From b2c7d957e299c21e9d9ea974ee7eeddd81aefa3e Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Sun, 18 Apr 2021 23:58:24 -0700 Subject: [PATCH 01/42] Added DeltaTree2, which can be shared between updated versions of the same tree, but so far it is 50% slower. --- fdbserver/DeltaTree.h | 578 ++++++++++++++++++++++++++++- fdbserver/VersionedBTree.actor.cpp | 250 +++++++++++-- 2 files changed, 785 insertions(+), 43 deletions(-) diff --git a/fdbserver/DeltaTree.h b/fdbserver/DeltaTree.h index f9ddd465b6..abc0896969 100644 --- a/fdbserver/DeltaTree.h +++ b/fdbserver/DeltaTree.h @@ -26,6 +26,8 @@ #include "fdbserver/Knobs.h" #include +#define deltatree_printf(args...) + typedef uint64_t Word; // Get the number of prefix bytes that are the same between a and b, up to their common length of cl static inline int commonPrefixLength(uint8_t const* ap, uint8_t const* bp, int cl) { @@ -198,10 +200,6 @@ struct DeltaTree { smallOffsets.left = offset; } } - - int size(bool large) const { - return delta(large).size() + (large ? sizeof(smallOffsets) : sizeof(largeOffsets)); - } }; static constexpr int SmallSizeLimit = std::numeric_limits::max(); @@ -356,8 +354,6 @@ public: Mirror(const void* treePtr = nullptr, const T* lowerBound = nullptr, const T* upperBound = nullptr) : tree((DeltaTree*)treePtr), lower(lowerBound), upper(upperBound) { - // TODO: Remove these copies into arena and require users of Mirror to keep prev and next alive during its - // lifetime lower = new (arena) T(arena, *lower); upper = new (arena) T(arena, *upper); @@ -875,7 +871,10 @@ private: int deltaSize = item.writeDelta(node.delta(largeNodes), *base, commonPrefix); node.delta(largeNodes).setPrefixSource(prefixSourcePrev); - // printf("Serialized %s to %p\n", item.toString().c_str(), &root.delta(largeNodes)); + printf("Serialized %s to offset %d data: %s\n", + item.toString().c_str(), + (uint8_t*)&node - (uint8_t*)this, + StringRef((uint8_t*)&node.delta(largeNodes), deltaSize).toHexString().c_str()); // Continue writing after the serialized Delta. uint8_t* wptr = (uint8_t*)&node.delta(largeNodes) + deltaSize; @@ -899,3 +898,568 @@ private: return wptr - (uint8_t*)&node; } }; + +// ------------------------------------------------------------------ +#pragma pack(push, 1) +template +struct DeltaTree2 { + typedef typename T::Partial Partial; + + struct { + uint16_t numItems; // Number of items in the tree. + uint32_t nodeBytesUsed; // Bytes used by nodes (everything after the tree header) + uint32_t nodeBytesFree; // Bytes left at end of tree to expand into + uint32_t nodeBytesDeleted; // Delta bytes deleted from tree. Note that some of these bytes could be borrowed by + // descendents. + uint8_t initialHeight; // Height of tree as originally built + uint8_t maxHeight; // Maximum height of tree after any insertion. Value of 0 means no insertions done. + bool largeNodes; // Node size, can be calculated as capacity > SmallSizeLimit but it will be used a lot + }; + struct Node { + // Offsets are relative to the start of the tree + union { + struct { + uint32_t leftChild; + uint32_t rightChild; + uint32_t leftParent; + uint32_t rightParent; + + } largeOffsets; + struct { + uint16_t leftChild; + uint16_t rightChild; + uint16_t leftParent; + uint16_t rightParent; + } smallOffsets; + }; + + static int headerSize(bool large) { return large ? sizeof(largeOffsets) : sizeof(smallOffsets); } + + // Delta is located after the offsets, which differs by node size + DeltaT& delta(bool large) { return large ? *(DeltaT*)(&largeOffsets + 1) : *(DeltaT*)(&smallOffsets + 1); }; + + // Delta is located after the offsets, which differs by node size + const DeltaT& delta(bool large) const { + return large ? *(DeltaT*)(&largeOffsets + 1) : *(DeltaT*)(&smallOffsets + 1); + }; + + std::string toString(DeltaTree2* tree) const { + return format("Node{offset=%d leftChild=%d rightChild=%d leftParent=%d rightParent=%d delta=%s}", + tree->nodeOffset(this), + getLeftChildOffset(tree->largeNodes), + getRightChildOffset(tree->largeNodes), + getLeftParentOffset(tree->largeNodes), + getRightParentOffset(tree->largeNodes), + delta(tree->largeNodes).toString().c_str()); + } + +#define getMember(m) (large ? largeOffsets.m : smallOffsets.m) +#define setMember(m, v) \ + if (large) { \ + largeOffsets.m = v; \ + } else { \ + smallOffsets.m = v; \ + } + + void setRightChildOffset(bool large, int offset) { setMember(rightChild, offset); } + void setLeftChildOffset(bool large, int offset) { setMember(leftChild, offset); } + void setRightParentOffset(bool large, int offset) { setMember(rightParent, offset); } + void setLeftParentOffset(bool large, int offset) { setMember(leftParent, offset); } + + int getRightChildOffset(bool large) const { return getMember(rightChild); } + int getLeftChildOffset(bool large) const { return getMember(leftChild); } + int getRightParentOffset(bool large) const { return getMember(rightParent); } + int getLeftParentOffset(bool large) const { return getMember(leftParent); } + + int size(bool large) const { return delta(large).size() + headerSize(large); } +#undef getMember +#undef setMember + }; + + static constexpr int SmallSizeLimit = std::numeric_limits::max(); + static constexpr int LargeTreePerNodeExtraOverhead = sizeof(Node::largeOffsets) - sizeof(Node::smallOffsets); + +#pragma pack(pop) + + int nodeOffset(const Node* n) const { return (uint8_t*)n - (uint8_t*)this; } + Node* nodeAt(int offset) { return offset == 0 ? nullptr : (Node*)((uint8_t*)this + offset); } + Node* root() { return numItems == 0 ? nullptr : (Node*)(this + 1); } + + int size() const { return sizeof(DeltaTree2) + nodeBytesUsed; } + int capacity() const { return size() + nodeBytesFree; } + + Node& newNode() { return *(Node*)((uint8_t*)this + size()); } + +public: + struct DecodeCache : FastAllocated { + DecodeCache(const T& lowerBound = T(), const T& upperBound = T()) + : lowerBound(arena, lowerBound), upperBound(arena, upperBound) {} + + Arena arena; + T lowerBound; + T upperBound; + std::unordered_map> partials; + Optional& get(int offset) { return partials[offset]; } + + void clear() { + partials.clear(); + Arena a; + lowerBound = T(a, lowerBound); + upperBound = T(a, upperBound); + arena = a; + } + }; + + // Cursor provides a way to seek into a DeltaTree and iterate over its contents + // The cursor needs a DeltaTree pointer and a DecodeCache, which can be shared + // with other DeltaTrees which were incrementally modified to produce the the + // tree that this cursor is referencing. + struct Cursor { + Cursor() : cache(nullptr), node(nullptr) {} + + Cursor(DecodeCache* cache, DeltaTree2* tree) : cache(cache), tree(tree) { node = tree->root(); } + + DeltaTree2* tree; + DecodeCache* cache; + Node* node; + + std::string toString() const { + return format("Cursor{tree=%p cache=%p node=%s item=%s", + tree, + cache, + node == nullptr ? "null" : node->toString(tree).c_str(), + node == nullptr ? "" : get().toString().c_str()); + } + + bool valid() const { return node != nullptr; } + + // Get T for Node n, and provide to n's delta the base and local decode cache entries to use/modify + const T get(Node* n) const { + DeltaT& delta = n->delta(tree->largeNodes); + + // If this node's cache is populated, then the delta can create T from that alone + Optional& c = cache->get(tree->nodeOffset(n)); + if (c.present()) { + return delta.apply(c.get()); + } + + // Otherwise, get the base T + bool basePrev = delta.getPrefixSource(); + int baseOffset = + basePrev ? n->getLeftParentOffset(tree->largeNodes) : n->getRightParentOffset(tree->largeNodes); + + // If baseOffset is 0, then base T is DecodeCache's lower or upper bound + if (baseOffset == 0) { + return delta.apply(cache->arena, basePrev ? cache->lowerBound : cache->upperBound, c); + } + + return delta.apply(cache->arena, get(tree->nodeAt(baseOffset)), c); + } + + const T get() const { return get(node); } + + // const tT getOrUpperBound() const { return valid() ? node->item : *mirror->upperBound(); } + + bool operator==(const Cursor& rhs) const { return node == rhs.node; } + bool operator!=(const Cursor& rhs) const { return node != rhs.node; } + + // The seek methods, of the form seek[Less|Greater][orEqual](...) are very similar. + // They attempt move the cursor to the [Greatest|Least] item, based on the name of the function. + // Then will not "see" erased records. + // If successful, they return true, and if not then false a while making the cursor invalid. + // These methods forward arguments to the seek() overloads, see those for argument descriptions. + template + bool seekLessThan(Args... args) { + int cmp = seek(args...); + if (cmp < 0 || (cmp == 0 && node != nullptr)) { + movePrev(); + } + return _hideDeletedBackward(); + } + + template + bool seekLessThanOrEqual(Args... args) { + int cmp = seek(args...); + if (cmp < 0) { + movePrev(); + } + return _hideDeletedBackward(); + } + + template + bool seekGreaterThan(Args... args) { + int cmp = seek(args...); + if (cmp > 0 || (cmp == 0 && node != nullptr)) { + moveNext(); + } + return _hideDeletedForward(); + } + + template + bool seekGreaterThanOrEqual(Args... args) { + int cmp = seek(args...); + if (cmp > 0) { + moveNext(); + } + return _hideDeletedForward(); + } + + // seek() moves the cursor to a node containing s or the node that would be the parent of s if s were to be + // added to the tree. If the tree was empty, the cursor will be invalid and the return value will be 0. + // Otherwise, returns the result of s.compare(item at cursor position) + // Does not skip/avoid deleted nodes. + int seek(const T& s, int skipLen = 0) { + node = nullptr; + deltatree_printf("seek(%s) start %s\n", s.toString().c_str(), toString().c_str()); + Node* n = tree->root(); + int cmp = 0; + + while (n != nullptr) { + node = n; + cmp = s.compare(get(), skipLen); + deltatree_printf("seek(%s) move %s cmp=%d\n", s.toString().c_str(), toString().c_str(), cmp); + if (cmp == 0) { + break; + } + + n = (cmp > 0) ? tree->nodeAt(n->getRightChildOffset(tree->largeNodes)) + : tree->nodeAt(n->getLeftChildOffset(tree->largeNodes)); + } + + return cmp; + } + + bool moveFirst() { + Node* n = tree->root(); + node = n; + deltatree_printf("moveFirst start %s\n", toString().c_str()); + while (n != nullptr) { + n = tree->nodeAt(n->getLeftChildOffset(tree->largeNodes)); + if (n != nullptr) { + node = n; + deltatree_printf("moveFirst move %s\n", toString().c_str()); + } + } + return _hideDeletedForward(); + } + + bool moveLast() { + Node* n = tree->root(); + node = n; + deltatree_printf("moveLast start %s\n", toString().c_str()); + while (n != nullptr) { + n = tree->nodeAt(n->getRightChildOffset(tree->largeNodes)); + if (n != nullptr) { + node = n; + deltatree_printf("moveLast move %s\n", toString().c_str()); + } + } + return _hideDeletedBackward(); + } + + // Try to move to next node, sees deleted nodes. + void _moveNext() { + deltatree_printf("_moveNext start %s\n", toString().c_str()); + // Try to go right + Node* n = tree->nodeAt(node->getRightChildOffset(tree->largeNodes)); + + // If we couldn't go right, then the answer is our next ancestor + if (n == nullptr) { + node = tree->nodeAt(node->getRightParentOffset(tree->largeNodes)); + deltatree_printf("_moveNext move1 %s\n", toString().c_str()); + } else { + // Go left as far as possible + do { + node = n; + deltatree_printf("_moveNext move2 %s\n", toString().c_str()); + n = tree->nodeAt(n->getLeftChildOffset(tree->largeNodes)); + } while (n != nullptr); + } + } + + // Try to move to previous node, sees deleted nodes. + void _movePrev() { + deltatree_printf("_movePrev start %s\n", toString().c_str()); + // Try to go left + Node* n = tree->nodeAt(node->getLeftChildOffset(tree->largeNodes)); + // If we couldn't go left, then the answer is our prev ancestor + if (n == nullptr) { + node = tree->nodeAt(node->getLeftParentOffset(tree->largeNodes)); + deltatree_printf("_movePrev move1 %s\n", toString().c_str()); + } else { + // Go right as far as possible + do { + node = n; + deltatree_printf("_movePrev move2 %s\n", toString().c_str()); + n = tree->nodeAt(n->getRightChildOffset(tree->largeNodes)); + } while (n != nullptr); + } + } + + bool moveNext() { + _moveNext(); + return _hideDeletedForward(); + } + + bool movePrev() { + _movePrev(); + return _hideDeletedBackward(); + } + + bool isErased() const { return node->delta(tree->largeNodes).getDeleted(); } + + // Erase current item by setting its deleted flag to true. + // Tree header is updated if a change is made. + void erase() { + auto& delta = node->delta(tree->largeNodes); + if (!delta.getDeleted()) { + delta.setDeleted(true); + --tree->numItems; + tree->nodeBytesDeleted += (delta.size() + Node::headerSize(tree->largeNodes)); + } + } + + // Un-erase current item by setting its deleted flag to false. + // Tree header is updated if a change is made. + void unErase() { + auto& delta = node->delta(tree->largeNodes); + if (delta.getDeleted()) { + delta.setDeleted(false); + ++tree->numItems; + tree->nodeBytesDeleted -= (delta.size() + Node::headerSize(tree->largeNodes)); + } + } + + // Erase k by setting its deleted flag to true. Returns true only if k existed + bool erase(const T& k, int skipLen = 0) { + Cursor c = *this; + if (c.seek(k, skipLen) == 0 && !c.isErased()) { + c.erase(); + return true; + } + return false; + } + + // Try to insert k into the DeltaTree, updating byte counts and initialHeight if they + // have changed (they won't if k already exists in the tree but was deleted). + // Returns true if successful, false if k does not fit in the space available + // or if k is already in the tree (and was not already deleted). + // Insertion on an empty tree returns false as well. + bool insert(const T& k, int skipLen = 0, int maxHeightAllowed = std::numeric_limits::max()) { + deltatree_printf("insert %s\n", k.toString().c_str()); + + if (tree->numItems == 0) { + return false; + } + + Cursor c = *this; + int height = 0; + // TODO: Inline seek here to add height output + + int cmp = c.seek(k, skipLen); + Node* parent = c.node; + + // If the item is found, mark it erased if it isn't already + if (cmp == 0) { + if (c.isErased()) { + c.unErase(); + return true; + } + return false; + } + + if (height > maxHeightAllowed) { + return false; + } + + Node& child = tree->newNode(); + int childOffset = tree->nodeOffset(&child); + + // If k > c then k becomes c's right child + bool addingRight = cmp > 0; + int leftParentOffset, rightParentOffset; + + // Point either the right or left child of c to the new node + // Set parent pointers for n + if (addingRight) { + // parent is the new node's left parent since n is the right child of parent + leftParentOffset = tree->nodeOffset(parent); + rightParentOffset = parent->getRightParentOffset(tree->largeNodes); + } else { + // parent is the new node's right parent since n is the left child of parent + leftParentOffset = parent->getLeftParentOffset(tree->largeNodes); + rightParentOffset = tree->nodeOffset(parent); + } + + T leftBase = leftParentOffset == 0 ? cache->lowerBound : get(tree->nodeAt(leftParentOffset)); + T rightBase = rightParentOffset == 0 ? cache->upperBound : get(tree->nodeAt(rightParentOffset)); + + int common = leftBase.getCommonPrefixLen(rightBase, skipLen); + int commonWithLeftParent = k.getCommonPrefixLen(leftBase, common); + int commonWithRightParent = k.getCommonPrefixLen(rightBase, common); + bool borrowFromLeft = commonWithLeftParent >= commonWithRightParent; + const T& base = borrowFromLeft ? leftBase : rightBase; + int commonPrefix = borrowFromLeft ? commonWithLeftParent : commonWithRightParent; + + int deltaSize = k.deltaSize(base, commonPrefix, false); + int nodeSpace = deltaSize + Node::headerSize(tree->largeNodes); + + if (nodeSpace > tree->nodeBytesFree) { + return false; + } + + if (addingRight) { + parent->setRightChildOffset(tree->largeNodes, childOffset); + } else { + parent->setLeftChildOffset(tree->largeNodes, childOffset); + } + child.setLeftParentOffset(tree->largeNodes, leftParentOffset); + child.setRightParentOffset(tree->largeNodes, rightParentOffset); + child.setRightChildOffset(tree->largeNodes, 0); + child.setLeftChildOffset(tree->largeNodes, 0); + + DeltaT& childDelta = child.delta(tree->largeNodes); + int written = k.writeDelta(childDelta, base, commonPrefix); + ASSERT(deltaSize == written); + childDelta.setPrefixSource(borrowFromLeft); + + tree->nodeBytesUsed += nodeSpace; + tree->nodeBytesFree -= nodeSpace; + ++tree->numItems; + + // Update max height of the tree if necessary + if (height > tree->maxHeight) { + tree->maxHeight = height; + } + + return true; + } + + private: + bool _hideDeletedBackward() { + while (node != nullptr && node->delta(tree->largeNodes).getDeleted()) { + _movePrev(); + } + return node != nullptr; + } + + bool _hideDeletedForward() { + while (node != nullptr && node->delta(tree->largeNodes).getDeleted()) { + _moveNext(); + } + return node != nullptr; + } + }; + + // Returns number of bytes written + int build(int spaceAvailable, const T* begin, const T* end, const T* lowerBound, const T* upperBound) { + largeNodes = spaceAvailable > SmallSizeLimit; + int count = end - begin; + numItems = count; + nodeBytesDeleted = 0; + initialHeight = (uint8_t)log2(count) + 1; + maxHeight = 0; + + // The boundary leading to the new page acts as the last time we branched right + if (count > 0) { + nodeBytesUsed = buildSubtree( + *root(), begin, end, lowerBound, upperBound, 0, 0, lowerBound->getCommonPrefixLen(*upperBound, 0)); + } else { + nodeBytesUsed = 0; + } + nodeBytesFree = spaceAvailable - size(); + return size(); + } + +private: + int buildSubtree(Node& node, + const T* begin, + const T* end, + const T* leftParent, + const T* rightParent, + int leftParentOffset, + int rightParentOffset, + int subtreeCommon) { + + int count = end - begin; + + // Find key to be stored in root + int mid = perfectSubtreeSplitPointCached(count); + const T& item = begin[mid]; + + int commonWithPrev = item.getCommonPrefixLen(*leftParent, subtreeCommon); + int commonWithNext = item.getCommonPrefixLen(*rightParent, subtreeCommon); + + bool prefixSourcePrev; + int commonPrefix; + const T* base; + if (commonWithPrev >= commonWithNext) { + prefixSourcePrev = true; + commonPrefix = commonWithPrev; + base = leftParent; + } else { + prefixSourcePrev = false; + commonPrefix = commonWithNext; + base = rightParent; + } + + int deltaSize = item.writeDelta(node.delta(largeNodes), *base, commonPrefix); + node.delta(largeNodes).setPrefixSource(prefixSourcePrev); + + // Continue writing after the serialized Delta. + uint8_t* wptr = (uint8_t*)&node.delta(largeNodes) + deltaSize; + + int leftChildOffset; + // Serialize left subtree + if (count > 1) { + leftChildOffset = wptr - (uint8_t*)this; + deltatree_printf("%p: offset=%d count=%d serialize left subtree leftChildOffset=%d\n", + this, + nodeOffset(&node), + count, + leftChildOffset); + + wptr += buildSubtree(*(Node*)wptr, + begin, + begin + mid, + leftParent, + &item, + leftParentOffset, + nodeOffset(&node), + commonWithPrev); + } else { + leftChildOffset = 0; + } + + int rightChildOffset; + // Serialize right subtree + if (count > 2) { + rightChildOffset = wptr - (uint8_t*)this; + deltatree_printf("%p: offset=%d count=%d serialize right subtree rightChildOffset=%d\n", + this, + nodeOffset(&node), + count, + rightChildOffset); + + wptr += buildSubtree(*(Node*)wptr, + begin + mid + 1, + end, + &item, + rightParent, + nodeOffset(&node), + rightParentOffset, + commonWithNext); + } else { + rightChildOffset = 0; + } + + node.setLeftChildOffset(largeNodes, leftChildOffset); + node.setRightChildOffset(largeNodes, rightChildOffset); + node.setLeftParentOffset(largeNodes, leftParentOffset); + node.setRightParentOffset(largeNodes, rightParentOffset); + + deltatree_printf("%p: Serialized %s as %s\n", this, item.toString().c_str(), node.toString(this).c_str()); + + return wptr - (uint8_t*)&node; + } +}; diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index d7133c9ad7..0ac28c0dc5 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -7109,9 +7109,9 @@ ACTOR Future randomReader(VersionedBTree* btree) { struct IntIntPair { IntIntPair() {} IntIntPair(int k, int v) : k(k), v(v) {} - IntIntPair(Arena& arena, const IntIntPair& toCopy) { *this = toCopy; } + typedef IntIntPair Partial; struct Delta { bool prefixSource; bool deleted; @@ -7120,6 +7120,15 @@ struct IntIntPair { IntIntPair apply(const IntIntPair& base, Arena& arena) { return { base.k + dk, base.v + dv }; } + IntIntPair apply(const Partial& cache) { return cache; } + + IntIntPair apply(Arena& arena, const IntIntPair& base, Optional& cache) { + if (!cache.present()) { + cache = IntIntPair(base.k + dk, base.v + dv); + } + return cache.get(); + } + void setPrefixSource(bool val) { prefixSource = val; } bool getPrefixSource() const { return prefixSource; } @@ -7581,51 +7590,40 @@ TEST_CASE("/redwood/correctness/unit/deltaTree/IntIntPair") { // Build tree of items std::vector items(uniqueItems.begin(), uniqueItems.end()); - int bufferSize = N * 2 * 20; + int bufferSize = N * 2 * 30; + DeltaTree* tree = (DeltaTree*)new uint8_t[bufferSize]; int builtSize = tree->build(bufferSize, &items[0], &items[items.size()], &prev, &next); ASSERT(builtSize <= bufferSize); - DeltaTree::Mirror r(tree, &prev, &next); - // Grow uniqueItems until tree is full, adding half of new items to toDelete - std::vector toDelete; - while (1) { - IntIntPair p = randomPair(); - auto nextP = p; // also check if next highest/lowest key is not in the set - nextP.v++; - auto prevP = p; - prevP.v--; - if (uniqueItems.count(p) == 0 && uniqueItems.count(nextP) == 0 && uniqueItems.count(prevP) == 0) { - if (!r.insert(p)) { - break; - }; - uniqueItems.insert(p); - if (deterministicRandom()->coinflip()) { - toDelete.push_back(p); - } - // printf("Inserted %s size=%d\n", items.back().toString().c_str(), tree->size()); - } - } - - ASSERT(tree->numItems > 2 * N); - ASSERT(tree->size() <= bufferSize); - - // Update items vector - items = std::vector(uniqueItems.begin(), uniqueItems.end()); + DeltaTree2* tree2 = (DeltaTree2*)new uint8_t[bufferSize]; + int builtSize2 = tree2->build(bufferSize, &items[0], &items[items.size()], &prev, &next); + ASSERT(builtSize2 <= bufferSize); + DeltaTree2::DecodeCache cache(prev, next); + DeltaTree2::Cursor cur2(&cache, tree2); auto printItems = [&] { for (int k = 0; k < items.size(); ++k) { - printf("%d %s\n", k, items[k].toString().c_str()); + printf("%d/%d %s\n", k + 1, items.size(), items[k].toString().c_str()); } }; - printf("Count=%d Size=%d InitialHeight=%d MaxHeight=%d\n", - (int)items.size(), - (int)tree->size(), - (int)tree->initialHeight, - (int)tree->maxHeight); - debug_printf("Data(%p): %s\n", tree, StringRef((uint8_t*)tree, tree->size()).toHexString().c_str()); + auto printTrees = [&] { + printf("DeltaTree: Count=%d Size=%d InitialHeight=%d MaxHeight=%d\n", + (int)tree->numItems, + (int)tree->size(), + (int)tree->initialHeight, + (int)tree->maxHeight); + debug_printf_always("Data(%p): %s\n", tree, StringRef((uint8_t*)tree, tree->size()).toHexString().c_str()); + + printf("DeltaTree2: Count=%d Size=%d InitialHeight=%d MaxHeight=%d\n", + (int)tree2->numItems, + (int)tree2->size(), + (int)tree2->initialHeight, + (int)tree2->maxHeight); + debug_printf_always("Data(%p): %s\n", tree2, StringRef((uint8_t*)tree2, tree2->size()).toHexString().c_str()); + }; // Iterate through items and tree forward and backward, verifying tree contents. auto scanAndVerify = [&]() { @@ -7669,56 +7667,148 @@ TEST_CASE("/redwood/correctness/unit/deltaTree/IntIntPair") { } }; + // Iterate through items and tree forward and backward, verifying tree contents. + auto scanAndVerify2 = [&]() { + printf("Verify tree contents.\n"); + + DeltaTree2::Cursor fwd(&cache, tree2); + DeltaTree2::Cursor rev(&cache, tree2); + + ASSERT(fwd.moveFirst()); + ASSERT(rev.moveLast()); + + for (int i = 0; i < items.size(); ++i) { + if (fwd.get() != items[i]) { + printItems(); + printf("forward iterator i=%d\n %s found\n %s expected\n", + i, + fwd.get().toString().c_str(), + items[i].toString().c_str()); + ASSERT(false); + } + if (rev.get() != items[items.size() - 1 - i]) { + printItems(); + printf("reverse iterator i=%d\n %s found\n %s expected\n", + i, + rev.get().toString().c_str(), + items[items.size() - 1 - i].toString().c_str()); + ASSERT(false); + } + + // Advance iterator, check scanning cursors for correct validity state + int j = i + 1; + bool end = j == items.size(); + + ASSERT(fwd.moveNext() == !end); + ASSERT(rev.movePrev() == !end); + ASSERT(fwd.valid() == !end); + ASSERT(rev.valid() == !end); + + if (end) { + break; + } + } + }; + + printItems(); + printTrees(); + // Verify tree contents scanAndVerify(); + scanAndVerify2(); + + // Grow uniqueItems until tree is full, adding half of new items to toDelete + std::vector toDelete; + while (1) { + IntIntPair p = randomPair(); + auto nextP = p; // also check if next highest/lowest key is not in the set + nextP.v++; + auto prevP = p; + prevP.v--; + if (uniqueItems.count(p) == 0 && uniqueItems.count(nextP) == 0 && uniqueItems.count(prevP) == 0) { + if (!r.insert(p)) { + break; + }; + uniqueItems.insert(p); + if (deterministicRandom()->coinflip()) { + toDelete.push_back(p); + } + // printf("Inserted %s size=%d\n", items.back().toString().c_str(), tree->size()); + } + } + + ASSERT(tree->numItems > 2 * N); + ASSERT(tree->size() <= bufferSize); + + // Update items vector + items = std::vector(uniqueItems.begin(), uniqueItems.end()); + + // Verify tree contents + scanAndVerify(); + scanAndVerify2(); // Create a new mirror, decoding the tree from scratch since insert() modified both the tree and the mirror r = DeltaTree::Mirror(tree, &prev, &next); + cache.clear(); scanAndVerify(); + scanAndVerify2(); // For each randomly selected new item to be deleted, delete it from the DeltaTree and from uniqueItems printf("Deleting some items\n"); for (auto p : toDelete) { uniqueItems.erase(p); + DeltaTree::Cursor c = r.getCursor(); ASSERT(c.seekLessThanOrEqual(p)); c.erase(); + + ASSERT(cur2.seekLessThanOrEqual(p)); + cur2.erase(); } // Update items vector items = std::vector(uniqueItems.begin(), uniqueItems.end()); // Verify tree contents after deletions scanAndVerify(); + scanAndVerify2(); printf("Verifying insert/erase behavior for existing items\n"); // Test delete/insert behavior for each item, making no net changes for (auto p : items) { // Insert existing should fail ASSERT(!r.insert(p)); + ASSERT(!cur2.insert(p)); // Erase existing should succeed ASSERT(r.erase(p)); + ASSERT(cur2.erase(p)); // Erase deleted should fail ASSERT(!r.erase(p)); + ASSERT(!cur2.erase(p)); // Insert deleted should succeed ASSERT(r.insert(p)); + ASSERT(cur2.insert(p)); // Insert existing should fail ASSERT(!r.insert(p)); + ASSERT(!cur2.insert(p)); } // Tree contents should still match items vector scanAndVerify(); + scanAndVerify2(); printf("Verifying seek behaviors\n"); DeltaTree::Cursor s = r.getCursor(); + DeltaTree2::Cursor s2(&cache, tree2); // SeekLTE to each element for (int i = 0; i < items.size(); ++i) { IntIntPair p = items[i]; IntIntPair q = p; + ASSERT(s.seekLessThanOrEqual(q)); if (s.get() != p) { printItems(); @@ -7728,12 +7818,23 @@ TEST_CASE("/redwood/correctness/unit/deltaTree/IntIntPair") { p.toString().c_str()); ASSERT(false); } + + ASSERT(s2.seekLessThanOrEqual(q)); + if (s2.get() != p) { + printItems(); + printf("seekLessThanOrEqual(%s) found %s expected %s\n", + q.toString().c_str(), + s2.get().toString().c_str(), + p.toString().c_str()); + ASSERT(false); + } } // SeekGTE to each element for (int i = 0; i < items.size(); ++i) { IntIntPair p = items[i]; IntIntPair q = p; + ASSERT(s.seekGreaterThanOrEqual(q)); if (s.get() != p) { printItems(); @@ -7743,6 +7844,16 @@ TEST_CASE("/redwood/correctness/unit/deltaTree/IntIntPair") { p.toString().c_str()); ASSERT(false); } + + ASSERT(s2.seekGreaterThanOrEqual(q)); + if (s2.get() != p) { + printItems(); + printf("seekGreaterThanOrEqual(%s) found %s expected %s\n", + q.toString().c_str(), + s2.get().toString().c_str(), + p.toString().c_str()); + ASSERT(false); + } } // SeekLTE to the next possible int pair value after each element to make sure the base element is found @@ -7751,6 +7862,7 @@ TEST_CASE("/redwood/correctness/unit/deltaTree/IntIntPair") { IntIntPair p = items[i]; IntIntPair q = p; q.v++; + ASSERT(s.seekLessThanOrEqual(q)); if (s.get() != p) { printItems(); @@ -7760,6 +7872,16 @@ TEST_CASE("/redwood/correctness/unit/deltaTree/IntIntPair") { p.toString().c_str()); ASSERT(false); } + + ASSERT(s2.seekLessThanOrEqual(q)); + if (s2.get() != p) { + printItems(); + printf("seekLessThanOrEqual(%s) found %s expected %s\n", + q.toString().c_str(), + s2.get().toString().c_str(), + p.toString().c_str()); + ASSERT(false); + } } // SeekGTE to the previous possible int pair value after each element to make sure the base element is found @@ -7768,6 +7890,7 @@ TEST_CASE("/redwood/correctness/unit/deltaTree/IntIntPair") { IntIntPair p = items[i]; IntIntPair q = p; q.v--; + ASSERT(s.seekGreaterThanOrEqual(q)); if (s.get() != p) { printItems(); @@ -7777,6 +7900,16 @@ TEST_CASE("/redwood/correctness/unit/deltaTree/IntIntPair") { p.toString().c_str()); ASSERT(false); } + + ASSERT(s2.seekGreaterThanOrEqual(q)); + if (s2.get() != p) { + printItems(); + printf("seekGreaterThanOrEqual(%s) found %s expected %s\n", + q.toString().c_str(), + s2.get().toString().c_str(), + p.toString().c_str()); + ASSERT(false); + } } // SeekLTE to each element N times, using every element as a hint @@ -7858,11 +7991,56 @@ TEST_CASE("/redwood/correctness/unit/deltaTree/IntIntPair") { double(count) / elapsed / 1e6); }; + auto skipSeekPerformance2 = [&](int jumpMax, bool old, bool useHint, int count) { + // Skip to a series of increasing items, jump by up to jumpMax units forward in the + // items, wrapping around to 0. + double start = timer(); + s2.moveFirst(); + auto first = s2; + int pos = 0; + for (int c = 0; c < count; ++c) { + int jump = deterministicRandom()->randomInt(0, jumpMax); + int newPos = pos + jump; + if (newPos >= items.size()) { + pos = 0; + newPos = jump; + s2 = first; + } + IntIntPair q = items[newPos]; + ++q.v; + if (old) { + if (useHint) { + // s.seekLessThanOrEqualOld(q, 0, &s, newPos - pos); + } else { + // s.seekLessThanOrEqualOld(q, 0, nullptr, 0); + } + } else { + if (useHint) { + // s.seekLessThanOrEqual(q, 0, &s, newPos - pos); + } else { + s2.seekLessThanOrEqual(q); + } + } + pos = newPos; + } + double elapsed = timer() - start; + printf("DeltaTree2 Seek/skip test, count=%d jumpMax=%d, items=%d, oldSeek=%d useHint=%d: Elapsed %f seconds " + "%.2f M/s\n", + count, + jumpMax, + items.size(), + old, + useHint, + elapsed, + double(count) / elapsed / 1e6); + }; + // Compare seeking to nearby elements with and without hints, using the old and new SeekLessThanOrEqual methods. // TODO: Once seekLessThanOrEqual() with a hint is as fast as seekLessThanOrEqualOld, remove it. + skipSeekPerformance(8, false, false, 80e6); + skipSeekPerformance2(8, false, false, 80e6); skipSeekPerformance(8, true, false, 80e6); skipSeekPerformance(8, true, true, 80e6); - skipSeekPerformance(8, false, false, 80e6); skipSeekPerformance(8, false, true, 80e6); // Repeatedly seek for one of a set of pregenerated random pairs and time it. From 701f05e513459b58f6748d4ce046d9f1b98f5b30 Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Mon, 19 Apr 2021 01:18:48 -0700 Subject: [PATCH 02/42] Bug fix, recursive call to get() could cause rehashing so hash lookup must be redone afterward. --- fdbserver/DeltaTree.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fdbserver/DeltaTree.h b/fdbserver/DeltaTree.h index abc0896969..a64345cf5d 100644 --- a/fdbserver/DeltaTree.h +++ b/fdbserver/DeltaTree.h @@ -871,7 +871,7 @@ private: int deltaSize = item.writeDelta(node.delta(largeNodes), *base, commonPrefix); node.delta(largeNodes).setPrefixSource(prefixSourcePrev); - printf("Serialized %s to offset %d data: %s\n", + deltatree_printf("Serialized %s to offset %d data: %s\n", item.toString().c_str(), (uint8_t*)&node - (uint8_t*)this, StringRef((uint8_t*)&node.delta(largeNodes), deltaSize).toHexString().c_str()); @@ -1053,7 +1053,8 @@ public: return delta.apply(cache->arena, basePrev ? cache->lowerBound : cache->upperBound, c); } - return delta.apply(cache->arena, get(tree->nodeAt(baseOffset)), c); + T base = get(tree->nodeAt(baseOffset)); + return delta.apply(cache->arena, base, cache->get(tree->nodeOffset(n))); } const T get() const { return get(node); } From 9ab69b5cb143ece4dd6077708dcb478546220756 Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Mon, 19 Apr 2021 03:12:30 -0700 Subject: [PATCH 03/42] RedwoodRecordRef support for DeltaTree2. --- fdbserver/DeltaTree.h | 9 +- fdbserver/VersionedBTree.actor.cpp | 207 ++++++++++++++++++++++++++++- 2 files changed, 207 insertions(+), 9 deletions(-) diff --git a/fdbserver/DeltaTree.h b/fdbserver/DeltaTree.h index a64345cf5d..eafb1d664b 100644 --- a/fdbserver/DeltaTree.h +++ b/fdbserver/DeltaTree.h @@ -26,7 +26,8 @@ #include "fdbserver/Knobs.h" #include -#define deltatree_printf(args...) +#define deltatree_printf(...) +//#define deltatree_printf(...) printf(__VA_ARGS__) typedef uint64_t Word; // Get the number of prefix bytes that are the same between a and b, up to their common length of cl @@ -872,9 +873,9 @@ private: int deltaSize = item.writeDelta(node.delta(largeNodes), *base, commonPrefix); node.delta(largeNodes).setPrefixSource(prefixSourcePrev); deltatree_printf("Serialized %s to offset %d data: %s\n", - item.toString().c_str(), - (uint8_t*)&node - (uint8_t*)this, - StringRef((uint8_t*)&node.delta(largeNodes), deltaSize).toHexString().c_str()); + item.toString().c_str(), + (uint8_t*)&node - (uint8_t*)this, + StringRef((uint8_t*)&node.delta(largeNodes), deltaSize).toHexString().c_str()); // Continue writing after the serialized Delta. uint8_t* wptr = (uint8_t*)&node.delta(largeNodes) + deltaSize; diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index 0ac28c0dc5..1cf5a68b84 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -2614,6 +2614,8 @@ struct RedwoodRecordRef { } } + typedef KeyRef Partial; + KeyValueRef toKeyValueRef() const { return KeyValueRef(key, value.get()); } // RedwoodRecordRefs are used for both internal and leaf pages of the BTree. @@ -2922,6 +2924,10 @@ struct RedwoodRecordRef { bool getDeleted() const { return flags & IS_DELETED; } + RedwoodRecordRef apply(const Partial& cache) { + return RedwoodRecordRef(cache, 0, hasValue() ? Optional(getValue()) : Optional()); + } + RedwoodRecordRef apply(const RedwoodRecordRef& base, Arena& arena) const { int keyPrefixLen = getKeyPrefixLength(); int keySuffixLen = getKeySuffixLength(); @@ -2954,6 +2960,13 @@ struct RedwoodRecordRef { return RedwoodRecordRef(k, v, value); } + RedwoodRecordRef apply(Arena& arena, const RedwoodRecordRef& base, Optional& cache) { + RedwoodRecordRef rec = apply(base, arena); + cache = rec.key; + + return rec; + } + int size() const { int size = 1 + getVersionDeltaSizeBytes(); switch (flags & LENGTHS_FORMAT) { @@ -3007,13 +3020,16 @@ struct RedwoodRecordRef { // its values, so the Reader does not require the original prev/next ancestors. struct DeltaValueOnly : Delta { RedwoodRecordRef apply(const RedwoodRecordRef& base, Arena& arena) const { - Optional value; + return RedwoodRecordRef(KeyRef(), 0, hasValue() ? Optional(getValue()) : Optional()); + } - if (hasValue()) { - value = getValue(); - } + RedwoodRecordRef apply(const Partial& cache) { + return RedwoodRecordRef(KeyRef(), 0, hasValue() ? Optional(getValue()) : Optional()); + } - return RedwoodRecordRef(StringRef(), 0, value); + RedwoodRecordRef apply(Arena& arena, const RedwoodRecordRef& base, Optional& cache) { + cache = KeyRef(); + return RedwoodRecordRef(KeyRef(), 0, hasValue() ? Optional(getValue()) : Optional()); } }; #pragma pack(pop) @@ -7565,6 +7581,187 @@ TEST_CASE("/redwood/correctness/unit/deltaTree/RedwoodRecordRef") { return Void(); } +TEST_CASE("/redwood/correctness/unit/deltaTree/RedwoodRecordRef2") { + // Sanity check on delta tree node format + ASSERT(DeltaTree2::Node::headerSize(false) == 8); + ASSERT(DeltaTree2::Node::headerSize(true) == 16); + + const int N = deterministicRandom()->randomInt(200, 1000); + + RedwoodRecordRef prev; + RedwoodRecordRef next(LiteralStringRef("\xff\xff\xff\xff")); + + Arena arena; + std::set uniqueItems; + + // Add random items to uniqueItems until its size is N + while (uniqueItems.size() < N) { + std::string k = deterministicRandom()->randomAlphaNumeric(30); + std::string v = deterministicRandom()->randomAlphaNumeric(30); + RedwoodRecordRef rec; + rec.key = StringRef(arena, k); + rec.version = 0; // deterministicRandom()->coinflip() + // ? deterministicRandom()->randomInt64(0, std::numeric_limits::max()) + // : invalidVersion; + if (deterministicRandom()->coinflip()) { + rec.value = StringRef(arena, v); + } + if (uniqueItems.count(rec) == 0) { + uniqueItems.insert(rec); + } + } + std::vector items(uniqueItems.begin(), uniqueItems.end()); + + int bufferSize = N * 100; + bool largeTree = bufferSize > DeltaTree2::SmallSizeLimit; + DeltaTree2* tree = (DeltaTree2*)new uint8_t[bufferSize]; + + tree->build(bufferSize, &items[0], &items[items.size()], &prev, &next); + + printf("Count=%d Size=%d InitialHeight=%d largeTree=%d\n", + (int)items.size(), + (int)tree->size(), + (int)tree->initialHeight, + largeTree); + debug_printf("Data(%p): %s\n", tree, StringRef((uint8_t*)tree, tree->size()).toHexString().c_str()); + + DeltaTree2::DecodeCache cache(prev, next); + DeltaTree2::Cursor c(&cache, tree); + + // Test delete/insert behavior for each item, making no net changes + printf("Testing seek/delete/insert for existing keys with random values\n"); + ASSERT(tree->numItems == items.size()); + for (auto rec : items) { + // Insert existing should fail + ASSERT(!c.insert(rec)); + ASSERT(tree->numItems == items.size()); + + // Erase existing should succeed + ASSERT(c.erase(rec)); + ASSERT(tree->numItems == items.size() - 1); + + // Erase deleted should fail + ASSERT(!c.erase(rec)); + ASSERT(tree->numItems == items.size() - 1); + + // Insert deleted should succeed + ASSERT(c.insert(rec)); + ASSERT(tree->numItems == items.size()); + + // Insert existing should fail + ASSERT(!c.insert(rec)); + ASSERT(tree->numItems == items.size()); + } + + DeltaTree2::Cursor fwd = c; + DeltaTree2::Cursor rev = c; + + DeltaTree2::DecodeCache cacheValuesOnly(prev, next); + DeltaTree2::Cursor fwdValueOnly( + &cacheValuesOnly, (DeltaTree2*)tree); + + printf("Verifying tree contents using forward, reverse, and value-only iterators\n"); + ASSERT(fwd.moveFirst()); + ASSERT(fwdValueOnly.moveFirst()); + ASSERT(rev.moveLast()); + + int i = 0; + while (1) { + if (fwd.get() != items[i]) { + printf("forward iterator i=%d\n %s found\n %s expected\n", + i, + fwd.get().toString().c_str(), + items[i].toString().c_str()); + printf("Cursor: %s\n", fwd.toString().c_str()); + ASSERT(false); + } + if (rev.get() != items[items.size() - 1 - i]) { + printf("reverse iterator i=%d\n %s found\n %s expected\n", + i, + rev.get().toString().c_str(), + items[items.size() - 1 - i].toString().c_str()); + printf("Cursor: %s\n", rev.toString().c_str()); + ASSERT(false); + } + if (fwdValueOnly.get().value != items[i].value) { + printf("forward values-only iterator i=%d\n %s found\n %s expected\n", + i, + fwdValueOnly.get().toString().c_str(), + items[i].toString().c_str()); + printf("Cursor: %s\n", fwdValueOnly.toString().c_str()); + ASSERT(false); + } + ++i; + + bool more = fwd.moveNext(); + ASSERT(fwdValueOnly.moveNext() == more); + ASSERT(rev.movePrev() == more); + + ASSERT(fwd.valid() == more); + ASSERT(fwdValueOnly.valid() == more); + ASSERT(rev.valid() == more); + + if (!fwd.valid()) { + break; + } + } + ASSERT(i == items.size()); + + { + DeltaTree2::DecodeCache cache(prev, next); + DeltaTree2::Cursor c(&cache, tree); + + printf("Doing 20M random seeks using the same cursor from the same mirror.\n"); + double start = timer(); + + for (int i = 0; i < 20000000; ++i) { + const RedwoodRecordRef& query = items[deterministicRandom()->randomInt(0, items.size())]; + if (!c.seekLessThanOrEqual(query)) { + printf("Not found! query=%s\n", query.toString().c_str()); + ASSERT(false); + } + if (c.get() != query) { + printf("Found incorrect node! query=%s found=%s\n", + query.toString().c_str(), + c.get().toString().c_str()); + ASSERT(false); + } + } + double elapsed = timer() - start; + printf("Elapsed %f\n", elapsed); + } + + // { + // printf("Doing 5M random seeks using 10k random cursors, each from a different mirror.\n"); + // double start = timer(); + // std::vector::Mirror*> mirrors; + // std::vector::Cursor> cursors; + // for (int i = 0; i < 10000; ++i) { + // mirrors.push_back(new DeltaTree2::Mirror(tree, &prev, &next)); + // cursors.push_back(mirrors.back()->getCursor()); + // } + + // for (int i = 0; i < 5000000; ++i) { + // const RedwoodRecordRef& query = items[deterministicRandom()->randomInt(0, items.size())]; + // DeltaTree2::Cursor& c = cursors[deterministicRandom()->randomInt(0, cursors.size())]; + // if (!c.seekLessThanOrEqual(query)) { + // printf("Not found! query=%s\n", query.toString().c_str()); + // ASSERT(false); + // } + // if (c.get() != query) { + // printf("Found incorrect node! query=%s found=%s\n", + // query.toString().c_str(), + // c.get().toString().c_str()); + // ASSERT(false); + // } + // } + // double elapsed = timer() - start; + // printf("Elapsed %f\n", elapsed); + // } + + return Void(); +} + TEST_CASE("/redwood/correctness/unit/deltaTree/IntIntPair") { const int N = 200; IntIntPair prev = { 1, 0 }; From b0ec76d4011a984951685954676d09376836b95d Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Thu, 22 Apr 2021 14:55:28 -0700 Subject: [PATCH 04/42] Test output improvements. --- fdbserver/DeltaTree.h | 5 ++++- fdbserver/VersionedBTree.actor.cpp | 19 +++++++++++++------ 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/fdbserver/DeltaTree.h b/fdbserver/DeltaTree.h index eafb1d664b..44d49f525d 100644 --- a/fdbserver/DeltaTree.h +++ b/fdbserver/DeltaTree.h @@ -994,7 +994,10 @@ struct DeltaTree2 { public: struct DecodeCache : FastAllocated { DecodeCache(const T& lowerBound = T(), const T& upperBound = T()) - : lowerBound(arena, lowerBound), upperBound(arena, upperBound) {} + : lowerBound(arena, lowerBound), upperBound(arena, upperBound) { + partials.reserve(10); + printf("size: %d\n", sizeof(OffsetPartial)); + } Arena arena; T lowerBound; diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index 1cf5a68b84..c6b7b79fac 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -7139,9 +7139,7 @@ struct IntIntPair { IntIntPair apply(const Partial& cache) { return cache; } IntIntPair apply(Arena& arena, const IntIntPair& base, Optional& cache) { - if (!cache.present()) { - cache = IntIntPair(base.k + dk, base.v + dv); - } + cache = IntIntPair(base.k + dk, base.v + dv); return cache.get(); } @@ -7802,7 +7800,7 @@ TEST_CASE("/redwood/correctness/unit/deltaTree/IntIntPair") { auto printItems = [&] { for (int k = 0; k < items.size(); ++k) { - printf("%d/%d %s\n", k + 1, items.size(), items[k].toString().c_str()); + debug_printf("%d/%d %s\n", k + 1, items.size(), items[k].toString().c_str()); } }; @@ -7812,14 +7810,14 @@ TEST_CASE("/redwood/correctness/unit/deltaTree/IntIntPair") { (int)tree->size(), (int)tree->initialHeight, (int)tree->maxHeight); - debug_printf_always("Data(%p): %s\n", tree, StringRef((uint8_t*)tree, tree->size()).toHexString().c_str()); + debug_printf("Data(%p): %s\n", tree, StringRef((uint8_t*)tree, tree->size()).toHexString().c_str()); printf("DeltaTree2: Count=%d Size=%d InitialHeight=%d MaxHeight=%d\n", (int)tree2->numItems, (int)tree2->size(), (int)tree2->initialHeight, (int)tree2->maxHeight); - debug_printf_always("Data(%p): %s\n", tree2, StringRef((uint8_t*)tree2, tree2->size()).toHexString().c_str()); + debug_printf("Data(%p): %s\n", tree2, StringRef((uint8_t*)tree2, tree2->size()).toHexString().c_str()); }; // Iterate through items and tree forward and backward, verifying tree contents. @@ -7940,6 +7938,9 @@ TEST_CASE("/redwood/correctness/unit/deltaTree/IntIntPair") { // Update items vector items = std::vector(uniqueItems.begin(), uniqueItems.end()); + printItems(); + printTrees(); + // Verify tree contents scanAndVerify(); scanAndVerify2(); @@ -7965,6 +7966,9 @@ TEST_CASE("/redwood/correctness/unit/deltaTree/IntIntPair") { // Update items vector items = std::vector(uniqueItems.begin(), uniqueItems.end()); + printItems(); + printTrees(); + // Verify tree contents after deletions scanAndVerify(); scanAndVerify2(); @@ -7993,6 +7997,9 @@ TEST_CASE("/redwood/correctness/unit/deltaTree/IntIntPair") { ASSERT(!cur2.insert(p)); } + printItems(); + printTrees(); + // Tree contents should still match items vector scanAndVerify(); scanAndVerify2(); From caf4b3c34597b5c3f15231cc3dd1bdd44ac5afff Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Fri, 23 Apr 2021 22:47:23 -0700 Subject: [PATCH 05/42] DeltaTree2 refactor. Nodes no longer contain parent offsets. DecodedCache no longer uses a hash but rather a vector of DecodedNodes, which Cursors reference by vector index. DecodedNodes contain parent node indexes which are populated on-demand, making storage in the serialized form no longer necessary. --- fdbserver/DeltaTree.h | 443 ++++++++++++++++++----------- fdbserver/VersionedBTree.actor.cpp | 33 ++- 2 files changed, 304 insertions(+), 172 deletions(-) diff --git a/fdbserver/DeltaTree.h b/fdbserver/DeltaTree.h index 44d49f525d..df94fdd122 100644 --- a/fdbserver/DeltaTree.h +++ b/fdbserver/DeltaTree.h @@ -27,7 +27,7 @@ #include #define deltatree_printf(...) -//#define deltatree_printf(...) printf(__VA_ARGS__) +// #define deltatree_printf(...) printf(__VA_ARGS__) typedef uint64_t Word; // Get the number of prefix bytes that are the same between a and b, up to their common length of cl @@ -922,15 +922,11 @@ struct DeltaTree2 { struct { uint32_t leftChild; uint32_t rightChild; - uint32_t leftParent; - uint32_t rightParent; } largeOffsets; struct { uint16_t leftChild; uint16_t rightChild; - uint16_t leftParent; - uint16_t rightParent; } smallOffsets; }; @@ -945,12 +941,10 @@ struct DeltaTree2 { }; std::string toString(DeltaTree2* tree) const { - return format("Node{offset=%d leftChild=%d rightChild=%d leftParent=%d rightParent=%d delta=%s}", + return format("Node{offset=%d leftChild=%d rightChild=%d delta=%s}", tree->nodeOffset(this), getLeftChildOffset(tree->largeNodes), getRightChildOffset(tree->largeNodes), - getLeftParentOffset(tree->largeNodes), - getRightParentOffset(tree->largeNodes), delta(tree->largeNodes).toString().c_str()); } @@ -964,13 +958,9 @@ struct DeltaTree2 { void setRightChildOffset(bool large, int offset) { setMember(rightChild, offset); } void setLeftChildOffset(bool large, int offset) { setMember(leftChild, offset); } - void setRightParentOffset(bool large, int offset) { setMember(rightParent, offset); } - void setLeftParentOffset(bool large, int offset) { setMember(leftParent, offset); } int getRightChildOffset(bool large) const { return getMember(rightChild); } int getLeftChildOffset(bool large) const { return getMember(leftChild); } - int getRightParentOffset(bool large) const { return getMember(rightParent); } - int getLeftParentOffset(bool large) const { return getMember(leftParent); } int size(bool large) const { return delta(large).size() + headerSize(large); } #undef getMember @@ -980,33 +970,66 @@ struct DeltaTree2 { static constexpr int SmallSizeLimit = std::numeric_limits::max(); static constexpr int LargeTreePerNodeExtraOverhead = sizeof(Node::largeOffsets) - sizeof(Node::smallOffsets); -#pragma pack(pop) - int nodeOffset(const Node* n) const { return (uint8_t*)n - (uint8_t*)this; } Node* nodeAt(int offset) { return offset == 0 ? nullptr : (Node*)((uint8_t*)this + offset); } Node* root() { return numItems == 0 ? nullptr : (Node*)(this + 1); } + int rootOffset() { return sizeof(DeltaTree2); } int size() const { return sizeof(DeltaTree2) + nodeBytesUsed; } int capacity() const { return size() + nodeBytesFree; } - Node& newNode() { return *(Node*)((uint8_t*)this + size()); } - public: + struct DecodedNode { + DecodedNode(int nodeOffset, int leftParentIndex, int rightParentIndex) + : nodeOffset(nodeOffset), leftParentIndex(leftParentIndex), rightParentIndex(rightParentIndex), + leftChildIndex(-1), rightChildIndex(-1) {} + int nodeOffset; + int16_t leftParentIndex; + int16_t rightParentIndex; + int16_t leftChildIndex; + int16_t rightChildIndex; + Optional partial; + + Node* node(DeltaTree2* tree) const { return tree->nodeAt(nodeOffset); } + + std::string toString() { + return format("DecodedNode{nodeOffset=%d leftChildIndex=%d rightChildIndex=%d leftParentIndex=%d " + "rightParentIndex=%d}", + (int)nodeOffset, + (int)leftChildIndex, + (int)rightChildIndex, + (int)leftParentIndex, + (int)rightParentIndex); + } + }; +#pragma pack(pop) struct DecodeCache : FastAllocated { DecodeCache(const T& lowerBound = T(), const T& upperBound = T()) : lowerBound(arena, lowerBound), upperBound(arena, upperBound) { - partials.reserve(10); - printf("size: %d\n", sizeof(OffsetPartial)); + decodedNodes.reserve(10); + printf("DecodedNode size: %d\n", sizeof(DecodedNode)); } Arena arena; T lowerBound; T upperBound; - std::unordered_map> partials; - Optional& get(int offset) { return partials[offset]; } + + // Index 0 is always the root + std::vector decodedNodes; + + DecodedNode& get(int index) { return decodedNodes[index]; } + + template + int emplace_new(Args&&... args) { + int index = decodedNodes.size(); + decodedNodes.emplace_back(args...); + return index; + } + + bool empty() const { return decodedNodes.empty(); } void clear() { - partials.clear(); + decodedNodes.clear(); Arena a; lowerBound = T(a, lowerBound); upperBound = T(a, upperBound); @@ -1019,54 +1042,68 @@ public: // with other DeltaTrees which were incrementally modified to produce the the // tree that this cursor is referencing. struct Cursor { - Cursor() : cache(nullptr), node(nullptr) {} + Cursor() : cache(nullptr), nodeIndex(-1) {} - Cursor(DecodeCache* cache, DeltaTree2* tree) : cache(cache), tree(tree) { node = tree->root(); } + Cursor(DecodeCache* cache, DeltaTree2* tree, int nodeIndex = -1) + : cache(cache), tree(tree), nodeIndex(nodeIndex) {} + + int rootIndex() { + if (!cache->empty()) { + return 0; + } else if (tree->numItems != 0) { + return cache->emplace_new(tree->rootOffset(), -1, -1); + } + return -1; + } DeltaTree2* tree; DecodeCache* cache; - Node* node; + int nodeIndex; + + Node* node() const { return tree->nodeAt(cache->get(nodeIndex).nodeOffset); } std::string toString() const { - return format("Cursor{tree=%p cache=%p node=%s item=%s", - tree, - cache, - node == nullptr ? "null" : node->toString(tree).c_str(), - node == nullptr ? "" : get().toString().c_str()); + if (nodeIndex == -1) { + return format("Cursor{nodeIndex=-1}"); + } + return format("Cursor{item=%s nodeIndex=%d decodedNode=%s node=%s ", + get().toString().c_str(), + nodeIndex, + cache->get(nodeIndex).toString().c_str(), + node()->toString(tree).c_str()); } - bool valid() const { return node != nullptr; } + bool valid() const { return nodeIndex != -1; } // Get T for Node n, and provide to n's delta the base and local decode cache entries to use/modify - const T get(Node* n) const { - DeltaT& delta = n->delta(tree->largeNodes); + const T get(DecodedNode& decoded) const { + DeltaT& delta = decoded.node(tree)->delta(tree->largeNodes); - // If this node's cache is populated, then the delta can create T from that alone - Optional& c = cache->get(tree->nodeOffset(n)); - if (c.present()) { - return delta.apply(c.get()); + // If this node's cached partial is populated, then the delta can create T from that alone + if (decoded.partial.present()) { + return delta.apply(decoded.partial.get()); } // Otherwise, get the base T bool basePrev = delta.getPrefixSource(); - int baseOffset = - basePrev ? n->getLeftParentOffset(tree->largeNodes) : n->getRightParentOffset(tree->largeNodes); + int baseIndex = basePrev ? decoded.leftParentIndex : decoded.rightParentIndex; - // If baseOffset is 0, then base T is DecodeCache's lower or upper bound - if (baseOffset == 0) { - return delta.apply(cache->arena, basePrev ? cache->lowerBound : cache->upperBound, c); + // If baseOffset is -1, then base T is DecodeCache's lower or upper bound + if (baseIndex == -1) { + return delta.apply(cache->arena, basePrev ? cache->lowerBound : cache->upperBound, decoded.partial); } - T base = get(tree->nodeAt(baseOffset)); - return delta.apply(cache->arena, base, cache->get(tree->nodeOffset(n))); + // Otherwise, get the base T and apply the delta to it + T base = get(cache->get(baseIndex)); + return delta.apply(cache->arena, base, decoded.partial); } - const T get() const { return get(node); } + const T get() const { return get(cache->get(nodeIndex)); } - // const tT getOrUpperBound() const { return valid() ? node->item : *mirror->upperBound(); } + // const T getOrUpperBound() const { return valid() ? node->item : *mirror->upperBound(); } - bool operator==(const Cursor& rhs) const { return node == rhs.node; } - bool operator!=(const Cursor& rhs) const { return node != rhs.node; } + bool operator==(const Cursor& rhs) const { return nodeIndex == rhs.nodeIndex; } + bool operator!=(const Cursor& rhs) const { return nodeIndex != rhs.nodeIndex; } // The seek methods, of the form seek[Less|Greater][orEqual](...) are very similar. // They attempt move the cursor to the [Greatest|Least] item, based on the name of the function. @@ -1076,7 +1113,7 @@ public: template bool seekLessThan(Args... args) { int cmp = seek(args...); - if (cmp < 0 || (cmp == 0 && node != nullptr)) { + if (cmp < 0 || (cmp == 0 && nodeIndex != -1)) { movePrev(); } return _hideDeletedBackward(); @@ -1094,7 +1131,7 @@ public: template bool seekGreaterThan(Args... args) { int cmp = seek(args...); - if (cmp > 0 || (cmp == 0 && node != nullptr)) { + if (cmp > 0 || (cmp == 0 && nodeIndex != -1)) { moveNext(); } return _hideDeletedForward(); @@ -1109,39 +1146,92 @@ public: return _hideDeletedForward(); } + // Get the right child index for parentIndex + int getRightChildIndex(int parentIndex) { + DecodedNode* parent = &cache->get(parentIndex); + + // The cache may have a child index, but since cache covers multiple versions of a DeltaTree + // it can't be used unless the node in the tree has a child. + int childOffset = parent->node(tree)->getRightChildOffset(tree->largeNodes); + + if (childOffset == 0) { + return -1; + } + + // parent has this child so return the index if it is in DecodedNode + if (parent->rightChildIndex != -1) { + return parent->rightChildIndex; + } + + // Create the child's DecodedNode and get its index + int childIndex = cache->emplace_new(childOffset, parentIndex, parent->rightParentIndex); + + // Set the index in the parent. The cache lookup is repeated because the cache has changed. + cache->get(parentIndex).rightChildIndex = childIndex; + return childIndex; + } + + // Get the left child index for parentIndex + int getLeftChildIndex(int parentIndex) { + DecodedNode* parent = &cache->get(parentIndex); + + // The cache may have a child index, but since cache covers multiple versions of a DeltaTree + // it can't be used unless the node in the tree has a child. + int childOffset = parent->node(tree)->getLeftChildOffset(tree->largeNodes); + + if (childOffset == 0) { + return -1; + } + + // parent has this child so return the index if it is in DecodedNode + if (parent->leftChildIndex != -1) { + return parent->leftChildIndex; + } + + // Create the child's DecodedNode and get its index + int childIndex = cache->emplace_new(childOffset, parent->leftParentIndex, parentIndex); + + // Set the index in the parent. The cache lookup is repeated because the cache has changed. + cache->get(parentIndex).leftChildIndex = childIndex; + return childIndex; + } + // seek() moves the cursor to a node containing s or the node that would be the parent of s if s were to be // added to the tree. If the tree was empty, the cursor will be invalid and the return value will be 0. // Otherwise, returns the result of s.compare(item at cursor position) // Does not skip/avoid deleted nodes. int seek(const T& s, int skipLen = 0) { - node = nullptr; + nodeIndex = -1; deltatree_printf("seek(%s) start %s\n", s.toString().c_str(), toString().c_str()); - Node* n = tree->root(); + int nIndex = rootIndex(); int cmp = 0; - while (n != nullptr) { - node = n; + while (nIndex != -1) { + nodeIndex = nIndex; cmp = s.compare(get(), skipLen); - deltatree_printf("seek(%s) move %s cmp=%d\n", s.toString().c_str(), toString().c_str(), cmp); + deltatree_printf("seek(%s) loop cmp=%d %s\n", s.toString().c_str(), cmp, toString().c_str()); if (cmp == 0) { break; } - n = (cmp > 0) ? tree->nodeAt(n->getRightChildOffset(tree->largeNodes)) - : tree->nodeAt(n->getLeftChildOffset(tree->largeNodes)); + if (cmp > 0) { + nIndex = getRightChildIndex(nIndex); + } else { + nIndex = getLeftChildIndex(nIndex); + } } return cmp; } bool moveFirst() { - Node* n = tree->root(); - node = n; + nodeIndex = -1; + int nIndex = rootIndex(); deltatree_printf("moveFirst start %s\n", toString().c_str()); - while (n != nullptr) { - n = tree->nodeAt(n->getLeftChildOffset(tree->largeNodes)); - if (n != nullptr) { - node = n; + while (nIndex != -1) { + nIndex = getLeftChildIndex(nIndex); + if (nIndex != -1) { + nodeIndex = nIndex; deltatree_printf("moveFirst move %s\n", toString().c_str()); } } @@ -1149,13 +1239,13 @@ public: } bool moveLast() { - Node* n = tree->root(); - node = n; + nodeIndex = -1; + int nIndex = rootIndex(); deltatree_printf("moveLast start %s\n", toString().c_str()); - while (n != nullptr) { - n = tree->nodeAt(n->getRightChildOffset(tree->largeNodes)); - if (n != nullptr) { - node = n; + while (nIndex != -1) { + nIndex = getRightChildIndex(nIndex); + if (nIndex != -1) { + nodeIndex = nIndex; deltatree_printf("moveLast move %s\n", toString().c_str()); } } @@ -1166,19 +1256,19 @@ public: void _moveNext() { deltatree_printf("_moveNext start %s\n", toString().c_str()); // Try to go right - Node* n = tree->nodeAt(node->getRightChildOffset(tree->largeNodes)); + int nIndex = getRightChildIndex(nodeIndex); // If we couldn't go right, then the answer is our next ancestor - if (n == nullptr) { - node = tree->nodeAt(node->getRightParentOffset(tree->largeNodes)); + if (nIndex == -1) { + nodeIndex = cache->get(nodeIndex).rightParentIndex; deltatree_printf("_moveNext move1 %s\n", toString().c_str()); } else { // Go left as far as possible do { - node = n; + nodeIndex = nIndex; deltatree_printf("_moveNext move2 %s\n", toString().c_str()); - n = tree->nodeAt(n->getLeftChildOffset(tree->largeNodes)); - } while (n != nullptr); + nIndex = getLeftChildIndex(nodeIndex); + } while (nIndex != -1); } } @@ -1186,18 +1276,18 @@ public: void _movePrev() { deltatree_printf("_movePrev start %s\n", toString().c_str()); // Try to go left - Node* n = tree->nodeAt(node->getLeftChildOffset(tree->largeNodes)); + int nIndex = getLeftChildIndex(nodeIndex); // If we couldn't go left, then the answer is our prev ancestor - if (n == nullptr) { - node = tree->nodeAt(node->getLeftParentOffset(tree->largeNodes)); + if (nIndex == -1) { + nodeIndex = cache->get(nodeIndex).leftParentIndex; deltatree_printf("_movePrev move1 %s\n", toString().c_str()); } else { // Go right as far as possible do { - node = n; + nodeIndex = nIndex; deltatree_printf("_movePrev move2 %s\n", toString().c_str()); - n = tree->nodeAt(n->getRightChildOffset(tree->largeNodes)); - } while (n != nullptr); + nIndex = getRightChildIndex(nodeIndex); + } while (nIndex != -1); } } @@ -1211,12 +1301,15 @@ public: return _hideDeletedBackward(); } - bool isErased() const { return node->delta(tree->largeNodes).getDeleted(); } + DeltaT& getDelta() const { return cache->get(nodeIndex).node(tree)->delta(tree->largeNodes); } + + bool isErased() const { return getDelta().getDeleted(); } // Erase current item by setting its deleted flag to true. // Tree header is updated if a change is made. + // Cursor is not moved, so now points to a node marked as deletd. void erase() { - auto& delta = node->delta(tree->largeNodes); + auto& delta = getDelta(); if (!delta.getDeleted()) { delta.setDeleted(true); --tree->numItems; @@ -1224,20 +1317,9 @@ public: } } - // Un-erase current item by setting its deleted flag to false. - // Tree header is updated if a change is made. - void unErase() { - auto& delta = node->delta(tree->largeNodes); - if (delta.getDeleted()) { - delta.setDeleted(false); - ++tree->numItems; - tree->nodeBytesDeleted -= (delta.size() + Node::headerSize(tree->largeNodes)); - } - } - // Erase k by setting its deleted flag to true. Returns true only if k existed bool erase(const T& k, int skipLen = 0) { - Cursor c = *this; + Cursor c(cache, tree, -1); if (c.seek(k, skipLen) == 0 && !c.isErased()) { c.erase(); return true; @@ -1253,78 +1335,130 @@ public: bool insert(const T& k, int skipLen = 0, int maxHeightAllowed = std::numeric_limits::max()) { deltatree_printf("insert %s\n", k.toString().c_str()); - if (tree->numItems == 0) { - return false; - } - - Cursor c = *this; + int nIndex = rootIndex(); + int parentIndex = nIndex; + DecodedNode* parentDecoded; + // Result of comparing node at parentIndex + int cmp = 0; + // Height of the inserted node int height = 0; - // TODO: Inline seek here to add height output - int cmp = c.seek(k, skipLen); - Node* parent = c.node; + // Find the parent to add the node to + // This is just seek but modifies parentIndex instead of nodeIndex and tracks the insertion height + deltatree_printf( + "insert(%s) start %s\n", k.toString().c_str(), Cursor(cache, tree, parentIndex).toString().c_str()); + while (nIndex != -1) { + ++height; + parentIndex = nIndex; + parentDecoded = &cache->get(parentIndex); + cmp = k.compare(get(*parentDecoded), skipLen); + deltatree_printf("insert(%s) moved cmp=%d %s\n", + k.toString().c_str(), + cmp, + Cursor(cache, tree, parentIndex).toString().c_str()); + + if (cmp == 0) { + break; + } + + if (cmp > 0) { + deltatree_printf("insert(%s) move right\n", k.toString().c_str()); + nIndex = getRightChildIndex(nIndex); + } else { + deltatree_printf("insert(%s) move left\n", k.toString().c_str()); + nIndex = getLeftChildIndex(nIndex); + } + } // If the item is found, mark it erased if it isn't already if (cmp == 0) { - if (c.isErased()) { - c.unErase(); + DeltaT& delta = tree->nodeAt(parentDecoded->nodeOffset)->delta(tree->largeNodes); + if (delta.getDeleted()) { + delta.setDeleted(false); + ++tree->numItems; + tree->nodeBytesDeleted -= (delta.size() + Node::headerSize(tree->largeNodes)); + deltatree_printf("insert(%s) deleted item restored %s\n", + k.toString().c_str(), + Cursor(cache, tree, parentIndex).toString().c_str()); return true; } + deltatree_printf("insert(%s) item exists %s\n", + k.toString().c_str(), + Cursor(cache, tree, parentIndex).toString().c_str()); return false; } - if (height > maxHeightAllowed) { + // If the tree was empty or the max insertion height is exceeded then fail + if (parentIndex == -1 || height > maxHeightAllowed) { return false; } - Node& child = tree->newNode(); - int childOffset = tree->nodeOffset(&child); - - // If k > c then k becomes c's right child + // Find the base base to borrow from, see if the resulting delta fits into the tree + int leftBaseIndex, rightBaseIndex; bool addingRight = cmp > 0; - int leftParentOffset, rightParentOffset; - - // Point either the right or left child of c to the new node - // Set parent pointers for n if (addingRight) { - // parent is the new node's left parent since n is the right child of parent - leftParentOffset = tree->nodeOffset(parent); - rightParentOffset = parent->getRightParentOffset(tree->largeNodes); + leftBaseIndex = parentIndex; + rightBaseIndex = parentDecoded->rightParentIndex; } else { - // parent is the new node's right parent since n is the left child of parent - leftParentOffset = parent->getLeftParentOffset(tree->largeNodes); - rightParentOffset = tree->nodeOffset(parent); + leftBaseIndex = parentDecoded->leftParentIndex; + rightBaseIndex = parentIndex; } - T leftBase = leftParentOffset == 0 ? cache->lowerBound : get(tree->nodeAt(leftParentOffset)); - T rightBase = rightParentOffset == 0 ? cache->upperBound : get(tree->nodeAt(rightParentOffset)); + T leftBase = leftBaseIndex == -1 ? cache->lowerBound : get(cache->get(leftBaseIndex)); + T rightBase = rightBaseIndex == -1 ? cache->upperBound : get(cache->get(rightBaseIndex)); int common = leftBase.getCommonPrefixLen(rightBase, skipLen); int commonWithLeftParent = k.getCommonPrefixLen(leftBase, common); int commonWithRightParent = k.getCommonPrefixLen(rightBase, common); bool borrowFromLeft = commonWithLeftParent >= commonWithRightParent; - const T& base = borrowFromLeft ? leftBase : rightBase; - int commonPrefix = borrowFromLeft ? commonWithLeftParent : commonWithRightParent; - int deltaSize = k.deltaSize(base, commonPrefix, false); + const T* base; + int commonPrefix; + if (borrowFromLeft) { + base = &leftBase; + commonPrefix = commonWithLeftParent; + } else { + base = &rightBase; + commonPrefix = commonWithRightParent; + } + + int deltaSize = k.deltaSize(*base, commonPrefix, false); int nodeSpace = deltaSize + Node::headerSize(tree->largeNodes); if (nodeSpace > tree->nodeBytesFree) { return false; } - if (addingRight) { - parent->setRightChildOffset(tree->largeNodes, childOffset); - } else { - parent->setLeftChildOffset(tree->largeNodes, childOffset); - } - child.setLeftParentOffset(tree->largeNodes, leftParentOffset); - child.setRightParentOffset(tree->largeNodes, rightParentOffset); - child.setRightChildOffset(tree->largeNodes, 0); - child.setLeftChildOffset(tree->largeNodes, 0); + int childOffset = tree->size(); + Node* childNode = tree->nodeAt(childOffset); + childNode->setLeftChildOffset(tree->largeNodes, 0); + childNode->setRightChildOffset(tree->largeNodes, 0); - DeltaT& childDelta = child.delta(tree->largeNodes); - int written = k.writeDelta(childDelta, base, commonPrefix); + // Create the decoded node and link it to the parent + // Link the parent's decodednode to the child's decodednode + // Link the parent node in the tree to the new child node + // true if node is being added to right child + int childIndex = cache->emplace_new(childOffset, leftBaseIndex, rightBaseIndex); + + // Get a new parentDecoded pointer as the cache may have changed allocations + parentDecoded = &cache->get(parentIndex); + + if (addingRight) { + // Adding child to right of parent + parentDecoded->rightChildIndex = childIndex; + parentDecoded->node(tree)->setRightChildOffset(tree->largeNodes, childOffset); + } else { + // Adding child to left of parent + parentDecoded->leftChildIndex = childIndex; + parentDecoded->node(tree)->setLeftChildOffset(tree->largeNodes, childOffset); + } + + // Give k opportunity to populate its cache partial record + k.updateCache(cache->get(childIndex).partial, cache->arena); + + DeltaT& childDelta = childNode->delta(tree->largeNodes); + deltatree_printf("insert(%s) writing delta from %s\n", k.toString().c_str(), base->toString().c_str()); + int written = k.writeDelta(childDelta, *base, commonPrefix); ASSERT(deltaSize == written); childDelta.setPrefixSource(borrowFromLeft); @@ -1337,22 +1471,29 @@ public: tree->maxHeight = height; } + deltatree_printf("insert(%s) done parent=%s\n", + k.toString().c_str(), + Cursor(cache, tree, parentIndex).toString().c_str()); + deltatree_printf("insert(%s) done child=%s\n", + k.toString().c_str(), + Cursor(cache, tree, childIndex).toString().c_str()); + return true; } private: bool _hideDeletedBackward() { - while (node != nullptr && node->delta(tree->largeNodes).getDeleted()) { + while (nodeIndex != -1 && getDelta().getDeleted()) { _movePrev(); } - return node != nullptr; + return nodeIndex != -1; } bool _hideDeletedForward() { - while (node != nullptr && node->delta(tree->largeNodes).getDeleted()) { + while (nodeIndex != -1 && getDelta().getDeleted()) { _moveNext(); } - return node != nullptr; + return nodeIndex != -1; } }; @@ -1368,7 +1509,7 @@ public: // The boundary leading to the new page acts as the last time we branched right if (count > 0) { nodeBytesUsed = buildSubtree( - *root(), begin, end, lowerBound, upperBound, 0, 0, lowerBound->getCommonPrefixLen(*upperBound, 0)); + *root(), begin, end, lowerBound, upperBound, lowerBound->getCommonPrefixLen(*upperBound, 0)); } else { nodeBytesUsed = 0; } @@ -1382,8 +1523,6 @@ private: const T* end, const T* leftParent, const T* rightParent, - int leftParentOffset, - int rightParentOffset, int subtreeCommon) { int count = end - begin; @@ -1424,14 +1563,7 @@ private: count, leftChildOffset); - wptr += buildSubtree(*(Node*)wptr, - begin, - begin + mid, - leftParent, - &item, - leftParentOffset, - nodeOffset(&node), - commonWithPrev); + wptr += buildSubtree(*(Node*)wptr, begin, begin + mid, leftParent, &item, commonWithPrev); } else { leftChildOffset = 0; } @@ -1446,22 +1578,13 @@ private: count, rightChildOffset); - wptr += buildSubtree(*(Node*)wptr, - begin + mid + 1, - end, - &item, - rightParent, - nodeOffset(&node), - rightParentOffset, - commonWithNext); + wptr += buildSubtree(*(Node*)wptr, begin + mid + 1, end, &item, rightParent, commonWithNext); } else { rightChildOffset = 0; } node.setLeftChildOffset(largeNodes, leftChildOffset); node.setRightChildOffset(largeNodes, rightChildOffset); - node.setLeftParentOffset(largeNodes, leftParentOffset); - node.setRightParentOffset(largeNodes, rightParentOffset); deltatree_printf("%p: Serialized %s as %s\n", this, item.toString().c_str(), node.toString(this).c_str()); diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index c6b7b79fac..b52cbf25d1 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -2616,6 +2616,8 @@ struct RedwoodRecordRef { typedef KeyRef Partial; + void updateCache(Optional cache, Arena& arena) const { cache = KeyRef(arena, key); } + KeyValueRef toKeyValueRef() const { return KeyValueRef(key, value.get()); } // RedwoodRecordRefs are used for both internal and leaf pages of the BTree. @@ -7128,6 +7130,8 @@ struct IntIntPair { IntIntPair(Arena& arena, const IntIntPair& toCopy) { *this = toCopy; } typedef IntIntPair Partial; + + void updateCache(Optional cache, Arena& arena) const {} struct Delta { bool prefixSource; bool deleted; @@ -7581,8 +7585,9 @@ TEST_CASE("/redwood/correctness/unit/deltaTree/RedwoodRecordRef") { TEST_CASE("/redwood/correctness/unit/deltaTree/RedwoodRecordRef2") { // Sanity check on delta tree node format - ASSERT(DeltaTree2::Node::headerSize(false) == 8); - ASSERT(DeltaTree2::Node::headerSize(true) == 16); + ASSERT(DeltaTree2::Node::headerSize(false) == 4); + ASSERT(DeltaTree2::Node::headerSize(true) == 8); + ASSERT(sizeof(DeltaTree2::DecodedNode) == 28); const int N = deterministicRandom()->randomInt(200, 1000); @@ -7822,7 +7827,7 @@ TEST_CASE("/redwood/correctness/unit/deltaTree/IntIntPair") { // Iterate through items and tree forward and backward, verifying tree contents. auto scanAndVerify = [&]() { - printf("Verify tree contents.\n"); + printf("Verify DeltaTree contents.\n"); DeltaTree::Cursor fwd = r.getCursor(); DeltaTree::Cursor rev = r.getCursor(); @@ -7864,7 +7869,7 @@ TEST_CASE("/redwood/correctness/unit/deltaTree/IntIntPair") { // Iterate through items and tree forward and backward, verifying tree contents. auto scanAndVerify2 = [&]() { - printf("Verify tree contents.\n"); + printf("Verify DeltaTree2 contents.\n"); DeltaTree2::Cursor fwd(&cache, tree2); DeltaTree2::Cursor rev(&cache, tree2); @@ -7914,16 +7919,19 @@ TEST_CASE("/redwood/correctness/unit/deltaTree/IntIntPair") { // Grow uniqueItems until tree is full, adding half of new items to toDelete std::vector toDelete; - while (1) { + int maxInsert = 9999999; + bool shouldBeFull = false; + while (maxInsert-- > 0) { IntIntPair p = randomPair(); - auto nextP = p; // also check if next highest/lowest key is not in the set - nextP.v++; - auto prevP = p; - prevP.v--; - if (uniqueItems.count(p) == 0 && uniqueItems.count(nextP) == 0 && uniqueItems.count(prevP) == 0) { - if (!r.insert(p)) { + // Insert record if it, its predecessor, and its successor are not present. + // Test data is intentionally sparse to test finding each record with a directional + // seek from each adjacent possible but not present record. + if (uniqueItems.count(p) == 0 && uniqueItems.count(IntIntPair(p.k, p.v - 1)) == 0 && uniqueItems.count(IntIntPair(p.k, p.v + 1)) == 0) { + if (!cur2.insert(p)) { + shouldBeFull = true; break; }; + ASSERT(r.insert(p)); uniqueItems.insert(p); if (deterministicRandom()->coinflip()) { toDelete.push_back(p); @@ -7932,7 +7940,8 @@ TEST_CASE("/redwood/correctness/unit/deltaTree/IntIntPair") { } } - ASSERT(tree->numItems > 2 * N); + // If the tree refused to insert an item, the count should be at least 2*N + ASSERT(!shouldBeFull || tree->numItems > 2 * N); ASSERT(tree->size() <= bufferSize); // Update items vector From 65cfd312215da6f059ef0095c204c3110a605a29 Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Fri, 23 Apr 2021 23:38:40 -0700 Subject: [PATCH 06/42] Added memory-only option for redwood set test. --- fdbserver/VersionedBTree.actor.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index b52cbf25d1..9d5a00235c 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -8787,6 +8787,7 @@ TEST_CASE(":/redwood/performance/set") { state int concurrentScans = params.getInt("concurrentScans").orDefault(64); state int seeks = params.getInt("seeks").orDefault(1000000); state int scans = params.getInt("scans").orDefault(20000); + state bool pagerMemoryOnly = params.getInt("pagerMemoryOnly").orDefault(0); printf("pageSize: %d\n", pageSize); printf("pageCacheBytes: %" PRId64 "\n", pageCacheBytes); @@ -8815,7 +8816,7 @@ TEST_CASE(":/redwood/performance/set") { deleteFile(fileName); } - DWALPager* pager = new DWALPager(pageSize, fileName, pageCacheBytes, remapCleanupWindow); + DWALPager* pager = new DWALPager(pageSize, fileName, pageCacheBytes, remapCleanupWindow, pagerMemoryOnly); state VersionedBTree* btree = new VersionedBTree(pager, fileName); wait(btree->init()); printf("Initialized. StorageBytes=%s\n", btree->getStorageBytes().toString().c_str()); From d208d3f3ecb8e8e8530d1ef28c11893e1b289d93 Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Mon, 26 Apr 2021 23:14:04 -0700 Subject: [PATCH 07/42] Bug fixes, moveFirst/Last didn't handle tree size of 1 correctly. --- fdbserver/DeltaTree.h | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/fdbserver/DeltaTree.h b/fdbserver/DeltaTree.h index df94fdd122..221145eb62 100644 --- a/fdbserver/DeltaTree.h +++ b/fdbserver/DeltaTree.h @@ -1003,11 +1003,11 @@ public: } }; #pragma pack(pop) - struct DecodeCache : FastAllocated { + struct DecodeCache : FastAllocated, ReferenceCounted { DecodeCache(const T& lowerBound = T(), const T& upperBound = T()) : lowerBound(arena, lowerBound), upperBound(arena, upperBound) { decodedNodes.reserve(10); - printf("DecodedNode size: %d\n", sizeof(DecodedNode)); + deltatree_printf("DecodedNode size: %d\n", sizeof(DecodedNode)); } Arena arena; @@ -1100,7 +1100,7 @@ public: const T get() const { return get(cache->get(nodeIndex)); } - // const T getOrUpperBound() const { return valid() ? node->item : *mirror->upperBound(); } + const T getOrUpperBound() const { return valid() ? get() : cache->upperBound; } bool operator==(const Cursor& rhs) const { return nodeIndex == rhs.nodeIndex; } bool operator!=(const Cursor& rhs) const { return nodeIndex != rhs.nodeIndex; } @@ -1229,11 +1229,9 @@ public: int nIndex = rootIndex(); deltatree_printf("moveFirst start %s\n", toString().c_str()); while (nIndex != -1) { + nodeIndex = nIndex; + deltatree_printf("moveFirst moved %s\n", toString().c_str()); nIndex = getLeftChildIndex(nIndex); - if (nIndex != -1) { - nodeIndex = nIndex; - deltatree_printf("moveFirst move %s\n", toString().c_str()); - } } return _hideDeletedForward(); } @@ -1243,11 +1241,9 @@ public: int nIndex = rootIndex(); deltatree_printf("moveLast start %s\n", toString().c_str()); while (nIndex != -1) { + nodeIndex = nIndex; + deltatree_printf("moveLast moved %s\n", toString().c_str()); nIndex = getRightChildIndex(nIndex); - if (nIndex != -1) { - nodeIndex = nIndex; - deltatree_printf("moveLast move %s\n", toString().c_str()); - } } return _hideDeletedBackward(); } From 1d947bff2d6c3c4e7558f54bdcae7db776c12cfc Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Mon, 17 May 2021 00:00:15 -0700 Subject: [PATCH 08/42] Initial pass at getting BTree to compile with DeltaTree2. Does not work since the Cursor contract has changed. --- fdbserver/VersionedBTree.actor.cpp | 213 +++++++++++++++-------------- 1 file changed, 111 insertions(+), 102 deletions(-) diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index 9d5a00235c..9b1c06489e 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -3018,7 +3018,7 @@ struct RedwoodRecordRef { } }; - // Using this class as an alternative for Delta enables reading a DeltaTree while only decoding + // Using this class as an alternative for Delta enables reading a DeltaTree2 while only decoding // its values, so the Reader does not require the original prev/next ancestors. struct DeltaValueOnly : Delta { RedwoodRecordRef apply(const RedwoodRecordRef& base, Arena& arena) const { @@ -3152,8 +3152,8 @@ struct RedwoodRecordRef { }; struct BTreePage { - typedef DeltaTree BinaryTree; - typedef DeltaTree ValueTree; + typedef DeltaTree2 BinaryTree; + typedef DeltaTree2 ValueTree; #pragma pack(push, 1) struct { @@ -3171,11 +3171,10 @@ struct BTreePage { BinaryTree& tree() { return *(BinaryTree*)(this + 1); } - const BinaryTree& tree() const { return *(const BinaryTree*)(this + 1); } + BinaryTree& tree() const { return *(BinaryTree*)(this + 1); } - const ValueTree& valueTree() const { return *(const ValueTree*)(this + 1); } + ValueTree& valueTree() const { return *(ValueTree*)(this + 1); } - // TODO: boundaries are for decoding, but upper std::string toString(bool write, BTreePageIDRef id, Version ver, @@ -3197,8 +3196,8 @@ struct BTreePage { if (tree().numItems > 0) { // This doesn't use the cached reader for the page because it is only for debugging purposes, // a cached reader may not exist - BinaryTree::Mirror reader(&tree(), lowerBound, upperBound); - BinaryTree::Cursor c = reader.getCursor(); + BinaryTree::DecodeCache cache(*lowerBound, *upperBound); + BinaryTree::Cursor c(&cache, &tree()); c.moveFirst(); ASSERT(c.valid()); @@ -3226,7 +3225,7 @@ struct BTreePage { // Out of range entries are actually okay now and the result of subtree deletion followed by // incremental insertions of records in the deleted range being added to an adjacent subtree // which is logically expanded encompass the deleted range but still is using the original - // subtree boundaries as DeltaTree boundaries. + // subtree boundaries as DeltaTree2 boundaries. // ASSERT(!anyOutOfRange); } } catch (Error& e) { @@ -3249,7 +3248,8 @@ static void makeEmptyRoot(Reference page) { } BTreePage::BinaryTree::Cursor getCursor(const Reference& page) { - return ((BTreePage::BinaryTree::Mirror*)page->userData)->getCursor(); + return BTreePage::BinaryTree::Cursor((BTreePage::BinaryTree::DecodeCache*)page->userData, + &((BTreePage*)page->begin())->tree()); } struct BoundaryRefAndPage { @@ -3498,8 +3498,8 @@ public: // Iterate over page entries, skipping key decoding using BTreePage::ValueTree which uses // RedwoodRecordRef::DeltaValueOnly as the delta type type to skip key decoding - BTreePage::ValueTree::Mirror reader(&btPage.valueTree(), &dbBegin, &dbEnd); - auto c = reader.getCursor(); + BTreePage::ValueTree::DecodeCache cache(dbBegin, dbEnd); + BTreePage::ValueTree::Cursor c(&cache, &btPage.valueTree()); ASSERT(c.moveFirst()); Version v = entry.version; while (1) { @@ -3930,7 +3930,7 @@ private: int count; // Number of records added to the page int pageSize; // Page size required to hold a BTreePage of the added records, which is a multiple of blockSize int bytesLeft; // Bytes in pageSize that are unused by the BTreePage so far - bool largeDeltaTree; // Whether or not the DeltaTree in the generated page is in the 'large' size range + bool largeDeltaTree; // Whether or not the tree in the generated page is in the 'large' size range int blockSize; // Base block size by which pageSize can be incremented int blockCount; // The number of blocks in pageSize int kvBytes; // The amount of user key/value bytes added to the page @@ -4361,13 +4361,17 @@ private: metrics.pageReadExt += (id.size() - 1); if (!forLazyClear && page->userData == nullptr) { - debug_printf("readPage() Creating Mirror for %s @%" PRId64 " lower=%s upper=%s\n", + debug_printf("readPage() Creating DecodeCache for %s @%" PRId64 " lower=%s upper=%s\n", toString(id).c_str(), snapshot->getVersion(), lowerBound->toString(false).c_str(), upperBound->toString(false).c_str()); - page->userData = new BTreePage::BinaryTree::Mirror(&pTreePage->tree(), lowerBound, upperBound); - page->userDataDestructor = [](void* ptr) { delete (BTreePage::BinaryTree::Mirror*)ptr; }; + + BTreePage::BinaryTree::DecodeCache* cache = + new BTreePage::BinaryTree::DecodeCache(*lowerBound, *upperBound); + cache->addref(); + page->userData = cache; + page->userDataDestructor = [](void* cache) { ((BTreePage::BinaryTree::DecodeCache*)cache)->delref(); }; } if (!forLazyClear) { @@ -4432,16 +4436,15 @@ private: return newID; } - // Copy page and initialize a Mirror for reading it. + // Copy page to a new page which shares the same DecodeCache with the old page Reference cloneForUpdate(Reference page) { Reference newPage = page->cloneContents(); - auto oldMirror = (const BTreePage::BinaryTree::Mirror*)page->userData; - auto newBTPage = (BTreePage*)newPage->mutate(); + BTreePage::BinaryTree::DecodeCache* cache = (BTreePage::BinaryTree::DecodeCache*)page->userData; + cache->addref(); + newPage->userData = cache; + newPage->userDataDestructor = [](void* cache) { ((BTreePage::BinaryTree::DecodeCache*)cache)->delref(); }; - newPage->userData = - new BTreePage::BinaryTree::Mirror(&newBTPage->tree(), oldMirror->lowerBound(), oldMirror->upperBound()); - newPage->userDataDestructor = [](void* ptr) { delete (BTreePage::BinaryTree::Mirror*)ptr; }; return newPage; } @@ -4453,12 +4456,12 @@ private: // Subtree clears can cause the boundaries for decoding the page to be more restrictive than the subtree's // logical boundaries. When a subtree is fully cleared, the link to it is replaced with a null link, but // the key boundary remains in tact to support decoding of the previous subtree. - const RedwoodRecordRef* subtreeLowerBound; - const RedwoodRecordRef* subtreeUpperBound; + RedwoodRecordRef subtreeLowerBound; + RedwoodRecordRef subtreeUpperBound; // The lower/upper bound for decoding the root of the subtree - const RedwoodRecordRef* decodeLowerBound; - const RedwoodRecordRef* decodeUpperBound; + RedwoodRecordRef decodeLowerBound; + RedwoodRecordRef decodeUpperBound; bool boundariesNormal() const { // If the decode upper boundary is the subtree upper boundary the pointers will be the same @@ -4466,7 +4469,7 @@ private: // that the keys are the same. This happens for the first remaining subtree of an internal page // after the prior subtree(s) were cleared. return (decodeUpperBound == subtreeUpperBound) && - (decodeLowerBound == subtreeLowerBound || decodeLowerBound->sameExceptValue(*subtreeLowerBound)); + (decodeLowerBound == subtreeLowerBound || decodeLowerBound.sameExceptValue(subtreeLowerBound)); } // The record range of the subtree slice is cBegin to cEnd @@ -4495,7 +4498,7 @@ private: // The upper boundary expected, if any, by the last child in either [cBegin, cEnd) or newLinks // If the last record in the range has a null link then this will be null. - const RedwoodRecordRef* expectedUpperBound; + Optional expectedUpperBound; bool inPlaceUpdate; @@ -4505,7 +4508,7 @@ private: void cleared() { inPlaceUpdate = false; childrenChanged = true; - expectedUpperBound = nullptr; + expectedUpperBound.reset(); } // Page was updated in-place through edits and written to maybeNewID @@ -4519,9 +4522,9 @@ private: metrics.modifyItemCount += btPage->tree().numItems; // The boundaries can't have changed, but the child page link may have. - if (maybeNewID != decodeLowerBound->getChildPage()) { + if (maybeNewID != decodeLowerBound.getChildPage()) { // Add page's decode lower bound to newLinks set without its child page, intially - newLinks.push_back_deep(newLinks.arena(), decodeLowerBound->withoutValue()); + newLinks.push_back_deep(newLinks.arena(), decodeLowerBound.withoutValue()); // Set the child page ID, which has already been allocated in result.arena() newLinks.back().setChildPage(maybeNewID); @@ -4542,7 +4545,7 @@ private: // If the replacement records ended on a non-null child page, then the expect upper bound is // the subtree upper bound since that is what would have been used for the page(s) rebuild, // otherwise it is null. - expectedUpperBound = newLinks.back().value.present() ? subtreeUpperBound : nullptr; + expectedUpperBound = newLinks.back().value.present() ? subtreeUpperBound : Optional(); } // Get the first record for this range AFTER applying whatever changes were made @@ -4553,7 +4556,7 @@ private: } return &newLinks.front(); } - return decodeLowerBound; + return &decodeLowerBound; } std::string toString() const { @@ -4564,12 +4567,12 @@ private: childrenChanged && newLinks.empty(), childrenChanged, inPlaceUpdate); - s += format("SubtreeLower: %s\n", subtreeLowerBound->toString(false).c_str()); - s += format(" DecodeLower: %s\n", decodeLowerBound->toString(false).c_str()); - s += format(" DecodeUpper: %s\n", decodeUpperBound->toString(false).c_str()); - s += format("SubtreeUpper: %s\n", subtreeUpperBound->toString(false).c_str()); + s += format("SubtreeLower: %s\n", subtreeLowerBound.toString(false).c_str()); + s += format(" DecodeLower: %s\n", decodeLowerBound.toString(false).c_str()); + s += format(" DecodeUpper: %s\n", decodeUpperBound.toString(false).c_str()); + s += format("SubtreeUpper: %s\n", subtreeUpperBound.toString(false).c_str()); s += format("expectedUpperBound: %s\n", - expectedUpperBound ? expectedUpperBound->toString(false).c_str() : "(null)"); + expectedUpperBound.present() ? expectedUpperBound.get().toString(false).c_str() : "(null)"); for (int i = 0; i < newLinks.size(); ++i) { s += format(" %i: %s\n", i, newLinks[i].toString(false).c_str()); } @@ -4580,19 +4583,19 @@ private: struct InternalPageModifier { InternalPageModifier() {} - InternalPageModifier(BTreePage* p, BTreePage::BinaryTree::Mirror* m, bool updating, ParentInfo* parentInfo) - : btPage(p), m(m), updating(updating), changesMade(false), parentInfo(parentInfo) {} + InternalPageModifier(BTreePage* p, BTreePage::BinaryTree::Cursor& c, bool updating, ParentInfo* parentInfo) + : btPage(p), c(c), updating(updating), changesMade(false), parentInfo(parentInfo) {} bool updating; BTreePage* btPage; - BTreePage::BinaryTree::Mirror* m; + BTreePage::BinaryTree::Cursor c; Standalone> rebuild; bool changesMade; ParentInfo* parentInfo; bool empty() const { if (updating) { - return m->tree->numItems == 0; + return c.tree->numItems == 0; } else { return rebuild.empty(); } @@ -4608,14 +4611,14 @@ private: const RedwoodRecordRef& rec = recs[i]; debug_printf("internal page (updating) insert: %s\n", rec.toString(false).c_str()); - if (!m->insert(rec)) { + if (!c.insert(rec)) { debug_printf("internal page: failed to insert %s, switching to rebuild\n", rec.toString(false).c_str()); // Update failed, so populate rebuild vector with everything up to but not including end, which // may include items from recs that were already added. auto c = end; if (c.moveFirst()) { - rebuild.reserve(rebuild.arena(), c.mirror->tree->numItems); + rebuild.reserve(rebuild.arena(), c.tree->numItems); while (c != end) { debug_printf(" internal page rebuild: add %s\n", c.get().toString(false).c_str()); rebuild.push_back(rebuild.arena(), c.get()); @@ -4688,7 +4691,7 @@ private: } else { if (u.inPlaceUpdate) { - for (auto id : u.decodeLowerBound->getChildPage()) { + for (auto id : u.decodeLowerBound.getChildPage()) { parentInfo->pageUpdated(id); } } @@ -4697,11 +4700,11 @@ private: } // If there is an expected upper boundary for the next range after u - if (u.expectedUpperBound != nullptr) { + if (u.expectedUpperBound.present()) { // Then if it does not match the next boundary then insert a dummy record - if (nextBoundary == nullptr || - (nextBoundary != u.expectedUpperBound && !nextBoundary->sameExceptValue(*u.expectedUpperBound))) { - RedwoodRecordRef rec = u.expectedUpperBound->withoutValue(); + if (nextBoundary == nullptr || (nextBoundary != &u.expectedUpperBound.get() && + !nextBoundary->sameExceptValue(u.expectedUpperBound.get()))) { + RedwoodRecordRef rec = u.expectedUpperBound.get().withoutValue(); debug_printf("applyUpdate adding dummy record %s\n", rec.toString(false).c_str()); insert(u.cEnd, { &rec, 1 }); changesMade = true; @@ -4748,7 +4751,7 @@ private: state FlowLock::Releaser readLock(*commitReadLock); state bool fromCache = false; state Reference page = wait( - readPage(snapshot, rootID, update->decodeLowerBound, update->decodeUpperBound, false, false, &fromCache)); + readPage(snapshot, rootID, &update->decodeLowerBound, &update->decodeUpperBound, false, false, &fromCache)); readLock.release(); state BTreePage* btPage = (BTreePage*)page->begin(); @@ -4762,7 +4765,6 @@ private: // If trying to update the page and the page reference points into the cache, // we need to clone it so we don't modify the original version of the page. - // TODO: Refactor DeltaTree::Mirror so it can be shared between different versions of pages if (tryToUpdate && fromCache) { page = self->cloneForUpdate(page); btPage = (BTreePage*)page->begin(); @@ -4772,7 +4774,8 @@ private: debug_printf( "%s commitSubtree(): %s\n", context.c_str(), - btPage->toString(false, rootID, snapshot->getVersion(), update->decodeLowerBound, update->decodeUpperBound) + btPage + ->toString(false, rootID, snapshot->getVersion(), &update->decodeLowerBound, &update->decodeUpperBound) .c_str()); state BTreePage::BinaryTree::Cursor cursor = getCursor(page); @@ -4829,7 +4832,7 @@ private: // - there actually is a change (whether a set or a clear, old records are to be removed) // - either this is not the first boundary or it is but its key matches our lower bound key bool applyBoundaryChange = mBegin.mutation().boundaryChanged && - (!firstMutationBoundary || mBegin.key() == update->subtreeLowerBound->key); + (!firstMutationBoundary || mBegin.key() == update->subtreeLowerBound.key); firstMutationBoundary = false; // Iterate over records for the mutation boundary key, keep them unless the boundary key was changed or @@ -4875,7 +4878,7 @@ private: // If updating, add to the page, else add to the output set if (updating) { - if (cursor.mirror->insert(rec, update->skipLen, maxHeightAllowed)) { + if (cursor.insert(rec, update->skipLen, maxHeightAllowed)) { btPage->kvBytes += rec.kvBytes(); debug_printf( "%s Inserted %s [mutation, boundary start]\n", context.c_str(), rec.toString().c_str()); @@ -4996,9 +4999,9 @@ private: writeVersion = self->getLastCommittedVersion() + 1; if (updating) { - const BTreePage::BinaryTree& deltaTree = btPage->tree(); + const BTreePage::BinaryTree& DeltaTree2 = btPage->tree(); // If the tree is now empty, delete the page - if (deltaTree.numItems == 0) { + if (DeltaTree2.numItems == 0) { update->cleared(); self->freeBTreePage(rootID, writeVersion); debug_printf("%s Page updates cleared all entries, returning %s\n", @@ -5029,8 +5032,8 @@ private: // Rebuild new page(s). state Standalone> entries = wait(writePages(self, - update->subtreeLowerBound, - update->subtreeUpperBound, + &update->subtreeLowerBound, + &update->subtreeUpperBound, merged, btPage->height, writeVersion, @@ -5061,7 +5064,7 @@ private: // Subtree lower boundary is this page's subtree lower bound or cursor u.cBegin = cursor; - u.decodeLowerBound = &cursor.get(); + u.decodeLowerBound = cursor.get(); if (first) { u.subtreeLowerBound = update->subtreeLowerBound; first = false; @@ -5072,7 +5075,7 @@ private: // mBegin is either at or greater than subtreeLowerBound->key, which was the subtreeUpperBound->key // for the previous subtree slice. But we need it to be at or *before* subtreeLowerBound->key // so if mBegin.key() is not exactly the subtree lower bound key then decrement it. - if (mBegin.key() != u.subtreeLowerBound->key) { + if (mBegin.key() != u.subtreeLowerBound.key) { --mBegin; } } @@ -5083,14 +5086,14 @@ private: // The decode upper bound is always the next key after the child link, or the decode upper bound for // this page if (cursor.moveNext()) { - u.decodeUpperBound = &cursor.get(); + u.decodeUpperBound = cursor.get(); // If cursor record has a null child page then it exists only to preserve a previous // subtree boundary that is now needed for reading the subtree at cBegin. if (!cursor.get().value.present()) { // If the upper bound is provided by a dummy record in [cBegin, cEnd) then there is no // requirement on the next subtree range or the parent page to have a specific upper boundary // for decoding the subtree. - u.expectedUpperBound = nullptr; + u.expectedUpperBound.reset(); cursor.moveNext(); // If there is another record after the null child record, it must have a child page value ASSERT(!cursor.valid() || cursor.get().value.present()); @@ -5101,12 +5104,12 @@ private: u.decodeUpperBound = update->decodeUpperBound; u.expectedUpperBound = update->decodeUpperBound; } - u.subtreeUpperBound = cursor.valid() ? &cursor.get() : update->subtreeUpperBound; + u.subtreeUpperBound = cursor.valid() ? cursor.get() : update->subtreeUpperBound; u.cEnd = cursor; u.skipLen = 0; // TODO: set this // Find the mutation buffer range that includes all changes to the range described by u - mEnd = mutationBuffer->lower_bound(u.subtreeUpperBound->key); + mEnd = mutationBuffer->lower_bound(u.subtreeUpperBound.key); // If the mutation range described by mBegin extends to mEnd, then see if the part of that range // that overlaps with u's subtree range is being fully cleared or fully unchanged. @@ -5121,12 +5124,12 @@ private: if (range.clearAfterBoundary) { // If the mutation range after the boundary key is cleared, then the mutation boundary key must // be cleared or must be different than the subtree lower bound key so that it doesn't matter - uniform = range.boundaryCleared() || mutationBoundaryKey != u.subtreeLowerBound->key; + uniform = range.boundaryCleared() || mutationBoundaryKey != u.subtreeLowerBound.key; } else { // If the mutation range after the boundary key is unchanged, then the mutation boundary key // must be also unchanged or must be different than the subtree lower bound key so that it // doesn't matter - uniform = !range.boundaryChanged || mutationBoundaryKey != u.subtreeLowerBound->key; + uniform = !range.boundaryChanged || mutationBoundaryKey != u.subtreeLowerBound.key; } // If u's subtree is either all cleared or all unchanged @@ -5135,8 +5138,9 @@ private: // include sibling subtrees also covered by (mBegin, mEnd) so we can not recurse to those, too. // If the cursor is valid, u.subtreeUpperBound is the cursor's position, which is >= mEnd.key(). // If equal, no range expansion is possible. - if (cursor.valid() && mEnd.key() != u.subtreeUpperBound->key) { - cursor.seekLessThanOrEqual(mEnd.key(), update->skipLen, &cursor, 1); + if (cursor.valid() && mEnd.key() != u.subtreeUpperBound.key) { + // TODO: If cursor hints are available, use (cursor, 1) + cursor.seekLessThanOrEqual(mEnd.key(), update->skipLen); // If this seek moved us ahead, to something other than cEnd, then update subtree range // boundaries @@ -5149,7 +5153,7 @@ private: } u.cEnd = cursor; - u.subtreeUpperBound = &cursor.get(); + u.subtreeUpperBound = cursor.get(); u.skipLen = 0; // TODO: set this // The new decode upper bound is either cEnd or the record before it if it has no child @@ -5158,8 +5162,8 @@ private: c.movePrev(); ASSERT(c.valid()); if (!c.get().value.present()) { - u.decodeUpperBound = &c.get(); - u.expectedUpperBound = nullptr; + u.decodeUpperBound = c.get(); + u.expectedUpperBound.reset(); } else { u.decodeUpperBound = u.subtreeUpperBound; u.expectedUpperBound = u.subtreeUpperBound; @@ -5173,7 +5177,7 @@ private: u.cleared(); auto c = u.cBegin; while (c != u.cEnd) { - const RedwoodRecordRef& rec = c.get(); + RedwoodRecordRef rec = c.get(); if (rec.value.present()) { if (btPage->height == 2) { debug_printf("%s: freeing child page in cleared subtree range: %s\n", @@ -5224,7 +5228,7 @@ private: // Note: parentInfo could be invalid after a wait and must be re-initialized. // All uses below occur before waits so no reinitialization is done. state ParentInfo* parentInfo = &self->childUpdateTracker[rootID.front()]; - state InternalPageModifier m(btPage, cursor.mirror, tryToUpdate, parentInfo); + state InternalPageModifier m(btPage, cursor, tryToUpdate, parentInfo); // Apply the possible changes for each subtree range recursed to, except the last one. // For each range, the expected next record, if any, is checked against the first boundary @@ -5242,7 +5246,7 @@ private: context.c_str(), m.changesMade, update->toString().c_str()); - m.applyUpdate(*slices.back(), m.changesMade ? update->subtreeUpperBound : update->decodeUpperBound); + m.applyUpdate(*slices.back(), m.changesMade ? &update->subtreeUpperBound : &update->decodeUpperBound); state bool detachChildren = (parentInfo->count > 2); state bool forceUpdate = false; @@ -5260,10 +5264,10 @@ private: // Copy the page before modification if the page references the cache if (fromCache) { page = self->cloneForUpdate(page); - cursor = getCursor(page); btPage = (BTreePage*)page->begin(); m.btPage = btPage; - m.m = cursor.mirror; + cursor.tree = &btPage->tree(); + m.c.tree = cursor.tree; fromCache = false; } } @@ -5328,8 +5332,8 @@ private: ->toString(false, newID, snapshot->getVersion(), - update->decodeLowerBound, - update->decodeUpperBound) + &update->decodeLowerBound, + &update->decodeUpperBound) .c_str()); update->updatedInPlace(newID, btPage, newID.size() * self->m_blockSize); @@ -5370,8 +5374,8 @@ private: Standalone> newChildEntries = wait(writePages(self, - update->subtreeLowerBound, - update->subtreeUpperBound, + &update->subtreeLowerBound, + &update->subtreeUpperBound, m.rebuild, btPage->height, writeVersion, @@ -5421,15 +5425,15 @@ private: state Standalone rootPageID = self->m_header.root.get(); state InternalPageSliceUpdate all; state RedwoodRecordRef rootLink = dbBegin.withPageID(rootPageID); - all.subtreeLowerBound = &rootLink; - all.decodeLowerBound = &rootLink; - all.subtreeUpperBound = &dbEnd; - all.decodeUpperBound = &dbEnd; + all.subtreeLowerBound = rootLink; + all.decodeLowerBound = rootLink; + all.subtreeUpperBound = dbEnd; + all.decodeUpperBound = dbEnd; all.skipLen = 0; - MutationBuffer::const_iterator mBegin = mutations->upper_bound(all.subtreeLowerBound->key); + MutationBuffer::const_iterator mBegin = mutations->upper_bound(all.subtreeLowerBound.key); --mBegin; - MutationBuffer::const_iterator mEnd = mutations->lower_bound(all.subtreeUpperBound->key); + MutationBuffer::const_iterator mEnd = mutations->lower_bound(all.subtreeUpperBound.key); wait(commitSubtree(self, self->m_pager->getReadSnapshot(latestVersion), @@ -5528,9 +5532,11 @@ public: ASSERT(!isLeaf()); BTreePage::BinaryTree::Cursor next = cursor; next.moveNext(); - const RedwoodRecordRef& rec = cursor.get(); + // TODO this should fail!!! + RedwoodRecordRef rec = cursor.get(); BTreePageIDRef id = rec.getChildPage(); - Future> child = readPage(pager, id, &rec, &next.getOrUpperBound()); + const RedwoodRecordRef upper = next.getOrUpperBound(); + Future> child = readPage(pager, id, &rec, &upper); // Read ahead siblings at level 2 // TODO: Application of readAheadBytes is not taking into account the size of the current page or any @@ -5605,7 +5611,7 @@ public: // Returns true if cursor position is present() and has an effective version <= v bool validAtVersion(Version v) { return valid() && pageCursor->cursor.get().version <= v; } - const RedwoodRecordRef& get() const { return pageCursor->cursor.get(); } + const RedwoodRecordRef get() const { return pageCursor->cursor.get(); } // Ensure that pageCursor is not shared with other cursors so we can modify it void ensureUnshared() { @@ -5800,7 +5806,7 @@ public: return r; } - const RedwoodRecordRef& get() { return path.back().cursor.get(); } + const RedwoodRecordRef get() { return path.back().cursor.get(); } bool inRoot() const { return path.size() == 1; } @@ -5809,6 +5815,7 @@ public: PathEntry& back() { return path.back(); } void popPath() { path.pop_back(); } +#error These can't be references anymore Future pushPage(BTreePageIDRef id, const RedwoodRecordRef& lowerBound, const RedwoodRecordRef& upperBound) { @@ -5820,7 +5827,7 @@ public: } Future pushPage(BTreePage::BinaryTree::Cursor c) { - const RedwoodRecordRef& rec = c.get(); + RedwoodRecordRef rec = c.get(); auto next = c; next.moveNext(); BTreePageIDRef id = rec.getChildPage(); @@ -5854,7 +5861,7 @@ public: auto& entry = self->path.back(); if (entry.btPage()->isLeaf()) { int cmp = entry.cursor.seek(query); - self->valid = entry.cursor.valid() && !entry.cursor.node->isDeleted(); + self->valid = entry.cursor.valid() && !entry.cursor.isErased(); debug_printf("seek(%s, %d) loop exit cmp=%d cursor=%s\n", query.toString().c_str(), prefetchBytes, @@ -7047,7 +7054,7 @@ ACTOR Future verify(VersionedBTree* btree, state Reference cur = btree->readAtVersion(v); debug_printf("Verifying entire key range at version %" PRId64 "\n", v); - if (deterministicRandom()->coinflip()) { + if (false) { fRangeAll = verifyRange(btree, LiteralStringRef(""), LiteralStringRef("\xff\xff"), v, written, pErrorCount); } else { @@ -7062,7 +7069,7 @@ ACTOR Future verify(VersionedBTree* btree, Key end = randomKV().key; debug_printf( "Verifying range (%s, %s) at version %" PRId64 "\n", toString(begin).c_str(), toString(end).c_str(), v); - if (deterministicRandom()->coinflip()) { + if (false) { fRangeRandom = verifyRange(btree, begin, end, v, written, pErrorCount); } else { fRangeRandom = verifyRangeBTreeCursor(btree, begin, end, v, written, pErrorCount); @@ -7072,7 +7079,7 @@ ACTOR Future verify(VersionedBTree* btree, } debug_printf("Verifying seeks to each changed key at version %" PRId64 "\n", v); - if (deterministicRandom()->coinflip()) { + if (false) { fSeekAll = seekAll(btree, v, written, pErrorCount); } else { fSeekAll = seekAllBTreeCursor(btree, v, written, pErrorCount); @@ -7406,8 +7413,8 @@ TEST_CASE("/redwood/correctness/unit/RedwoodRecordRef") { TEST_CASE("/redwood/correctness/unit/deltaTree/RedwoodRecordRef") { // Sanity check on delta tree node format - ASSERT(DeltaTree::Node::headerSize(false) == 4); - ASSERT(DeltaTree::Node::headerSize(true) == 8); + ASSERT(DeltaTree2::Node::headerSize(false) == 4); + ASSERT(DeltaTree2::Node::headerSize(true) == 8); const int N = deterministicRandom()->randomInt(200, 1000); @@ -7436,8 +7443,8 @@ TEST_CASE("/redwood/correctness/unit/deltaTree/RedwoodRecordRef") { std::vector items(uniqueItems.begin(), uniqueItems.end()); int bufferSize = N * 100; - bool largeTree = bufferSize > DeltaTree::SmallSizeLimit; - DeltaTree* tree = (DeltaTree*)new uint8_t[bufferSize]; + bool largeTree = bufferSize > DeltaTree2::SmallSizeLimit; + DeltaTree2* tree = (DeltaTree2*)new uint8_t[bufferSize]; tree->build(bufferSize, &items[0], &items[items.size()], &prev, &next); @@ -7792,7 +7799,7 @@ TEST_CASE("/redwood/correctness/unit/deltaTree/IntIntPair") { std::vector items(uniqueItems.begin(), uniqueItems.end()); int bufferSize = N * 2 * 30; - DeltaTree* tree = (DeltaTree*)new uint8_t[bufferSize]; + DeltaTree2* tree = (DeltaTree2*)new uint8_t[bufferSize]; int builtSize = tree->build(bufferSize, &items[0], &items[items.size()], &prev, &next); ASSERT(builtSize <= bufferSize); DeltaTree::Mirror r(tree, &prev, &next); @@ -7960,7 +7967,7 @@ TEST_CASE("/redwood/correctness/unit/deltaTree/IntIntPair") { scanAndVerify(); scanAndVerify2(); - // For each randomly selected new item to be deleted, delete it from the DeltaTree and from uniqueItems + // For each randomly selected new item to be deleted, delete it from the DeltaTree2 and from uniqueItems printf("Deleting some items\n"); for (auto p : toDelete) { uniqueItems.erase(p); @@ -8639,7 +8646,9 @@ TEST_CASE("/redwood/correctness/btree") { // Create new promise stream and start the verifier again committedVersions = PromiseStream(); verifyTask = verify(btree, committedVersions.getFuture(), &written, &errorCount, serialTest); - randomTask = randomReader(btree) || btree->getError(); + if(!serialTest) { + randomTask = randomReader(btree) || btree->getError(); + } committedVersions.send(v); } From d155482f5fca56b520f930212147dede6e615370 Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Mon, 17 May 2021 01:28:01 -0700 Subject: [PATCH 09/42] Remove the legacy IVersionedStore / IStoreCursor classes and implementations as they are no longer useful or efficient, respectively. BTreeCursor can be used far more efficiently to access the BTree. --- fdbserver/CMakeLists.txt | 1 - fdbserver/IVersionedStore.h | 80 --- fdbserver/VersionedBTree.actor.cpp | 853 ++--------------------------- 3 files changed, 54 insertions(+), 880 deletions(-) delete mode 100644 fdbserver/IVersionedStore.h diff --git a/fdbserver/CMakeLists.txt b/fdbserver/CMakeLists.txt index d80254097a..430d92fe96 100644 --- a/fdbserver/CMakeLists.txt +++ b/fdbserver/CMakeLists.txt @@ -27,7 +27,6 @@ set(FDBSERVER_SRCS IKeyValueContainer.h IKeyValueStore.h IPager.h - IVersionedStore.h KeyValueStoreCompressTestData.actor.cpp KeyValueStoreMemory.actor.cpp KeyValueStoreRocksDB.actor.cpp diff --git a/fdbserver/IVersionedStore.h b/fdbserver/IVersionedStore.h deleted file mode 100644 index 3651aa76a0..0000000000 --- a/fdbserver/IVersionedStore.h +++ /dev/null @@ -1,80 +0,0 @@ -/* - * IVersionedStore.h - * - * This source file is part of the FoundationDB open source project - * - * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef FDBSERVER_IVERSIONEDSTORE_H -#define FDBSERVER_IVERSIONEDSTORE_H -#pragma once - -#include "fdbserver/IKeyValueStore.h" - -#include "flow/flow.h" -#include "fdbclient/FDBTypes.h" - -class IStoreCursor { -public: - virtual Future findEqual(KeyRef key) = 0; - virtual Future findFirstEqualOrGreater(KeyRef key, int prefetchBytes = 0) = 0; - virtual Future findLastLessOrEqual(KeyRef key, int prefetchBytes = 0) = 0; - virtual Future next() = 0; - virtual Future prev() = 0; - - virtual bool isValid() = 0; - virtual KeyRef getKey() = 0; - virtual ValueRef getValue() = 0; - - virtual void addref() = 0; - virtual void delref() = 0; -}; - -class IVersionedStore : public IClosable { -public: - virtual KeyValueStoreType getType() const = 0; - virtual bool supportsMutation(int op) const = 0; // If this returns true, then mutate(op, ...) may be called - virtual StorageBytes getStorageBytes() const = 0; - - // Writes are provided in an ordered stream. - // A write is considered part of (a change leading to) the version determined by the previous call to - // setWriteVersion() A write shall not become durable until the following call to commit() begins, and shall be - // durable once the following call to commit() returns - virtual void set(KeyValueRef keyValue) = 0; - virtual void clear(KeyRangeRef range) = 0; - virtual void mutate(int op, StringRef param1, StringRef param2) = 0; - virtual void setWriteVersion(Version) = 0; // The write version must be nondecreasing - virtual void setOldestVersion(Version v) = 0; // Set oldest readable version to be used in next commit - virtual Version getOldestVersion() const = 0; // Get oldest readable version - virtual Future commit() = 0; - - virtual Future init() = 0; - virtual Version getLatestVersion() const = 0; - - // readAtVersion() may only be called on a version which has previously been passed to setWriteVersion() and never - // previously passed - // to forgetVersion. The returned results when violating this precondition are unspecified; the store is not - // required to be able to detect violations. - // The returned read cursor provides a consistent snapshot of the versioned store, corresponding to all the writes - // done with write versions less - // than or equal to the given version. - // If readAtVersion() is called on the *current* write version, the given read cursor MAY reflect subsequent writes - // at the same - // write version, OR it may represent a snapshot as of the call to readAtVersion(). - virtual Reference readAtVersion(Version) = 0; -}; - -#endif diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index 9b1c06489e..f7560a464d 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -19,7 +19,6 @@ */ #include "flow/flow.h" -#include "fdbserver/IVersionedStore.h" #include "fdbserver/IPager.h" #include "fdbclient/Tuple.h" #include "flow/serialize.h" @@ -3262,8 +3261,6 @@ struct BoundaryRefAndPage { } }; -#define NOT_IMPLEMENTED UNSTOPPABLE_ASSERT(false) - #pragma pack(push, 1) template struct InPlaceArray { @@ -3289,7 +3286,7 @@ struct InPlaceArray { }; #pragma pack(pop) -class VersionedBTree final : public IVersionedStore { +class VersionedBTree { public: // The first possible internal record possible in the tree static RedwoodRecordRef dbBegin; @@ -3381,9 +3378,9 @@ public: // All async opts on the btree are based on pager reads, writes, and commits, so // we can mostly forward these next few functions to the pager - Future getError() override { return m_pager->getError(); } + Future getError() { return m_pager->getError(); } - Future onClosed() override { return m_pager->onClosed(); } + Future onClosed() { return m_pager->onClosed(); } void close_impl(bool dispose) { auto* pager = m_pager; @@ -3394,26 +3391,24 @@ public: pager->close(); } - void dispose() override { return close_impl(true); } + void dispose() { return close_impl(true); } - void close() override { return close_impl(false); } + void close() { return close_impl(false); } - KeyValueStoreType getType() const override { NOT_IMPLEMENTED; } - bool supportsMutation(int op) const override { NOT_IMPLEMENTED; } - StorageBytes getStorageBytes() const override { return m_pager->getStorageBytes(); } + StorageBytes getStorageBytes() const { return m_pager->getStorageBytes(); } // Writes are provided in an ordered stream. // A write is considered part of (a change leading to) the version determined by the previous call to // setWriteVersion() A write shall not become durable until the following call to commit() begins, and shall be // durable once the following call to commit() returns - void set(KeyValueRef keyValue) override { + void set(KeyValueRef keyValue) { ++g_redwoodMetrics.opSet; g_redwoodMetrics.opSetKeyBytes += keyValue.key.size(); g_redwoodMetrics.opSetValueBytes += keyValue.value.size(); m_pBuffer->insert(keyValue.key).mutation().setBoundaryValue(m_pBuffer->copyToArena(keyValue.value)); } - void clear(KeyRangeRef clearedRange) override { + void clear(KeyRangeRef clearedRange) { // Optimization for single key clears to create just one mutation boundary instead of two if (clearedRange.begin.size() == clearedRange.end.size() - 1 && clearedRange.end[clearedRange.end.size() - 1] == 0 && clearedRange.end.startsWith(clearedRange.begin)) { @@ -3432,13 +3427,11 @@ public: m_pBuffer->erase(iBegin, iEnd); } - void mutate(int op, StringRef param1, StringRef param2) override { NOT_IMPLEMENTED; } + void setOldestVersion(Version v) { m_newOldestVersion = v; } - void setOldestVersion(Version v) override { m_newOldestVersion = v; } + Version getOldestVersion() const { return m_pager->getOldestVersion(); } - Version getOldestVersion() const override { return m_pager->getOldestVersion(); } - - Version getLatestVersion() const override { + Version getLatestVersion() const { if (m_writeVersion != invalidVersion) return m_writeVersion; return m_pager->getLatestVersion(); @@ -3592,7 +3585,7 @@ public: return Void(); } - Future init() override { return m_init; } + Future init() { return m_init; } virtual ~VersionedBTree() { // This probably shouldn't be called directly (meaning deleting an instance directly) but it should be safe, @@ -3602,20 +3595,8 @@ public: m_latestCommit.cancel(); } - Reference readAtVersion(Version v) override { - // Only committed versions can be read. - ASSERT(v <= m_lastCommittedVersion); - Reference snapshot = m_pager->getReadSnapshot(v); - - // This is a ref because snapshot will continue to hold the metakey value memory - KeyRef m = snapshot->getMetaKey(); - - // Currently all internal records generated in the write path are at version 0 - return Reference(new Cursor(snapshot, ((MetaKey*)m.begin())->root.get(), (Version)0)); - } - // Must be nondecreasing - void setWriteVersion(Version v) override { + void setWriteVersion(Version v) { ASSERT(v > m_lastCommittedVersion); // If there was no current mutation buffer, create one in the buffer map and update m_pBuffer if (m_pBuffer == nullptr) { @@ -3629,7 +3610,7 @@ public: m_writeVersion = v; } - Future commit() override { + Future commit() { if (m_pBuffer == nullptr) return m_latestCommit; return commit_impl(this); @@ -5499,277 +5480,9 @@ private: } public: - // InternalCursor is for seeking to and iterating over the leaf-level RedwoodRecordRef records in the tree. - // The records could represent multiple values for the same key at different versions, including a non-present value - // representing a clear. Currently, however, all records are at version 0 and no clears are present in the tree. - struct InternalCursor { - private: - // Each InternalCursor's position is represented by a reference counted PageCursor, which links - // to its parent PageCursor, up to a PageCursor representing a cursor on the root page. - // PageCursors can be shared by many InternalCursors, making InternalCursor copying low overhead - struct PageCursor : ReferenceCounted, FastAllocated { - Reference parent; - BTreePageIDRef pageID; // Only needed for debugging purposes - Reference page; - BTreePage::BinaryTree::Cursor cursor; - - // id will normally reference memory owned by the parent, which is okay because a reference to the parent - // will be held in the cursor - PageCursor(BTreePageIDRef id, Reference page, Reference parent = {}) - : pageID(id), page(page), parent(parent), cursor(getCursor(page)) {} - - PageCursor(const PageCursor& toCopy) - : parent(toCopy.parent), pageID(toCopy.pageID), page(toCopy.page), cursor(toCopy.cursor) {} - - // Convenience method for copying a PageCursor - Reference copy() const { return makeReference(*this); } - - const BTreePage* btPage() const { return (const BTreePage*)page->begin(); } - - bool isLeaf() const { return btPage()->isLeaf(); } - - Future> getChild(Reference pager, int readAheadBytes = 0) { - ASSERT(!isLeaf()); - BTreePage::BinaryTree::Cursor next = cursor; - next.moveNext(); - // TODO this should fail!!! - RedwoodRecordRef rec = cursor.get(); - BTreePageIDRef id = rec.getChildPage(); - const RedwoodRecordRef upper = next.getOrUpperBound(); - Future> child = readPage(pager, id, &rec, &upper); - - // Read ahead siblings at level 2 - // TODO: Application of readAheadBytes is not taking into account the size of the current page or any - // of the adjacent pages it is preloading. - if (readAheadBytes > 0 && btPage()->height == 2 && next.valid()) { - do { - debug_printf("preloading %s %d bytes left\n", - ::toString(next.get().getChildPage()).c_str(), - readAheadBytes); - // If any part of the page was already loaded then stop - if (next.get().value.present()) { - preLoadPage(pager.getPtr(), next.get().getChildPage()); - readAheadBytes -= page->size(); - } - } while (readAheadBytes > 0 && next.moveNext()); - } - - return map(child, [=](Reference page) { - return makeReference(id, page, Reference::addRef(this)); - }); - } - - std::string toString() const { - return format("%s, %s", - ::toString(pageID).c_str(), - cursor.valid() ? cursor.get().toString(isLeaf()).c_str() : ""); - } - }; - - Standalone rootPageID; - Reference pager; - Reference pageCursor; - - public: - InternalCursor() {} - - InternalCursor(Reference pager, BTreePageIDRef root) : pager(pager), rootPageID(root) {} - - std::string toString() const { - std::string r; - - Reference c = pageCursor; - int maxDepth = 0; - while (c) { - c = c->parent; - ++maxDepth; - } - - c = pageCursor; - int depth = maxDepth; - while (c) { - r = format("[%d/%d: %s] ", depth--, maxDepth, c->toString().c_str()) + r; - c = c->parent; - } - return r; - } - - // Returns true if cursor position is a valid leaf page record - bool valid() const { return pageCursor && pageCursor->isLeaf() && pageCursor->cursor.valid(); } - - // Returns true if cursor position is valid() and has a present record value - bool present() const { return valid() && pageCursor->cursor.get().value.present(); } - - // Returns true if cursor position is present() and has an effective version <= v - bool presentAtVersion(Version v) { return present() && pageCursor->cursor.get().version <= v; } - - // This is to enable an optimization for the case where all internal records are at the - // same version and there are no implicit clears - // *this MUST be valid() - bool presentAtExactVersion(Version v) const { return present() && pageCursor->cursor.get().version == v; } - - // Returns true if cursor position is present() and has an effective version <= v - bool validAtVersion(Version v) { return valid() && pageCursor->cursor.get().version <= v; } - - const RedwoodRecordRef get() const { return pageCursor->cursor.get(); } - - // Ensure that pageCursor is not shared with other cursors so we can modify it - void ensureUnshared() { - if (!pageCursor->isSoleOwner()) { - pageCursor = pageCursor->copy(); - } - } - - Future moveToRoot() { - // If pageCursor exists follow parent links to the root - if (pageCursor) { - while (pageCursor->parent) { - pageCursor = pageCursor->parent; - } - return Void(); - } - - // Otherwise read the root page - Future> root = readPage(pager, rootPageID, &dbBegin, &dbEnd); - return map(root, [=](Reference p) { - pageCursor = makeReference(rootPageID, p); - return Void(); - }); - } - - ACTOR Future seekLessThan_impl(InternalCursor* self, RedwoodRecordRef query, int prefetchBytes) { - Future f = self->moveToRoot(); - // f will almost always be ready - if (!f.isReady()) { - wait(f); - } - - self->ensureUnshared(); - loop { - bool isLeaf = self->pageCursor->isLeaf(); - bool success = self->pageCursor->cursor.seekLessThan(query); - - // Skip backwards over internal page entries that do not link to child pages - if (!isLeaf) { - // While record has no value, move again - while (success && !self->pageCursor->cursor.get().value.present()) { - success = self->pageCursor->cursor.movePrev(); - } - } - - if (success) { - // If we found a record < query at a leaf page then return success - if (isLeaf) { - return true; - } - - Reference child = wait(self->pageCursor->getChild(self->pager, prefetchBytes)); - self->pageCursor = child; - } else { - // No records < query on this page, so move to immediate previous record at leaf level - bool success = wait(self->move(false)); - return success; - } - } - } - - Future seekLessThan(RedwoodRecordRef query, int prefetchBytes) { - return seekLessThan_impl(this, query, prefetchBytes); - } - - ACTOR Future move_impl(InternalCursor* self, bool forward) { - // Try to move pageCursor, if it fails to go parent, repeat until it works or root cursor can't be moved - while (1) { - self->ensureUnshared(); - bool success = self->pageCursor->cursor.valid() && - (forward ? self->pageCursor->cursor.moveNext() : self->pageCursor->cursor.movePrev()); - - // Skip over internal page entries that do not link to child pages - if (!self->pageCursor->isLeaf()) { - // While record has no value, move again - while (success && !self->pageCursor->cursor.get().value.present()) { - success = forward ? self->pageCursor->cursor.moveNext() : self->pageCursor->cursor.movePrev(); - } - } - - // Stop if successful or there's no parent to move to - if (success || !self->pageCursor->parent) { - break; - } - - // Move to parent - self->pageCursor = self->pageCursor->parent; - } - - // If pageCursor not valid we've reached an end of the tree - if (!self->pageCursor->cursor.valid()) { - return false; - } - - // While not on a leaf page, move down to get to one. - while (!self->pageCursor->isLeaf()) { - // Skip over internal page entries that do not link to child pages - while (!self->pageCursor->cursor.get().value.present()) { - bool success = forward ? self->pageCursor->cursor.moveNext() : self->pageCursor->cursor.movePrev(); - if (!success) { - return false; - } - } - - Reference child = wait(self->pageCursor->getChild(self->pager)); - forward ? child->cursor.moveFirst() : child->cursor.moveLast(); - self->pageCursor = child; - } - - return true; - } - - Future move(bool forward) { return move_impl(this, forward); } - - // Move to the first or last record of the database. - ACTOR Future move_end(InternalCursor* self, bool begin) { - Future f = self->moveToRoot(); - - // f will almost always be ready - if (!f.isReady()) { - wait(f); - } - - self->ensureUnshared(); - - loop { - // Move to first or last record in the page - bool success = begin ? self->pageCursor->cursor.moveFirst() : self->pageCursor->cursor.moveLast(); - - // Skip over internal page entries that do not link to child pages - if (!self->pageCursor->isLeaf()) { - // While record has no value, move past it - while (success && !self->pageCursor->cursor.get().value.present()) { - success = begin ? self->pageCursor->cursor.moveNext() : self->pageCursor->cursor.movePrev(); - } - } - - // If it worked, return true if we've reached a leaf page otherwise go to the next child - if (success) { - if (self->pageCursor->isLeaf()) { - return true; - } - - Reference child = wait(self->pageCursor->getChild(self->pager)); - self->pageCursor = child; - } else { - return false; - } - } - } - - Future moveFirst() { return move_end(this, true); } - Future moveLast() { return move_end(this, false); } - }; - - // Cursor designed for short lifespans. - // Holds references to all pages touched. - // All record references returned from it are valid until the cursor is destroyed. + // Cursor into BTree which enables seeking and iteration in the BTree as a whole, or + // iteration within a specific page and movement across levels for more efficient access. + // Cursor record's memory is only guaranteed to be valid until cursor moves to a different page. class BTreeCursor { public: struct PathEntry { @@ -5788,6 +5501,7 @@ public: public: BTreeCursor() {} + bool intialized() const { return pager.isValid(); } bool isValid() const { return valid; } std::string toString() const { @@ -5815,7 +5529,7 @@ public: PathEntry& back() { return path.back(); } void popPath() { path.pop_back(); } -#error These can't be references anymore +#warning These can't be references anymore Future pushPage(BTreePageIDRef id, const RedwoodRecordRef& lowerBound, const RedwoodRecordRef& upperBound) { @@ -6010,206 +5724,6 @@ public: return cursor->init(this, snapshot, ((MetaKey*)m.begin())->root.get()); } - - // Cursor is for reading and interating over user visible KV pairs at a specific version - // KeyValueRefs returned become invalid once the cursor is moved - class Cursor : public IStoreCursor, public ReferenceCounted, public FastAllocated, NonCopyable { - public: - Cursor(Reference pageSource, BTreePageIDRef root, Version internalRecordVersion) - : m_version(internalRecordVersion), m_cur1(pageSource, root), m_cur2(m_cur1) {} - - void addref() override { ReferenceCounted::addref(); } - void delref() override { ReferenceCounted::delref(); } - - private: - Version m_version; - // If kv is valid - // - kv.key references memory held by cur1 - // - If cur1 points to a non split KV pair - // - kv.value references memory held by cur1 - // - cur2 points to the next internal record after cur1 - // Else - // - kv.value references memory in arena - // - cur2 points to the first internal record of the split KV pair - InternalCursor m_cur1; - InternalCursor m_cur2; - Arena m_arena; - Optional m_kv; - - public: - Future findEqual(KeyRef key) override { return find_impl(this, key, 0); } - Future findFirstEqualOrGreater(KeyRef key, int prefetchBytes) override { - return find_impl(this, key, 1, prefetchBytes); - } - Future findLastLessOrEqual(KeyRef key, int prefetchBytes) override { - return find_impl(this, key, -1, prefetchBytes); - } - - Future next() override { return move(this, true); } - Future prev() override { return move(this, false); } - - bool isValid() override { return m_kv.present(); } - - KeyRef getKey() override { return m_kv.get().key; } - - ValueRef getValue() override { return m_kv.get().value; } - - std::string toString(bool includePaths = true) const { - std::string r; - r += format("Cursor(%p) ver: %" PRId64 " ", this, m_version); - if (m_kv.present()) { - r += format( - " KV: '%s' -> '%s'", m_kv.get().key.printable().c_str(), m_kv.get().value.printable().c_str()); - } else { - r += " KV: "; - } - if (includePaths) { - r += format("\n Cur1: %s", m_cur1.toString().c_str()); - r += format("\n Cur2: %s", m_cur2.toString().c_str()); - } else { - if (m_cur1.valid()) { - r += format("\n Cur1: %s", m_cur1.get().toString().c_str()); - } - if (m_cur2.valid()) { - r += format("\n Cur2: %s", m_cur2.get().toString().c_str()); - } - } - - return r; - } - - private: - // find key in tree closest to or equal to key (at this cursor's version) - // for less than or equal use cmp < 0 - // for greater than or equal use cmp > 0 - // for equal use cmp == 0 - ACTOR static Future find_impl(Cursor* self, KeyRef key, int cmp, int prefetchBytes = 0) { - state RedwoodRecordRef query(key, self->m_version + 1); - self->m_kv.reset(); - - wait(success(self->m_cur1.seekLessThan(query, prefetchBytes))); - debug_printf("find%sE(%s): %s\n", - cmp > 0 ? "GT" : (cmp == 0 ? "" : "LT"), - query.toString().c_str(), - self->toString().c_str()); - - // If we found the target key with a present value then return it as it is valid for any cmp type - if (self->m_cur1.present() && self->m_cur1.get().key == key) { - debug_printf("Target key found. Cursor: %s\n", self->toString().c_str()); - self->m_kv = self->m_cur1.get().toKeyValueRef(); - return Void(); - } - - // If cmp type is Equal and we reached here, we didn't find it - if (cmp == 0) { - return Void(); - } - - // cmp mode is GreaterThanOrEqual, so if we've reached here an equal key was not found and cur1 either - // points to a lesser key or is invalid. - if (cmp > 0) { - // If cursor is invalid, query was less than the first key in database so go to the first record - if (!self->m_cur1.valid()) { - bool valid = wait(self->m_cur1.moveFirst()); - if (!valid) { - self->m_kv.reset(); - return Void(); - } - } else { - // Otherwise, move forward until we find a key greater than the target key. - // If multiversion data is present, the next record could have the same key as the initial - // record found but be at a newer version. - loop { - bool valid = wait(self->m_cur1.move(true)); - if (!valid) { - self->m_kv.reset(); - return Void(); - } - - if (self->m_cur1.get().key > key) { - break; - } - } - } - - // Get the next present key at the target version. Handles invalid cursor too. - wait(self->next()); - } else if (cmp < 0) { - // cmp mode is LessThanOrEqual. An equal key to the target key was already checked above, and the - // search was for LessThan query, so cur1 is already in the right place. - if (!self->m_cur1.valid()) { - self->m_kv.reset(); - return Void(); - } - - // Move to previous present kv pair at the target version - wait(self->prev()); - } - - return Void(); - } - - ACTOR static Future move(Cursor* self, bool fwd) { - debug_printf("Cursor::move(%d): Start %s\n", fwd, self->toString().c_str()); - ASSERT(self->m_cur1.valid()); - - // If kv is present then the key/version at cur1 was already returned so move to a new key - // Move cur1 until failure or a new key is found, keeping prior record visited in cur2 - if (self->m_kv.present()) { - ASSERT(self->m_cur1.valid()); - loop { - self->m_cur2 = self->m_cur1; - debug_printf("Cursor::move(%d): Advancing cur1 %s\n", fwd, self->toString().c_str()); - bool valid = wait(self->m_cur1.move(fwd)); - if (!valid || self->m_cur1.get().key != self->m_cur2.get().key) { - break; - } - } - } - - // Given two consecutive cursors c1 and c2, c1 represents a returnable record if - // c1 is present at exactly version v - // OR - // c1 is.presentAtVersion(v) && (!c2.validAtVersion() || c2.get().key != c1.get().key()) - // Note the distinction between 'present' and 'valid'. Present means the value for the key - // exists at the version (but could be the empty string) while valid just means the internal - // record is in effect at that version but it could indicate that the key was cleared and - // no longer exists from the user's perspective at that version - if (self->m_cur1.valid()) { - self->m_cur2 = self->m_cur1; - debug_printf("Cursor::move(%d): Advancing cur2 %s\n", fwd, self->toString().c_str()); - wait(success(self->m_cur2.move(true))); - } - - while (self->m_cur1.valid()) { - - if (self->m_cur1.get().version == self->m_version || - (self->m_cur1.presentAtVersion(self->m_version) && - (!self->m_cur2.validAtVersion(self->m_version) || - self->m_cur2.get().key != self->m_cur1.get().key))) { - self->m_kv = self->m_cur1.get().toKeyValueRef(); - return Void(); - } - - if (fwd) { - // Moving forward, move cur2 forward and keep cur1 pointing to the prior (predecessor) record - debug_printf("Cursor::move(%d): Moving forward %s\n", fwd, self->toString().c_str()); - self->m_cur1 = self->m_cur2; - wait(success(self->m_cur2.move(true))); - } else { - // Moving backward, move cur1 backward and keep cur2 pointing to the prior (successor) record - debug_printf("Cursor::move(%d): Moving backward %s\n", fwd, self->toString().c_str()); - self->m_cur2 = self->m_cur1; - wait(success(self->m_cur1.move(false))); - } - } - - debug_printf("Cursor::move(%d): Exit, end of db reached. Cursor = %s\n", fwd, self->toString().c_str()); - self->m_kv.reset(); - - return Void(); - } - }; }; #include "fdbserver/art_impl.h" @@ -6221,7 +5735,6 @@ class KeyValueStoreRedwoodUnversioned : public IKeyValueStore { public: KeyValueStoreRedwoodUnversioned(std::string filePrefix, UID logID) : m_filePrefix(filePrefix), m_concurrentReads(new FlowLock(SERVER_KNOBS->REDWOOD_KVSTORE_CONCURRENT_READS)) { - // TODO: This constructor should really just take an IVersionedStore int pageSize = BUGGIFY ? deterministicRandom()->randomInt(1000, 4096 * 4) : SERVER_KNOBS->REDWOOD_DEFAULT_PAGE_SIZE; @@ -6334,13 +5847,13 @@ public: // we can bypass the bounds check for each key in the leaf if the entire leaf is in range // > because both query end and page upper bound are exclusive of the query results and page contents, // respectively - bool boundsCheck = leafCursor.upperBound() > keys.end; + bool checkBounds = leafCursor.cache->upperBound > keys.end; // Whether or not any results from this page were added to results bool usedPage = false; while (leafCursor.valid()) { KeyValueRef kv = leafCursor.get().toKeyValueRef(); - if (boundsCheck && kv.key.compare(keys.end) >= 0) { + if (checkBounds && kv.key.compare(keys.end) >= 0) { break; } accumulatedBytes += kv.expectedSize(); @@ -6355,7 +5868,7 @@ public: // If the page was used, results must depend on the ArenaPage arena and the Mirror arena. // This must be done after visiting all the results in case the Mirror arena changes. if (usedPage) { - result.arena().dependsOn(leafCursor.mirror->arena); + result.arena().dependsOn(leafCursor.cache->arena); result.arena().dependsOn(cur.back().page->getArena()); } @@ -6376,13 +5889,13 @@ public: // we can bypass the bounds check for each key in the leaf if the entire leaf is in range // < because both query begin and page lower bound are inclusive of the query results and page contents, // respectively - bool boundsCheck = leafCursor.lowerBound() < keys.begin; + bool checkBounds = leafCursor.cache->lowerBound < keys.begin; // Whether or not any results from this page were added to results bool usedPage = false; while (leafCursor.valid()) { KeyValueRef kv = leafCursor.get().toKeyValueRef(); - if (boundsCheck && kv.key.compare(keys.begin) < 0) { + if (checkBounds && kv.key.compare(keys.begin) < 0) { break; } accumulatedBytes += kv.expectedSize(); @@ -6397,7 +5910,7 @@ public: // If the page was used, results must depend on the ArenaPage arena and the Mirror arena. // This must be done after visiting all the results in case the Mirror arena changes. if (usedPage) { - result.arena().dependsOn(leafCursor.mirror->arena); + result.arena().dependsOn(leafCursor.cache->arena); result.arena().dependsOn(cur.back().page->getArena()); } @@ -6612,7 +6125,7 @@ ACTOR Future verifyRangeBTreeCursor(VersionedBTree* btree, ASSERT(errors == 0); results.push_back(results.arena(), cur.get().toKeyValueRef()); - results.arena().dependsOn(cur.back().cursor.mirror->arena); + results.arena().dependsOn(cur.back().cursor.cache->arena); results.arena().dependsOn(cur.back().page->getArena()); wait(cur.moveNext()); @@ -6709,255 +6222,6 @@ ACTOR Future verifyRangeBTreeCursor(VersionedBTree* btree, return errors; } -ACTOR Future verifyRange(VersionedBTree* btree, - Key start, - Key end, - Version v, - std::map, Optional>* written, - int* pErrorCount) { - state int errors = 0; - if (end <= start) - end = keyAfter(start); - - state std::map, Optional>::const_iterator i = - written->lower_bound(std::make_pair(start.toString(), 0)); - state std::map, Optional>::const_iterator iEnd = - written->upper_bound(std::make_pair(end.toString(), 0)); - state std::map, Optional>::const_iterator iLast; - - state Reference cur = btree->readAtVersion(v); - debug_printf("VerifyRange(@%" PRId64 ", %s, %s): Start cur=%p\n", - v, - start.printable().c_str(), - end.printable().c_str(), - cur.getPtr()); - - // Randomly use the cursor for something else first. - if (deterministicRandom()->coinflip()) { - state Key randomKey = randomKV().key; - debug_printf("VerifyRange(@%" PRId64 ", %s, %s): Dummy seek to '%s'\n", - v, - start.printable().c_str(), - end.printable().c_str(), - randomKey.toString().c_str()); - wait(deterministicRandom()->coinflip() ? cur->findFirstEqualOrGreater(randomKey) - : cur->findLastLessOrEqual(randomKey)); - } - - debug_printf( - "VerifyRange(@%" PRId64 ", %s, %s): Actual seek\n", v, start.printable().c_str(), end.printable().c_str()); - wait(cur->findFirstEqualOrGreater(start)); - - state std::vector results; - - while (cur->isValid() && cur->getKey() < end) { - // Find the next written kv pair that would be present at this version - while (1) { - iLast = i; - if (i == iEnd) - break; - ++i; - - if (iLast->first.second <= v && iLast->second.present() && - (i == iEnd || i->first.first != iLast->first.first || i->first.second > v)) { - debug_printf("VerifyRange(@%" PRId64 ", %s, %s) Found key in written map: %s\n", - v, - start.printable().c_str(), - end.printable().c_str(), - iLast->first.first.c_str()); - break; - } - } - - if (iLast == iEnd) { - ++errors; - ++*pErrorCount; - printf("VerifyRange(@%" PRId64 ", %s, %s) ERROR: Tree key '%s' vs nothing in written map.\n", - v, - start.printable().c_str(), - end.printable().c_str(), - cur->getKey().toString().c_str()); - break; - } - - if (cur->getKey() != iLast->first.first) { - ++errors; - ++*pErrorCount; - printf("VerifyRange(@%" PRId64 ", %s, %s) ERROR: Tree key '%s' but expected '%s'\n", - v, - start.printable().c_str(), - end.printable().c_str(), - cur->getKey().toString().c_str(), - iLast->first.first.c_str()); - break; - } - if (cur->getValue() != iLast->second.get()) { - ++errors; - ++*pErrorCount; - printf("VerifyRange(@%" PRId64 ", %s, %s) ERROR: Tree key '%s' has tree value '%s' but expected '%s'\n", - v, - start.printable().c_str(), - end.printable().c_str(), - cur->getKey().toString().c_str(), - cur->getValue().toString().c_str(), - iLast->second.get().c_str()); - break; - } - - ASSERT(errors == 0); - - results.push_back(KeyValue(KeyValueRef(cur->getKey(), cur->getValue()))); - wait(cur->next()); - } - - // Make sure there are no further written kv pairs that would be present at this version. - while (1) { - iLast = i; - if (i == iEnd) - break; - ++i; - if (iLast->first.second <= v && iLast->second.present() && - (i == iEnd || i->first.first != iLast->first.first || i->first.second > v)) - break; - } - - if (iLast != iEnd) { - ++errors; - ++*pErrorCount; - printf("VerifyRange(@%" PRId64 ", %s, %s) ERROR: Tree range ended but written has @%" PRId64 " '%s'\n", - v, - start.printable().c_str(), - end.printable().c_str(), - iLast->first.second, - iLast->first.first.c_str()); - } - - debug_printf( - "VerifyRangeReverse(@%" PRId64 ", %s, %s): start\n", v, start.printable().c_str(), end.printable().c_str()); - - // Randomly use a new cursor at the same version for the reverse range read, if the version is still available for - // opening new cursors - if (v >= btree->getOldestVersion() && deterministicRandom()->coinflip()) { - cur = btree->readAtVersion(v); - } - - // Now read the range from the tree in reverse order and compare to the saved results - wait(cur->findLastLessOrEqual(end)); - if (cur->isValid() && cur->getKey() == end) - wait(cur->prev()); - - state std::vector::const_reverse_iterator r = results.rbegin(); - - while (cur->isValid() && cur->getKey() >= start) { - if (r == results.rend()) { - ++errors; - ++*pErrorCount; - printf("VerifyRangeReverse(@%" PRId64 ", %s, %s) ERROR: Tree key '%s' vs nothing in written map.\n", - v, - start.printable().c_str(), - end.printable().c_str(), - cur->getKey().toString().c_str()); - break; - } - - if (cur->getKey() != r->key) { - ++errors; - ++*pErrorCount; - printf("VerifyRangeReverse(@%" PRId64 ", %s, %s) ERROR: Tree key '%s' but expected '%s'\n", - v, - start.printable().c_str(), - end.printable().c_str(), - cur->getKey().toString().c_str(), - r->key.toString().c_str()); - break; - } - if (cur->getValue() != r->value) { - ++errors; - ++*pErrorCount; - printf("VerifyRangeReverse(@%" PRId64 - ", %s, %s) ERROR: Tree key '%s' has tree value '%s' but expected '%s'\n", - v, - start.printable().c_str(), - end.printable().c_str(), - cur->getKey().toString().c_str(), - cur->getValue().toString().c_str(), - r->value.toString().c_str()); - break; - } - - ++r; - wait(cur->prev()); - } - - if (r != results.rend()) { - ++errors; - ++*pErrorCount; - printf("VerifyRangeReverse(@%" PRId64 ", %s, %s) ERROR: Tree range ended but written has '%s'\n", - v, - start.printable().c_str(), - end.printable().c_str(), - r->key.toString().c_str()); - } - - return errors; -} - -// Verify the result of point reads for every set or cleared key at the given version -ACTOR Future seekAll(VersionedBTree* btree, - Version v, - std::map, Optional>* written, - int* pErrorCount) { - state std::map, Optional>::const_iterator i = written->cbegin(); - state std::map, Optional>::const_iterator iEnd = written->cend(); - state int errors = 0; - state Reference cur = btree->readAtVersion(v); - - while (i != iEnd) { - state std::string key = i->first.first; - state Version ver = i->first.second; - if (ver == v) { - state Optional val = i->second; - debug_printf("Verifying @%" PRId64 " '%s'\n", ver, key.c_str()); - state Arena arena; - wait(cur->findEqual(KeyRef(arena, key))); - - if (val.present()) { - if (!(cur->isValid() && cur->getKey() == key && cur->getValue() == val.get())) { - ++errors; - ++*pErrorCount; - if (!cur->isValid()) - printf("Verify ERROR: key_not_found: '%s' -> '%s' @%" PRId64 "\n", - key.c_str(), - val.get().c_str(), - ver); - else if (cur->getKey() != key) - printf("Verify ERROR: key_incorrect: found '%s' expected '%s' @%" PRId64 "\n", - cur->getKey().toString().c_str(), - key.c_str(), - ver); - else if (cur->getValue() != val.get()) - printf("Verify ERROR: value_incorrect: for '%s' found '%s' expected '%s' @%" PRId64 "\n", - cur->getKey().toString().c_str(), - cur->getValue().toString().c_str(), - val.get().c_str(), - ver); - } - } else { - if (cur->isValid() && cur->getKey() == key) { - ++errors; - ++*pErrorCount; - printf("Verify ERROR: cleared_key_found: '%s' -> '%s' @%" PRId64 "\n", - key.c_str(), - cur->getValue().toString().c_str(), - ver); - } - } - } - ++i; - } - return errors; -} - // Verify the result of point reads for every set or cleared key at the given version ACTOR Future seekAllBTreeCursor(VersionedBTree* btree, Version v, @@ -7023,9 +6287,6 @@ ACTOR Future verify(VersionedBTree* btree, std::map, Optional>* written, int* pErrorCount, bool serial) { - state Future fRangeAll; - state Future fRangeRandom; - state Future fSeekAll; // Queue of committed versions still readable from btree state std::deque committedVersions; @@ -7050,40 +6311,30 @@ ACTOR Future verify(VersionedBTree* btree, v = committedVersions[deterministicRandom()->randomInt(0, committedVersions.size())]; debug_printf("Using committed version %" PRId64 "\n", v); + // Get a cursor at v so that v doesn't get expired between the possibly serial steps below. - state Reference cur = btree->readAtVersion(v); + state VersionedBTree::BTreeCursor cur; + wait(btree->initBTreeCursor(&cur, v)); debug_printf("Verifying entire key range at version %" PRId64 "\n", v); - if (false) { - fRangeAll = - verifyRange(btree, LiteralStringRef(""), LiteralStringRef("\xff\xff"), v, written, pErrorCount); - } else { - fRangeAll = verifyRangeBTreeCursor( - btree, LiteralStringRef(""), LiteralStringRef("\xff\xff"), v, written, pErrorCount); - } + state Future fRangeAll = verifyRangeBTreeCursor( + btree, LiteralStringRef(""), LiteralStringRef("\xff\xff"), v, written, pErrorCount); if (serial) { wait(success(fRangeAll)); } Key begin = randomKV().key; Key end = randomKV().key; + debug_printf( "Verifying range (%s, %s) at version %" PRId64 "\n", toString(begin).c_str(), toString(end).c_str(), v); - if (false) { - fRangeRandom = verifyRange(btree, begin, end, v, written, pErrorCount); - } else { - fRangeRandom = verifyRangeBTreeCursor(btree, begin, end, v, written, pErrorCount); - } + state Future fRangeRandom = verifyRangeBTreeCursor(btree, begin, end, v, written, pErrorCount); if (serial) { wait(success(fRangeRandom)); } debug_printf("Verifying seeks to each changed key at version %" PRId64 "\n", v); - if (false) { - fSeekAll = seekAll(btree, v, written, pErrorCount); - } else { - fSeekAll = seekAllBTreeCursor(btree, v, written, pErrorCount); - } + state Future fSeekAll = seekAllBTreeCursor(btree, v, written, pErrorCount); if (serial) { wait(success(fSeekAll)); } @@ -7106,19 +6357,20 @@ ACTOR Future verify(VersionedBTree* btree, // Does a random range read, doesn't trap/report errors ACTOR Future randomReader(VersionedBTree* btree) { try { - state Reference cur; + state VersionedBTree::BTreeCursor cur; + loop { wait(yield()); - if (!cur || deterministicRandom()->random01() > .01) { - Version v = btree->getLastCommittedVersion(); - cur = btree->readAtVersion(v); + if (!cur.intialized() || deterministicRandom()->random01() > .01) { + wait(btree->initBTreeCursor(&cur, btree->getLastCommittedVersion())); } state KeyValue kv = randomKV(10, 0); - wait(cur->findFirstEqualOrGreater(kv.key)); + wait(cur.seekGTE(kv.key, 0)); state int c = deterministicRandom()->randomInt(0, 100); - while (cur->isValid() && c-- > 0) { - wait(success(cur->next())); + state bool direction = deterministicRandom()->coinflip(); + while (cur.isValid() && c-- > 0) { + wait(success(direction ? cur.moveNext() : cur.movePrev())); wait(yield()); } } @@ -8646,7 +7898,7 @@ TEST_CASE("/redwood/correctness/btree") { // Create new promise stream and start the verifier again committedVersions = PromiseStream(); verifyTask = verify(btree, committedVersions.getFuture(), &written, &errorCount, serialTest); - if(!serialTest) { + if (!serialTest) { randomTask = randomReader(btree) || btree->getError(); } committedVersions.send(v); @@ -8692,10 +7944,11 @@ ACTOR Future randomSeeks(VersionedBTree* btree, int count, char firstChar, state int c = 0; state double readStart = timer(); printf("Executing %d random seeks\n", count); - state Reference cur = btree->readAtVersion(readVer); + state VersionedBTree::BTreeCursor cur; + wait(btree->initBTreeCursor(&cur, readVer)); while (c < count) { state Key k = randomString(20, firstChar, lastChar); - wait(success(cur->findFirstEqualOrGreater(k))); + wait(cur.seekGTE(k, 0)); ++c; } double elapsed = timer() - readStart; @@ -8713,20 +7966,22 @@ ACTOR Future randomScans(VersionedBTree* btree, state int c = 0; state double readStart = timer(); printf("Executing %d random scans\n", count); - state Reference cur = btree->readAtVersion(readVer); + state VersionedBTree::BTreeCursor cur; + wait(btree->initBTreeCursor(&cur, readVer)); + state bool adaptive = readAhead < 0; state int totalScanBytes = 0; while (c++ < count) { state Key k = randomString(20, firstChar, lastChar); - wait(success(cur->findFirstEqualOrGreater(k, readAhead))); + wait(cur.seekGTE(k, readAhead)); if (adaptive) { readAhead = totalScanBytes / c; } state int w = width; - while (w > 0 && cur->isValid()) { - totalScanBytes += cur->getKey().size(); - totalScanBytes += cur->getValue().size(); - wait(cur->next()); + state bool direction = deterministicRandom()->coinflip(); + while (w > 0 && cur.isValid()) { + totalScanBytes += cur.get().expectedSize(); + wait(success(direction ? cur.moveNext() : cur.movePrev())); --w; } } From 8ef516ead21b9955ce456453fa0eb158bc0e2402 Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Tue, 18 May 2021 01:33:11 -0700 Subject: [PATCH 10/42] Bug fixes from bad search/replace. DeltaTree2::Cursor now keeps current decoded item as a member instead of calculating it on demand in get(). --- fdbserver/DeltaTree.h | 31 +++++++++++++++++++++++------- fdbserver/VersionedBTree.actor.cpp | 6 +++--- 2 files changed, 27 insertions(+), 10 deletions(-) diff --git a/fdbserver/DeltaTree.h b/fdbserver/DeltaTree.h index 221145eb62..6794847e7c 100644 --- a/fdbserver/DeltaTree.h +++ b/fdbserver/DeltaTree.h @@ -1044,8 +1044,7 @@ public: struct Cursor { Cursor() : cache(nullptr), nodeIndex(-1) {} - Cursor(DecodeCache* cache, DeltaTree2* tree, int nodeIndex = -1) - : cache(cache), tree(tree), nodeIndex(nodeIndex) {} + Cursor(DecodeCache* cache, DeltaTree2* tree) : cache(cache), tree(tree), nodeIndex(-1) {} int rootIndex() { if (!cache->empty()) { @@ -1059,6 +1058,7 @@ public: DeltaTree2* tree; DecodeCache* cache; int nodeIndex; + T item; Node* node() const { return tree->nodeAt(cache->get(nodeIndex).nodeOffset); } @@ -1066,8 +1066,9 @@ public: if (nodeIndex == -1) { return format("Cursor{nodeIndex=-1}"); } - return format("Cursor{item=%s nodeIndex=%d decodedNode=%s node=%s ", - get().toString().c_str(), + return format("Cursor{item=%s indexItem=%s nodeIndex=%d decodedNode=%s node=%s ", + item.toString().c_str(), + get(cache->get(nodeIndex)).toString().c_str(), nodeIndex, cache->get(nodeIndex).toString().c_str(), node()->toString(tree).c_str()); @@ -1098,7 +1099,11 @@ public: return delta.apply(cache->arena, base, decoded.partial); } - const T get() const { return get(cache->get(nodeIndex)); } + private: + inline void updateItem() { item = get(cache->get(nodeIndex)); } + + public: + const T& get() const { return item; } const T getOrUpperBound() const { return valid() ? get() : cache->upperBound; } @@ -1208,6 +1213,7 @@ public: while (nIndex != -1) { nodeIndex = nIndex; + updateItem(); cmp = s.compare(get(), skipLen); deltatree_printf("seek(%s) loop cmp=%d %s\n", s.toString().c_str(), cmp, toString().c_str()); if (cmp == 0) { @@ -1230,6 +1236,7 @@ public: deltatree_printf("moveFirst start %s\n", toString().c_str()); while (nIndex != -1) { nodeIndex = nIndex; + updateItem(); deltatree_printf("moveFirst moved %s\n", toString().c_str()); nIndex = getLeftChildIndex(nIndex); } @@ -1242,6 +1249,7 @@ public: deltatree_printf("moveLast start %s\n", toString().c_str()); while (nIndex != -1) { nodeIndex = nIndex; + updateItem(); deltatree_printf("moveLast moved %s\n", toString().c_str()); nIndex = getRightChildIndex(nIndex); } @@ -1257,11 +1265,15 @@ public: // If we couldn't go right, then the answer is our next ancestor if (nIndex == -1) { nodeIndex = cache->get(nodeIndex).rightParentIndex; + if (nodeIndex != -1) { + updateItem(); + } deltatree_printf("_moveNext move1 %s\n", toString().c_str()); } else { // Go left as far as possible do { nodeIndex = nIndex; + updateItem(); deltatree_printf("_moveNext move2 %s\n", toString().c_str()); nIndex = getLeftChildIndex(nodeIndex); } while (nIndex != -1); @@ -1276,11 +1288,15 @@ public: // If we couldn't go left, then the answer is our prev ancestor if (nIndex == -1) { nodeIndex = cache->get(nodeIndex).leftParentIndex; + if (nodeIndex != -1) { + updateItem(); + } deltatree_printf("_movePrev move1 %s\n", toString().c_str()); } else { // Go right as far as possible do { nodeIndex = nIndex; + updateItem(); deltatree_printf("_movePrev move2 %s\n", toString().c_str()); nIndex = getRightChildIndex(nodeIndex); } while (nIndex != -1); @@ -1303,7 +1319,7 @@ public: // Erase current item by setting its deleted flag to true. // Tree header is updated if a change is made. - // Cursor is not moved, so now points to a node marked as deletd. + // Cursor is then moved forward to the next non-deleted node. void erase() { auto& delta = getDelta(); if (!delta.getDeleted()) { @@ -1311,11 +1327,12 @@ public: --tree->numItems; tree->nodeBytesDeleted += (delta.size() + Node::headerSize(tree->largeNodes)); } + moveNext(); } // Erase k by setting its deleted flag to true. Returns true only if k existed bool erase(const T& k, int skipLen = 0) { - Cursor c(cache, tree, -1); + Cursor c(cache, tree); if (c.seek(k, skipLen) == 0 && !c.isErased()) { c.erase(); return true; diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index f7560a464d..69f98903da 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -6695,8 +6695,8 @@ TEST_CASE("/redwood/correctness/unit/deltaTree/RedwoodRecordRef") { std::vector items(uniqueItems.begin(), uniqueItems.end()); int bufferSize = N * 100; - bool largeTree = bufferSize > DeltaTree2::SmallSizeLimit; - DeltaTree2* tree = (DeltaTree2*)new uint8_t[bufferSize]; + bool largeTree = bufferSize > DeltaTree::SmallSizeLimit; + DeltaTree* tree = (DeltaTree*)new uint8_t[bufferSize]; tree->build(bufferSize, &items[0], &items[items.size()], &prev, &next); @@ -7051,7 +7051,7 @@ TEST_CASE("/redwood/correctness/unit/deltaTree/IntIntPair") { std::vector items(uniqueItems.begin(), uniqueItems.end()); int bufferSize = N * 2 * 30; - DeltaTree2* tree = (DeltaTree2*)new uint8_t[bufferSize]; + DeltaTree* tree = (DeltaTree*)new uint8_t[bufferSize]; int builtSize = tree->build(bufferSize, &items[0], &items[items.size()], &prev, &next); ASSERT(builtSize <= bufferSize); DeltaTree::Mirror r(tree, &prev, &next); From a6f7d37a256851f763eab4059d46ee52a6731cb1 Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Tue, 18 May 2021 01:58:30 -0700 Subject: [PATCH 11/42] Bug fixes related to DeltaTree2::Cursor contract being different from DeltaTree::Cursor. --- fdbserver/DeltaTree.h | 8 ++++- fdbserver/VersionedBTree.actor.cpp | 52 ++++++++++++++---------------- 2 files changed, 32 insertions(+), 28 deletions(-) diff --git a/fdbserver/DeltaTree.h b/fdbserver/DeltaTree.h index 6794847e7c..02a276ec7b 100644 --- a/fdbserver/DeltaTree.h +++ b/fdbserver/DeltaTree.h @@ -1103,9 +1103,15 @@ public: inline void updateItem() { item = get(cache->get(nodeIndex)); } public: + // Get the item at the cursor + // Behavior is undefined if the cursor is not valid. + // If the cursor is moved, the reference object returned will be modified to + // the cursor's new current item. const T& get() const { return item; } - const T getOrUpperBound() const { return valid() ? get() : cache->upperBound; } + // If the cursor is valid, return a reference to the cursor's internal T. + // Otherwise, returns a reference to the cache's upper boundary. + const T& getOrUpperBound() const { return valid() ? get() : cache->upperBound; } bool operator==(const Cursor& rhs) const { return nodeIndex == rhs.nodeIndex; } bool operator!=(const Cursor& rhs) const { return nodeIndex != rhs.nodeIndex; } diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index 69f98903da..f50d4368d0 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -3177,8 +3177,8 @@ struct BTreePage { std::string toString(bool write, BTreePageIDRef id, Version ver, - const RedwoodRecordRef* lowerBound, - const RedwoodRecordRef* upperBound) const { + const RedwoodRecordRef& lowerBound, + const RedwoodRecordRef& upperBound) const { std::string r; r += format("BTreePage op=%s %s @%" PRId64 " ptr=%p height=%d count=%d kvBytes=%d\n lowerBound: %s\n upperBound: %s\n", @@ -3189,13 +3189,13 @@ struct BTreePage { height, (int)tree().numItems, (int)kvBytes, - lowerBound->toString(false).c_str(), - upperBound->toString(false).c_str()); + lowerBound.toString(false).c_str(), + upperBound.toString(false).c_str()); try { if (tree().numItems > 0) { // This doesn't use the cached reader for the page because it is only for debugging purposes, // a cached reader may not exist - BinaryTree::DecodeCache cache(*lowerBound, *upperBound); + BinaryTree::DecodeCache cache(lowerBound, upperBound); BinaryTree::Cursor c(&cache, &tree()); c.moveFirst(); @@ -3206,8 +3206,8 @@ struct BTreePage { r += " "; r += c.get().toString(height == 1); - bool tooLow = c.get().withoutValue() < lowerBound->withoutValue(); - bool tooHigh = c.get().withoutValue() >= upperBound->withoutValue(); + bool tooLow = c.get().withoutValue() < lowerBound.withoutValue(); + bool tooHigh = c.get().withoutValue() >= upperBound.withoutValue(); if (tooLow || tooHigh) { anyOutOfRange = true; if (tooLow) { @@ -3472,7 +3472,7 @@ public: } // Start reading the page, without caching entries.push_back( - std::make_pair(q.get(), self->readPage(snapshot, q.get().pageID, nullptr, nullptr, true, false))); + std::make_pair(q.get(), self->readPage(snapshot, q.get().pageID, dbBegin, dbEnd, true, false))); --toPop; } @@ -4296,8 +4296,8 @@ private: ACTOR static Future> readPage(Reference snapshot, BTreePageIDRef id, - const RedwoodRecordRef* lowerBound, - const RedwoodRecordRef* upperBound, + RedwoodRecordRef lowerBound, + RedwoodRecordRef upperBound, bool forLazyClear = false, bool cacheable = true, bool* fromCache = nullptr) { @@ -4305,8 +4305,8 @@ private: debug_printf("readPage() op=read %s @%" PRId64 " lower=%s upper=%s\n", toString(id).c_str(), snapshot->getVersion(), - lowerBound->toString(false).c_str(), - upperBound->toString(false).c_str()); + lowerBound.toString(false).c_str(), + upperBound.toString(false).c_str()); } else { debug_printf( "readPage() op=readForDeferredClear %s @%" PRId64 " \n", toString(id).c_str(), snapshot->getVersion()); @@ -4345,11 +4345,10 @@ private: debug_printf("readPage() Creating DecodeCache for %s @%" PRId64 " lower=%s upper=%s\n", toString(id).c_str(), snapshot->getVersion(), - lowerBound->toString(false).c_str(), - upperBound->toString(false).c_str()); + lowerBound.toString(false).c_str(), + upperBound.toString(false).c_str()); - BTreePage::BinaryTree::DecodeCache* cache = - new BTreePage::BinaryTree::DecodeCache(*lowerBound, *upperBound); + BTreePage::BinaryTree::DecodeCache* cache = new BTreePage::BinaryTree::DecodeCache(lowerBound, upperBound); cache->addref(); page->userData = cache; page->userDataDestructor = [](void* cache) { ((BTreePage::BinaryTree::DecodeCache*)cache)->delref(); }; @@ -4732,7 +4731,7 @@ private: state FlowLock::Releaser readLock(*commitReadLock); state bool fromCache = false; state Reference page = wait( - readPage(snapshot, rootID, &update->decodeLowerBound, &update->decodeUpperBound, false, false, &fromCache)); + readPage(snapshot, rootID, update->decodeLowerBound, update->decodeUpperBound, false, false, &fromCache)); readLock.release(); state BTreePage* btPage = (BTreePage*)page->begin(); @@ -4755,8 +4754,7 @@ private: debug_printf( "%s commitSubtree(): %s\n", context.c_str(), - btPage - ->toString(false, rootID, snapshot->getVersion(), &update->decodeLowerBound, &update->decodeUpperBound) + btPage->toString(false, rootID, snapshot->getVersion(), update->decodeLowerBound, update->decodeUpperBound) .c_str()); state BTreePage::BinaryTree::Cursor cursor = getCursor(page); @@ -5313,8 +5311,8 @@ private: ->toString(false, newID, snapshot->getVersion(), - &update->decodeLowerBound, - &update->decodeUpperBound) + update->decodeLowerBound, + update->decodeUpperBound) .c_str()); update->updatedInPlace(newID, btPage, newID.size() * self->m_blockSize); @@ -5529,23 +5527,23 @@ public: PathEntry& back() { return path.back(); } void popPath() { path.pop_back(); } -#warning These can't be references anymore Future pushPage(BTreePageIDRef id, const RedwoodRecordRef& lowerBound, const RedwoodRecordRef& upperBound) { - - return map(readPage(pager, id, &lowerBound, &upperBound), [this, id](Reference p) { + // The boundary RedwoodRecordRefs are shallow copied to readPage()'s argument / actor state variables, + // and the arenas for them must be kept alive by the higher path entries which contain ArenaPage + // references. + return map(readPage(pager, id, lowerBound, upperBound), [this, id](Reference p) { path.push_back({ p, getCursor(p) }); return Void(); }); } Future pushPage(BTreePage::BinaryTree::Cursor c) { - RedwoodRecordRef rec = c.get(); auto next = c; next.moveNext(); - BTreePageIDRef id = rec.getChildPage(); - return pushPage(id, rec, next.getOrUpperBound()); + BTreePageIDRef id = c.get().getChildPage(); + return pushPage(id, c.get(), next.getOrUpperBound()); } Future init(VersionedBTree* btree_in, Reference pager_in, BTreePageIDRef root) { From a58ac622ed70b9fb5450be725c850b50a0a2086a Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Tue, 18 May 2021 14:33:04 -0700 Subject: [PATCH 12/42] Bug fix in test data generation for IntIntPair DeltaTree unit test. --- fdbserver/VersionedBTree.actor.cpp | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index f50d4368d0..223e712159 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -7024,12 +7024,19 @@ TEST_CASE("/redwood/correctness/unit/deltaTree/RedwoodRecordRef2") { TEST_CASE("/redwood/correctness/unit/deltaTree/IntIntPair") { const int N = 200; - IntIntPair prev = { 1, 0 }; - IntIntPair next = { 10000, 10000 }; + IntIntPair lowerBound = { 0, 0 }; + IntIntPair upperBound = { 1000, 1000 }; state std::function randomPair = [&]() { - return IntIntPair( - { deterministicRandom()->randomInt(prev.k, next.k), deterministicRandom()->randomInt(prev.v, next.v) }); + // Generate a pair >= lowerBound and < upperBound + int k = deterministicRandom()->randomInt(lowerBound.k, upperBound.k + 1); + int v = deterministicRandom()->randomInt(lowerBound.v, upperBound.v); + + // Only generate even values so the tests below can approach and find each + // key with a directional seek of the adjacent absent value on either side. + v -= v % 2; + + return IntIntPair(k, v); }; // Build a set of N unique items, where no consecutive items are in the set, a requirement of the seek behavior tests. @@ -7050,14 +7057,14 @@ TEST_CASE("/redwood/correctness/unit/deltaTree/IntIntPair") { int bufferSize = N * 2 * 30; DeltaTree* tree = (DeltaTree*)new uint8_t[bufferSize]; - int builtSize = tree->build(bufferSize, &items[0], &items[items.size()], &prev, &next); + int builtSize = tree->build(bufferSize, &items[0], &items[items.size()], &lowerBound, &upperBound); ASSERT(builtSize <= bufferSize); - DeltaTree::Mirror r(tree, &prev, &next); + DeltaTree::Mirror r(tree, &lowerBound, &upperBound); DeltaTree2* tree2 = (DeltaTree2*)new uint8_t[bufferSize]; - int builtSize2 = tree2->build(bufferSize, &items[0], &items[items.size()], &prev, &next); + int builtSize2 = tree2->build(bufferSize, &items[0], &items[items.size()], &lowerBound, &upperBound); ASSERT(builtSize2 <= bufferSize); - DeltaTree2::DecodeCache cache(prev, next); + DeltaTree2::DecodeCache cache(lowerBound, upperBound); DeltaTree2::Cursor cur2(&cache, tree2); auto printItems = [&] { @@ -7212,7 +7219,7 @@ TEST_CASE("/redwood/correctness/unit/deltaTree/IntIntPair") { scanAndVerify2(); // Create a new mirror, decoding the tree from scratch since insert() modified both the tree and the mirror - r = DeltaTree::Mirror(tree, &prev, &next); + r = DeltaTree::Mirror(tree, &lowerBound, &upperBound); cache.clear(); scanAndVerify(); scanAndVerify2(); From 8e7a97f495ced70c608b0ff755bfcd26100d3120 Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Wed, 19 May 2021 02:09:07 -0700 Subject: [PATCH 13/42] Bug fix: BTreeCursor::init() did not clear path. --- fdbserver/VersionedBTree.actor.cpp | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index 223e712159..7018659ef2 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -4780,10 +4780,9 @@ private: bool updating = tryToUpdate; bool changesMade = false; - // Couldn't make changes in place, so now do a linear merge and build new pages. state Standalone> merged; - auto switchToLinearMerge = [&]() { + // Couldn't make changes in place, so now do a linear merge and build new pages. updating = false; auto c = cursor; c.moveFirst(); @@ -5486,6 +5485,9 @@ public: struct PathEntry { Reference page; BTreePage::BinaryTree::Cursor cursor; +#if REDWOOD_DEBUG + Standalone id; +#endif const BTreePage* btPage() const { return (BTreePage*)page->begin(); }; }; @@ -5505,9 +5507,14 @@ public: std::string toString() const { std::string r = format("{ptr=%p %s ", this, ::toString(pager->getVersion()).c_str()); for (int i = 0; i < path.size(); ++i) { - r += format("[%d/%d: %s] ", - i + 1, - path.size(), + std::string id = ""; +#if REDWOOD_DEBUG + id = ::toString(path[i].id); +#endif + r += format("[Level=%d ID=%s ptr=%p Cursor=%s] ", + path[i].btPage()->height, + id.c_str(), + path[i].page->begin(), path[i].cursor.valid() ? path[i].cursor.get().toString(path[i].btPage()->isLeaf()).c_str() : ""); } @@ -5533,8 +5540,13 @@ public: // The boundary RedwoodRecordRefs are shallow copied to readPage()'s argument / actor state variables, // and the arenas for them must be kept alive by the higher path entries which contain ArenaPage // references. - return map(readPage(pager, id, lowerBound, upperBound), [this, id](Reference p) { + debug_printf("pushPage(%s) first cursor=%s\n", ::toString(id).c_str(), toString().c_str()); + return map(readPage(pager, id, lowerBound, upperBound), [=](Reference p) { +#if REDWOOD_DEBUG + path.push_back({ p, getCursor(p), id }); +#else path.push_back({ p, getCursor(p) }); +#endif return Void(); }); } @@ -5546,9 +5558,11 @@ public: return pushPage(id, c.get(), next.getOrUpperBound()); } + // Initialize or reinitialize cursor Future init(VersionedBTree* btree_in, Reference pager_in, BTreePageIDRef root) { btree = btree_in; pager = pager_in; + path.clear(); path.reserve(6); valid = false; return pushPage(root, dbBegin, dbEnd); @@ -5676,6 +5690,7 @@ public: if (self->path.size() == 1) { self->valid = false; + debug_printf("move%s() exit cursor=%s\n", forward ? "Next" : "Prev", self->toString().c_str()); return Void(); } From 751bac22712be1b6cbdb0ff3212e87d4f44f02d5 Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Thu, 20 May 2021 02:08:17 -0700 Subject: [PATCH 14/42] Write path no longer uses non-caching reads because it is no longer necessary to avoid a page copy. Page copies are only done just before an actual change is made. --- fdbserver/DeltaTree.h | 12 ++ fdbserver/VersionedBTree.actor.cpp | 178 +++++++++++++++++++++-------- 2 files changed, 144 insertions(+), 46 deletions(-) diff --git a/fdbserver/DeltaTree.h b/fdbserver/DeltaTree.h index 02a276ec7b..acef72fbc7 100644 --- a/fdbserver/DeltaTree.h +++ b/fdbserver/DeltaTree.h @@ -1046,6 +1046,10 @@ public: Cursor(DecodeCache* cache, DeltaTree2* tree) : cache(cache), tree(tree), nodeIndex(-1) {} + Cursor(DecodeCache* cache, DeltaTree2* tree, int nodeIndex) : cache(cache), tree(tree), nodeIndex(nodeIndex) { + updateItem(); + } + int rootIndex() { if (!cache->empty()) { return 0; @@ -1109,6 +1113,13 @@ public: // the cursor's new current item. const T& get() const { return item; } + void switchTree(DeltaTree2* newTree) { + tree = newTree; + if (nodeIndex != -1) { + updateItem(); + } + } + // If the cursor is valid, return a reference to the cursor's internal T. // Otherwise, returns a reference to the cache's upper boundary. const T& getOrUpperBound() const { return valid() ? get() : cache->upperBound; } @@ -1351,6 +1362,7 @@ public: // Returns true if successful, false if k does not fit in the space available // or if k is already in the tree (and was not already deleted). // Insertion on an empty tree returns false as well. + // Insert does NOT change the cursor position. bool insert(const T& k, int skipLen = 0, int maxHeightAllowed = std::numeric_limits::max()) { deltatree_printf("insert %s\n", k.toString().c_str()); diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index 7018659ef2..b6e102e5bd 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -4388,6 +4388,18 @@ private: state BTreePageIDRef newID; newID.resize(*arena, oldID.size()); + if (REDWOOD_DEBUG) { + BTreePage* btPage = (BTreePage*)page->begin(); + BTreePage::BinaryTree::DecodeCache* cache = (BTreePage::BinaryTree::DecodeCache*)page->userData; + debug_printf( + "updateBTreePage(%s, %s) %s\n", + ::toString(oldID).c_str(), + ::toString(writeVersion).c_str(), + cache == nullptr + ? "" + : btPage->toString(true, oldID, writeVersion, cache->lowerBound, cache->upperBound).c_str()); + } + if (oldID.size() == 1) { LogicalPageID id = wait(self->m_pager->atomicUpdatePage(oldID.front(), page, writeVersion)); newID.front() = id; @@ -4417,7 +4429,7 @@ private: } // Copy page to a new page which shares the same DecodeCache with the old page - Reference cloneForUpdate(Reference page) { + static Reference clonePageForUpdate(Reference page) { Reference newPage = page->cloneContents(); BTreePage::BinaryTree::DecodeCache* cache = (BTreePage::BinaryTree::DecodeCache*)page->userData; @@ -4425,6 +4437,7 @@ private: newPage->userData = cache; newPage->userDataDestructor = [](void* cache) { ((BTreePage::BinaryTree::DecodeCache*)cache)->delref(); }; + debug_printf("cloneForUpdate(%p -> %p size=%d\n", page->begin(), newPage->begin(), page->size()); return newPage; } @@ -4563,37 +4576,62 @@ private: struct InternalPageModifier { InternalPageModifier() {} - InternalPageModifier(BTreePage* p, BTreePage::BinaryTree::Cursor& c, bool updating, ParentInfo* parentInfo) - : btPage(p), c(c), updating(updating), changesMade(false), parentInfo(parentInfo) {} + InternalPageModifier(Reference p, + BTreePage::BinaryTree::Cursor& c, + bool updating, + ParentInfo* parentInfo) + : page(p), clonedPage(false), cursor(c), updating(updating), changesMade(false), parentInfo(parentInfo) {} + // Whether updating the existing page is allowed bool updating; - BTreePage* btPage; - BTreePage::BinaryTree::Cursor c; + Reference page; + + // Whether or not page has been cloned for update + bool clonedPage; + + BTreePage::BinaryTree::Cursor cursor; Standalone> rebuild; + + // Whether there are any changes to the page, either made in place or staged in rebuild bool changesMade; ParentInfo* parentInfo; + BTreePage* btPage() { return (BTreePage*)page->begin(); } + bool empty() const { if (updating) { - return c.tree->numItems == 0; + return cursor.tree->numItems == 0; } else { return rebuild.empty(); } } + void cloneForUpdate() { + if (!clonedPage) { + page = clonePageForUpdate(page); + cursor.switchTree(&btPage()->tree()); + clonedPage = true; + } + } + // end is the cursor position of the first record of the unvisited child link range, which // is needed if the insert requires switching from update to rebuild mode. void insert(BTreePage::BinaryTree::Cursor end, const VectorRef& recs) { int i = 0; if (updating) { + // Update must be done in the new tree, not the original tree where the end cursor will be from + end.tree = cursor.tree; + end.switchTree(cursor.tree); + // TODO: insert recs in a random order to avoid new subtree being entirely right child links while (i != recs.size()) { const RedwoodRecordRef& rec = recs[i]; debug_printf("internal page (updating) insert: %s\n", rec.toString(false).c_str()); - if (!c.insert(rec)) { + if (!cursor.insert(rec)) { debug_printf("internal page: failed to insert %s, switching to rebuild\n", rec.toString(false).c_str()); + // Update failed, so populate rebuild vector with everything up to but not including end, which // may include items from recs that were already added. auto c = end; @@ -4608,7 +4646,7 @@ private: updating = false; break; } - btPage->kvBytes += rec.kvBytes(); + btPage()->kvBytes += rec.kvBytes(); ++i; } } @@ -4651,11 +4689,20 @@ private: if (u.childrenChanged) { if (updating) { auto c = u.cBegin; + + if (c != u.cEnd) { + cloneForUpdate(); + // must point c to the tree to erase from + c.tree = cursor.tree; + c.switchTree(cursor.tree); + } + while (c != u.cEnd) { debug_printf("internal page (updating) erasing: %s\n", c.get().toString(false).c_str()); - btPage->kvBytes -= c.get().kvBytes(); + btPage()->kvBytes -= c.get().kvBytes(); c.erase(); } + // [cBegin, cEnd) is now erased, and cBegin is invalid, so cEnd represents the end // of the range that comes before any part of newLinks that can't be added if there // is not enough space. @@ -4670,6 +4717,9 @@ private: changesMade = true; } else { + // If this was an in-place update, where the child page IDs do not change, notify the + // parentInfo that those pages have been updated so it can possibly eliminate their + // second writes later. if (u.inPlaceUpdate) { for (auto id : u.decodeLowerBound.getChildPage()) { parentInfo->pageUpdated(id); @@ -4686,6 +4736,8 @@ private: !nextBoundary->sameExceptValue(u.expectedUpperBound.get()))) { RedwoodRecordRef rec = u.expectedUpperBound.get().withoutValue(); debug_printf("applyUpdate adding dummy record %s\n", rec.toString(false).c_str()); + + cloneForUpdate(); insert(u.cEnd, { &rec, 1 }); changesMade = true; } @@ -4729,11 +4781,15 @@ private: state Reference commitReadLock = self->m_commitReadLock; wait(commitReadLock->take()); state FlowLock::Releaser readLock(*commitReadLock); - state bool fromCache = false; - state Reference page = wait( - readPage(snapshot, rootID, update->decodeLowerBound, update->decodeUpperBound, false, false, &fromCache)); + state Reference page = + wait(readPage(snapshot, rootID, update->decodeLowerBound, update->decodeUpperBound, false, true)); readLock.release(); + // If in-place modification to the page is done, a copy of the page will be made in pageCopy + // and the cursor will be pointed to it. The original page variable must stay in scope because + // there could be RedwoodRecordRefs referencing its arenas. + state Reference pageCopy; + state BTreePage* btPage = (BTreePage*)page->begin(); ASSERT(isLeaf == btPage->isLeaf()); g_redwoodMetrics.level(btPage->height).pageCommitStart += 1; @@ -4741,16 +4797,9 @@ private: // TODO: Decide if it is okay to update if the subtree boundaries are expanded. It can result in // records in a DeltaTree being outside its decode boundary range, which isn't actually invalid // though it is awkward to reason about. + // TryToUpdate indicates insert and erase operations should be tried on the existing page first state bool tryToUpdate = btPage->tree().numItems > 0 && update->boundariesNormal(); - // If trying to update the page and the page reference points into the cache, - // we need to clone it so we don't modify the original version of the page. - if (tryToUpdate && fromCache) { - page = self->cloneForUpdate(page); - btPage = (BTreePage*)page->begin(); - fromCache = false; - } - debug_printf( "%s commitSubtree(): %s\n", context.c_str(), @@ -4834,6 +4883,15 @@ private: debug_printf("%s Erasing %s [existing, boundary start]\n", context.c_str(), cursor.get().toString().c_str()); + + // Copy page for modification if not already copied + if (!pageCopy.isValid()) { + pageCopy = clonePageForUpdate(page); + btPage = (BTreePage*)pageCopy->begin(); + cursor.tree = &btPage->tree(); + cursor.switchTree(&btPage->tree()); + } + btPage->kvBytes -= cursor.get().kvBytes(); cursor.erase(); } else { @@ -4856,6 +4914,14 @@ private: // If updating, add to the page, else add to the output set if (updating) { + // Copy page for modification if not already copied + if (!pageCopy.isValid()) { + pageCopy = clonePageForUpdate(page); + btPage = (BTreePage*)pageCopy->begin(); + cursor.tree = &btPage->tree(); + cursor.switchTree(&btPage->tree()); + } + if (cursor.insert(rec, update->skipLen, maxHeightAllowed)) { btPage->kvBytes += rec.kvBytes(); debug_printf( @@ -4911,6 +4977,15 @@ private: debug_printf("%s Erasing %s [existing, boundary start]\n", context.c_str(), cursor.get().toString().c_str()); + + // Copy page for modification if not already copied + if (!pageCopy.isValid()) { + pageCopy = clonePageForUpdate(page); + btPage = (BTreePage*)pageCopy->begin(); + cursor.tree = &btPage->tree(); + cursor.switchTree(&btPage->tree()); + } + btPage->kvBytes -= cursor.get().kvBytes(); cursor.erase(); changesMade = true; @@ -4947,6 +5022,15 @@ private: "%s Erasing %s and beyond [existing, matches changed upper mutation boundary]\n", context.c_str(), cursor.get().toString().c_str()); + + // Copy page for modification if not already copied + if (!pageCopy.isValid()) { + pageCopy = clonePageForUpdate(page); + btPage = (BTreePage*)pageCopy->begin(); + cursor.tree = &btPage->tree(); + cursor.switchTree(&btPage->tree()); + } + btPage->kvBytes -= cursor.get().kvBytes(); cursor.erase(); } else { @@ -4988,7 +5072,7 @@ private: } else { // Otherwise update it. BTreePageIDRef newID = wait(self->updateBTreePage( - self, rootID, &update->newLinks.arena(), page.castTo(), writeVersion)); + self, rootID, &update->newLinks.arena(), pageCopy.castTo(), writeVersion)); update->updatedInPlace(newID, btPage, newID.size() * self->m_blockSize); debug_printf( @@ -5206,13 +5290,13 @@ private: // Note: parentInfo could be invalid after a wait and must be re-initialized. // All uses below occur before waits so no reinitialization is done. state ParentInfo* parentInfo = &self->childUpdateTracker[rootID.front()]; - state InternalPageModifier m(btPage, cursor, tryToUpdate, parentInfo); + state InternalPageModifier modifier(page, cursor, tryToUpdate, parentInfo); // Apply the possible changes for each subtree range recursed to, except the last one. // For each range, the expected next record, if any, is checked against the first boundary // of the next range, if any. for (int i = 0, iEnd = slices.size() - 1; i < iEnd; ++i) { - m.applyUpdate(*slices[i], slices[i + 1]->getFirstBoundary()); + modifier.applyUpdate(*slices[i], slices[i + 1]->getFirstBoundary()); } // The expected next record for the final range is checked against one of the upper boundaries passed to @@ -5222,39 +5306,41 @@ private: // sole purpose of adding a dummy upper bound record. debug_printf("%s Applying final child range update. changesMade=%d Parent update is: %s\n", context.c_str(), - m.changesMade, + modifier.changesMade, update->toString().c_str()); - m.applyUpdate(*slices.back(), m.changesMade ? &update->subtreeUpperBound : &update->decodeUpperBound); + modifier.applyUpdate(*slices.back(), + modifier.changesMade ? &update->subtreeUpperBound : &update->decodeUpperBound); state bool detachChildren = (parentInfo->count > 2); state bool forceUpdate = false; // If no changes were made, but we should rewrite it to point directly to remapped child pages - if (!m.changesMade && detachChildren) { + if (!modifier.changesMade && detachChildren) { debug_printf( "%s Internal page forced rewrite because at least %d children have been updated in-place.\n", context.c_str(), parentInfo->count); - forceUpdate = true; - if (!m.updating) { - m.updating = true; - // Copy the page before modification if the page references the cache - if (fromCache) { - page = self->cloneForUpdate(page); - btPage = (BTreePage*)page->begin(); - m.btPage = btPage; - cursor.tree = &btPage->tree(); - m.c.tree = cursor.tree; - fromCache = false; - } - } + forceUpdate = true; + modifier.updating = true; + + // Make sure the modifier cloned the page so we can update the child links in-place below. + modifier.cloneForUpdate(); + ++g_redwoodMetrics.level(btPage->height).forceUpdate; } + // If the modifier cloned the page for updating, then update our local pageCopy, btPage, and cursor + if (modifier.clonedPage) { + pageCopy = modifier.page; + btPage = modifier.btPage(); + cursor.tree = modifier.cursor.tree; + cursor.switchTree(modifier.cursor.tree); + } + // If page contents have changed - if (m.changesMade || forceUpdate) { - if (m.empty()) { + if (modifier.changesMade || forceUpdate) { + if (modifier.empty()) { update->cleared(); debug_printf("%s All internal page children were deleted so deleting this page too, returning %s\n", context.c_str(), @@ -5262,7 +5348,7 @@ private: self->freeBTreePage(rootID, writeVersion); self->childUpdateTracker.erase(rootID.front()); } else { - if (m.updating) { + if (modifier.updating) { // Page was updated in place (or being forced to be updated in place to update child page ids) debug_printf( "%s Internal page modified in-place tryToUpdate=%d forceUpdate=%d detachChildren=%d\n", @@ -5301,7 +5387,7 @@ private: } BTreePageIDRef newID = wait(self->updateBTreePage( - self, rootID, &update->newLinks.arena(), page.castTo(), writeVersion)); + self, rootID, &update->newLinks.arena(), pageCopy.castTo(), writeVersion)); debug_printf( "%s commitSubtree(): Internal page updated in-place at version %s, new contents: %s\n", context.c_str(), @@ -5325,7 +5411,7 @@ private: if (detachChildren) { auto& stats = g_redwoodMetrics.level(btPage->height); - for (auto& rec : m.rebuild) { + for (auto& rec : modifier.rebuild) { if (rec.value.present()) { BTreePageIDRef oldPages = rec.getChildPage(); BTreePageIDRef newPages; @@ -5336,7 +5422,7 @@ private: if (newID != invalidLogicalPageID) { // Rebuild record values reference original page memory so make a copy if (newPages.empty()) { - newPages = BTreePageIDRef(m.rebuild.arena(), oldPages); + newPages = BTreePageIDRef(modifier.rebuild.arena(), oldPages); rec.setChildPage(newPages); } debug_printf("%s Detach updated %u -> %u\n", context.c_str(), p, newID); @@ -5354,7 +5440,7 @@ private: wait(writePages(self, &update->subtreeLowerBound, &update->subtreeUpperBound, - m.rebuild, + modifier.rebuild, btPage->height, writeVersion, rootID)); From a9cf0a2471e371e58d17df101784a764d55094fa Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Thu, 20 May 2021 02:12:32 -0700 Subject: [PATCH 15/42] Removed unnecessary cursor member from InternalPageModifier. Changed BTreePage::tree() methods to return a pointer instead of a reference since >90% of usages want a pointer. --- fdbserver/VersionedBTree.actor.cpp | 78 ++++++++++++++---------------- 1 file changed, 36 insertions(+), 42 deletions(-) diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index b6e102e5bd..03f56e6cc5 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -3162,17 +3162,17 @@ struct BTreePage { #pragma pack(pop) int size() const { - auto& t = tree(); - return (uint8_t*)&t - (uint8_t*)this + t.size(); + const BinaryTree* t = tree(); + return (uint8_t*)t - (uint8_t*)this + t->size(); } bool isLeaf() const { return height == 1; } - BinaryTree& tree() { return *(BinaryTree*)(this + 1); } + BinaryTree* tree() { return (BinaryTree*)(this + 1); } - BinaryTree& tree() const { return *(BinaryTree*)(this + 1); } + BinaryTree* tree() const { return (BinaryTree*)(this + 1); } - ValueTree& valueTree() const { return *(ValueTree*)(this + 1); } + ValueTree* valueTree() const { return (ValueTree*)(this + 1); } std::string toString(bool write, BTreePageIDRef id, @@ -3187,16 +3187,16 @@ struct BTreePage { ver, this, height, - (int)tree().numItems, + (int)tree()->numItems, (int)kvBytes, lowerBound.toString(false).c_str(), upperBound.toString(false).c_str()); try { - if (tree().numItems > 0) { + if (tree()->numItems > 0) { // This doesn't use the cached reader for the page because it is only for debugging purposes, // a cached reader may not exist BinaryTree::DecodeCache cache(lowerBound, upperBound); - BinaryTree::Cursor c(&cache, &tree()); + BinaryTree::Cursor c(&cache, tree()); c.moveFirst(); ASSERT(c.valid()); @@ -3243,12 +3243,12 @@ static void makeEmptyRoot(Reference page) { BTreePage* btpage = (BTreePage*)page->begin(); btpage->height = 1; btpage->kvBytes = 0; - btpage->tree().build(page->size(), nullptr, nullptr, nullptr, nullptr); + btpage->tree()->build(page->size(), nullptr, nullptr, nullptr, nullptr); } BTreePage::BinaryTree::Cursor getCursor(const Reference& page) { return BTreePage::BinaryTree::Cursor((BTreePage::BinaryTree::DecodeCache*)page->userData, - &((BTreePage*)page->begin())->tree()); + ((BTreePage*)page->begin())->tree()); } struct BoundaryRefAndPage { @@ -3492,7 +3492,7 @@ public: // Iterate over page entries, skipping key decoding using BTreePage::ValueTree which uses // RedwoodRecordRef::DeltaValueOnly as the delta type type to skip key decoding BTreePage::ValueTree::DecodeCache cache(dbBegin, dbEnd); - BTreePage::ValueTree::Cursor c(&cache, &btPage.valueTree()); + BTreePage::ValueTree::Cursor c(&cache, btPage.valueTree()); ASSERT(c.moveFirst()); Version v = entry.version; while (1) { @@ -4173,7 +4173,7 @@ private: pageUpperBound.toString(false).c_str()); int deltaTreeSpace = p.pageSize - sizeof(BTreePage); - state int written = btPage->tree().build( + state int written = btPage->tree()->build( deltaTreeSpace, &entries[p.startIndex], &entries[endIndex], &pageLowerBound, &pageUpperBound); if (written > deltaTreeSpace) { @@ -4512,7 +4512,7 @@ private: metrics.pageModifyExt += (maybeNewID.size() - 1); metrics.modifyFillPct += (double)btPage->size() / capacity; metrics.modifyStoredPct += (double)btPage->kvBytes / capacity; - metrics.modifyItemCount += btPage->tree().numItems; + metrics.modifyItemCount += btPage->tree()->numItems; // The boundaries can't have changed, but the child page link may have. if (maybeNewID != decodeLowerBound.getChildPage()) { @@ -4576,11 +4576,8 @@ private: struct InternalPageModifier { InternalPageModifier() {} - InternalPageModifier(Reference p, - BTreePage::BinaryTree::Cursor& c, - bool updating, - ParentInfo* parentInfo) - : page(p), clonedPage(false), cursor(c), updating(updating), changesMade(false), parentInfo(parentInfo) {} + InternalPageModifier(Reference p, bool updating, ParentInfo* parentInfo) + : page(p), clonedPage(false), updating(updating), changesMade(false), parentInfo(parentInfo) {} // Whether updating the existing page is allowed bool updating; @@ -4589,18 +4586,17 @@ private: // Whether or not page has been cloned for update bool clonedPage; - BTreePage::BinaryTree::Cursor cursor; Standalone> rebuild; // Whether there are any changes to the page, either made in place or staged in rebuild bool changesMade; ParentInfo* parentInfo; - BTreePage* btPage() { return (BTreePage*)page->begin(); } + BTreePage* btPage() const { return (BTreePage*)page->begin(); } bool empty() const { if (updating) { - return cursor.tree->numItems == 0; + return btPage()->tree()->numItems == 0; } else { return rebuild.empty(); } @@ -4609,7 +4605,6 @@ private: void cloneForUpdate() { if (!clonedPage) { page = clonePageForUpdate(page); - cursor.switchTree(&btPage()->tree()); clonedPage = true; } } @@ -4620,15 +4615,15 @@ private: int i = 0; if (updating) { // Update must be done in the new tree, not the original tree where the end cursor will be from - end.tree = cursor.tree; - end.switchTree(cursor.tree); + end.tree = btPage()->tree(); + end.switchTree(btPage()->tree()); // TODO: insert recs in a random order to avoid new subtree being entirely right child links while (i != recs.size()) { const RedwoodRecordRef& rec = recs[i]; debug_printf("internal page (updating) insert: %s\n", rec.toString(false).c_str()); - if (!cursor.insert(rec)) { + if (!end.insert(rec)) { debug_printf("internal page: failed to insert %s, switching to rebuild\n", rec.toString(false).c_str()); @@ -4693,8 +4688,8 @@ private: if (c != u.cEnd) { cloneForUpdate(); // must point c to the tree to erase from - c.tree = cursor.tree; - c.switchTree(cursor.tree); + c.tree = btPage()->tree(); + c.switchTree(btPage()->tree()); } while (c != u.cEnd) { @@ -4798,7 +4793,7 @@ private: // records in a DeltaTree being outside its decode boundary range, which isn't actually invalid // though it is awkward to reason about. // TryToUpdate indicates insert and erase operations should be tried on the existing page first - state bool tryToUpdate = btPage->tree().numItems > 0 && update->boundariesNormal(); + state bool tryToUpdate = btPage->tree()->numItems > 0 && update->boundariesNormal(); debug_printf( "%s commitSubtree(): %s\n", @@ -4888,8 +4883,8 @@ private: if (!pageCopy.isValid()) { pageCopy = clonePageForUpdate(page); btPage = (BTreePage*)pageCopy->begin(); - cursor.tree = &btPage->tree(); - cursor.switchTree(&btPage->tree()); + cursor.tree = btPage->tree(); + cursor.switchTree(btPage->tree()); } btPage->kvBytes -= cursor.get().kvBytes(); @@ -4918,8 +4913,8 @@ private: if (!pageCopy.isValid()) { pageCopy = clonePageForUpdate(page); btPage = (BTreePage*)pageCopy->begin(); - cursor.tree = &btPage->tree(); - cursor.switchTree(&btPage->tree()); + cursor.tree = btPage->tree(); + cursor.switchTree(btPage->tree()); } if (cursor.insert(rec, update->skipLen, maxHeightAllowed)) { @@ -4982,8 +4977,8 @@ private: if (!pageCopy.isValid()) { pageCopy = clonePageForUpdate(page); btPage = (BTreePage*)pageCopy->begin(); - cursor.tree = &btPage->tree(); - cursor.switchTree(&btPage->tree()); + cursor.tree = btPage->tree(); + cursor.switchTree(btPage->tree()); } btPage->kvBytes -= cursor.get().kvBytes(); @@ -5027,8 +5022,8 @@ private: if (!pageCopy.isValid()) { pageCopy = clonePageForUpdate(page); btPage = (BTreePage*)pageCopy->begin(); - cursor.tree = &btPage->tree(); - cursor.switchTree(&btPage->tree()); + cursor.tree = btPage->tree(); + cursor.switchTree(btPage->tree()); } btPage->kvBytes -= cursor.get().kvBytes(); @@ -5061,9 +5056,8 @@ private: writeVersion = self->getLastCommittedVersion() + 1; if (updating) { - const BTreePage::BinaryTree& DeltaTree2 = btPage->tree(); // If the tree is now empty, delete the page - if (DeltaTree2.numItems == 0) { + if (cursor.tree->numItems == 0) { update->cleared(); self->freeBTreePage(rootID, writeVersion); debug_printf("%s Page updates cleared all entries, returning %s\n", @@ -5280,7 +5274,7 @@ private: context.c_str(), btPage->size(), btPage->height, - btPage->tree().numItems, + btPage->tree()->numItems, slices.size(), recursions.size()); @@ -5290,7 +5284,7 @@ private: // Note: parentInfo could be invalid after a wait and must be re-initialized. // All uses below occur before waits so no reinitialization is done. state ParentInfo* parentInfo = &self->childUpdateTracker[rootID.front()]; - state InternalPageModifier modifier(page, cursor, tryToUpdate, parentInfo); + state InternalPageModifier modifier(page, tryToUpdate, parentInfo); // Apply the possible changes for each subtree range recursed to, except the last one. // For each range, the expected next record, if any, is checked against the first boundary @@ -5334,8 +5328,8 @@ private: if (modifier.clonedPage) { pageCopy = modifier.page; btPage = modifier.btPage(); - cursor.tree = modifier.cursor.tree; - cursor.switchTree(modifier.cursor.tree); + cursor.tree = btPage->tree(); + cursor.switchTree(btPage->tree()); } // If page contents have changed From e2c3d2d10842b8e3907f3c852715db3511a802ae Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Thu, 20 May 2021 02:27:05 -0700 Subject: [PATCH 16/42] Removed redundant calls to Cursor::switchTree() since there are no cases where the it matters if get() references the old tree's value until the cursor is moved. --- fdbserver/VersionedBTree.actor.cpp | 7 ------- 1 file changed, 7 deletions(-) diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index 03f56e6cc5..d8fb77e7e1 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -4616,7 +4616,6 @@ private: if (updating) { // Update must be done in the new tree, not the original tree where the end cursor will be from end.tree = btPage()->tree(); - end.switchTree(btPage()->tree()); // TODO: insert recs in a random order to avoid new subtree being entirely right child links while (i != recs.size()) { @@ -4689,7 +4688,6 @@ private: cloneForUpdate(); // must point c to the tree to erase from c.tree = btPage()->tree(); - c.switchTree(btPage()->tree()); } while (c != u.cEnd) { @@ -4884,7 +4882,6 @@ private: pageCopy = clonePageForUpdate(page); btPage = (BTreePage*)pageCopy->begin(); cursor.tree = btPage->tree(); - cursor.switchTree(btPage->tree()); } btPage->kvBytes -= cursor.get().kvBytes(); @@ -4914,7 +4911,6 @@ private: pageCopy = clonePageForUpdate(page); btPage = (BTreePage*)pageCopy->begin(); cursor.tree = btPage->tree(); - cursor.switchTree(btPage->tree()); } if (cursor.insert(rec, update->skipLen, maxHeightAllowed)) { @@ -4978,7 +4974,6 @@ private: pageCopy = clonePageForUpdate(page); btPage = (BTreePage*)pageCopy->begin(); cursor.tree = btPage->tree(); - cursor.switchTree(btPage->tree()); } btPage->kvBytes -= cursor.get().kvBytes(); @@ -5023,7 +5018,6 @@ private: pageCopy = clonePageForUpdate(page); btPage = (BTreePage*)pageCopy->begin(); cursor.tree = btPage->tree(); - cursor.switchTree(btPage->tree()); } btPage->kvBytes -= cursor.get().kvBytes(); @@ -5329,7 +5323,6 @@ private: pageCopy = modifier.page; btPage = modifier.btPage(); cursor.tree = btPage->tree(); - cursor.switchTree(btPage->tree()); } // If page contents have changed From 4ee27919ad070d068e92733cfc41f847a2c51e14 Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Thu, 20 May 2021 03:12:40 -0700 Subject: [PATCH 17/42] Print size of RedwoodRecordRef in unit test. --- fdbserver/VersionedBTree.actor.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index d8fb77e7e1..8ef90eac39 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -6608,6 +6608,8 @@ TEST_CASE("/redwood/correctness/unit/RedwoodRecordRef") { ASSERT(RedwoodRecordRef::Delta::VersionDeltaSizes[2] == 6); ASSERT(RedwoodRecordRef::Delta::VersionDeltaSizes[3] == 8); + printf("sizeof(RedwoodRecordRef) = %d\n", sizeof(RedwoodRecordRef)); + // Test pageID stuff. { LogicalPageID ids[] = { 1, 5 }; From 9c7ec8d6cd68b5a48d6186c4906c93c16e7365cd Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Thu, 20 May 2021 03:29:07 -0700 Subject: [PATCH 18/42] Removed RedwoodRecordRef::version since in the current design all record versions within a single BTree snapshot are the same. --- fdbserver/VersionedBTree.actor.cpp | 182 +++++++---------------------- 1 file changed, 43 insertions(+), 139 deletions(-) diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index 8ef90eac39..c06584ff43 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -2604,10 +2604,9 @@ std::string toString(BTreePageIDRef id) { struct RedwoodRecordRef { typedef uint8_t byte; - RedwoodRecordRef(KeyRef key = KeyRef(), Version ver = 0, Optional value = {}) - : key(key), version(ver), value(value) {} + RedwoodRecordRef(KeyRef key = KeyRef(), Optional value = {}) : key(key), value(value) {} - RedwoodRecordRef(Arena& arena, const RedwoodRecordRef& toCopy) : key(arena, toCopy.key), version(toCopy.version) { + RedwoodRecordRef(Arena& arena, const RedwoodRecordRef& toCopy) : key(arena, toCopy.key) { if (toCopy.value.present()) { value = ValueRef(arena, toCopy.value.get()); } @@ -2636,20 +2635,19 @@ struct RedwoodRecordRef { } inline RedwoodRecordRef withPageID(BTreePageIDRef id) const { - return RedwoodRecordRef(key, version, ValueRef((const uint8_t*)id.begin(), id.size() * sizeof(LogicalPageID))); + return RedwoodRecordRef(key, ValueRef((const uint8_t*)id.begin(), id.size() * sizeof(LogicalPageID))); } - inline RedwoodRecordRef withoutValue() const { return RedwoodRecordRef(key, version); } + inline RedwoodRecordRef withoutValue() const { return RedwoodRecordRef(key); } inline RedwoodRecordRef withMaxPageID() const { - return RedwoodRecordRef(key, version, StringRef((uint8_t*)&maxPageID, sizeof(maxPageID))); + return RedwoodRecordRef(key, StringRef((uint8_t*)&maxPageID, sizeof(maxPageID))); } // Truncate (key, version, part) tuple to len bytes. void truncate(int len) { ASSERT(len <= key.size()); key = key.substr(0, len); - version = 0; } // Find the common key prefix between two records, assuming that the first skipLen bytes are the same @@ -2664,10 +2662,7 @@ struct RedwoodRecordRef { int cmp = key.compareSuffix(rhs.key, keySkip); if (cmp == 0) { - cmp = version - rhs.version; - if (cmp == 0) { - cmp = value.compare(rhs.value); - } + cmp = value.compare(rhs.value); } return cmp; } @@ -2678,14 +2673,11 @@ struct RedwoodRecordRef { return (key.size() == k.size()) && (key.substr(skipLen) == k.substr(skipLen)); } - bool sameExceptValue(const RedwoodRecordRef& rhs, int skipLen = 0) const { - return sameUserKey(rhs.key, skipLen) && version == rhs.version; - } + bool sameExceptValue(const RedwoodRecordRef& rhs, int skipLen = 0) const { return sameUserKey(rhs.key, skipLen); } // TODO: Use SplitStringRef (unless it ends up being slower) KeyRef key; Optional value; - Version version; int expectedSize() const { return key.expectedSize() + value.expectedSize(); } int kvBytes() const { return expectedSize(); } @@ -2769,8 +2761,7 @@ struct RedwoodRecordRef { PREFIX_SOURCE_PREV = 0x80, IS_DELETED = 0x40, HAS_VALUE = 0x20, - HAS_VERSION = 0x10, - VERSION_DELTA_SIZE = 0xC, + // 3 unused bits LENGTHS_FORMAT = 0x03 }; @@ -2848,61 +2839,6 @@ struct RedwoodRecordRef { StringRef getValue() const { return StringRef(data() + getKeySuffixLength(), getValueLength()); } - bool hasVersion() const { return flags & HAS_VERSION; } - - int getVersionDeltaSizeBytes() const { - int code = (flags & VERSION_DELTA_SIZE) >> 2; - return VersionDeltaSizes[code]; - } - - static int getVersionDeltaSizeBytes(Version d) { - if (d == 0) { - return 0; - } else if (d == (int32_t)d) { - return sizeof(int32_t); - } else if (d == (d & int48_t::MASK)) { - return sizeof(int48_t); - } - return sizeof(int64_t); - } - - int getVersionDelta(const uint8_t* r) const { - int code = (flags & VERSION_DELTA_SIZE) >> 2; - switch (code) { - case 0: - return 0; - case 1: - return *(int32_t*)r; - case 2: - return ((int64_t) static_cast(reinterpret_cast(r)->high) << 16) | - (((int48_t*)r)->low & 0xFFFF); - case 3: - default: - return *(int64_t*)r; - } - } - - // Version delta size should be 0 before calling - int setVersionDelta(Version d, uint8_t* w) { - flags |= HAS_VERSION; - if (d == 0) { - return 0; - } else if (d == (int32_t)d) { - flags |= 1 << 2; - *(uint32_t*)w = d; - return sizeof(uint32_t); - } else if (d == (d & int48_t::MASK)) { - flags |= 2 << 2; - ((int48_t*)w)->high = d >> 16; - ((int48_t*)w)->low = d; - return sizeof(int48_t); - } else { - flags |= 3 << 2; - *(int64_t*)w = d; - return sizeof(int64_t); - } - } - bool hasValue() const { return flags & HAS_VALUE; } void setPrefixSource(bool val) { @@ -2926,7 +2862,7 @@ struct RedwoodRecordRef { bool getDeleted() const { return flags & IS_DELETED; } RedwoodRecordRef apply(const Partial& cache) { - return RedwoodRecordRef(cache, 0, hasValue() ? Optional(getValue()) : Optional()); + return RedwoodRecordRef(cache, hasValue() ? Optional(getValue()) : Optional()); } RedwoodRecordRef apply(const RedwoodRecordRef& base, Arena& arena) const { @@ -2953,12 +2889,7 @@ struct RedwoodRecordRef { value = r.readString(valueLen); } - Version v = 0; - if (hasVersion()) { - v = base.version + getVersionDelta(r.rptr); - } - - return RedwoodRecordRef(k, v, value); + return RedwoodRecordRef(k, value); } RedwoodRecordRef apply(Arena& arena, const RedwoodRecordRef& base, Optional& cache) { @@ -2969,7 +2900,7 @@ struct RedwoodRecordRef { } int size() const { - int size = 1 + getVersionDeltaSizeBytes(); + int size = 1; switch (flags & LENGTHS_FORMAT) { case 0: return size + sizeof(LengthFormat0) + LengthFormat0.suffixLength + LengthFormat0.valueLength; @@ -2994,9 +2925,6 @@ struct RedwoodRecordRef { if (hasValue()) { flagString += "HasValue|"; } - if (hasVersion()) { - flagString += "HasVersion|"; - } int lengthFormat = flags & LENGTHS_FORMAT; Reader r(data()); @@ -3005,13 +2933,12 @@ struct RedwoodRecordRef { int valueLen = getValueLength(); return format("lengthFormat: %d totalDeltaSize: %d flags: %s prefixLen: %d keySuffixLen: %d " - "versionDeltaSizeBytes: %d valueLen %d raw: %s", + "valueLen %d raw: %s", lengthFormat, size(), flagString.c_str(), prefixLen, keySuffixLen, - getVersionDeltaSizeBytes(), valueLen, StringRef((const uint8_t*)this, size()).toHexString().c_str()); } @@ -3021,16 +2948,16 @@ struct RedwoodRecordRef { // its values, so the Reader does not require the original prev/next ancestors. struct DeltaValueOnly : Delta { RedwoodRecordRef apply(const RedwoodRecordRef& base, Arena& arena) const { - return RedwoodRecordRef(KeyRef(), 0, hasValue() ? Optional(getValue()) : Optional()); + return RedwoodRecordRef(KeyRef(), hasValue() ? Optional(getValue()) : Optional()); } RedwoodRecordRef apply(const Partial& cache) { - return RedwoodRecordRef(KeyRef(), 0, hasValue() ? Optional(getValue()) : Optional()); + return RedwoodRecordRef(KeyRef(), hasValue() ? Optional(getValue()) : Optional()); } RedwoodRecordRef apply(Arena& arena, const RedwoodRecordRef& base, Optional& cache) { cache = KeyRef(); - return RedwoodRecordRef(KeyRef(), 0, hasValue() ? Optional(getValue()) : Optional()); + return RedwoodRecordRef(KeyRef(), hasValue() ? Optional(getValue()) : Optional()); } }; #pragma pack(pop) @@ -3055,16 +2982,13 @@ struct RedwoodRecordRef { int valueLen = value.present() ? value.get().size() : 0; int formatType; - int versionBytes; if (worstCaseOverhead) { formatType = Delta::determineLengthFormat(key.size(), key.size(), valueLen); - versionBytes = version == 0 ? 0 : Delta::getVersionDeltaSizeBytes(version << 1); } else { formatType = Delta::determineLengthFormat(prefixLen, keySuffixLen, valueLen); - versionBytes = version == 0 ? 0 : Delta::getVersionDeltaSizeBytes(version - base.version); } - return 1 + Delta::LengthFormatSizes[formatType] + keySuffixLen + valueLen + versionBytes; + return 1 + Delta::LengthFormatSizes[formatType] + keySuffixLen + valueLen; } // commonPrefix between *this and base can be passed if known @@ -3114,10 +3038,6 @@ struct RedwoodRecordRef { wptr = value.get().copyTo(wptr); } - if (version != 0) { - wptr += d.setVersionDelta(version - base.version, wptr); - } - return wptr - (uint8_t*)&d; } @@ -3136,7 +3056,7 @@ struct RedwoodRecordRef { std::string toString(bool leaf = true) const { std::string r; - r += format("'%s'@%" PRId64 " => ", key.printable().c_str(), version); + r += format("'%s' => ", key.printable().c_str()); if (value.present()) { if (leaf) { r += format("'%s'", kvformat(value.get()).c_str()); @@ -3352,7 +3272,7 @@ public: #pragma pack(push, 1) struct MetaKey { - static constexpr int FORMAT_VERSION = 8; + static constexpr int FORMAT_VERSION = 9; // This serves as the format version for the entire tree, individual pages will not be versioned uint16_t formatVersion; uint8_t height; @@ -3682,14 +3602,14 @@ private: inline bool equalToSet(ValueRef val) { return isSet() && value == val; } - inline RedwoodRecordRef toRecord(KeyRef userKey, Version version) const { + inline RedwoodRecordRef toRecord(KeyRef userKey) const { // No point in serializing an atomic op, it needs to be coalesced to a real value. ASSERT(!isAtomicOp()); if (isClear()) - return RedwoodRecordRef(userKey, version); + return RedwoodRecordRef(userKey); - return RedwoodRecordRef(userKey, version, value); + return RedwoodRecordRef(userKey, value); } std::string toString() const { return format("op=%d val='%s'", op, printable(value).c_str()); } @@ -4901,7 +4821,7 @@ private: // Clears of this key will have been processed above by not being erased from the updated page or // excluded from the merge output if (applyBoundaryChange && mBegin.mutation().boundarySet()) { - RedwoodRecordRef rec(mBegin.key(), 0, mBegin.mutation().boundaryValue.get()); + RedwoodRecordRef rec(mBegin.key(), mBegin.mutation().boundaryValue.get()); changesMade = true; // If updating, add to the page, else add to the output set @@ -6327,7 +6247,7 @@ ACTOR Future seekAllBTreeCursor(VersionedBTree* btree, state Optional val = i->second; debug_printf("Verifying @%" PRId64 " '%s'\n", ver, key.c_str()); state Arena arena; - wait(cur.seekGTE(RedwoodRecordRef(KeyRef(arena, key), 0), 0)); + wait(cur.seekGTE(RedwoodRecordRef(KeyRef(arena, key)), 0)); bool foundKey = cur.isValid() && cur.get().key == key; bool hasValue = foundKey && cur.get().value.present(); @@ -6587,13 +6507,6 @@ RedwoodRecordRef randomRedwoodRecordRef(const std::string& keyBuffer, const std: rec.value = StringRef((uint8_t*)valueBuffer.data(), deterministicRandom()->randomInt(0, valueBuffer.size())); } - int versionIntSize = deterministicRandom()->randomInt(0, 8) * 8; - if (versionIntSize > 0) { - --versionIntSize; - int64_t max = ((int64_t)1 << versionIntSize) - 1; - rec.version = deterministicRandom()->randomInt64(0, max); - } - return rec; } @@ -6624,35 +6537,35 @@ TEST_CASE("/redwood/correctness/unit/RedwoodRecordRef") { ASSERT(r2.getChildPage().begin() != id.begin()); } - deltaTest(RedwoodRecordRef(LiteralStringRef(""), 0, LiteralStringRef("")), - RedwoodRecordRef(LiteralStringRef(""), 0, LiteralStringRef(""))); + deltaTest(RedwoodRecordRef(LiteralStringRef(""), LiteralStringRef("")), + RedwoodRecordRef(LiteralStringRef(""), LiteralStringRef(""))); - deltaTest(RedwoodRecordRef(LiteralStringRef("abc"), 0, LiteralStringRef("")), - RedwoodRecordRef(LiteralStringRef("abc"), 0, LiteralStringRef(""))); + deltaTest(RedwoodRecordRef(LiteralStringRef("abc"), LiteralStringRef("")), + RedwoodRecordRef(LiteralStringRef("abc"), LiteralStringRef(""))); - deltaTest(RedwoodRecordRef(LiteralStringRef("abc"), 0, LiteralStringRef("")), - RedwoodRecordRef(LiteralStringRef("abcd"), 0, LiteralStringRef(""))); + deltaTest(RedwoodRecordRef(LiteralStringRef("abc"), LiteralStringRef("")), + RedwoodRecordRef(LiteralStringRef("abcd"), LiteralStringRef(""))); - deltaTest(RedwoodRecordRef(LiteralStringRef("abcd"), 2, LiteralStringRef("")), - RedwoodRecordRef(LiteralStringRef("abc"), 2, LiteralStringRef(""))); + deltaTest(RedwoodRecordRef(LiteralStringRef("abcd"), LiteralStringRef("")), + RedwoodRecordRef(LiteralStringRef("abc"), LiteralStringRef(""))); - deltaTest(RedwoodRecordRef(std::string(300, 'k'), 2, std::string(1e6, 'v')), - RedwoodRecordRef(std::string(300, 'k'), 2, LiteralStringRef(""))); + deltaTest(RedwoodRecordRef(std::string(300, 'k'), std::string(1e6, 'v')), + RedwoodRecordRef(std::string(300, 'k'), LiteralStringRef(""))); - deltaTest(RedwoodRecordRef(LiteralStringRef(""), 2, LiteralStringRef("")), - RedwoodRecordRef(LiteralStringRef(""), 1, LiteralStringRef(""))); + deltaTest(RedwoodRecordRef(LiteralStringRef(""), LiteralStringRef("")), + RedwoodRecordRef(LiteralStringRef(""), LiteralStringRef(""))); - deltaTest(RedwoodRecordRef(LiteralStringRef(""), 0xffff, LiteralStringRef("")), - RedwoodRecordRef(LiteralStringRef(""), 1, LiteralStringRef(""))); + deltaTest(RedwoodRecordRef(LiteralStringRef(""), LiteralStringRef("")), + RedwoodRecordRef(LiteralStringRef(""), LiteralStringRef(""))); - deltaTest(RedwoodRecordRef(LiteralStringRef(""), 1, LiteralStringRef("")), - RedwoodRecordRef(LiteralStringRef(""), 0xffff, LiteralStringRef(""))); + deltaTest(RedwoodRecordRef(LiteralStringRef(""), LiteralStringRef("")), + RedwoodRecordRef(LiteralStringRef(""), LiteralStringRef(""))); - deltaTest(RedwoodRecordRef(LiteralStringRef(""), 0xffffff, LiteralStringRef("")), - RedwoodRecordRef(LiteralStringRef(""), 1, LiteralStringRef(""))); + deltaTest(RedwoodRecordRef(LiteralStringRef(""), LiteralStringRef("")), + RedwoodRecordRef(LiteralStringRef(""), LiteralStringRef(""))); - deltaTest(RedwoodRecordRef(LiteralStringRef(""), 1, LiteralStringRef("")), - RedwoodRecordRef(LiteralStringRef(""), 0xffffff, LiteralStringRef(""))); + deltaTest(RedwoodRecordRef(LiteralStringRef(""), LiteralStringRef("")), + RedwoodRecordRef(LiteralStringRef(""), LiteralStringRef(""))); Arena mem; double start; @@ -6692,9 +6605,6 @@ TEST_CASE("/redwood/correctness/unit/RedwoodRecordRef") { rec1.key = LiteralStringRef("alksdfjaklsdfjlkasdjflkasdjfklajsdflk;ajsdflkajdsflkjadsf1"); rec2.key = LiteralStringRef("alksdfjaklsdfjlkasdjflkasdjfklajsdflk;ajsdflkajdsflkjadsf234"); - rec1.version = deterministicRandom()->randomInt64(0, std::numeric_limits::max()); - rec2.version = deterministicRandom()->randomInt64(0, std::numeric_limits::max()); - start = timer(); total = 0; count = 100e6; @@ -6770,9 +6680,6 @@ TEST_CASE("/redwood/correctness/unit/deltaTree/RedwoodRecordRef") { std::string v = deterministicRandom()->randomAlphaNumeric(30); RedwoodRecordRef rec; rec.key = StringRef(arena, k); - rec.version = deterministicRandom()->coinflip() - ? deterministicRandom()->randomInt64(0, std::numeric_limits::max()) - : invalidVersion; if (deterministicRandom()->coinflip()) { rec.value = StringRef(arena, v); } @@ -6950,9 +6857,6 @@ TEST_CASE("/redwood/correctness/unit/deltaTree/RedwoodRecordRef2") { std::string v = deterministicRandom()->randomAlphaNumeric(30); RedwoodRecordRef rec; rec.key = StringRef(arena, k); - rec.version = 0; // deterministicRandom()->coinflip() - // ? deterministicRandom()->randomInt64(0, std::numeric_limits::max()) - // : invalidVersion; if (deterministicRandom()->coinflip()) { rec.value = StringRef(arena, v); } From fa7a73071f14b550d42ef0738370b9bdfed43966 Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Thu, 20 May 2021 19:33:55 -0700 Subject: [PATCH 19/42] Fixed memory leak, InternalPageSliceUpdates require destruction. --- fdbserver/VersionedBTree.actor.cpp | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index c06584ff43..ae0ed871b7 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -4363,7 +4363,7 @@ private: // Each call to commitSubtree() will pass most of its arguments via a this structure because the caller // will need access to these parameters after commitSubtree() is done. - struct InternalPageSliceUpdate { + struct InternalPageSliceUpdate : public FastAllocated { // The logical range for the subtree's contents. Due to subtree clears, these boundaries may not match // the lower/upper bounds needed to decode the page. // Subtree clears can cause the boundaries for decoding the page to be more restrictive than the subtree's @@ -5017,16 +5017,15 @@ private: } else { // Internal Page std::vector> recursions; - state std::vector slices; - state Arena arena; + state std::vector> slices; cursor.moveFirst(); bool first = true; while (cursor.valid()) { - InternalPageSliceUpdate& u = *new (arena) InternalPageSliceUpdate(); - slices.push_back(&u); + slices.emplace_back(new InternalPageSliceUpdate()); + InternalPageSliceUpdate& u = *slices.back(); // At this point we should never be at a null child page entry because the first entry of a page // can't be null and this loop will skip over null entries that come after non-null entries. From 96f14a714f256cee5d73c557428480534de7a7a7 Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Fri, 21 May 2021 23:11:37 -0700 Subject: [PATCH 20/42] Fixed memory leak, DecodeCache reference count was initialized incorrectly. Streamlined perf unit test a bit. --- fdbserver/VersionedBTree.actor.cpp | 82 ++++++++++-------------------- 1 file changed, 28 insertions(+), 54 deletions(-) diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index ae0ed871b7..ca4b464359 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -4269,7 +4269,6 @@ private: upperBound.toString(false).c_str()); BTreePage::BinaryTree::DecodeCache* cache = new BTreePage::BinaryTree::DecodeCache(lowerBound, upperBound); - cache->addref(); page->userData = cache; page->userDataDestructor = [](void* cache) { ((BTreePage::BinaryTree::DecodeCache*)cache)->delref(); }; } @@ -7941,7 +7940,6 @@ ACTOR Future randomSeeks(VersionedBTree* btree, int count, char firstChar, state Version readVer = btree->getLatestVersion(); state int c = 0; state double readStart = timer(); - printf("Executing %d random seeks\n", count); state VersionedBTree::BTreeCursor cur; wait(btree->initBTreeCursor(&cur, readVer)); while (c < count) { @@ -7963,7 +7961,6 @@ ACTOR Future randomScans(VersionedBTree* btree, state Version readVer = btree->getLatestVersion(); state int c = 0; state double readStart = timer(); - printf("Executing %d random scans\n", count); state VersionedBTree::BTreeCursor cur; wait(btree->initBTreeCursor(&cur, readVer)); @@ -8023,9 +8020,6 @@ TEST_CASE(":/redwood/correctness/pager/cow") { TEST_CASE(":/redwood/performance/set") { state SignalableActorCollection actors; - g_redwoodMetricsActor = Void(); // Prevent trace event metrics from starting - g_redwoodMetrics.clear(); - state std::string fileName = params.get("fileName").orDefault("unittest.redwood"); state int pageSize = params.getInt("pageSize").orDefault(SERVER_KNOBS->REDWOOD_DEFAULT_PAGE_SIZE); state int64_t pageCacheBytes = params.getInt("pageCacheBytes").orDefault(FLOW_KNOBS->PAGE_CACHE_4K); @@ -8050,6 +8044,7 @@ TEST_CASE(":/redwood/performance/set") { state int seeks = params.getInt("seeks").orDefault(1000000); state int scans = params.getInt("scans").orDefault(20000); state bool pagerMemoryOnly = params.getInt("pagerMemoryOnly").orDefault(0); + state bool traceMetrics = params.getInt("traceMetrics").orDefault(0); printf("pageSize: %d\n", pageSize); printf("pageCacheBytes: %" PRId64 "\n", pageCacheBytes); @@ -8073,6 +8068,12 @@ TEST_CASE(":/redwood/performance/set") { printf("openExisting: %d\n", openExisting); printf("insertRecords: %d\n", insertRecords); + // If using stdout for metrics, prevent trace event metrics logger from starting + if (!traceMetrics) { + g_redwoodMetricsActor = Void(); + g_redwoodMetrics.clear(); + } + if (!openExisting) { printf("Deleting old test data\n"); deleteFile(fileName); @@ -8145,7 +8146,9 @@ TEST_CASE(":/redwood/performance/set") { double* pIntervalStart = &intervalStart; commit = map(btree->commit(), [=](Void result) { - printf("Committed:\n%s\n", g_redwoodMetrics.toString(true).c_str()); + if (!traceMetrics) { + printf("%s\n", g_redwoodMetrics.toString(true).c_str()); + } double elapsed = timer() - *pIntervalStart; printf("Committed %d keyValueBytes in %d records in %f seconds, %.2f MB/s\n", kvb, @@ -8169,56 +8172,27 @@ TEST_CASE(":/redwood/performance/set") { printf("StorageBytes=%s\n", btree->getStorageBytes().toString().c_str()); } - printf("Warming cache with seeks\n"); - for (int x = 0; x < concurrentSeeks; ++x) { - actors.add(randomSeeks(btree, seeks / concurrentSeeks, firstKeyChar, lastKeyChar)); + if (scans > 0) { + printf("Parallel scans, count=%d, concurrency=%d, no readAhead ...\n", scans, concurrentScans); + for (int x = 0; x < concurrentScans; ++x) { + actors.add(randomScans(btree, scans / concurrentScans, 50, 0, firstKeyChar, lastKeyChar)); + } + wait(actors.signalAndReset()); + if (!traceMetrics) { + printf("Stats:\n%s\n", g_redwoodMetrics.toString(true).c_str()); + } } - wait(actors.signalAndReset()); - printf("Stats:\n%s\n", g_redwoodMetrics.toString(true).c_str()); - printf("Serial scans with adaptive readAhead...\n"); - actors.add(randomScans(btree, scans, 50, -1, firstKeyChar, lastKeyChar)); - wait(actors.signalAndReset()); - printf("Stats:\n%s\n", g_redwoodMetrics.toString(true).c_str()); - - printf("Serial scans with readAhead 3 pages...\n"); - actors.add(randomScans(btree, scans, 50, 12000, firstKeyChar, lastKeyChar)); - wait(actors.signalAndReset()); - printf("Stats:\n%s\n", g_redwoodMetrics.toString(true).c_str()); - - printf("Serial scans with readAhead 2 pages...\n"); - actors.add(randomScans(btree, scans, 50, 8000, firstKeyChar, lastKeyChar)); - wait(actors.signalAndReset()); - printf("Stats:\n%s\n", g_redwoodMetrics.toString(true).c_str()); - - printf("Serial scans with readAhead 1 page...\n"); - actors.add(randomScans(btree, scans, 50, 4000, firstKeyChar, lastKeyChar)); - wait(actors.signalAndReset()); - printf("Stats:\n%s\n", g_redwoodMetrics.toString(true).c_str()); - - printf("Serial scans...\n"); - actors.add(randomScans(btree, scans, 50, 0, firstKeyChar, lastKeyChar)); - wait(actors.signalAndReset()); - printf("Stats:\n%s\n", g_redwoodMetrics.toString(true).c_str()); - - printf("Parallel scans, concurrency=%d, no readAhead ...\n", concurrentScans); - for (int x = 0; x < concurrentScans; ++x) { - actors.add(randomScans(btree, scans / concurrentScans, 50, 0, firstKeyChar, lastKeyChar)); + if (seeks > 0) { + printf("Parallel seeks, count=%d, concurrency=%d ...\n", seeks, concurrentSeeks); + for (int x = 0; x < concurrentSeeks; ++x) { + actors.add(randomSeeks(btree, seeks / concurrentSeeks, firstKeyChar, lastKeyChar)); + } + wait(actors.signalAndReset()); + if (!traceMetrics) { + printf("Stats:\n%s\n", g_redwoodMetrics.toString(true).c_str()); + } } - wait(actors.signalAndReset()); - printf("Stats:\n%s\n", g_redwoodMetrics.toString(true).c_str()); - - printf("Serial seeks...\n"); - actors.add(randomSeeks(btree, seeks, firstKeyChar, lastKeyChar)); - wait(actors.signalAndReset()); - printf("Stats:\n%s\n", g_redwoodMetrics.toString(true).c_str()); - - printf("Parallel seeks, concurrency=%d ...\n", concurrentSeeks); - for (int x = 0; x < concurrentSeeks; ++x) { - actors.add(randomSeeks(btree, seeks / concurrentSeeks, firstKeyChar, lastKeyChar)); - } - wait(actors.signalAndReset()); - printf("Stats:\n%s\n", g_redwoodMetrics.toString(true).c_str()); Future closedFuture = btree->onClosed(); btree->close(); From 6cc78458564bf2ae149398974b78f01be4fb8d65 Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Sat, 22 May 2021 17:00:17 -0700 Subject: [PATCH 21/42] Restore non-caching reads on the commit path, probably temporarily, to remove this as a variable before/after the switch to DeltaTree2. --- fdbserver/VersionedBTree.actor.cpp | 32 ++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index ca4b464359..700a807b08 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -4495,8 +4495,8 @@ private: struct InternalPageModifier { InternalPageModifier() {} - InternalPageModifier(Reference p, bool updating, ParentInfo* parentInfo) - : page(p), clonedPage(false), updating(updating), changesMade(false), parentInfo(parentInfo) {} + InternalPageModifier(Reference p, bool alreadyCloned, bool updating, ParentInfo* parentInfo) + : page(p), clonedPage(alreadyCloned), updating(updating), changesMade(false), parentInfo(parentInfo) {} // Whether updating the existing page is allowed bool updating; @@ -4693,14 +4693,20 @@ private: state Reference commitReadLock = self->m_commitReadLock; wait(commitReadLock->take()); state FlowLock::Releaser readLock(*commitReadLock); - state Reference page = - wait(readPage(snapshot, rootID, update->decodeLowerBound, update->decodeUpperBound, false, true)); + state bool fromCache = false; + state Reference page = wait( + readPage(snapshot, rootID, update->decodeLowerBound, update->decodeUpperBound, false, false, &fromCache)); readLock.release(); - // If in-place modification to the page is done, a copy of the page will be made in pageCopy - // and the cursor will be pointed to it. The original page variable must stay in scope because - // there could be RedwoodRecordRefs referencing its arenas. - state Reference pageCopy; + // If the page exists in the cache, it must be copied before modification. + // That copy will be referenced by pageCopy, as page must stay in scope in case anything references its + // memory and it gets evicted from the cache. + // If the page is not in the cache, then no copy is needed so we will initialize pageCopy to page + state Reference pageCopy = fromCache ? Reference() : page; + + if (!fromCache) { + pageCopy = page; + } state BTreePage* btPage = (BTreePage*)page->begin(); ASSERT(isLeaf == btPage->isLeaf()); @@ -5193,10 +5199,16 @@ private: wait(waitForAll(recursions)); debug_printf("%s Recursions done, processing slice updates.\n", context.c_str()); - // Note: parentInfo could be invalid after a wait and must be re-initialized. + // ParentInfo could be invalid after a wait and must be re-initialized. // All uses below occur before waits so no reinitialization is done. state ParentInfo* parentInfo = &self->childUpdateTracker[rootID.front()]; - state InternalPageModifier modifier(page, tryToUpdate, parentInfo); + + // InternalPageModifier takes the results of the recursive commitSubtree() calls in order + // and makes changes to page as needed, copying as needed, and generating an array from + // which to build new page(s) if modification is not possible or not allowed. + // If pageCopy is already set it was initialized to page above so the modifier doesn't need + // to copy it + state InternalPageModifier modifier(page, pageCopy.isValid(), tryToUpdate, parentInfo); // Apply the possible changes for each subtree range recursed to, except the last one. // For each range, the expected next record, if any, is checked against the first boundary From 0c94a25c489ec9ee3e3df9d8c2432ff03f7442f9 Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Sun, 23 May 2021 16:33:29 -0700 Subject: [PATCH 22/42] Prioritized cache eviction of old page versions and freed pages. --- fdbserver/VersionedBTree.actor.cpp | 39 ++++++++++++++++++++++++------ 1 file changed, 32 insertions(+), 7 deletions(-) diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index 700a807b08..481b2b612f 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -1136,6 +1136,16 @@ public: return nullptr; } + // If index is in cache, move it to the front of the eviction order + void prioritizeEviction(const IndexType& index) { + auto i = cache.find(index); + if (i != cache.end()) { + auto ei = evictionOrder.iterator_to(i->second); + evictionOrder.erase(ei); + evictionOrder.push_front(i->second); + } + } + // Try to evict the item at index from cache // Returns true if item is evicted or was not present in cache bool tryEvict(const IndexType& index) { @@ -1148,7 +1158,7 @@ public: ++g_redwoodMetrics.pagerEvictUnhit; } evictionOrder.erase(evictionOrder.iterator_to(toEvict)); - cache.erase(toEvict.index); + cache.erase(i); return true; } @@ -1170,6 +1180,7 @@ public: evictionOrder.push_back(entry); } } else { + // Otherwise it was a cache miss if (!noMiss) { ++g_redwoodMetrics.pagerCacheMiss; } @@ -1197,8 +1208,8 @@ public: toString(index).c_str()); if (!toEvict.item.evictable()) { - evictionOrder.erase(evictionOrder.iterator_to(toEvict)); - evictionOrder.push_back(toEvict); + // shift the front to the back + evictionOrder.shift_forward(1); ++g_redwoodMetrics.pagerEvictFail; break; } else { @@ -1216,8 +1227,7 @@ public: return entry.item; } - // Clears the cache, saving the entries, and then waits for eachWaits for each item to be evictable and evicts it. - // The cache should not be Evicts all evictable entries + // Clears the cache, saving the entries to second cache, then waits for each item to be evictable and evicts it. ACTOR static Future clear_impl(ObjectCache* self) { state ObjectCache::CacheT cache; state EvictionOrderT evictionOrder; @@ -1673,7 +1683,14 @@ public: // TODO: Possibly limit size of remap queue since it must be recovered on cold start RemappedPage r{ v, pageID, newPageID }; remapQueue.pushBack(r); - remappedPages[pageID][v] = newPageID; + auto& versionedMap = remappedPages[pageID]; + + // An update page is unlikely to have its old version read again soon, so prioritize its cache eviction + // If the versioned map is empty for this page then the prior version of the page is at stored at the + // PhysicalPageID pageID, otherwise it is the last mapped value in the version-ordered map. + pageCache.prioritizeEviction(versionedMap.empty() ? pageID : versionedMap.rbegin()->second); + versionedMap[v] = newPageID; + debug_printf("DWALPager(%s) pushed %s\n", filename.c_str(), RemappedPage(r).toString().c_str()); return pageID; }); @@ -1682,7 +1699,7 @@ public: return f; } - void freeUnmappedPage(LogicalPageID pageID, Version v) { + void freeUnmappedPage(PhysicalPageID pageID, Version v) { // If v is older than the oldest version still readable then mark pageID as free as of the next commit if (v < effectiveOldestVersion()) { debug_printf("DWALPager(%s) op=freeNow %s @%" PRId64 " oldestVersion=%" PRId64 "\n", @@ -1700,6 +1717,9 @@ public: pLastCommittedHeader->oldestVersion); delayedFreeList.pushBack({ v, pageID }); } + + // A freed page is unlikely to be read again soon so prioritize its cache eviction + pageCache.prioritizeEviction(pageID); } LogicalPageID detachRemappedPage(LogicalPageID pageID, Version v) override { @@ -1751,6 +1771,11 @@ public: v, pLastCommittedHeader->oldestVersion); remapQueue.pushBack(RemappedPage{ v, pageID, invalidLogicalPageID }); + + // A freed page is unlikely to be read again soon so prioritize its cache eviction + PhysicalPageID previousPhysicalPage = i->second.rbegin()->second; + pageCache.prioritizeEviction(previousPhysicalPage); + i->second[v] = invalidLogicalPageID; return; } From 7f411934b4877597d80c1148dc775dc4c26f7f51 Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Mon, 24 May 2021 01:51:22 -0700 Subject: [PATCH 23/42] Rare simulation-only bug fix. A single-page BTree with too small of a page size and one gigantic value can result in a root page list that is too large to fit in the hardcoded MetaKey size. --- fdbserver/VersionedBTree.actor.cpp | 70 +++++++++++++++++------------- 1 file changed, 39 insertions(+), 31 deletions(-) diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index 481b2b612f..cc88f48b06 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -3227,7 +3227,8 @@ struct InPlaceArray { memcpy(begin(), v.begin(), sizeof(T) * v.size()); } - int extraSize() const { return count * sizeof(T); } + int size() const { return count; } + int sizeBytes() const { return count * sizeof(T); } }; #pragma pack(pop) @@ -3297,14 +3298,14 @@ public: #pragma pack(push, 1) struct MetaKey { - static constexpr int FORMAT_VERSION = 9; + static constexpr int FORMAT_VERSION = 10; // This serves as the format version for the entire tree, individual pages will not be versioned uint16_t formatVersion; uint8_t height; LazyClearQueueT::QueueState lazyDeleteQueue; InPlaceArray root; - KeyRef asKeyRef() const { return KeyRef((uint8_t*)this, sizeof(MetaKey) + root.extraSize()); } + KeyRef asKeyRef() const { return KeyRef((uint8_t*)this, sizeof(MetaKey) + root.sizeBytes()); } void fromKeyRef(KeyRef k) { memcpy(this, k.begin(), k.size()); @@ -3312,9 +3313,9 @@ public: } std::string toString() { - return format("{height=%d formatVersion=%d root=%s lazyDeleteQueue=%s}", - (int)height, + return format("{formatVersion=%d height=%d root=%s lazyDeleteQueue=%s}", (int)formatVersion, + (int)height, ::toString(root.get()).c_str(), lazyDeleteQueue.toString().c_str()); } @@ -3388,7 +3389,8 @@ public: VersionedBTree(IPager2* pager, std::string name) : m_pager(pager), m_writeVersion(invalidVersion), m_lastCommittedVersion(invalidVersion), m_pBuffer(nullptr), - m_commitReadLock(new FlowLock(SERVER_KNOBS->REDWOOD_COMMIT_CONCURRENT_READS)), m_name(name) { + m_commitReadLock(new FlowLock(SERVER_KNOBS->REDWOOD_COMMIT_CONCURRENT_READS)), m_name(name), m_pHeader(nullptr), + m_headerSpace(0) { m_lazyClearActor = 0; m_init = init_impl(this); @@ -3491,6 +3493,10 @@ public: ACTOR static Future init_impl(VersionedBTree* self) { wait(self->m_pager->init()); + // TODO: Get actual max MetaKey size limit from Pager + self->m_headerSpace = self->m_pager->getUsablePageSize(); + self->m_pHeader = (MetaKey*)new uint8_t[self->m_headerSpace]; + self->m_blockSize = self->m_pager->getUsablePageSize(); state Version latest = self->m_pager->getLatestVersion(); self->m_newOldestVersion = self->m_pager->getOldestVersion(); @@ -3500,12 +3506,12 @@ public: state Key meta = self->m_pager->getMetaKey(); if (meta.size() == 0) { - self->m_header.formatVersion = MetaKey::FORMAT_VERSION; + self->m_pHeader->formatVersion = MetaKey::FORMAT_VERSION; LogicalPageID id = wait(self->m_pager->newPageID()); BTreePageIDRef newRoot((LogicalPageID*)&id, 1); debug_printf("new root %s\n", toString(newRoot).c_str()); - self->m_header.root.set(newRoot, sizeof(headerSpace) - sizeof(m_header)); - self->m_header.height = 1; + self->m_pHeader->root.set(newRoot, self->m_headerSpace - sizeof(MetaKey)); + self->m_pHeader->height = 1; ++latest; Reference page = self->m_pager->newPageBuffer(); makeEmptyRoot(page); @@ -3514,16 +3520,16 @@ public: LogicalPageID newQueuePage = wait(self->m_pager->newPageID()); self->m_lazyClearQueue.create(self->m_pager, newQueuePage, "LazyClearQueue"); - self->m_header.lazyDeleteQueue = self->m_lazyClearQueue.getState(); - self->m_pager->setMetaKey(self->m_header.asKeyRef()); + self->m_pHeader->lazyDeleteQueue = self->m_lazyClearQueue.getState(); + self->m_pager->setMetaKey(self->m_pHeader->asKeyRef()); wait(self->m_pager->commit()); debug_printf("Committed initial commit.\n"); } else { - self->m_header.fromKeyRef(meta); - self->m_lazyClearQueue.recover(self->m_pager, self->m_header.lazyDeleteQueue, "LazyClearQueueRecovered"); + self->m_pHeader->fromKeyRef(meta); + self->m_lazyClearQueue.recover(self->m_pager, self->m_pHeader->lazyDeleteQueue, "LazyClearQueueRecovered"); } - debug_printf("Recovered btree at version %" PRId64 ": %s\n", latest, self->m_header.toString().c_str()); + debug_printf("Recovered btree at version %" PRId64 ": %s\n", latest, self->m_pHeader->toString().c_str()); self->m_lastCommittedVersion = latest; self->m_lazyClearActor = incrementalLazyClear(self); @@ -3538,6 +3544,10 @@ public: // uncommitted writes so it should not be committed. m_init.cancel(); m_latestCommit.cancel(); + + if (m_pHeader != nullptr) { + delete[](uint8_t*) m_pHeader; + } } // Must be nondecreasing @@ -3595,8 +3605,8 @@ public: ASSERT(s.numPages == 1); // The btree should now be a single non-oversized root page. - ASSERT(self->m_header.height == 1); - ASSERT(self->m_header.root.count == 1); + ASSERT(self->m_pHeader->height == 1); + ASSERT(self->m_pHeader->root.count == 1); // From the pager's perspective the only pages that should be in use are the btree root and // the previously mentioned lazy delete queue page. @@ -3833,12 +3843,9 @@ private: std::unordered_map parents; ParentInfoMapT childUpdateTracker; - // MetaKey changes size so allocate space for it to expand into. FIXME: Steve is fixing this to be dynamically - // sized. - union { - uint8_t headerSpace[sizeof(MetaKey) + sizeof(LogicalPageID) * 200]; - MetaKey m_header; - }; + // MetaKey has a variable size, it can be as large as m_headerSpace + MetaKey* m_pHeader; + int m_headerSpace; LazyClearQueueT m_lazyClearQueue; Future m_lazyClearActor; @@ -4215,7 +4222,7 @@ private: // While there are multiple child pages for this version we must write new tree levels. while (records.size() > 1) { - self->m_header.height = ++height; + self->m_pHeader->height = ++height; Standalone> newRecords = wait(writePages(self, &dbBegin, &dbEnd, records, height, version, BTreePageIDRef())); debug_printf("Wrote a new root level at version %" PRId64 " height %d size %lu pages\n", @@ -4335,7 +4342,7 @@ private: if (REDWOOD_DEBUG) { BTreePage* btPage = (BTreePage*)page->begin(); BTreePage::BinaryTree::DecodeCache* cache = (BTreePage::BinaryTree::DecodeCache*)page->userData; - debug_printf( + debug_printf_always( "updateBTreePage(%s, %s) %s\n", ::toString(oldID).c_str(), ::toString(writeVersion).c_str(), @@ -5428,7 +5435,7 @@ private: state Version latestVersion = self->m_pager->getLatestVersion(); debug_printf("%s: pager latestVersion %" PRId64 "\n", self->m_name.c_str(), latestVersion); - state Standalone rootPageID = self->m_header.root.get(); + state Standalone rootPageID = self->m_pHeader->root.get(); state InternalPageSliceUpdate all; state RedwoodRecordRef rootLink = dbBegin.withPageID(rootPageID); all.subtreeLowerBound = rootLink; @@ -5445,7 +5452,7 @@ private: self->m_pager->getReadSnapshot(latestVersion), mutations, rootPageID, - self->m_header.height == 1, + self->m_pHeader->height == 1, mBegin, mEnd, &all)); @@ -5457,7 +5464,7 @@ private: LogicalPageID newRootID = wait(self->m_pager->newPageID()); Reference page = self->m_pager->newPageBuffer(); makeEmptyRoot(page); - self->m_header.height = 1; + self->m_pHeader->height = 1; self->m_pager->updatePage(newRootID, page); rootPageID = BTreePageIDRef((LogicalPageID*)&newRootID, 1); } else { @@ -5467,13 +5474,14 @@ private: } else { // If the new root level's size is not 1 then build new root level(s) Standalone> newRootPage = - wait(buildNewRoot(self, latestVersion, newRootLevel, self->m_header.height)); + wait(buildNewRoot(self, latestVersion, newRootLevel, self->m_pHeader->height)); rootPageID = newRootPage.front().getChildPage(); } } } - self->m_header.root.set(rootPageID, sizeof(headerSpace) - sizeof(m_header)); + debug_printf("new root %s\n", toString(rootPageID).c_str()); + self->m_pHeader->root.set(rootPageID, self->m_headerSpace - sizeof(MetaKey)); self->m_lazyClearStop = true; wait(success(self->m_lazyClearActor)); @@ -5482,10 +5490,10 @@ private: self->m_pager->setCommitVersion(writeVersion); wait(self->m_lazyClearQueue.flush()); - self->m_header.lazyDeleteQueue = self->m_lazyClearQueue.getState(); + self->m_pHeader->lazyDeleteQueue = self->m_lazyClearQueue.getState(); debug_printf("Setting metakey\n"); - self->m_pager->setMetaKey(self->m_header.asKeyRef()); + self->m_pager->setMetaKey(self->m_pHeader->asKeyRef()); debug_printf("%s: Committing pager %" PRId64 "\n", self->m_name.c_str(), writeVersion); wait(self->m_pager->commit()); From 3451c2242f43af7ed1140469a265f6f935226623 Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Tue, 25 May 2021 01:31:50 -0700 Subject: [PATCH 24/42] DeltaTree2::Cursor now reconstructs current item on-demand, caches it in an Optional member, and does not initialize it when a cursor is copied. --- fdbserver/DeltaTree.h | 43 ++++++++++++++++++------------------------- 1 file changed, 18 insertions(+), 25 deletions(-) diff --git a/fdbserver/DeltaTree.h b/fdbserver/DeltaTree.h index acef72fbc7..76bdf51b8f 100644 --- a/fdbserver/DeltaTree.h +++ b/fdbserver/DeltaTree.h @@ -1046,9 +1046,10 @@ public: Cursor(DecodeCache* cache, DeltaTree2* tree) : cache(cache), tree(tree), nodeIndex(-1) {} - Cursor(DecodeCache* cache, DeltaTree2* tree, int nodeIndex) : cache(cache), tree(tree), nodeIndex(nodeIndex) { - updateItem(); - } + Cursor(DecodeCache* cache, DeltaTree2* tree, int nodeIndex) : cache(cache), tree(tree), nodeIndex(nodeIndex) {} + + // Copy constructor does not copy item because normally a copied cursor will be immediately moved. + Cursor(const Cursor& c) : cache(c.cache), tree(c.tree), nodeIndex(c.nodeIndex) {} int rootIndex() { if (!cache->empty()) { @@ -1062,7 +1063,7 @@ public: DeltaTree2* tree; DecodeCache* cache; int nodeIndex; - T item; + mutable Optional item; Node* node() const { return tree->nodeAt(cache->get(nodeIndex).nodeOffset); } @@ -1071,7 +1072,7 @@ public: return format("Cursor{nodeIndex=-1}"); } return format("Cursor{item=%s indexItem=%s nodeIndex=%d decodedNode=%s node=%s ", - item.toString().c_str(), + item.present() ? item.get().toString().c_str() : "", get(cache->get(nodeIndex)).toString().c_str(), nodeIndex, cache->get(nodeIndex).toString().c_str(), @@ -1103,23 +1104,20 @@ public: return delta.apply(cache->arena, base, decoded.partial); } - private: - inline void updateItem() { item = get(cache->get(nodeIndex)); } - public: // Get the item at the cursor // Behavior is undefined if the cursor is not valid. // If the cursor is moved, the reference object returned will be modified to // the cursor's new current item. - const T& get() const { return item; } - - void switchTree(DeltaTree2* newTree) { - tree = newTree; - if (nodeIndex != -1) { - updateItem(); + const T& get() const { + if (!item.present()) { + item = get(cache->get(nodeIndex)); } + return item.get(); } + void switchTree(DeltaTree2* newTree) { tree = newTree; } + // If the cursor is valid, return a reference to the cursor's internal T. // Otherwise, returns a reference to the cache's upper boundary. const T& getOrUpperBound() const { return valid() ? get() : cache->upperBound; } @@ -1224,13 +1222,14 @@ public: // Does not skip/avoid deleted nodes. int seek(const T& s, int skipLen = 0) { nodeIndex = -1; + item.reset(); deltatree_printf("seek(%s) start %s\n", s.toString().c_str(), toString().c_str()); int nIndex = rootIndex(); int cmp = 0; while (nIndex != -1) { nodeIndex = nIndex; - updateItem(); + item.reset(); cmp = s.compare(get(), skipLen); deltatree_printf("seek(%s) loop cmp=%d %s\n", s.toString().c_str(), cmp, toString().c_str()); if (cmp == 0) { @@ -1249,11 +1248,11 @@ public: bool moveFirst() { nodeIndex = -1; + item.reset(); int nIndex = rootIndex(); deltatree_printf("moveFirst start %s\n", toString().c_str()); while (nIndex != -1) { nodeIndex = nIndex; - updateItem(); deltatree_printf("moveFirst moved %s\n", toString().c_str()); nIndex = getLeftChildIndex(nIndex); } @@ -1262,11 +1261,11 @@ public: bool moveLast() { nodeIndex = -1; + item.reset(); int nIndex = rootIndex(); deltatree_printf("moveLast start %s\n", toString().c_str()); while (nIndex != -1) { nodeIndex = nIndex; - updateItem(); deltatree_printf("moveLast moved %s\n", toString().c_str()); nIndex = getRightChildIndex(nIndex); } @@ -1276,21 +1275,18 @@ public: // Try to move to next node, sees deleted nodes. void _moveNext() { deltatree_printf("_moveNext start %s\n", toString().c_str()); + item.reset(); // Try to go right int nIndex = getRightChildIndex(nodeIndex); // If we couldn't go right, then the answer is our next ancestor if (nIndex == -1) { nodeIndex = cache->get(nodeIndex).rightParentIndex; - if (nodeIndex != -1) { - updateItem(); - } deltatree_printf("_moveNext move1 %s\n", toString().c_str()); } else { // Go left as far as possible do { nodeIndex = nIndex; - updateItem(); deltatree_printf("_moveNext move2 %s\n", toString().c_str()); nIndex = getLeftChildIndex(nodeIndex); } while (nIndex != -1); @@ -1300,20 +1296,17 @@ public: // Try to move to previous node, sees deleted nodes. void _movePrev() { deltatree_printf("_movePrev start %s\n", toString().c_str()); + item.reset(); // Try to go left int nIndex = getLeftChildIndex(nodeIndex); // If we couldn't go left, then the answer is our prev ancestor if (nIndex == -1) { nodeIndex = cache->get(nodeIndex).leftParentIndex; - if (nodeIndex != -1) { - updateItem(); - } deltatree_printf("_movePrev move1 %s\n", toString().c_str()); } else { // Go right as far as possible do { nodeIndex = nIndex; - updateItem(); deltatree_printf("_movePrev move2 %s\n", toString().c_str()); nIndex = getRightChildIndex(nodeIndex); } while (nIndex != -1); From b8af2950c85a7d5e910b7b4c72fe74c65a981ca3 Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Wed, 26 May 2021 00:21:15 -0700 Subject: [PATCH 25/42] Remove some leftover Version field remnants, update RedwoodRecordRef comments. --- fdbserver/VersionedBTree.actor.cpp | 22 ++++------------------ 1 file changed, 4 insertions(+), 18 deletions(-) diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index cc88f48b06..ace7b9f5a5 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -2752,35 +2752,25 @@ struct RedwoodRecordRef { } LengthFormat3; }; - struct int48_t { - static constexpr int64_t MASK = 0xFFFFFFFFFFFFLL; - int32_t high; - int16_t low; - }; - static constexpr int LengthFormatSizes[] = { sizeof(LengthFormat0), sizeof(LengthFormat1), sizeof(LengthFormat2), sizeof(LengthFormat3) }; - static constexpr int VersionDeltaSizes[] = { 0, sizeof(int32_t), sizeof(int48_t), sizeof(int64_t) }; // Serialized Format // // Flags - 1 byte // 1 bit - borrow source is prev ancestor (otherwise next ancestor) // 1 bit - item is deleted - // 1 bit - has value (different from zero-length value, if 0 value len will be 0) - // 1 bits - has nonzero version - // 2 bits - version delta integer size code, maps to 0, 4, 6, 8 + // 1 bit - has value (different from a zero-length value, which is still a value) + // 3 unused bits // 2 bits - length fields format // // Length fields using 3 to 8 bytes total depending on length fields format // // Byte strings - // Key suffix bytes // Value bytes - // Version delta bytes - // + // Key suffix bytes enum EFlags { PREFIX_SOURCE_PREV = 0x80, @@ -2790,6 +2780,7 @@ struct RedwoodRecordRef { LENGTHS_FORMAT = 0x03 }; + // Figure out which length format must be used for the given lengths static inline int determineLengthFormat(int prefixLength, int suffixLength, int valueLength) { // Large prefix or suffix length, which should be rare, is format 3 if (prefixLength > 0xFF || suffixLength > 0xFF) { @@ -6559,11 +6550,6 @@ TEST_CASE("/redwood/correctness/unit/RedwoodRecordRef") { ASSERT(RedwoodRecordRef::Delta::LengthFormatSizes[2] == 6); ASSERT(RedwoodRecordRef::Delta::LengthFormatSizes[3] == 8); - ASSERT(RedwoodRecordRef::Delta::VersionDeltaSizes[0] == 0); - ASSERT(RedwoodRecordRef::Delta::VersionDeltaSizes[1] == 4); - ASSERT(RedwoodRecordRef::Delta::VersionDeltaSizes[2] == 6); - ASSERT(RedwoodRecordRef::Delta::VersionDeltaSizes[3] == 8); - printf("sizeof(RedwoodRecordRef) = %d\n", sizeof(RedwoodRecordRef)); // Test pageID stuff. From f95d592db854c741e882e31b8b77f4b851b00c0b Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Wed, 26 May 2021 01:21:02 -0700 Subject: [PATCH 26/42] Optimized record delta decoding / applying a bit, changed substring order to (value, keySuffix) since value is needed more frequently. Added DeltaTree2 item type requirement to create T from base's cached partial item instead of full record. --- fdbserver/DeltaTree.h | 13 ++++-- fdbserver/VersionedBTree.actor.cpp | 72 +++++++++++++++--------------- 2 files changed, 46 insertions(+), 39 deletions(-) diff --git a/fdbserver/DeltaTree.h b/fdbserver/DeltaTree.h index 76bdf51b8f..b4678691c0 100644 --- a/fdbserver/DeltaTree.h +++ b/fdbserver/DeltaTree.h @@ -1099,9 +1099,16 @@ public: return delta.apply(cache->arena, basePrev ? cache->lowerBound : cache->upperBound, decoded.partial); } - // Otherwise, get the base T and apply the delta to it - T base = get(cache->get(baseIndex)); - return delta.apply(cache->arena, base, decoded.partial); + // Otherwise, get the base's decoded node + DecodedNode& baseDecoded = cache->get(baseIndex); + + // If the base's partial is present, apply delta to it to get result + if (baseDecoded.partial.present()) { + return delta.apply(cache->arena, baseDecoded.partial.get(), decoded.partial); + } + + // Otherwise apply delta to base T + return delta.apply(cache->arena, get(baseDecoded), decoded.partial); } public: diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index ace7b9f5a5..93025f3976 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -2707,19 +2707,6 @@ struct RedwoodRecordRef { int expectedSize() const { return key.expectedSize() + value.expectedSize(); } int kvBytes() const { return expectedSize(); } - class Reader { - public: - Reader(const void* ptr) : rptr((const byte*)ptr) {} - - const byte* rptr; - - StringRef readString(int len) { - StringRef s(rptr, len); - rptr += len; - return s; - } - }; - #pragma pack(push, 1) struct Delta { @@ -2851,9 +2838,9 @@ struct RedwoodRecordRef { } } - StringRef getKeySuffix() const { return StringRef(data(), getKeySuffixLength()); } + StringRef getKeySuffix() const { return StringRef(data() + getValueLength(), getKeySuffixLength()); } - StringRef getValue() const { return StringRef(data() + getKeySuffixLength(), getValueLength()); } + StringRef getValue() const { return StringRef(data(), getValueLength()); } bool hasValue() const { return flags & HAS_VALUE; } @@ -2877,42 +2864,55 @@ struct RedwoodRecordRef { bool getDeleted() const { return flags & IS_DELETED; } - RedwoodRecordRef apply(const Partial& cache) { - return RedwoodRecordRef(cache, hasValue() ? Optional(getValue()) : Optional()); - } - + // DeltaTree interface RedwoodRecordRef apply(const RedwoodRecordRef& base, Arena& arena) const { int keyPrefixLen = getKeyPrefixLength(); int keySuffixLen = getKeySuffixLength(); int valueLen = hasValue() ? getValueLength() : 0; + byte* pData = data(); StringRef k; - - Reader r(data()); // If there is a key suffix, reconstitute the complete key into a contiguous string if (keySuffixLen > 0) { - StringRef keySuffix = r.readString(keySuffixLen); k = makeString(keyPrefixLen + keySuffixLen, arena); memcpy(mutateString(k), base.key.begin(), keyPrefixLen); - memcpy(mutateString(k) + keyPrefixLen, keySuffix.begin(), keySuffixLen); + memcpy(mutateString(k) + keyPrefixLen, pData + valueLen, keySuffixLen); } else { // Otherwise just reference the base key's memory k = base.key.substr(0, keyPrefixLen); } - Optional value; - if (hasValue()) { - value = r.readString(valueLen); - } + return RedwoodRecordRef(k, hasValue() ? ValueRef(pData, valueLen) : Optional()); + } - return RedwoodRecordRef(k, value); + // DeltaTree interface + RedwoodRecordRef apply(const Partial& cache) { + return RedwoodRecordRef(cache, hasValue() ? Optional(getValue()) : Optional()); + } + + RedwoodRecordRef apply(Arena& arena, const Partial& baseKey, Optional& cache) { + int keyPrefixLen = getKeyPrefixLength(); + int keySuffixLen = getKeySuffixLength(); + int valueLen = hasValue() ? getValueLength() : 0; + byte* pData = data(); + + StringRef k; + // If there is a key suffix, reconstitute the complete key into a contiguous string + if (keySuffixLen > 0) { + k = makeString(keyPrefixLen + keySuffixLen, arena); + memcpy(mutateString(k), baseKey.begin(), keyPrefixLen); + memcpy(mutateString(k) + keyPrefixLen, pData + valueLen, keySuffixLen); + } else { + // Otherwise just reference the base key's memory + k = baseKey.substr(0, keyPrefixLen); + } + cache = k; + + return RedwoodRecordRef(k, hasValue() ? ValueRef(pData, valueLen) : Optional()); } RedwoodRecordRef apply(Arena& arena, const RedwoodRecordRef& base, Optional& cache) { - RedwoodRecordRef rec = apply(base, arena); - cache = rec.key; - - return rec; + return apply(arena, base.key, cache); } int size() const { @@ -2943,7 +2943,6 @@ struct RedwoodRecordRef { } int lengthFormat = flags & LENGTHS_FORMAT; - Reader r(data()); int prefixLen = getKeyPrefixLength(); int keySuffixLen = getKeySuffixLength(); int valueLen = getValueLength(); @@ -3046,14 +3045,15 @@ struct RedwoodRecordRef { } uint8_t* wptr = d.data(); - // Write key suffix string - wptr = keySuffix.copyTo(wptr); // Write value bytes - if (value.present()) { + if (valueLen > 0) { wptr = value.get().copyTo(wptr); } + // Write key suffix string + wptr = keySuffix.copyTo(wptr); + return wptr - (uint8_t*)&d; } From 345a484ce7a52446a60a295b903855ebf9aea2c7 Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Wed, 26 May 2021 23:37:46 -0700 Subject: [PATCH 27/42] Prevent rehashing by reserving cache size limit in cache map. --- fdbserver/VersionedBTree.actor.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index 93025f3976..c1e2f64004 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -1121,6 +1121,7 @@ public: void setSizeLimit(int n) { ASSERT(n > 0); sizeLimit = n; + cache.reserve(n); } // Get the object for i if it exists, else return nullptr. From b4c446bc8ba259749b78028abb8974608933fa95 Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Wed, 26 May 2021 23:38:12 -0700 Subject: [PATCH 28/42] Remove unused variable. --- fdbserver/VersionedBTree.actor.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index c1e2f64004..bb0f144049 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -3832,7 +3832,6 @@ private: Future m_init; std::string m_name; int m_blockSize; - std::unordered_map parents; ParentInfoMapT childUpdateTracker; // MetaKey has a variable size, it can be as large as m_headerSpace From a485ae12150de89aee354829595bc1a96d7e8f03 Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Wed, 2 Jun 2021 16:12:11 -0700 Subject: [PATCH 29/42] Log pagerMemoryOnly in test config. --- fdbserver/VersionedBTree.actor.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index bb0f144049..5343b5ae7f 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -8077,6 +8077,7 @@ TEST_CASE(":/redwood/performance/set") { state bool pagerMemoryOnly = params.getInt("pagerMemoryOnly").orDefault(0); state bool traceMetrics = params.getInt("traceMetrics").orDefault(0); + printf("pagerMemoryOnly: %d\n", pagerMemoryOnly); printf("pageSize: %d\n", pageSize); printf("pageCacheBytes: %" PRId64 "\n", pageCacheBytes); printf("trailingIntegerIndexRange: %d\n", nodeCount); From f2904dadf3aaead38c0b6b75d7c30ef82fddc0ea Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Thu, 3 Jun 2021 02:25:57 -0700 Subject: [PATCH 30/42] Reading BTree pages no longer requires boundary records, as they are not needed if the page is already cached. --- fdbserver/DeltaTree.h | 12 ++++ fdbserver/VersionedBTree.actor.cpp | 95 +++++++++++++++--------------- 2 files changed, 59 insertions(+), 48 deletions(-) diff --git a/fdbserver/DeltaTree.h b/fdbserver/DeltaTree.h index b4678691c0..0a810414ef 100644 --- a/fdbserver/DeltaTree.h +++ b/fdbserver/DeltaTree.h @@ -1051,6 +1051,18 @@ public: // Copy constructor does not copy item because normally a copied cursor will be immediately moved. Cursor(const Cursor& c) : cache(c.cache), tree(c.tree), nodeIndex(c.nodeIndex) {} + Cursor next() const { + Cursor c = *this; + c.moveNext(); + return c; + } + + Cursor previous() const { + Cursor c = *this; + c.movePrev(); + return c; + } + int rootIndex() { if (!cache->empty()) { return 0; diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index 5343b5ae7f..92dfbca4f4 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -3183,11 +3183,6 @@ static void makeEmptyRoot(Reference page) { btpage->tree()->build(page->size(), nullptr, nullptr, nullptr, nullptr); } -BTreePage::BinaryTree::Cursor getCursor(const Reference& page) { - return BTreePage::BinaryTree::Cursor((BTreePage::BinaryTree::DecodeCache*)page->userData, - ((BTreePage*)page->begin())->tree()); -} - struct BoundaryRefAndPage { Standalone lowerBound; Reference firstPage; @@ -3410,8 +3405,7 @@ public: break; } // Start reading the page, without caching - entries.push_back( - std::make_pair(q.get(), self->readPage(snapshot, q.get().pageID, dbBegin, dbEnd, true, false))); + entries.push_back(std::make_pair(q.get(), self->readPage(snapshot, q.get().pageID, true, false))); --toPop; } @@ -4239,23 +4233,15 @@ private: ACTOR static Future> readPage(Reference snapshot, BTreePageIDRef id, - RedwoodRecordRef lowerBound, - RedwoodRecordRef upperBound, bool forLazyClear = false, bool cacheable = true, bool* fromCache = nullptr) { - if (!forLazyClear) { - debug_printf("readPage() op=read %s @%" PRId64 " lower=%s upper=%s\n", - toString(id).c_str(), - snapshot->getVersion(), - lowerBound.toString(false).c_str(), - upperBound.toString(false).c_str()); - } else { - debug_printf( - "readPage() op=readForDeferredClear %s @%" PRId64 " \n", toString(id).c_str(), snapshot->getVersion()); - } wait(yield()); + debug_printf("readPage() op=read%s %s @%" PRId64 "\n", + forLazyClear ? "ForDeferredClear" : "", + toString(id).c_str(), + snapshot->getVersion()); state Reference page; @@ -4284,24 +4270,37 @@ private: metrics.pageRead += 1; metrics.pageReadExt += (id.size() - 1); - if (!forLazyClear && page->userData == nullptr) { - debug_printf("readPage() Creating DecodeCache for %s @%" PRId64 " lower=%s upper=%s\n", - toString(id).c_str(), - snapshot->getVersion(), - lowerBound.toString(false).c_str(), - upperBound.toString(false).c_str()); + return std::move(page); + } + + // Get cursor into a BTree node, creating decode cache from boundaries if needed + static BTreePage::BinaryTree::Cursor getCursor(Reference page, + const RedwoodRecordRef& lowerBound, + const RedwoodRecordRef& upperBound) { + if (page->userData == nullptr) { + debug_printf("Creating DecodeCache for ptr=%p lower=%s upper=%s\n", + page->begin(), + lowerBound.toString().c_str(), + upperBound.toString().c_str()); BTreePage::BinaryTree::DecodeCache* cache = new BTreePage::BinaryTree::DecodeCache(lowerBound, upperBound); page->userData = cache; page->userDataDestructor = [](void* cache) { ((BTreePage::BinaryTree::DecodeCache*)cache)->delref(); }; } - if (!forLazyClear) { - debug_printf("readPage() %s\n", - pTreePage->toString(false, id, snapshot->getVersion(), lowerBound, upperBound).c_str()); + return BTreePage::BinaryTree::Cursor((BTreePage::BinaryTree::DecodeCache*)page->userData, + ((BTreePage*)page->begin())->tree()); + } + + // Get cursor into a BTree node from a child link + static BTreePage::BinaryTree::Cursor getCursor(const Reference& page, + const BTreePage::BinaryTree::Cursor& link) { + if (page->userData == nullptr) { + return getCursor(page, link.get(), link.next().getOrUpperBound()); } - return std::move(page); + return BTreePage::BinaryTree::Cursor((BTreePage::BinaryTree::DecodeCache*)page->userData, + ((BTreePage*)page->begin())->tree()); } static void preLoadPage(IPagerSnapshot* snapshot, BTreePageIDRef id) { @@ -4717,8 +4716,7 @@ private: wait(commitReadLock->take()); state FlowLock::Releaser readLock(*commitReadLock); state bool fromCache = false; - state Reference page = wait( - readPage(snapshot, rootID, update->decodeLowerBound, update->decodeUpperBound, false, false, &fromCache)); + state Reference page = wait(readPage(snapshot, rootID, false, false, &fromCache)); readLock.release(); // If the page exists in the cache, it must be copied before modification. @@ -4747,7 +4745,8 @@ private: btPage->toString(false, rootID, snapshot->getVersion(), update->decodeLowerBound, update->decodeUpperBound) .c_str()); - state BTreePage::BinaryTree::Cursor cursor = getCursor(page); + state BTreePage::BinaryTree::Cursor cursor = + update->cBegin.valid() ? getCursor(page, update->cBegin) : getCursor(page, dbBegin, dbEnd); if (REDWOOD_DEBUG) { debug_printf("%s ---------MUTATION BUFFER SLICE ---------------------\n", context.c_str()); @@ -5561,28 +5560,28 @@ public: PathEntry& back() { return path.back(); } void popPath() { path.pop_back(); } - Future pushPage(BTreePageIDRef id, - const RedwoodRecordRef& lowerBound, - const RedwoodRecordRef& upperBound) { - // The boundary RedwoodRecordRefs are shallow copied to readPage()'s argument / actor state variables, - // and the arenas for them must be kept alive by the higher path entries which contain ArenaPage - // references. - debug_printf("pushPage(%s) first cursor=%s\n", ::toString(id).c_str(), toString().c_str()); - return map(readPage(pager, id, lowerBound, upperBound), [=](Reference p) { + Future pushPage(const BTreePage::BinaryTree::Cursor& link) { + debug_printf("pushPage(link=%s)\n", link.get().toString(false).c_str()); + return map(readPage(pager, link.get().getChildPage()), [=](Reference p) { #if REDWOOD_DEBUG - path.push_back({ p, getCursor(p), id }); + path.push_back({ p, getCursor(p, link), link.get().getChildPage() }); #else - path.push_back({ p, getCursor(p) }); + path.push_back({ p, getCursor(p, link) }); #endif return Void(); }); } - Future pushPage(BTreePage::BinaryTree::Cursor c) { - auto next = c; - next.moveNext(); - BTreePageIDRef id = c.get().getChildPage(); - return pushPage(id, c.get(), next.getOrUpperBound()); + Future pushPage(BTreePageIDRef id) { + debug_printf("pushPage(root=%s)\n", ::toString(id).c_str()); + return map(readPage(pager, id), [=](Reference p) { +#if REDWOOD_DEBUG + path.push_back({ p, getCursor(p, dbBegin, dbEnd), id }); +#else + path.push_back({ p, getCursor(p, dbBegin, dbEnd) }); +#endif + return Void(); + }); } // Initialize or reinitialize cursor @@ -5592,7 +5591,7 @@ public: path.clear(); path.reserve(6); valid = false; - return pushPage(root, dbBegin, dbEnd); + return pushPage(root); } // Seeks cursor to query if it exists, the record before or after it, or an undefined and invalid From 3af0bea46cd5ca43956b5f355d54bd54ab1ff32f Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Thu, 3 Jun 2021 02:28:26 -0700 Subject: [PATCH 31/42] Removed or reduced several yields because they are called too often. --- fdbserver/VersionedBTree.actor.cpp | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index 92dfbca4f4..bc6000aab8 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -4044,6 +4044,7 @@ private: // Lower bound of the page being added to state RedwoodRecordRef pageLowerBound = lowerBound->withoutValue(); state RedwoodRecordRef pageUpperBound; + state int sinceYield = 0; state int pageIndex; @@ -4172,7 +4173,10 @@ private: } } - wait(yield()); + if (++sinceYield > 100) { + sinceYield = 0; + wait(yield()); + } if (REDWOOD_DEBUG) { auto& p = pagesToBuild[pageIndex]; @@ -4237,7 +4241,6 @@ private: bool cacheable = true, bool* fromCache = nullptr) { - wait(yield()); debug_printf("readPage() op=read%s %s @%" PRId64 "\n", forLazyClear ? "ForDeferredClear" : "", toString(id).c_str(), @@ -8124,11 +8127,10 @@ TEST_CASE(":/redwood/performance/set") { printf("Starting.\n"); state double intervalStart = timer(); state double start = intervalStart; + state int sinceYield = 0; if (insertRecords) { while (kvBytesTotal < kvBytesTarget) { - wait(yield()); - Version lastVer = btree->getLatestVersion(); state Version version = lastVer + 1; btree->setWriteVersion(version); @@ -8158,7 +8160,10 @@ TEST_CASE(":/redwood/performance/set") { ++recordsThisCommit; } - wait(yield()); + if (++sinceYield >= 100) { + sinceYield = 0; + wait(yield()); + } } if (kvBytesThisCommit >= maxKVBytesPerCommit || recordsThisCommit >= maxRecordsPerCommit) { From 46c4f6fd4799ffd21eb9e4d5aa5d061d596434d5 Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Sun, 6 Jun 2021 02:04:20 -0700 Subject: [PATCH 32/42] Added yields to prevent stack overflows from too many callbacks when queue operations accumulate waiting on IO. --- fdbserver/VersionedBTree.actor.cpp | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index bc6000aab8..c1c03f12f1 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -516,6 +516,14 @@ public: ++self->queue->numEntries; if (mustWait || needNewPage) { + // Prevent possible stack overflow if too many waiters which require no IO are queued up + // Using static because multiple Cursors can be involved + static int sinceYield = 0; + if (++sinceYield == 1000) { + sinceYield = 0; + wait(yield()); + } + self->mutex.release(); } @@ -551,10 +559,18 @@ public: wait(success(self->nextPageReader)); } - Optional result = wait(self->readNext(upperBound, true)); + state Optional result = wait(self->readNext(upperBound, true)); // If this actor instance locked the mutex, then unlock it. if (!locked) { + // Prevent possible stack overflow if too many waiters which require no IO are queued up + // Using static because multiple Cursors can be involved + static int sinceYield = 0; + if (++sinceYield == 1000) { + sinceYield = 0; + wait(yield()); + } + debug_printf("FIFOQueue::Cursor(%s) waitThenReadNext unlocking mutex\n", self->toString().c_str()); self->mutex.release(); } @@ -2116,6 +2132,7 @@ public: break; } + // Yield to prevent slow task in case no IO waits are encountered if (++sinceYield >= 100) { sinceYield = 0; wait(yield()); From d94929f08db8b9c83d76c18747652b6199c665e3 Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Mon, 7 Jun 2021 02:53:05 -0700 Subject: [PATCH 33/42] Added FlowMutex, a low overhead replacement for FlowLocks with a budget of 1. Replaced mutex in FIFOQueue::Cursor with FlowMutex to reduce overhead. --- fdbserver/VersionedBTree.actor.cpp | 216 +++++++++++++++++++++++++---- 1 file changed, 188 insertions(+), 28 deletions(-) diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index c1c03f12f1..9e0d7b3648 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -37,6 +37,40 @@ #include #include +// A low-overhead FIFO mutex made with no internal queue structure (no list, deque, vector, etc) +// The lock is implemented as a Promise, which is returned to callers in a convenient wrapper +// called Lock. +// +// Usage: +// Lock lock = wait(mutex.take()); +// lock.release(); // Next waiter will get the lock, OR +// lock.error(e); // Next waiter will get e, future waiters will see broken_promise OR +// lock = Lock(); // Or let Lock and any copies go out of scope. All waiters will see broken_promise. +struct FlowMutex { + FlowMutex() { lastPromise.send(Void()); } + + bool available() { return lastPromise.isSet(); } + + struct Lock { + void release() { promise.send(Void()); } + + void error(Error e = broken_promise()) { promise.sendError(e); } + + // This is exposed in case the caller wants to use/copy it directly + Promise promise; + }; + + Future take() { + Lock newLock; + Future f = lastPromise.isSet() ? newLock : tag(lastPromise.getFuture(), newLock); + lastPromise = newLock.promise; + return f; + } + +private: + Promise lastPromise; +}; + #define REDWOOD_DEBUG 0 // Only print redwood debug statements for a certain address. Useful in simulation with many redwood processes to reduce @@ -282,8 +316,7 @@ public: // This exists because writing the queue returns void, not a future Future writeOperations; - FlowLock mutex; - Future killMutex; + FlowMutex mutex; Cursor() : mode(NONE) {} @@ -294,14 +327,6 @@ public: int readOffset = 0, LogicalPageID endPage = invalidLogicalPageID) { queue = q; - - // If the pager gets an error, which includes shutdown, kill the mutex so any waiters can no longer run. - // This avoids having every mutex wait also wait on pagerError. - killMutex = map(ready(queue->pagerError), [=](Void e) { - mutex.kill(); - return Void(); - }); - mode = m; firstPageIDWritten = invalidLogicalPageID; offset = readOffset; @@ -379,14 +404,14 @@ public: #pragma pack(pop) // Returns true if the mutex cannot be immediately taken. - bool isBusy() { return mutex.activePermits() != 0; } + bool isBusy() { return !mutex.available(); } // Wait for all operations started before now to be ready, which is done by // obtaining and releasing the mutex. Future notBusy() { return isBusy() ? map(mutex.take(), - [&](Void) { - mutex.release(); + [&](FlowMutex::Lock lock) { + lock.release(); return Void(); }) : Void(); @@ -473,6 +498,7 @@ public: ACTOR static Future write_impl(Cursor* self, T item) { ASSERT(self->mode == WRITE); + state FlowMutex::Lock lock; state bool mustWait = self->isBusy(); state int bytesNeeded = Codec::bytesNeeded(item); state bool needNewPage = @@ -486,7 +512,8 @@ public: // If we have to wait for the mutex because it's busy, or we need a new page, then wait for the mutex. if (mustWait || needNewPage) { - wait(self->mutex.take()); + FlowMutex::Lock _lock = wait(self->mutex.take()); + lock = _lock; // If we had to wait because the mutex was busy, then update needNewPage as another writer // would have changed the cursor state @@ -524,7 +551,7 @@ public: wait(yield()); } - self->mutex.release(); + lock.release(); } return Void(); @@ -545,12 +572,15 @@ public: // Only mutex holders will wait on the page read. ACTOR static Future> waitThenReadNext(Cursor* self, Optional upperBound, - bool locked, + FlowMutex::Lock* lock, bool load) { - // Lock the mutex if it wasn't already - if (!locked) { + state FlowMutex::Lock localLock; + + // Lock the mutex if it wasn't already locked, so we didn't get a lock pointer + if (lock == nullptr) { debug_printf("FIFOQueue::Cursor(%s) waitThenReadNext locking mutex\n", self->toString().c_str()); - wait(self->mutex.take()); + FlowMutex::Lock newLock = wait(self->mutex.take()); + localLock = newLock; } if (load) { @@ -559,10 +589,10 @@ public: wait(success(self->nextPageReader)); } - state Optional result = wait(self->readNext(upperBound, true)); + state Optional result = wait(self->readNext(upperBound, &localLock)); - // If this actor instance locked the mutex, then unlock it. - if (!locked) { + // If a lock was not passed in, so this actor locked the mutex above, then unlock it + if (lock == nullptr) { // Prevent possible stack overflow if too many waiters which require no IO are queued up // Using static because multiple Cursors can be involved static int sinceYield = 0; @@ -572,7 +602,7 @@ public: } debug_printf("FIFOQueue::Cursor(%s) waitThenReadNext unlocking mutex\n", self->toString().c_str()); - self->mutex.release(); + localLock.release(); } return result; @@ -581,15 +611,15 @@ public: // Read the next item at the cursor (if < upperBound), moving to a new page first if the current page is // exhausted If locked is true, this call owns the mutex, which would have been locked by readNext() before a // recursive call - Future> readNext(const Optional& upperBound = {}, bool locked = false) { + Future> readNext(const Optional& upperBound = {}, FlowMutex::Lock* lock = nullptr) { if ((mode != POP && mode != READONLY) || pageID == invalidLogicalPageID || pageID == endPageID) { debug_printf("FIFOQueue::Cursor(%s) readNext returning nothing\n", toString().c_str()); return Optional(); } - // If we don't own the mutex and it's not available then acquire it - if (!locked && isBusy()) { - return waitThenReadNext(this, upperBound, false, false); + // If we don't have a lock and the mutex isn't available then acquire it + if (lock == nullptr && isBusy()) { + return waitThenReadNext(this, upperBound, lock, false); } // We now know pageID is valid and should be used, but page might not point to it yet @@ -605,7 +635,7 @@ public: } if (!nextPageReader.isReady()) { - return waitThenReadNext(this, upperBound, locked, true); + return waitThenReadNext(this, upperBound, lock, true); } page = nextPageReader.get(); @@ -8724,3 +8754,133 @@ TEST_CASE("!/redwood/performance/randomRangeScans") { return Void(); } + +constexpr double mutexTestDelay = 0.00001; + +ACTOR Future mutexTest(int id, FlowMutex* mutex, int n, bool allowError, bool* verbose) { + while (n-- > 0) { + state double d = deterministicRandom()->random01() * mutexTestDelay; + if (*verbose) { + printf("%d:%d wait %f while unlocked\n", id, n, d); + } + wait(delay(d)); + + if (*verbose) { + printf("%d:%d locking\n", id, n); + } + state FlowMutex::Lock lock = wait(mutex->take()); + if (*verbose) { + printf("%d:%d locked\n", id, n); + } + + d = deterministicRandom()->random01() * mutexTestDelay; + if (*verbose) { + printf("%d:%d wait %f while locked\n", id, n, d); + } + wait(delay(d)); + + // On the last iteration, send an error or drop the lock if allowError is true + if (n == 0 && allowError) { + if (deterministicRandom()->coinflip()) { + // Send explicit error + if (*verbose) { + printf("%d:%d sending error\n", id, n); + } + lock.error(end_of_stream()); + } else { + // Do nothing + if (*verbose) { + printf("%d:%d dropping promise, returning without unlock\n", id, n); + } + } + } else { + if (*verbose) { + printf("%d:%d unlocking\n", id, n); + } + lock.release(); + } + } + + if (*verbose) { + printf("%d Returning\n", id); + } + return Void(); +} + +TEST_CASE("/flow/FlowMutex") { + state int count = 100000; + + // Default verboseness + state bool verboseSetting = false; + // Useful for debugging, enable verbose mode for this iteration number + state int verboseTestIteration = -1; + + try { + state bool verbose = verboseSetting || count == verboseTestIteration; + + while (--count > 0) { + if (count % 1000 == 0) { + printf("%d tests left\n", count); + } + + state FlowMutex mutex; + state std::vector> tests; + + state bool allowErrors = deterministicRandom()->coinflip(); + if (verbose) { + printf("\nTesting allowErrors=%d\n", allowErrors); + } + + state Optional error; + + try { + for (int i = 0; i < 10; ++i) { + tests.push_back(mutexTest(i, &mutex, 10, allowErrors, &verbose)); + } + wait(waitForAll(tests)); + + if (allowErrors) { + if (verbose) { + printf("Final wait in case error was injected by the last actor to finish\n"); + } + wait(success(mutex.take())); + } + } catch (Error& e) { + if (verbose) { + printf("Caught error %s\n", e.what()); + } + error = e; + + // Wait for all actors still running to finish their waits and try to take the mutex + if (verbose) { + printf("Waiting for completions\n"); + } + wait(delay(2 * mutexTestDelay)); + + if (verbose) { + printf("Future end states:\n"); + } + // All futures should be ready, some with errors. + bool allReady = true; + for (int i = 0; i < tests.size(); ++i) { + auto f = tests[i]; + if (verbose) { + printf( + " %d: %s\n", i, f.isReady() ? (f.isError() ? f.getError().what() : "done") : "not ready"); + } + allReady = allReady && f.isReady(); + } + ASSERT(allReady); + } + + // If an error was caused, one should have been detected. + // Otherwise, no errors should be detected. + ASSERT(error.present() == allowErrors); + } + } catch (Error& e) { + printf("Error at count=%d\n", count + 1); + ASSERT(false); + } + + return Void(); +} From 72e077e69bea729e49a8516d7c0fbda265b0c83d Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Mon, 7 Jun 2021 03:41:43 -0700 Subject: [PATCH 34/42] Remove obsolete knobs. --- fdbserver/Knobs.cpp | 4 ---- fdbserver/Knobs.h | 4 ---- 2 files changed, 8 deletions(-) diff --git a/fdbserver/Knobs.cpp b/fdbserver/Knobs.cpp index d87660a85e..74189e22a7 100644 --- a/fdbserver/Knobs.cpp +++ b/fdbserver/Knobs.cpp @@ -266,10 +266,6 @@ void ServerKnobs::initialize(bool randomize, ClientKnobs* clientKnobs, bool isSi init( DD_REMOVE_STORE_ENGINE_DELAY, 60.0 ); if( randomize && BUGGIFY ) DD_REMOVE_STORE_ENGINE_DELAY = deterministicRandom()->random01() * 60.0; - // Redwood Storage Engine - init( PREFIX_TREE_IMMEDIATE_KEY_SIZE_LIMIT, 30 ); - init( PREFIX_TREE_IMMEDIATE_KEY_SIZE_MIN, 0 ); - // KeyValueStore SQLITE init( CLEAR_BUFFER_SIZE, 20000 ); init( READ_VALUE_TIME_ESTIMATE, .00005 ); diff --git a/fdbserver/Knobs.h b/fdbserver/Knobs.h index 8b2f8963f1..2ded8f312b 100644 --- a/fdbserver/Knobs.h +++ b/fdbserver/Knobs.h @@ -222,10 +222,6 @@ public: double DD_FAILURE_TIME; double DD_ZERO_HEALTHY_TEAM_DELAY; - // Redwood Storage Engine - int PREFIX_TREE_IMMEDIATE_KEY_SIZE_LIMIT; - int PREFIX_TREE_IMMEDIATE_KEY_SIZE_MIN; - // KeyValueStore SQLITE int CLEAR_BUFFER_SIZE; double READ_VALUE_TIME_ESTIMATE; From 293559bb615be15160eea94342623eab791bca9d Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Mon, 7 Jun 2021 03:48:17 -0700 Subject: [PATCH 35/42] Moved FlowMutex to genericactors. --- fdbserver/VersionedBTree.actor.cpp | 34 ------------------------------ flow/genericactors.actor.h | 34 ++++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+), 34 deletions(-) diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index 9e0d7b3648..06d96b0bcc 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -37,40 +37,6 @@ #include #include -// A low-overhead FIFO mutex made with no internal queue structure (no list, deque, vector, etc) -// The lock is implemented as a Promise, which is returned to callers in a convenient wrapper -// called Lock. -// -// Usage: -// Lock lock = wait(mutex.take()); -// lock.release(); // Next waiter will get the lock, OR -// lock.error(e); // Next waiter will get e, future waiters will see broken_promise OR -// lock = Lock(); // Or let Lock and any copies go out of scope. All waiters will see broken_promise. -struct FlowMutex { - FlowMutex() { lastPromise.send(Void()); } - - bool available() { return lastPromise.isSet(); } - - struct Lock { - void release() { promise.send(Void()); } - - void error(Error e = broken_promise()) { promise.sendError(e); } - - // This is exposed in case the caller wants to use/copy it directly - Promise promise; - }; - - Future take() { - Lock newLock; - Future f = lastPromise.isSet() ? newLock : tag(lastPromise.getFuture(), newLock); - lastPromise = newLock.promise; - return f; - } - -private: - Promise lastPromise; -}; - #define REDWOOD_DEBUG 0 // Only print redwood debug statements for a certain address. Useful in simulation with many redwood processes to reduce diff --git a/flow/genericactors.actor.h b/flow/genericactors.actor.h index a4b67f6fdf..38d9d79323 100644 --- a/flow/genericactors.actor.h +++ b/flow/genericactors.actor.h @@ -1271,6 +1271,40 @@ Future waitOrError(Future f, Future errorSignal) { } } +// A low-overhead FIFO mutex made with no internal queue structure (no list, deque, vector, etc) +// The lock is implemented as a Promise, which is returned to callers in a convenient wrapper +// called Lock. +// +// Usage: +// Lock lock = wait(mutex.take()); +// lock.release(); // Next waiter will get the lock, OR +// lock.error(e); // Next waiter will get e, future waiters will see broken_promise OR +// lock = Lock(); // Or let Lock and any copies go out of scope. All waiters will see broken_promise. +struct FlowMutex { + FlowMutex() { lastPromise.send(Void()); } + + bool available() { return lastPromise.isSet(); } + + struct Lock { + void release() { promise.send(Void()); } + + void error(Error e = broken_promise()) { promise.sendError(e); } + + // This is exposed in case the caller wants to use/copy it directly + Promise promise; + }; + + Future take() { + Lock newLock; + Future f = lastPromise.isSet() ? newLock : tag(lastPromise.getFuture(), newLock); + lastPromise = newLock.promise; + return f; + } + +private: + Promise lastPromise; +}; + struct FlowLock : NonCopyable, public ReferenceCounted { // FlowLock implements a nonblocking critical section: there can be only a limited number of clients executing code // between wait(take()) and release(). Not thread safe. take() returns only when the number of holders of the lock From adcf126bfac3bfc73deaeb7d5c5901a649542d8c Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Mon, 7 Jun 2021 04:37:03 -0700 Subject: [PATCH 36/42] Removed commit read FlowLock because it costs too much overhead, will need another way to throttle. Removed readPage() fromCache argument as it is no longer useful. --- fdbserver/IPager.h | 10 ++---- fdbserver/Knobs.cpp | 1 - fdbserver/Knobs.h | 1 - fdbserver/VersionedBTree.actor.cpp | 54 ++++++------------------------ 4 files changed, 12 insertions(+), 54 deletions(-) diff --git a/fdbserver/IPager.h b/fdbserver/IPager.h index 3514dd3a06..76b08d8313 100644 --- a/fdbserver/IPager.h +++ b/fdbserver/IPager.h @@ -125,10 +125,7 @@ public: class IPagerSnapshot { public: - virtual Future> getPhysicalPage(LogicalPageID pageID, - bool cacheable, - bool nohit, - bool* fromCache = nullptr) = 0; + virtual Future> getPhysicalPage(LogicalPageID pageID, bool cacheable, bool nohit) = 0; virtual bool tryEvictPage(LogicalPageID id) = 0; virtual Version getVersion() const = 0; @@ -180,10 +177,7 @@ public: // Cacheable indicates that the page should be added to the page cache (if applicable?) as a result of this read. // NoHit indicates that the read should not be considered a cache hit, such as when preloading pages that are // considered likely to be needed soon. - virtual Future> readPage(LogicalPageID pageID, - bool cacheable = true, - bool noHit = false, - bool* fromCache = nullptr) = 0; + virtual Future> readPage(LogicalPageID pageID, bool cacheable = true, bool noHit = false) = 0; // Get a snapshot of the metakey and all pages as of the version v which must be >= getOldestVersion() // Note that snapshots at any version may still see the results of updatePage() calls. diff --git a/fdbserver/Knobs.cpp b/fdbserver/Knobs.cpp index 74189e22a7..249060e9f0 100644 --- a/fdbserver/Knobs.cpp +++ b/fdbserver/Knobs.cpp @@ -707,7 +707,6 @@ void ServerKnobs::initialize(bool randomize, ClientKnobs* clientKnobs, bool isSi init( REDWOOD_DEFAULT_PAGE_SIZE, 4096 ); init( REDWOOD_KVSTORE_CONCURRENT_READS, 64 ); - init( REDWOOD_COMMIT_CONCURRENT_READS, 64 ); init( REDWOOD_PAGE_REBUILD_MAX_SLACK, 0.33 ); init( REDWOOD_LAZY_CLEAR_BATCH_SIZE_PAGES, 10 ); init( REDWOOD_LAZY_CLEAR_MIN_PAGES, 0 ); diff --git a/fdbserver/Knobs.h b/fdbserver/Knobs.h index 2ded8f312b..289da507eb 100644 --- a/fdbserver/Knobs.h +++ b/fdbserver/Knobs.h @@ -641,7 +641,6 @@ public: int REDWOOD_DEFAULT_PAGE_SIZE; // Page size for new Redwood files int REDWOOD_KVSTORE_CONCURRENT_READS; // Max number of simultaneous point or range reads in progress. - int REDWOOD_COMMIT_CONCURRENT_READS; // Max number of concurrent reads done to support commit operations double REDWOOD_PAGE_REBUILD_MAX_SLACK; // When rebuilding pages, max slack to allow in page int REDWOOD_LAZY_CLEAR_BATCH_SIZE_PAGES; // Number of pages to try to pop from the lazy delete queue and process at // once diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index 06d96b0bcc..784d194998 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -1858,21 +1858,12 @@ public: // Reads the most recent version of pageID, either previously committed or written using updatePage() // in the current commit - // If cacheable is false then if fromCache is valid it will be set to true if the page is from cache, otherwise - // false. If cacheable is true, fromCache is ignored as the result is automatically from cache by virtue of being - // cacheable. - Future> readPage(LogicalPageID pageID, - bool cacheable, - bool noHit = false, - bool* fromCache = nullptr) override { + Future> readPage(LogicalPageID pageID, bool cacheable, bool noHit = false) override { // Use cached page if present, without triggering a cache hit. // Otherwise, read the page and return it but don't add it to the cache if (!cacheable) { debug_printf("DWALPager(%s) op=readUncached %s\n", filename.c_str(), toString(pageID).c_str()); PageCacheEntry* pCacheEntry = pageCache.getIfExists(pageID); - if (fromCache != nullptr) { - *fromCache = pCacheEntry != nullptr; - } if (pCacheEntry != nullptr) { debug_printf("DWALPager(%s) op=readUncachedHit %s\n", filename.c_str(), toString(pageID).c_str()); @@ -1926,13 +1917,9 @@ public: return (PhysicalPageID)pageID; } - Future> readPageAtVersion(LogicalPageID logicalID, - Version v, - bool cacheable, - bool noHit, - bool* fromCache) { + Future> readPageAtVersion(LogicalPageID logicalID, Version v, bool cacheable, bool noHit) { PhysicalPageID physicalID = getPhysicalPageID(logicalID, v); - return readPage(physicalID, cacheable, noHit, fromCache); + return readPage(physicalID, cacheable, noHit); } // Get snapshot as of the most recent committed version of the pager @@ -2473,14 +2460,11 @@ public: : pager(pager), metaKey(meta), version(version), expired(expiredFuture) {} ~DWALPagerSnapshot() override {} - Future> getPhysicalPage(LogicalPageID pageID, - bool cacheable, - bool noHit, - bool* fromCache) override { + Future> getPhysicalPage(LogicalPageID pageID, bool cacheable, bool noHit) override { if (expired.isError()) { throw expired.getError(); } - return map(pager->readPageAtVersion(pageID, version, cacheable, noHit, fromCache), + return map(pager->readPageAtVersion(pageID, version, cacheable, noHit), [=](Reference p) { return Reference(std::move(p)); }); } @@ -3389,8 +3373,7 @@ public: VersionedBTree(IPager2* pager, std::string name) : m_pager(pager), m_writeVersion(invalidVersion), m_lastCommittedVersion(invalidVersion), m_pBuffer(nullptr), - m_commitReadLock(new FlowLock(SERVER_KNOBS->REDWOOD_COMMIT_CONCURRENT_READS)), m_name(name), m_pHeader(nullptr), - m_headerSpace(0) { + m_name(name), m_pHeader(nullptr), m_headerSpace(0) { m_lazyClearActor = 0; m_init = init_impl(this); @@ -3834,7 +3817,6 @@ private: Version m_writeVersion; Version m_lastCommittedVersion; Version m_newOldestVersion; - Reference m_commitReadLock; Future m_latestCommit; Future m_init; std::string m_name; @@ -4251,8 +4233,7 @@ private: ACTOR static Future> readPage(Reference snapshot, BTreePageIDRef id, bool forLazyClear = false, - bool cacheable = true, - bool* fromCache = nullptr) { + bool cacheable = true) { debug_printf("readPage() op=read%s %s @%" PRId64 "\n", forLazyClear ? "ForDeferredClear" : "", @@ -4262,7 +4243,7 @@ private: state Reference page; if (id.size() == 1) { - Reference p = wait(snapshot->getPhysicalPage(id.front(), cacheable, false, fromCache)); + Reference p = wait(snapshot->getPhysicalPage(id.front(), cacheable, false)); page = std::move(p); } else { ASSERT(!id.empty()); @@ -4273,11 +4254,6 @@ private: std::vector> pages = wait(getAll(reads)); // TODO: Cache reconstituted super pages somehow, perhaps with help from the Pager. page = ArenaPage::concatPages(pages); - - // In the current implementation, SuperPages are never present in the cache - if (fromCache != nullptr) { - *fromCache = false; - } } debug_printf("readPage() op=readComplete %s @%" PRId64 " \n", toString(id).c_str(), snapshot->getVersion()); @@ -4726,24 +4702,14 @@ private: debug_printf("%s -------------------------------------\n", context.c_str()); } + state Reference page = wait(readPage(snapshot, rootID, false, false)); state Version writeVersion = self->getLastCommittedVersion() + 1; - state Reference commitReadLock = self->m_commitReadLock; - wait(commitReadLock->take()); - state FlowLock::Releaser readLock(*commitReadLock); - state bool fromCache = false; - state Reference page = wait(readPage(snapshot, rootID, false, false, &fromCache)); - readLock.release(); - // If the page exists in the cache, it must be copied before modification. // That copy will be referenced by pageCopy, as page must stay in scope in case anything references its // memory and it gets evicted from the cache. // If the page is not in the cache, then no copy is needed so we will initialize pageCopy to page - state Reference pageCopy = fromCache ? Reference() : page; - - if (!fromCache) { - pageCopy = page; - } + state Reference pageCopy; state BTreePage* btPage = (BTreePage*)page->begin(); ASSERT(isLeaf == btPage->isLeaf()); From f7554b8fcbc78704274ff7c6c12561b1c2e47cc1 Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Tue, 8 Jun 2021 01:55:29 -0700 Subject: [PATCH 37/42] Move FlowMutex unit test to FlowTests. --- fdbrpc/FlowTests.actor.cpp | 130 +++++++++++++++++++++++++++++ fdbserver/VersionedBTree.actor.cpp | 129 ---------------------------- 2 files changed, 130 insertions(+), 129 deletions(-) diff --git a/fdbrpc/FlowTests.actor.cpp b/fdbrpc/FlowTests.actor.cpp index 40e4ed1c52..f16cfb1ec3 100644 --- a/fdbrpc/FlowTests.actor.cpp +++ b/fdbrpc/FlowTests.actor.cpp @@ -1484,3 +1484,133 @@ TEST_CASE("/flow/flow/PromiseStream/move2") { ASSERT(movedTracker.copied == 0); return Void(); } + +constexpr double mutexTestDelay = 0.00001; + +ACTOR Future mutexTest(int id, FlowMutex* mutex, int n, bool allowError, bool* verbose) { + while (n-- > 0) { + state double d = deterministicRandom()->random01() * mutexTestDelay; + if (*verbose) { + printf("%d:%d wait %f while unlocked\n", id, n, d); + } + wait(delay(d)); + + if (*verbose) { + printf("%d:%d locking\n", id, n); + } + state FlowMutex::Lock lock = wait(mutex->take()); + if (*verbose) { + printf("%d:%d locked\n", id, n); + } + + d = deterministicRandom()->random01() * mutexTestDelay; + if (*verbose) { + printf("%d:%d wait %f while locked\n", id, n, d); + } + wait(delay(d)); + + // On the last iteration, send an error or drop the lock if allowError is true + if (n == 0 && allowError) { + if (deterministicRandom()->coinflip()) { + // Send explicit error + if (*verbose) { + printf("%d:%d sending error\n", id, n); + } + lock.error(end_of_stream()); + } else { + // Do nothing + if (*verbose) { + printf("%d:%d dropping promise, returning without unlock\n", id, n); + } + } + } else { + if (*verbose) { + printf("%d:%d unlocking\n", id, n); + } + lock.release(); + } + } + + if (*verbose) { + printf("%d Returning\n", id); + } + return Void(); +} + +TEST_CASE("/flow/flow/FlowMutex") { + state int count = 100000; + + // Default verboseness + state bool verboseSetting = false; + // Useful for debugging, enable verbose mode for this iteration number + state int verboseTestIteration = -1; + + try { + state bool verbose = verboseSetting || count == verboseTestIteration; + + while (--count > 0) { + if (count % 1000 == 0) { + printf("%d tests left\n", count); + } + + state FlowMutex mutex; + state std::vector> tests; + + state bool allowErrors = deterministicRandom()->coinflip(); + if (verbose) { + printf("\nTesting allowErrors=%d\n", allowErrors); + } + + state Optional error; + + try { + for (int i = 0; i < 10; ++i) { + tests.push_back(mutexTest(i, &mutex, 10, allowErrors, &verbose)); + } + wait(waitForAll(tests)); + + if (allowErrors) { + if (verbose) { + printf("Final wait in case error was injected by the last actor to finish\n"); + } + wait(success(mutex.take())); + } + } catch (Error& e) { + if (verbose) { + printf("Caught error %s\n", e.what()); + } + error = e; + + // Wait for all actors still running to finish their waits and try to take the mutex + if (verbose) { + printf("Waiting for completions\n"); + } + wait(delay(2 * mutexTestDelay)); + + if (verbose) { + printf("Future end states:\n"); + } + // All futures should be ready, some with errors. + bool allReady = true; + for (int i = 0; i < tests.size(); ++i) { + auto f = tests[i]; + if (verbose) { + printf( + " %d: %s\n", i, f.isReady() ? (f.isError() ? f.getError().what() : "done") : "not ready"); + } + allReady = allReady && f.isReady(); + } + ASSERT(allReady); + } + + // If an error was caused, one should have been detected. + // Otherwise, no errors should be detected. + ASSERT(error.present() == allowErrors); + } + } catch (Error& e) { + printf("Error at count=%d\n", count + 1); + ASSERT(false); + } + + return Void(); +} diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index 784d194998..f5a864f90c 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -8687,132 +8687,3 @@ TEST_CASE("!/redwood/performance/randomRangeScans") { return Void(); } -constexpr double mutexTestDelay = 0.00001; - -ACTOR Future mutexTest(int id, FlowMutex* mutex, int n, bool allowError, bool* verbose) { - while (n-- > 0) { - state double d = deterministicRandom()->random01() * mutexTestDelay; - if (*verbose) { - printf("%d:%d wait %f while unlocked\n", id, n, d); - } - wait(delay(d)); - - if (*verbose) { - printf("%d:%d locking\n", id, n); - } - state FlowMutex::Lock lock = wait(mutex->take()); - if (*verbose) { - printf("%d:%d locked\n", id, n); - } - - d = deterministicRandom()->random01() * mutexTestDelay; - if (*verbose) { - printf("%d:%d wait %f while locked\n", id, n, d); - } - wait(delay(d)); - - // On the last iteration, send an error or drop the lock if allowError is true - if (n == 0 && allowError) { - if (deterministicRandom()->coinflip()) { - // Send explicit error - if (*verbose) { - printf("%d:%d sending error\n", id, n); - } - lock.error(end_of_stream()); - } else { - // Do nothing - if (*verbose) { - printf("%d:%d dropping promise, returning without unlock\n", id, n); - } - } - } else { - if (*verbose) { - printf("%d:%d unlocking\n", id, n); - } - lock.release(); - } - } - - if (*verbose) { - printf("%d Returning\n", id); - } - return Void(); -} - -TEST_CASE("/flow/FlowMutex") { - state int count = 100000; - - // Default verboseness - state bool verboseSetting = false; - // Useful for debugging, enable verbose mode for this iteration number - state int verboseTestIteration = -1; - - try { - state bool verbose = verboseSetting || count == verboseTestIteration; - - while (--count > 0) { - if (count % 1000 == 0) { - printf("%d tests left\n", count); - } - - state FlowMutex mutex; - state std::vector> tests; - - state bool allowErrors = deterministicRandom()->coinflip(); - if (verbose) { - printf("\nTesting allowErrors=%d\n", allowErrors); - } - - state Optional error; - - try { - for (int i = 0; i < 10; ++i) { - tests.push_back(mutexTest(i, &mutex, 10, allowErrors, &verbose)); - } - wait(waitForAll(tests)); - - if (allowErrors) { - if (verbose) { - printf("Final wait in case error was injected by the last actor to finish\n"); - } - wait(success(mutex.take())); - } - } catch (Error& e) { - if (verbose) { - printf("Caught error %s\n", e.what()); - } - error = e; - - // Wait for all actors still running to finish their waits and try to take the mutex - if (verbose) { - printf("Waiting for completions\n"); - } - wait(delay(2 * mutexTestDelay)); - - if (verbose) { - printf("Future end states:\n"); - } - // All futures should be ready, some with errors. - bool allReady = true; - for (int i = 0; i < tests.size(); ++i) { - auto f = tests[i]; - if (verbose) { - printf( - " %d: %s\n", i, f.isReady() ? (f.isError() ? f.getError().what() : "done") : "not ready"); - } - allReady = allReady && f.isReady(); - } - ASSERT(allReady); - } - - // If an error was caused, one should have been detected. - // Otherwise, no errors should be detected. - ASSERT(error.present() == allowErrors); - } - } catch (Error& e) { - printf("Error at count=%d\n", count + 1); - ASSERT(false); - } - - return Void(); -} From b39d4af91a4e6c8c0b78e2371eae3b9613a4ad5e Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Tue, 8 Jun 2021 02:45:08 -0700 Subject: [PATCH 38/42] Redwood KVS wrapper now shares the same error Promise as the Pager, so the FlowLock in the read actors no longer needs to be reference counted. --- fdbserver/VersionedBTree.actor.cpp | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index f5a864f90c..ba720dae1f 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -1371,7 +1371,8 @@ public: std::string filename, int64_t pageCacheSizeBytes, Version remapCleanupWindow, - bool memoryOnly = false) + bool memoryOnly = false, + Promise errorPromise = {}) : desiredPageSize(desiredPageSize), filename(filename), pHeader(nullptr), pageCacheBytes(pageCacheSizeBytes), memoryOnly(memoryOnly), remapCleanupWindow(remapCleanupWindow) { @@ -5755,7 +5756,7 @@ RedwoodRecordRef VersionedBTree::dbEnd(LiteralStringRef("\xff\xff\xff\xff\xff")) class KeyValueStoreRedwoodUnversioned : public IKeyValueStore { public: KeyValueStoreRedwoodUnversioned(std::string filePrefix, UID logID) - : m_filePrefix(filePrefix), m_concurrentReads(new FlowLock(SERVER_KNOBS->REDWOOD_KVSTORE_CONCURRENT_READS)) { + : m_filePrefix(filePrefix), m_concurrentReads(SERVER_KNOBS->REDWOOD_KVSTORE_CONCURRENT_READS) { int pageSize = BUGGIFY ? deterministicRandom()->randomInt(1000, 4096 * 4) : SERVER_KNOBS->REDWOOD_DEFAULT_PAGE_SIZE; @@ -5767,7 +5768,7 @@ public: Version remapCleanupWindow = BUGGIFY ? deterministicRandom()->randomInt64(0, 1000) : SERVER_KNOBS->REDWOOD_REMAP_CLEANUP_WINDOW; - IPager2* pager = new DWALPager(pageSize, filePrefix, pageCacheBytes, remapCleanupWindow); + IPager2* pager = new DWALPager(pageSize, filePrefix, pageCacheBytes, remapCleanupWindow, false, m_error); m_tree = new VersionedBTree(pager, filePrefix); m_init = catchError(init_impl(this)); } @@ -5843,9 +5844,8 @@ public: state VersionedBTree::BTreeCursor cur; wait(self->m_tree->initBTreeCursor(&cur, self->m_tree->getLastCommittedVersion())); - state Reference readLock = self->m_concurrentReads; - wait(readLock->take()); - state FlowLock::Releaser releaser(*readLock); + wait(self->m_concurrentReads.take()); + state FlowLock::Releaser releaser(self->m_concurrentReads); ++g_redwoodMetrics.opGetRange; state RangeResult result; @@ -5959,9 +5959,8 @@ public: state VersionedBTree::BTreeCursor cur; wait(self->m_tree->initBTreeCursor(&cur, self->m_tree->getLastCommittedVersion())); - state Reference readLock = self->m_concurrentReads; - wait(readLock->take()); - state FlowLock::Releaser releaser(*readLock); + wait(self->m_concurrentReads.take()); + state FlowLock::Releaser releaser(self->m_concurrentReads); ++g_redwoodMetrics.opGet; wait(cur.seekGTE(key, 0)); @@ -5999,7 +5998,7 @@ private: Future m_init; Promise m_closed; Promise m_error; - Reference m_concurrentReads; + FlowLock m_concurrentReads; template inline Future catchError(Future f) { From 64429097bf2a7802e83f0c458a88cfc3f55df448 Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Tue, 8 Jun 2021 18:56:18 -0700 Subject: [PATCH 39/42] Bump pager and btree format versions because there have been format changes. --- fdbserver/VersionedBTree.actor.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index ba720dae1f..62113040c8 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -2346,7 +2346,7 @@ private: #pragma pack(push, 1) // Header is the format of page 0 of the database struct Header { - static constexpr int FORMAT_VERSION = 2; + static constexpr int FORMAT_VERSION = 3; uint16_t formatVersion; uint32_t pageSize; int64_t pageCount; @@ -3283,7 +3283,7 @@ public: #pragma pack(push, 1) struct MetaKey { - static constexpr int FORMAT_VERSION = 10; + static constexpr int FORMAT_VERSION = 11; // This serves as the format version for the entire tree, individual pages will not be versioned uint16_t formatVersion; uint8_t height; From 0253463a9f185a3af76871789edd5a5c6f882259 Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Wed, 9 Jun 2021 15:41:30 -0700 Subject: [PATCH 40/42] Remove redundant "or" Co-authored-by: Andrew Noyes --- flow/genericactors.actor.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flow/genericactors.actor.h b/flow/genericactors.actor.h index 38d9d79323..7d6f521fbe 100644 --- a/flow/genericactors.actor.h +++ b/flow/genericactors.actor.h @@ -1278,7 +1278,7 @@ Future waitOrError(Future f, Future errorSignal) { // Usage: // Lock lock = wait(mutex.take()); // lock.release(); // Next waiter will get the lock, OR -// lock.error(e); // Next waiter will get e, future waiters will see broken_promise OR +// lock.error(e); // Next waiter will get e, future waiters will see broken_promise // lock = Lock(); // Or let Lock and any copies go out of scope. All waiters will see broken_promise. struct FlowMutex { FlowMutex() { lastPromise.send(Void()); } From 69f7c7cba2ea56ec0f45cdfd2e35b618f4c66d62 Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Wed, 9 Jun 2021 21:18:58 -0700 Subject: [PATCH 41/42] Make deltatree debug toggle easier to use. --- fdbserver/DeltaTree.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fdbserver/DeltaTree.h b/fdbserver/DeltaTree.h index 0a810414ef..ead4c92109 100644 --- a/fdbserver/DeltaTree.h +++ b/fdbserver/DeltaTree.h @@ -26,8 +26,13 @@ #include "fdbserver/Knobs.h" #include +#define DELTATREE_DEBUG 0 + +#if DELTATREE_DEBUG +#define deltatree_printf(...) printf(__VA_ARGS__) +#else #define deltatree_printf(...) -// #define deltatree_printf(...) printf(__VA_ARGS__) +#endif typedef uint64_t Word; // Get the number of prefix bytes that are the same between a and b, up to their common length of cl From 8cbc26d43658477070d1fba83ae0d48b3c99c651 Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Thu, 10 Jun 2021 02:29:17 -0700 Subject: [PATCH 42/42] Added documentation for DeltaTree2. --- fdbserver/DeltaTree.h | 89 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 87 insertions(+), 2 deletions(-) diff --git a/fdbserver/DeltaTree.h b/fdbserver/DeltaTree.h index ead4c92109..7f2b1ae723 100644 --- a/fdbserver/DeltaTree.h +++ b/fdbserver/DeltaTree.h @@ -905,7 +905,73 @@ private: } }; -// ------------------------------------------------------------------ +// DeltaTree2 is a memory mappable binary tree of T objects such that each node's item is +// stored as a Delta which can reproduce the node's T item given either +// - The node's greatest lesser ancestor, called the "left parent" +// - The node's least greater ancestor, called the "right parent" +// One of these ancestors will also happen to be the node's direct parent. +// +// The Delta type is intended to make use of ordered prefix compression and borrow all +// available prefix bytes from the ancestor T which shares the most prefix bytes with +// the item T being encoded. If T is implemented properly, this results in perfect +// prefix compression while performing O(log n) comparisons for a seek. +// +// T requirements +// +// Must be compatible with Standalone and must implement the following additional things: +// +// // Return the common prefix length between *this and T +// // skipLen is a hint, representing the length that is already known to be common. +// int getCommonPrefixLen(const T& other, int skipLen) const; +// +// // Compare *this to rhs, returns < 0 for less than, 0 for equal, > 0 for greater than +// // skipLen is a hint, representing the length that is already known to be common. +// int compare(const T &rhs, int skipLen) const; +// +// // Writes to d a delta which can create *this from base +// // commonPrefix is a hint, representing the length that is already known to be common. +// // DeltaT's size need not be static, for more details see below. +// void writeDelta(DeltaT &d, const T &base, int commonPrefix) const; +// +// // Returns the size in bytes of the DeltaT required to recreate *this from base +// int deltaSize(const T &base) const; +// +// // A type which represents the parts of T that either borrowed from the base T +// // or can be borrowed by other T's using the first T as a base +// // Partials must allocate any heap storage in the provided Arena for any operation. +// typedef Partial; +// +// // Update cache with the Partial for *this, storing any heap memory for the Partial in arena +// void updateCache(Optional cache, Arena& arena) const; +// +// // For debugging, return a useful human-readable string representation of *this +// std::string toString() const; +// +// DeltaT requirements +// +// DeltaT can be variable sized, larger than sizeof(DeltaT), and implement the following: +// +// // Returns the size in bytes of this specific DeltaT instance +// int size(); +// +// // Apply *this to base and return the resulting T +// // Store the Partial for T into cache, allocating any heap memory for the Partial in arena +// T apply(Arena& arena, const T& base, Optional& cache); +// +// // Recreate T from *this and the Partial for T +// T apply(const T::Partial& cache); +// +// // Set or retrieve a boolean flag representing which base ancestor the DeltaT is to be applied to +// void setPrefixSource(bool val); +// bool getPrefixSource() const; +// +// // Set of retrieve a boolean flag representing that a DeltaTree node has been erased +// void setDeleted(bool val); +// bool getDeleted() const; +// +// // For debugging, return a useful human-readable string representation of *this +// std::string toString() const; +// #pragma pack(push, 1) template struct DeltaTree2 { @@ -921,8 +987,11 @@ struct DeltaTree2 { uint8_t maxHeight; // Maximum height of tree after any insertion. Value of 0 means no insertions done. bool largeNodes; // Node size, can be calculated as capacity > SmallSizeLimit but it will be used a lot }; + + // Node is not fixed size. Most node methods require the context of whether the node is in small or large + // offset mode, passed as a boolean struct Node { - // Offsets are relative to the start of the tree + // Offsets are relative to the start of the DeltaTree union { struct { uint32_t leftChild; @@ -984,6 +1053,16 @@ struct DeltaTree2 { int capacity() const { return size() + nodeBytesFree; } public: + // DecodedNode represents a Node of a DeltaTree and its T::Partial. + // DecodedNodes are created on-demand, as DeltaTree Nodes are visited by a Cursor. + // DecodedNodes link together to form a binary tree with the same Node relationships as their + // corresponding DeltaTree Nodes. Additionally, DecodedNodes store links to their left and + // right ancestors which correspond to possible base Nodes on which the Node's Delta is based. + // + // DecodedNode links are not pointers, but rather indices to be looked up in the DecodeCache + // defined below. An index value of -1 is uninitialized, meaning it is not yet known whether + // the corresponding DeltaTree Node link is non-null in any version of the DeltaTree which is + // using or has used the DecodeCache. struct DecodedNode { DecodedNode(int nodeOffset, int leftParentIndex, int rightParentIndex) : nodeOffset(nodeOffset), leftParentIndex(leftParentIndex), rightParentIndex(rightParentIndex), @@ -1008,6 +1087,12 @@ public: } }; #pragma pack(pop) + + // The DecodeCache is a reference counted structure that stores DecodedNodes by an integer index + // and can be shared across a series of updated copies of a DeltaTree. + // + // DecodedNodes are stored in a contiguous vector, which sometimes must be expanded, so care + // must be taken to resolve DecodedNode pointers again after the DecodeCache has new entries added. struct DecodeCache : FastAllocated, ReferenceCounted { DecodeCache(const T& lowerBound = T(), const T& upperBound = T()) : lowerBound(arena, lowerBound), upperBound(arena, upperBound) {