diff --git a/fdbrpc/FlowTests.actor.cpp b/fdbrpc/FlowTests.actor.cpp index 40e4ed1c52..f16cfb1ec3 100644 --- a/fdbrpc/FlowTests.actor.cpp +++ b/fdbrpc/FlowTests.actor.cpp @@ -1484,3 +1484,133 @@ TEST_CASE("/flow/flow/PromiseStream/move2") { ASSERT(movedTracker.copied == 0); return Void(); } + +constexpr double mutexTestDelay = 0.00001; + +ACTOR Future mutexTest(int id, FlowMutex* mutex, int n, bool allowError, bool* verbose) { + while (n-- > 0) { + state double d = deterministicRandom()->random01() * mutexTestDelay; + if (*verbose) { + printf("%d:%d wait %f while unlocked\n", id, n, d); + } + wait(delay(d)); + + if (*verbose) { + printf("%d:%d locking\n", id, n); + } + state FlowMutex::Lock lock = wait(mutex->take()); + if (*verbose) { + printf("%d:%d locked\n", id, n); + } + + d = deterministicRandom()->random01() * mutexTestDelay; + if (*verbose) { + printf("%d:%d wait %f while locked\n", id, n, d); + } + wait(delay(d)); + + // On the last iteration, send an error or drop the lock if allowError is true + if (n == 0 && allowError) { + if (deterministicRandom()->coinflip()) { + // Send explicit error + if (*verbose) { + printf("%d:%d sending error\n", id, n); + } + lock.error(end_of_stream()); + } else { + // Do nothing + if (*verbose) { + printf("%d:%d dropping promise, returning without unlock\n", id, n); + } + } + } else { + if (*verbose) { + printf("%d:%d unlocking\n", id, n); + } + lock.release(); + } + } + + if (*verbose) { + printf("%d Returning\n", id); + } + return Void(); +} + +TEST_CASE("/flow/flow/FlowMutex") { + state int count = 100000; + + // Default verboseness + state bool verboseSetting = false; + // Useful for debugging, enable verbose mode for this iteration number + state int verboseTestIteration = -1; + + try { + state bool verbose = verboseSetting || count == verboseTestIteration; + + while (--count > 0) { + if (count % 1000 == 0) { + printf("%d tests left\n", count); + } + + state FlowMutex mutex; + state std::vector> tests; + + state bool allowErrors = deterministicRandom()->coinflip(); + if (verbose) { + printf("\nTesting allowErrors=%d\n", allowErrors); + } + + state Optional error; + + try { + for (int i = 0; i < 10; ++i) { + tests.push_back(mutexTest(i, &mutex, 10, allowErrors, &verbose)); + } + wait(waitForAll(tests)); + + if (allowErrors) { + if (verbose) { + printf("Final wait in case error was injected by the last actor to finish\n"); + } + wait(success(mutex.take())); + } + } catch (Error& e) { + if (verbose) { + printf("Caught error %s\n", e.what()); + } + error = e; + + // Wait for all actors still running to finish their waits and try to take the mutex + if (verbose) { + printf("Waiting for completions\n"); + } + wait(delay(2 * mutexTestDelay)); + + if (verbose) { + printf("Future end states:\n"); + } + // All futures should be ready, some with errors. + bool allReady = true; + for (int i = 0; i < tests.size(); ++i) { + auto f = tests[i]; + if (verbose) { + printf( + " %d: %s\n", i, f.isReady() ? (f.isError() ? f.getError().what() : "done") : "not ready"); + } + allReady = allReady && f.isReady(); + } + ASSERT(allReady); + } + + // If an error was caused, one should have been detected. + // Otherwise, no errors should be detected. + ASSERT(error.present() == allowErrors); + } + } catch (Error& e) { + printf("Error at count=%d\n", count + 1); + ASSERT(false); + } + + return Void(); +} diff --git a/fdbserver/CMakeLists.txt b/fdbserver/CMakeLists.txt index d80254097a..430d92fe96 100644 --- a/fdbserver/CMakeLists.txt +++ b/fdbserver/CMakeLists.txt @@ -27,7 +27,6 @@ set(FDBSERVER_SRCS IKeyValueContainer.h IKeyValueStore.h IPager.h - IVersionedStore.h KeyValueStoreCompressTestData.actor.cpp KeyValueStoreMemory.actor.cpp KeyValueStoreRocksDB.actor.cpp diff --git a/fdbserver/DeltaTree.h b/fdbserver/DeltaTree.h index f9ddd465b6..7f2b1ae723 100644 --- a/fdbserver/DeltaTree.h +++ b/fdbserver/DeltaTree.h @@ -26,6 +26,14 @@ #include "fdbserver/Knobs.h" #include +#define DELTATREE_DEBUG 0 + +#if DELTATREE_DEBUG +#define deltatree_printf(...) printf(__VA_ARGS__) +#else +#define deltatree_printf(...) +#endif + typedef uint64_t Word; // Get the number of prefix bytes that are the same between a and b, up to their common length of cl static inline int commonPrefixLength(uint8_t const* ap, uint8_t const* bp, int cl) { @@ -198,10 +206,6 @@ struct DeltaTree { smallOffsets.left = offset; } } - - int size(bool large) const { - return delta(large).size() + (large ? sizeof(smallOffsets) : sizeof(largeOffsets)); - } }; static constexpr int SmallSizeLimit = std::numeric_limits::max(); @@ -356,8 +360,6 @@ public: Mirror(const void* treePtr = nullptr, const T* lowerBound = nullptr, const T* upperBound = nullptr) : tree((DeltaTree*)treePtr), lower(lowerBound), upper(upperBound) { - // TODO: Remove these copies into arena and require users of Mirror to keep prev and next alive during its - // lifetime lower = new (arena) T(arena, *lower); upper = new (arena) T(arena, *upper); @@ -875,7 +877,10 @@ private: int deltaSize = item.writeDelta(node.delta(largeNodes), *base, commonPrefix); node.delta(largeNodes).setPrefixSource(prefixSourcePrev); - // printf("Serialized %s to %p\n", item.toString().c_str(), &root.delta(largeNodes)); + deltatree_printf("Serialized %s to offset %d data: %s\n", + item.toString().c_str(), + (uint8_t*)&node - (uint8_t*)this, + StringRef((uint8_t*)&node.delta(largeNodes), deltaSize).toHexString().c_str()); // Continue writing after the serialized Delta. uint8_t* wptr = (uint8_t*)&node.delta(largeNodes) + deltaSize; @@ -899,3 +904,823 @@ private: return wptr - (uint8_t*)&node; } }; + +// DeltaTree2 is a memory mappable binary tree of T objects such that each node's item is +// stored as a Delta which can reproduce the node's T item given either +// - The node's greatest lesser ancestor, called the "left parent" +// - The node's least greater ancestor, called the "right parent" +// One of these ancestors will also happen to be the node's direct parent. +// +// The Delta type is intended to make use of ordered prefix compression and borrow all +// available prefix bytes from the ancestor T which shares the most prefix bytes with +// the item T being encoded. If T is implemented properly, this results in perfect +// prefix compression while performing O(log n) comparisons for a seek. +// +// T requirements +// +// Must be compatible with Standalone and must implement the following additional things: +// +// // Return the common prefix length between *this and T +// // skipLen is a hint, representing the length that is already known to be common. +// int getCommonPrefixLen(const T& other, int skipLen) const; +// +// // Compare *this to rhs, returns < 0 for less than, 0 for equal, > 0 for greater than +// // skipLen is a hint, representing the length that is already known to be common. +// int compare(const T &rhs, int skipLen) const; +// +// // Writes to d a delta which can create *this from base +// // commonPrefix is a hint, representing the length that is already known to be common. +// // DeltaT's size need not be static, for more details see below. +// void writeDelta(DeltaT &d, const T &base, int commonPrefix) const; +// +// // Returns the size in bytes of the DeltaT required to recreate *this from base +// int deltaSize(const T &base) const; +// +// // A type which represents the parts of T that either borrowed from the base T +// // or can be borrowed by other T's using the first T as a base +// // Partials must allocate any heap storage in the provided Arena for any operation. +// typedef Partial; +// +// // Update cache with the Partial for *this, storing any heap memory for the Partial in arena +// void updateCache(Optional cache, Arena& arena) const; +// +// // For debugging, return a useful human-readable string representation of *this +// std::string toString() const; +// +// DeltaT requirements +// +// DeltaT can be variable sized, larger than sizeof(DeltaT), and implement the following: +// +// // Returns the size in bytes of this specific DeltaT instance +// int size(); +// +// // Apply *this to base and return the resulting T +// // Store the Partial for T into cache, allocating any heap memory for the Partial in arena +// T apply(Arena& arena, const T& base, Optional& cache); +// +// // Recreate T from *this and the Partial for T +// T apply(const T::Partial& cache); +// +// // Set or retrieve a boolean flag representing which base ancestor the DeltaT is to be applied to +// void setPrefixSource(bool val); +// bool getPrefixSource() const; +// +// // Set of retrieve a boolean flag representing that a DeltaTree node has been erased +// void setDeleted(bool val); +// bool getDeleted() const; +// +// // For debugging, return a useful human-readable string representation of *this +// std::string toString() const; +// +#pragma pack(push, 1) +template +struct DeltaTree2 { + typedef typename T::Partial Partial; + + struct { + uint16_t numItems; // Number of items in the tree. + uint32_t nodeBytesUsed; // Bytes used by nodes (everything after the tree header) + uint32_t nodeBytesFree; // Bytes left at end of tree to expand into + uint32_t nodeBytesDeleted; // Delta bytes deleted from tree. Note that some of these bytes could be borrowed by + // descendents. + uint8_t initialHeight; // Height of tree as originally built + uint8_t maxHeight; // Maximum height of tree after any insertion. Value of 0 means no insertions done. + bool largeNodes; // Node size, can be calculated as capacity > SmallSizeLimit but it will be used a lot + }; + + // Node is not fixed size. Most node methods require the context of whether the node is in small or large + // offset mode, passed as a boolean + struct Node { + // Offsets are relative to the start of the DeltaTree + union { + struct { + uint32_t leftChild; + uint32_t rightChild; + + } largeOffsets; + struct { + uint16_t leftChild; + uint16_t rightChild; + } smallOffsets; + }; + + static int headerSize(bool large) { return large ? sizeof(largeOffsets) : sizeof(smallOffsets); } + + // Delta is located after the offsets, which differs by node size + DeltaT& delta(bool large) { return large ? *(DeltaT*)(&largeOffsets + 1) : *(DeltaT*)(&smallOffsets + 1); }; + + // Delta is located after the offsets, which differs by node size + const DeltaT& delta(bool large) const { + return large ? *(DeltaT*)(&largeOffsets + 1) : *(DeltaT*)(&smallOffsets + 1); + }; + + std::string toString(DeltaTree2* tree) const { + return format("Node{offset=%d leftChild=%d rightChild=%d delta=%s}", + tree->nodeOffset(this), + getLeftChildOffset(tree->largeNodes), + getRightChildOffset(tree->largeNodes), + delta(tree->largeNodes).toString().c_str()); + } + +#define getMember(m) (large ? largeOffsets.m : smallOffsets.m) +#define setMember(m, v) \ + if (large) { \ + largeOffsets.m = v; \ + } else { \ + smallOffsets.m = v; \ + } + + void setRightChildOffset(bool large, int offset) { setMember(rightChild, offset); } + void setLeftChildOffset(bool large, int offset) { setMember(leftChild, offset); } + + int getRightChildOffset(bool large) const { return getMember(rightChild); } + int getLeftChildOffset(bool large) const { return getMember(leftChild); } + + int size(bool large) const { return delta(large).size() + headerSize(large); } +#undef getMember +#undef setMember + }; + + static constexpr int SmallSizeLimit = std::numeric_limits::max(); + static constexpr int LargeTreePerNodeExtraOverhead = sizeof(Node::largeOffsets) - sizeof(Node::smallOffsets); + + int nodeOffset(const Node* n) const { return (uint8_t*)n - (uint8_t*)this; } + Node* nodeAt(int offset) { return offset == 0 ? nullptr : (Node*)((uint8_t*)this + offset); } + Node* root() { return numItems == 0 ? nullptr : (Node*)(this + 1); } + int rootOffset() { return sizeof(DeltaTree2); } + + int size() const { return sizeof(DeltaTree2) + nodeBytesUsed; } + int capacity() const { return size() + nodeBytesFree; } + +public: + // DecodedNode represents a Node of a DeltaTree and its T::Partial. + // DecodedNodes are created on-demand, as DeltaTree Nodes are visited by a Cursor. + // DecodedNodes link together to form a binary tree with the same Node relationships as their + // corresponding DeltaTree Nodes. Additionally, DecodedNodes store links to their left and + // right ancestors which correspond to possible base Nodes on which the Node's Delta is based. + // + // DecodedNode links are not pointers, but rather indices to be looked up in the DecodeCache + // defined below. An index value of -1 is uninitialized, meaning it is not yet known whether + // the corresponding DeltaTree Node link is non-null in any version of the DeltaTree which is + // using or has used the DecodeCache. + struct DecodedNode { + DecodedNode(int nodeOffset, int leftParentIndex, int rightParentIndex) + : nodeOffset(nodeOffset), leftParentIndex(leftParentIndex), rightParentIndex(rightParentIndex), + leftChildIndex(-1), rightChildIndex(-1) {} + int nodeOffset; + int16_t leftParentIndex; + int16_t rightParentIndex; + int16_t leftChildIndex; + int16_t rightChildIndex; + Optional partial; + + Node* node(DeltaTree2* tree) const { return tree->nodeAt(nodeOffset); } + + std::string toString() { + return format("DecodedNode{nodeOffset=%d leftChildIndex=%d rightChildIndex=%d leftParentIndex=%d " + "rightParentIndex=%d}", + (int)nodeOffset, + (int)leftChildIndex, + (int)rightChildIndex, + (int)leftParentIndex, + (int)rightParentIndex); + } + }; +#pragma pack(pop) + + // The DecodeCache is a reference counted structure that stores DecodedNodes by an integer index + // and can be shared across a series of updated copies of a DeltaTree. + // + // DecodedNodes are stored in a contiguous vector, which sometimes must be expanded, so care + // must be taken to resolve DecodedNode pointers again after the DecodeCache has new entries added. + struct DecodeCache : FastAllocated, ReferenceCounted { + DecodeCache(const T& lowerBound = T(), const T& upperBound = T()) + : lowerBound(arena, lowerBound), upperBound(arena, upperBound) { + decodedNodes.reserve(10); + deltatree_printf("DecodedNode size: %d\n", sizeof(DecodedNode)); + } + + Arena arena; + T lowerBound; + T upperBound; + + // Index 0 is always the root + std::vector decodedNodes; + + DecodedNode& get(int index) { return decodedNodes[index]; } + + template + int emplace_new(Args&&... args) { + int index = decodedNodes.size(); + decodedNodes.emplace_back(args...); + return index; + } + + bool empty() const { return decodedNodes.empty(); } + + void clear() { + decodedNodes.clear(); + Arena a; + lowerBound = T(a, lowerBound); + upperBound = T(a, upperBound); + arena = a; + } + }; + + // Cursor provides a way to seek into a DeltaTree and iterate over its contents + // The cursor needs a DeltaTree pointer and a DecodeCache, which can be shared + // with other DeltaTrees which were incrementally modified to produce the the + // tree that this cursor is referencing. + struct Cursor { + Cursor() : cache(nullptr), nodeIndex(-1) {} + + Cursor(DecodeCache* cache, DeltaTree2* tree) : cache(cache), tree(tree), nodeIndex(-1) {} + + Cursor(DecodeCache* cache, DeltaTree2* tree, int nodeIndex) : cache(cache), tree(tree), nodeIndex(nodeIndex) {} + + // Copy constructor does not copy item because normally a copied cursor will be immediately moved. + Cursor(const Cursor& c) : cache(c.cache), tree(c.tree), nodeIndex(c.nodeIndex) {} + + Cursor next() const { + Cursor c = *this; + c.moveNext(); + return c; + } + + Cursor previous() const { + Cursor c = *this; + c.movePrev(); + return c; + } + + int rootIndex() { + if (!cache->empty()) { + return 0; + } else if (tree->numItems != 0) { + return cache->emplace_new(tree->rootOffset(), -1, -1); + } + return -1; + } + + DeltaTree2* tree; + DecodeCache* cache; + int nodeIndex; + mutable Optional item; + + Node* node() const { return tree->nodeAt(cache->get(nodeIndex).nodeOffset); } + + std::string toString() const { + if (nodeIndex == -1) { + return format("Cursor{nodeIndex=-1}"); + } + return format("Cursor{item=%s indexItem=%s nodeIndex=%d decodedNode=%s node=%s ", + item.present() ? item.get().toString().c_str() : "", + get(cache->get(nodeIndex)).toString().c_str(), + nodeIndex, + cache->get(nodeIndex).toString().c_str(), + node()->toString(tree).c_str()); + } + + bool valid() const { return nodeIndex != -1; } + + // Get T for Node n, and provide to n's delta the base and local decode cache entries to use/modify + const T get(DecodedNode& decoded) const { + DeltaT& delta = decoded.node(tree)->delta(tree->largeNodes); + + // If this node's cached partial is populated, then the delta can create T from that alone + if (decoded.partial.present()) { + return delta.apply(decoded.partial.get()); + } + + // Otherwise, get the base T + bool basePrev = delta.getPrefixSource(); + int baseIndex = basePrev ? decoded.leftParentIndex : decoded.rightParentIndex; + + // If baseOffset is -1, then base T is DecodeCache's lower or upper bound + if (baseIndex == -1) { + return delta.apply(cache->arena, basePrev ? cache->lowerBound : cache->upperBound, decoded.partial); + } + + // Otherwise, get the base's decoded node + DecodedNode& baseDecoded = cache->get(baseIndex); + + // If the base's partial is present, apply delta to it to get result + if (baseDecoded.partial.present()) { + return delta.apply(cache->arena, baseDecoded.partial.get(), decoded.partial); + } + + // Otherwise apply delta to base T + return delta.apply(cache->arena, get(baseDecoded), decoded.partial); + } + + public: + // Get the item at the cursor + // Behavior is undefined if the cursor is not valid. + // If the cursor is moved, the reference object returned will be modified to + // the cursor's new current item. + const T& get() const { + if (!item.present()) { + item = get(cache->get(nodeIndex)); + } + return item.get(); + } + + void switchTree(DeltaTree2* newTree) { tree = newTree; } + + // If the cursor is valid, return a reference to the cursor's internal T. + // Otherwise, returns a reference to the cache's upper boundary. + const T& getOrUpperBound() const { return valid() ? get() : cache->upperBound; } + + bool operator==(const Cursor& rhs) const { return nodeIndex == rhs.nodeIndex; } + bool operator!=(const Cursor& rhs) const { return nodeIndex != rhs.nodeIndex; } + + // The seek methods, of the form seek[Less|Greater][orEqual](...) are very similar. + // They attempt move the cursor to the [Greatest|Least] item, based on the name of the function. + // Then will not "see" erased records. + // If successful, they return true, and if not then false a while making the cursor invalid. + // These methods forward arguments to the seek() overloads, see those for argument descriptions. + template + bool seekLessThan(Args... args) { + int cmp = seek(args...); + if (cmp < 0 || (cmp == 0 && nodeIndex != -1)) { + movePrev(); + } + return _hideDeletedBackward(); + } + + template + bool seekLessThanOrEqual(Args... args) { + int cmp = seek(args...); + if (cmp < 0) { + movePrev(); + } + return _hideDeletedBackward(); + } + + template + bool seekGreaterThan(Args... args) { + int cmp = seek(args...); + if (cmp > 0 || (cmp == 0 && nodeIndex != -1)) { + moveNext(); + } + return _hideDeletedForward(); + } + + template + bool seekGreaterThanOrEqual(Args... args) { + int cmp = seek(args...); + if (cmp > 0) { + moveNext(); + } + return _hideDeletedForward(); + } + + // Get the right child index for parentIndex + int getRightChildIndex(int parentIndex) { + DecodedNode* parent = &cache->get(parentIndex); + + // The cache may have a child index, but since cache covers multiple versions of a DeltaTree + // it can't be used unless the node in the tree has a child. + int childOffset = parent->node(tree)->getRightChildOffset(tree->largeNodes); + + if (childOffset == 0) { + return -1; + } + + // parent has this child so return the index if it is in DecodedNode + if (parent->rightChildIndex != -1) { + return parent->rightChildIndex; + } + + // Create the child's DecodedNode and get its index + int childIndex = cache->emplace_new(childOffset, parentIndex, parent->rightParentIndex); + + // Set the index in the parent. The cache lookup is repeated because the cache has changed. + cache->get(parentIndex).rightChildIndex = childIndex; + return childIndex; + } + + // Get the left child index for parentIndex + int getLeftChildIndex(int parentIndex) { + DecodedNode* parent = &cache->get(parentIndex); + + // The cache may have a child index, but since cache covers multiple versions of a DeltaTree + // it can't be used unless the node in the tree has a child. + int childOffset = parent->node(tree)->getLeftChildOffset(tree->largeNodes); + + if (childOffset == 0) { + return -1; + } + + // parent has this child so return the index if it is in DecodedNode + if (parent->leftChildIndex != -1) { + return parent->leftChildIndex; + } + + // Create the child's DecodedNode and get its index + int childIndex = cache->emplace_new(childOffset, parent->leftParentIndex, parentIndex); + + // Set the index in the parent. The cache lookup is repeated because the cache has changed. + cache->get(parentIndex).leftChildIndex = childIndex; + return childIndex; + } + + // seek() moves the cursor to a node containing s or the node that would be the parent of s if s were to be + // added to the tree. If the tree was empty, the cursor will be invalid and the return value will be 0. + // Otherwise, returns the result of s.compare(item at cursor position) + // Does not skip/avoid deleted nodes. + int seek(const T& s, int skipLen = 0) { + nodeIndex = -1; + item.reset(); + deltatree_printf("seek(%s) start %s\n", s.toString().c_str(), toString().c_str()); + int nIndex = rootIndex(); + int cmp = 0; + + while (nIndex != -1) { + nodeIndex = nIndex; + item.reset(); + cmp = s.compare(get(), skipLen); + deltatree_printf("seek(%s) loop cmp=%d %s\n", s.toString().c_str(), cmp, toString().c_str()); + if (cmp == 0) { + break; + } + + if (cmp > 0) { + nIndex = getRightChildIndex(nIndex); + } else { + nIndex = getLeftChildIndex(nIndex); + } + } + + return cmp; + } + + bool moveFirst() { + nodeIndex = -1; + item.reset(); + int nIndex = rootIndex(); + deltatree_printf("moveFirst start %s\n", toString().c_str()); + while (nIndex != -1) { + nodeIndex = nIndex; + deltatree_printf("moveFirst moved %s\n", toString().c_str()); + nIndex = getLeftChildIndex(nIndex); + } + return _hideDeletedForward(); + } + + bool moveLast() { + nodeIndex = -1; + item.reset(); + int nIndex = rootIndex(); + deltatree_printf("moveLast start %s\n", toString().c_str()); + while (nIndex != -1) { + nodeIndex = nIndex; + deltatree_printf("moveLast moved %s\n", toString().c_str()); + nIndex = getRightChildIndex(nIndex); + } + return _hideDeletedBackward(); + } + + // Try to move to next node, sees deleted nodes. + void _moveNext() { + deltatree_printf("_moveNext start %s\n", toString().c_str()); + item.reset(); + // Try to go right + int nIndex = getRightChildIndex(nodeIndex); + + // If we couldn't go right, then the answer is our next ancestor + if (nIndex == -1) { + nodeIndex = cache->get(nodeIndex).rightParentIndex; + deltatree_printf("_moveNext move1 %s\n", toString().c_str()); + } else { + // Go left as far as possible + do { + nodeIndex = nIndex; + deltatree_printf("_moveNext move2 %s\n", toString().c_str()); + nIndex = getLeftChildIndex(nodeIndex); + } while (nIndex != -1); + } + } + + // Try to move to previous node, sees deleted nodes. + void _movePrev() { + deltatree_printf("_movePrev start %s\n", toString().c_str()); + item.reset(); + // Try to go left + int nIndex = getLeftChildIndex(nodeIndex); + // If we couldn't go left, then the answer is our prev ancestor + if (nIndex == -1) { + nodeIndex = cache->get(nodeIndex).leftParentIndex; + deltatree_printf("_movePrev move1 %s\n", toString().c_str()); + } else { + // Go right as far as possible + do { + nodeIndex = nIndex; + deltatree_printf("_movePrev move2 %s\n", toString().c_str()); + nIndex = getRightChildIndex(nodeIndex); + } while (nIndex != -1); + } + } + + bool moveNext() { + _moveNext(); + return _hideDeletedForward(); + } + + bool movePrev() { + _movePrev(); + return _hideDeletedBackward(); + } + + DeltaT& getDelta() const { return cache->get(nodeIndex).node(tree)->delta(tree->largeNodes); } + + bool isErased() const { return getDelta().getDeleted(); } + + // Erase current item by setting its deleted flag to true. + // Tree header is updated if a change is made. + // Cursor is then moved forward to the next non-deleted node. + void erase() { + auto& delta = getDelta(); + if (!delta.getDeleted()) { + delta.setDeleted(true); + --tree->numItems; + tree->nodeBytesDeleted += (delta.size() + Node::headerSize(tree->largeNodes)); + } + moveNext(); + } + + // Erase k by setting its deleted flag to true. Returns true only if k existed + bool erase(const T& k, int skipLen = 0) { + Cursor c(cache, tree); + if (c.seek(k, skipLen) == 0 && !c.isErased()) { + c.erase(); + return true; + } + return false; + } + + // Try to insert k into the DeltaTree, updating byte counts and initialHeight if they + // have changed (they won't if k already exists in the tree but was deleted). + // Returns true if successful, false if k does not fit in the space available + // or if k is already in the tree (and was not already deleted). + // Insertion on an empty tree returns false as well. + // Insert does NOT change the cursor position. + bool insert(const T& k, int skipLen = 0, int maxHeightAllowed = std::numeric_limits::max()) { + deltatree_printf("insert %s\n", k.toString().c_str()); + + int nIndex = rootIndex(); + int parentIndex = nIndex; + DecodedNode* parentDecoded; + // Result of comparing node at parentIndex + int cmp = 0; + // Height of the inserted node + int height = 0; + + // Find the parent to add the node to + // This is just seek but modifies parentIndex instead of nodeIndex and tracks the insertion height + deltatree_printf( + "insert(%s) start %s\n", k.toString().c_str(), Cursor(cache, tree, parentIndex).toString().c_str()); + while (nIndex != -1) { + ++height; + parentIndex = nIndex; + parentDecoded = &cache->get(parentIndex); + cmp = k.compare(get(*parentDecoded), skipLen); + deltatree_printf("insert(%s) moved cmp=%d %s\n", + k.toString().c_str(), + cmp, + Cursor(cache, tree, parentIndex).toString().c_str()); + + if (cmp == 0) { + break; + } + + if (cmp > 0) { + deltatree_printf("insert(%s) move right\n", k.toString().c_str()); + nIndex = getRightChildIndex(nIndex); + } else { + deltatree_printf("insert(%s) move left\n", k.toString().c_str()); + nIndex = getLeftChildIndex(nIndex); + } + } + + // If the item is found, mark it erased if it isn't already + if (cmp == 0) { + DeltaT& delta = tree->nodeAt(parentDecoded->nodeOffset)->delta(tree->largeNodes); + if (delta.getDeleted()) { + delta.setDeleted(false); + ++tree->numItems; + tree->nodeBytesDeleted -= (delta.size() + Node::headerSize(tree->largeNodes)); + deltatree_printf("insert(%s) deleted item restored %s\n", + k.toString().c_str(), + Cursor(cache, tree, parentIndex).toString().c_str()); + return true; + } + deltatree_printf("insert(%s) item exists %s\n", + k.toString().c_str(), + Cursor(cache, tree, parentIndex).toString().c_str()); + return false; + } + + // If the tree was empty or the max insertion height is exceeded then fail + if (parentIndex == -1 || height > maxHeightAllowed) { + return false; + } + + // Find the base base to borrow from, see if the resulting delta fits into the tree + int leftBaseIndex, rightBaseIndex; + bool addingRight = cmp > 0; + if (addingRight) { + leftBaseIndex = parentIndex; + rightBaseIndex = parentDecoded->rightParentIndex; + } else { + leftBaseIndex = parentDecoded->leftParentIndex; + rightBaseIndex = parentIndex; + } + + T leftBase = leftBaseIndex == -1 ? cache->lowerBound : get(cache->get(leftBaseIndex)); + T rightBase = rightBaseIndex == -1 ? cache->upperBound : get(cache->get(rightBaseIndex)); + + int common = leftBase.getCommonPrefixLen(rightBase, skipLen); + int commonWithLeftParent = k.getCommonPrefixLen(leftBase, common); + int commonWithRightParent = k.getCommonPrefixLen(rightBase, common); + bool borrowFromLeft = commonWithLeftParent >= commonWithRightParent; + + const T* base; + int commonPrefix; + if (borrowFromLeft) { + base = &leftBase; + commonPrefix = commonWithLeftParent; + } else { + base = &rightBase; + commonPrefix = commonWithRightParent; + } + + int deltaSize = k.deltaSize(*base, commonPrefix, false); + int nodeSpace = deltaSize + Node::headerSize(tree->largeNodes); + + if (nodeSpace > tree->nodeBytesFree) { + return false; + } + + int childOffset = tree->size(); + Node* childNode = tree->nodeAt(childOffset); + childNode->setLeftChildOffset(tree->largeNodes, 0); + childNode->setRightChildOffset(tree->largeNodes, 0); + + // Create the decoded node and link it to the parent + // Link the parent's decodednode to the child's decodednode + // Link the parent node in the tree to the new child node + // true if node is being added to right child + int childIndex = cache->emplace_new(childOffset, leftBaseIndex, rightBaseIndex); + + // Get a new parentDecoded pointer as the cache may have changed allocations + parentDecoded = &cache->get(parentIndex); + + if (addingRight) { + // Adding child to right of parent + parentDecoded->rightChildIndex = childIndex; + parentDecoded->node(tree)->setRightChildOffset(tree->largeNodes, childOffset); + } else { + // Adding child to left of parent + parentDecoded->leftChildIndex = childIndex; + parentDecoded->node(tree)->setLeftChildOffset(tree->largeNodes, childOffset); + } + + // Give k opportunity to populate its cache partial record + k.updateCache(cache->get(childIndex).partial, cache->arena); + + DeltaT& childDelta = childNode->delta(tree->largeNodes); + deltatree_printf("insert(%s) writing delta from %s\n", k.toString().c_str(), base->toString().c_str()); + int written = k.writeDelta(childDelta, *base, commonPrefix); + ASSERT(deltaSize == written); + childDelta.setPrefixSource(borrowFromLeft); + + tree->nodeBytesUsed += nodeSpace; + tree->nodeBytesFree -= nodeSpace; + ++tree->numItems; + + // Update max height of the tree if necessary + if (height > tree->maxHeight) { + tree->maxHeight = height; + } + + deltatree_printf("insert(%s) done parent=%s\n", + k.toString().c_str(), + Cursor(cache, tree, parentIndex).toString().c_str()); + deltatree_printf("insert(%s) done child=%s\n", + k.toString().c_str(), + Cursor(cache, tree, childIndex).toString().c_str()); + + return true; + } + + private: + bool _hideDeletedBackward() { + while (nodeIndex != -1 && getDelta().getDeleted()) { + _movePrev(); + } + return nodeIndex != -1; + } + + bool _hideDeletedForward() { + while (nodeIndex != -1 && getDelta().getDeleted()) { + _moveNext(); + } + return nodeIndex != -1; + } + }; + + // Returns number of bytes written + int build(int spaceAvailable, const T* begin, const T* end, const T* lowerBound, const T* upperBound) { + largeNodes = spaceAvailable > SmallSizeLimit; + int count = end - begin; + numItems = count; + nodeBytesDeleted = 0; + initialHeight = (uint8_t)log2(count) + 1; + maxHeight = 0; + + // The boundary leading to the new page acts as the last time we branched right + if (count > 0) { + nodeBytesUsed = buildSubtree( + *root(), begin, end, lowerBound, upperBound, lowerBound->getCommonPrefixLen(*upperBound, 0)); + } else { + nodeBytesUsed = 0; + } + nodeBytesFree = spaceAvailable - size(); + return size(); + } + +private: + int buildSubtree(Node& node, + const T* begin, + const T* end, + const T* leftParent, + const T* rightParent, + int subtreeCommon) { + + int count = end - begin; + + // Find key to be stored in root + int mid = perfectSubtreeSplitPointCached(count); + const T& item = begin[mid]; + + int commonWithPrev = item.getCommonPrefixLen(*leftParent, subtreeCommon); + int commonWithNext = item.getCommonPrefixLen(*rightParent, subtreeCommon); + + bool prefixSourcePrev; + int commonPrefix; + const T* base; + if (commonWithPrev >= commonWithNext) { + prefixSourcePrev = true; + commonPrefix = commonWithPrev; + base = leftParent; + } else { + prefixSourcePrev = false; + commonPrefix = commonWithNext; + base = rightParent; + } + + int deltaSize = item.writeDelta(node.delta(largeNodes), *base, commonPrefix); + node.delta(largeNodes).setPrefixSource(prefixSourcePrev); + + // Continue writing after the serialized Delta. + uint8_t* wptr = (uint8_t*)&node.delta(largeNodes) + deltaSize; + + int leftChildOffset; + // Serialize left subtree + if (count > 1) { + leftChildOffset = wptr - (uint8_t*)this; + deltatree_printf("%p: offset=%d count=%d serialize left subtree leftChildOffset=%d\n", + this, + nodeOffset(&node), + count, + leftChildOffset); + + wptr += buildSubtree(*(Node*)wptr, begin, begin + mid, leftParent, &item, commonWithPrev); + } else { + leftChildOffset = 0; + } + + int rightChildOffset; + // Serialize right subtree + if (count > 2) { + rightChildOffset = wptr - (uint8_t*)this; + deltatree_printf("%p: offset=%d count=%d serialize right subtree rightChildOffset=%d\n", + this, + nodeOffset(&node), + count, + rightChildOffset); + + wptr += buildSubtree(*(Node*)wptr, begin + mid + 1, end, &item, rightParent, commonWithNext); + } else { + rightChildOffset = 0; + } + + node.setLeftChildOffset(largeNodes, leftChildOffset); + node.setRightChildOffset(largeNodes, rightChildOffset); + + deltatree_printf("%p: Serialized %s as %s\n", this, item.toString().c_str(), node.toString(this).c_str()); + + return wptr - (uint8_t*)&node; + } +}; diff --git a/fdbserver/IPager.h b/fdbserver/IPager.h index 4520a0caf8..aea7bbf9d2 100644 --- a/fdbserver/IPager.h +++ b/fdbserver/IPager.h @@ -128,10 +128,7 @@ public: class IPagerSnapshot { public: - virtual Future> getPhysicalPage(LogicalPageID pageID, - bool cacheable, - bool nohit, - bool* fromCache = nullptr) = 0; + virtual Future> getPhysicalPage(LogicalPageID pageID, bool cacheable, bool nohit) = 0; virtual bool tryEvictPage(LogicalPageID id) = 0; virtual Version getVersion() const = 0; @@ -191,10 +188,7 @@ public: // Cacheable indicates that the page should be added to the page cache (if applicable?) as a result of this read. // NoHit indicates that the read should not be considered a cache hit, such as when preloading pages that are // considered likely to be needed soon. - virtual Future> readPage(LogicalPageID pageID, - bool cacheable = true, - bool noHit = false, - bool* fromCache = nullptr) = 0; + virtual Future> readPage(LogicalPageID pageID, bool cacheable = true, bool noHit = false) = 0; virtual Future> readExtent(LogicalPageID pageID) = 0; virtual void releaseExtentReadLock() = 0; diff --git a/fdbserver/IVersionedStore.h b/fdbserver/IVersionedStore.h deleted file mode 100644 index 3651aa76a0..0000000000 --- a/fdbserver/IVersionedStore.h +++ /dev/null @@ -1,80 +0,0 @@ -/* - * IVersionedStore.h - * - * This source file is part of the FoundationDB open source project - * - * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef FDBSERVER_IVERSIONEDSTORE_H -#define FDBSERVER_IVERSIONEDSTORE_H -#pragma once - -#include "fdbserver/IKeyValueStore.h" - -#include "flow/flow.h" -#include "fdbclient/FDBTypes.h" - -class IStoreCursor { -public: - virtual Future findEqual(KeyRef key) = 0; - virtual Future findFirstEqualOrGreater(KeyRef key, int prefetchBytes = 0) = 0; - virtual Future findLastLessOrEqual(KeyRef key, int prefetchBytes = 0) = 0; - virtual Future next() = 0; - virtual Future prev() = 0; - - virtual bool isValid() = 0; - virtual KeyRef getKey() = 0; - virtual ValueRef getValue() = 0; - - virtual void addref() = 0; - virtual void delref() = 0; -}; - -class IVersionedStore : public IClosable { -public: - virtual KeyValueStoreType getType() const = 0; - virtual bool supportsMutation(int op) const = 0; // If this returns true, then mutate(op, ...) may be called - virtual StorageBytes getStorageBytes() const = 0; - - // Writes are provided in an ordered stream. - // A write is considered part of (a change leading to) the version determined by the previous call to - // setWriteVersion() A write shall not become durable until the following call to commit() begins, and shall be - // durable once the following call to commit() returns - virtual void set(KeyValueRef keyValue) = 0; - virtual void clear(KeyRangeRef range) = 0; - virtual void mutate(int op, StringRef param1, StringRef param2) = 0; - virtual void setWriteVersion(Version) = 0; // The write version must be nondecreasing - virtual void setOldestVersion(Version v) = 0; // Set oldest readable version to be used in next commit - virtual Version getOldestVersion() const = 0; // Get oldest readable version - virtual Future commit() = 0; - - virtual Future init() = 0; - virtual Version getLatestVersion() const = 0; - - // readAtVersion() may only be called on a version which has previously been passed to setWriteVersion() and never - // previously passed - // to forgetVersion. The returned results when violating this precondition are unspecified; the store is not - // required to be able to detect violations. - // The returned read cursor provides a consistent snapshot of the versioned store, corresponding to all the writes - // done with write versions less - // than or equal to the given version. - // If readAtVersion() is called on the *current* write version, the given read cursor MAY reflect subsequent writes - // at the same - // write version, OR it may represent a snapshot as of the call to readAtVersion(). - virtual Reference readAtVersion(Version) = 0; -}; - -#endif diff --git a/fdbserver/Knobs.cpp b/fdbserver/Knobs.cpp index ae25b82a69..4ccb08b9f6 100644 --- a/fdbserver/Knobs.cpp +++ b/fdbserver/Knobs.cpp @@ -268,10 +268,6 @@ void ServerKnobs::initialize(bool randomize, ClientKnobs* clientKnobs, bool isSi init( DD_REMOVE_STORE_ENGINE_DELAY, 60.0 ); if( randomize && BUGGIFY ) DD_REMOVE_STORE_ENGINE_DELAY = deterministicRandom()->random01() * 60.0; - // Redwood Storage Engine - init( PREFIX_TREE_IMMEDIATE_KEY_SIZE_LIMIT, 30 ); - init( PREFIX_TREE_IMMEDIATE_KEY_SIZE_MIN, 0 ); - // KeyValueStore SQLITE init( CLEAR_BUFFER_SIZE, 20000 ); init( READ_VALUE_TIME_ESTIMATE, .00005 ); @@ -714,9 +710,8 @@ void ServerKnobs::initialize(bool randomize, ClientKnobs* clientKnobs, bool isSi init( REDWOOD_DEFAULT_PAGE_SIZE, 4096 ); init( REDWOOD_DEFAULT_EXTENT_SIZE, 32 * 1024 * 1024 ); init( REDWOOD_DEFAULT_EXTENT_READ_SIZE, 1024 * 1024 ); - init( REDWOOD_KVSTORE_CONCURRENT_READS, 64 ); - init( REDWOOD_COMMIT_CONCURRENT_READS, 64 ); init( REDWOOD_EXTENT_CONCURRENT_READS, 4 ); + init( REDWOOD_KVSTORE_CONCURRENT_READS, 64 ); init( REDWOOD_PAGE_REBUILD_MAX_SLACK, 0.33 ); init( REDWOOD_LAZY_CLEAR_BATCH_SIZE_PAGES, 10 ); init( REDWOOD_LAZY_CLEAR_MIN_PAGES, 0 ); diff --git a/fdbserver/Knobs.h b/fdbserver/Knobs.h index 1705bf084b..de39ecf209 100644 --- a/fdbserver/Knobs.h +++ b/fdbserver/Knobs.h @@ -222,10 +222,6 @@ public: double DD_FAILURE_TIME; double DD_ZERO_HEALTHY_TEAM_DELAY; - // Redwood Storage Engine - int PREFIX_TREE_IMMEDIATE_KEY_SIZE_LIMIT; - int PREFIX_TREE_IMMEDIATE_KEY_SIZE_MIN; - // KeyValueStore SQLITE int CLEAR_BUFFER_SIZE; double READ_VALUE_TIME_ESTIMATE; @@ -646,9 +642,8 @@ public: int REDWOOD_DEFAULT_PAGE_SIZE; // Page size for new Redwood files int REDWOOD_DEFAULT_EXTENT_SIZE; // Extent size for new Redwood files int REDWOOD_DEFAULT_EXTENT_READ_SIZE; // Extent read size for Redwood files - int REDWOOD_KVSTORE_CONCURRENT_READS; // Max number of simultaneous point or range reads in progress. - int REDWOOD_COMMIT_CONCURRENT_READS; // Max number of concurrent reads done to support commit operations int REDWOOD_EXTENT_CONCURRENT_READS; // Max number of simultaneous extent disk reads in progress. + int REDWOOD_KVSTORE_CONCURRENT_READS; // Max number of simultaneous point or range reads in progress. double REDWOOD_PAGE_REBUILD_MAX_SLACK; // When rebuilding pages, max slack to allow in page int REDWOOD_LAZY_CLEAR_BATCH_SIZE_PAGES; // Number of pages to try to pop from the lazy delete queue and process at // once diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index 99c463470b..e3a18d26af 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -19,7 +19,6 @@ */ #include "flow/flow.h" -#include "fdbserver/IVersionedStore.h" #include "fdbserver/IPager.h" #include "fdbclient/Tuple.h" #include "flow/serialize.h" @@ -345,8 +344,7 @@ public: // This exists because writing the queue returns void, not a future Future writeOperations; - FlowLock mutex; - Future killMutex; + FlowMutex mutex; Cursor() : mode(NONE) {} @@ -360,14 +358,6 @@ public: int readOffset = 0, LogicalPageID prevExtentEndPageID = invalidLogicalPageID) { queue = q; - - // If the pager gets an error, which includes shutdown, kill the mutex so any waiters can no longer run. - // This avoids having every mutex wait also wait on pagerError. - killMutex = map(ready(queue->pagerError), [=](Void e) { - mutex.kill(); - return Void(); - }); - mode = m; firstPageIDWritten = invalidLogicalPageID; offset = readOffset; @@ -447,14 +437,14 @@ public: } // Returns true if the mutex cannot be immediately taken. - bool isBusy() { return mutex.activePermits() != 0; } + bool isBusy() { return !mutex.available(); } // Wait for all operations started before now to be ready, which is done by // obtaining and releasing the mutex. Future notBusy() { return isBusy() ? map(mutex.take(), - [&](Void) { - mutex.release(); + [&](FlowMutex::Lock lock) { + lock.release(); return Void(); }) : Void(); @@ -600,6 +590,7 @@ public: ACTOR static Future write_impl(Cursor* self, T item) { ASSERT(self->mode == WRITE); + state FlowMutex::Lock lock; state bool mustWait = self->isBusy(); state int bytesNeeded = Codec::bytesNeeded(item); state bool needNewPage = @@ -621,7 +612,8 @@ public: // If we have to wait for the mutex because it's busy, or we need a new page, then wait for the mutex. if (mustWait || needNewPage) { - wait(self->mutex.take()); + FlowMutex::Lock _lock = wait(self->mutex.take()); + lock = _lock; // If we had to wait because the mutex was busy, then update needNewPage as another writer // would have changed the cursor state @@ -682,7 +674,15 @@ public: ++self->queue->numEntries; if (mustWait || needNewPage) { - self->mutex.release(); + // Prevent possible stack overflow if too many waiters which require no IO are queued up + // Using static because multiple Cursors can be involved + static int sinceYield = 0; + if (++sinceYield == 1000) { + sinceYield = 0; + wait(yield()); + } + + lock.release(); } return Void(); @@ -703,12 +703,15 @@ public: // Only mutex holders will wait on the page read. ACTOR static Future> waitThenReadNext(Cursor* self, Optional upperBound, - bool locked, + FlowMutex::Lock* lock, bool load) { - // Lock the mutex if it wasn't already - if (!locked) { + state FlowMutex::Lock localLock; + + // Lock the mutex if it wasn't already locked, so we didn't get a lock pointer + if (lock == nullptr) { debug_printf("FIFOQueue::Cursor(%s) waitThenReadNext locking mutex\n", self->toString().c_str()); - wait(self->mutex.take()); + FlowMutex::Lock newLock = wait(self->mutex.take()); + localLock = newLock; } if (load) { @@ -717,12 +720,20 @@ public: wait(success(self->nextPageReader)); } - Optional result = wait(self->readNext(upperBound, true)); + state Optional result = wait(self->readNext(upperBound, &localLock)); + + // If a lock was not passed in, so this actor locked the mutex above, then unlock it + if (lock == nullptr) { + // Prevent possible stack overflow if too many waiters which require no IO are queued up + // Using static because multiple Cursors can be involved + static int sinceYield = 0; + if (++sinceYield == 1000) { + sinceYield = 0; + wait(yield()); + } - // If this actor instance locked the mutex, then unlock it. - if (!locked) { debug_printf("FIFOQueue::Cursor(%s) waitThenReadNext unlocking mutex\n", self->toString().c_str()); - self->mutex.release(); + localLock.release(); } return result; @@ -731,15 +742,15 @@ public: // Read the next item at the cursor (if < upperBound), moving to a new page first if the current page is // exhausted If locked is true, this call owns the mutex, which would have been locked by readNext() before a // recursive call - Future> readNext(const Optional& upperBound = {}, bool locked = false) { + Future> readNext(const Optional& upperBound = {}, FlowMutex::Lock* lock = nullptr) { if ((mode != POP && mode != READONLY) || pageID == invalidLogicalPageID || pageID == endPageID) { debug_printf("FIFOQueue::Cursor(%s) readNext returning nothing\n", toString().c_str()); return Optional(); } - // If we don't own the mutex and it's not available then acquire it - if (!locked && isBusy()) { - return waitThenReadNext(this, upperBound, false, false); + // If we don't have a lock and the mutex isn't available then acquire it + if (lock == nullptr && isBusy()) { + return waitThenReadNext(this, upperBound, lock, false); } // We now know pageID is valid and should be used, but page might not point to it yet @@ -755,7 +766,7 @@ public: } if (!nextPageReader.isReady()) { - return waitThenReadNext(this, upperBound, locked, true); + return waitThenReadNext(this, upperBound, lock, true); } page = nextPageReader.get(); @@ -1507,6 +1518,7 @@ public: void setSizeLimit(int n) { ASSERT(n > 0); sizeLimit = n; + cache.reserve(n); } // Get the object for i if it exists, else return nullptr. @@ -1522,6 +1534,16 @@ public: return nullptr; } + // If index is in cache, move it to the front of the eviction order + void prioritizeEviction(const IndexType& index) { + auto i = cache.find(index); + if (i != cache.end()) { + auto ei = evictionOrder.iterator_to(i->second); + evictionOrder.erase(ei); + evictionOrder.push_front(i->second); + } + } + // Try to evict the item at index from cache // Returns true if item is evicted or was not present in cache bool tryEvict(const IndexType& index) { @@ -1534,7 +1556,7 @@ public: ++g_redwoodMetrics.pagerEvictUnhit; } evictionOrder.erase(evictionOrder.iterator_to(toEvict)); - cache.erase(toEvict.index); + cache.erase(i); return true; } @@ -1556,6 +1578,7 @@ public: evictionOrder.push_back(entry); } } else { + // Otherwise it was a cache miss if (!noMiss) { ++g_redwoodMetrics.pagerCacheMiss; } @@ -1583,8 +1606,8 @@ public: toString(index).c_str()); if (!toEvict.item.evictable()) { - evictionOrder.erase(evictionOrder.iterator_to(toEvict)); - evictionOrder.push_back(toEvict); + // shift the front to the back + evictionOrder.shift_forward(1); ++g_redwoodMetrics.pagerEvictFail; break; } else { @@ -1602,8 +1625,7 @@ public: return entry.item; } - // Clears the cache, saving the entries, and then waits for eachWaits for each item to be evictable and evicts it. - // The cache should not be Evicts all evictable entries + // Clears the cache, saving the entries to second cache, then waits for each item to be evictable and evicts it. ACTOR static Future clear_impl(ObjectCache* self) { state ObjectCache::CacheT cache; state EvictionOrderT evictionOrder; @@ -1748,10 +1770,11 @@ public: int64_t pageCacheSizeBytes, Version remapCleanupWindow, int concurrentExtentReads, - bool memoryOnly = false) + bool memoryOnly = false, + Promise errorPromise = {}) : desiredPageSize(desiredPageSize), desiredExtentSize(desiredExtentSize), filename(filename), pHeader(nullptr), pageCacheBytes(pageCacheSizeBytes), memoryOnly(memoryOnly), remapCleanupWindow(remapCleanupWindow), - concurrentExtentReads(new FlowLock(concurrentExtentReads)) { + concurrentExtentReads(new FlowLock(concurrentExtentReads)), errorPromise(errorPromise) { if (!g_redwoodMetricsActor.isValid()) { g_redwoodMetricsActor = redwoodMetricsLogger(); @@ -2247,7 +2270,14 @@ public: // TODO: Possibly limit size of remap queue since it must be recovered on cold start RemappedPage r{ v, pageID, newPageID }; remapQueue.pushBack(r); - remappedPages[pageID][v] = newPageID; + auto& versionedMap = remappedPages[pageID]; + + // An update page is unlikely to have its old version read again soon, so prioritize its cache eviction + // If the versioned map is empty for this page then the prior version of the page is at stored at the + // PhysicalPageID pageID, otherwise it is the last mapped value in the version-ordered map. + pageCache.prioritizeEviction(versionedMap.empty() ? pageID : versionedMap.rbegin()->second); + versionedMap[v] = newPageID; + debug_printf("DWALPager(%s) pushed %s\n", filename.c_str(), RemappedPage(r).toString().c_str()); return pageID; }); @@ -2256,7 +2286,7 @@ public: return f; } - void freeUnmappedPage(LogicalPageID pageID, Version v) { + void freeUnmappedPage(PhysicalPageID pageID, Version v) { // If v is older than the oldest version still readable then mark pageID as free as of the next commit if (v < effectiveOldestVersion()) { debug_printf("DWALPager(%s) op=freeNow %s @%" PRId64 " oldestVersion=%" PRId64 "\n", @@ -2274,6 +2304,9 @@ public: pLastCommittedHeader->oldestVersion); delayedFreeList.pushBack({ v, pageID }); } + + // A freed page is unlikely to be read again soon so prioritize its cache eviction + pageCache.prioritizeEviction(pageID); } LogicalPageID detachRemappedPage(LogicalPageID pageID, Version v) override { @@ -2325,6 +2358,11 @@ public: v, pLastCommittedHeader->oldestVersion); remapQueue.pushBack(RemappedPage{ v, pageID, invalidLogicalPageID }); + + // A freed page is unlikely to be read again soon so prioritize its cache eviction + PhysicalPageID previousPhysicalPage = i->second.rbegin()->second; + pageCache.prioritizeEviction(previousPhysicalPage); + i->second[v] = invalidLogicalPageID; return; } @@ -2406,21 +2444,12 @@ public: // Reads the most recent version of pageID, either previously committed or written using updatePage() // in the current commit - // If cacheable is false then if fromCache is valid it will be set to true if the page is from cache, otherwise - // false. If cacheable is true, fromCache is ignored as the result is automatically from cache by virtue of being - // cacheable. - Future> readPage(LogicalPageID pageID, - bool cacheable, - bool noHit = false, - bool* fromCache = nullptr) override { + Future> readPage(LogicalPageID pageID, bool cacheable, bool noHit = false) override { // Use cached page if present, without triggering a cache hit. // Otherwise, read the page and return it but don't add it to the cache if (!cacheable) { debug_printf("DWALPager(%s) op=readUncached %s\n", filename.c_str(), toString(pageID).c_str()); PageCacheEntry* pCacheEntry = pageCache.getIfExists(pageID); - if (fromCache != nullptr) { - *fromCache = pCacheEntry != nullptr; - } if (pCacheEntry != nullptr) { debug_printf("DWALPager(%s) op=readUncachedHit %s\n", filename.c_str(), toString(pageID).c_str()); @@ -2478,13 +2507,9 @@ public: return (PhysicalPageID)pageID; } - Future> readPageAtVersion(LogicalPageID logicalID, - Version v, - bool cacheable, - bool noHit, - bool* fromCache) { + Future> readPageAtVersion(LogicalPageID logicalID, Version v, bool cacheable, bool noHit) { PhysicalPageID physicalID = getPhysicalPageID(logicalID, v); - return readPage(physicalID, cacheable, noHit, fromCache); + return readPage(physicalID, cacheable, noHit); } void releaseExtentReadLock() override { concurrentExtentReads->release(); } @@ -2805,6 +2830,7 @@ public: break; } + // Yield to prevent slow task in case no IO waits are encountered if (++sinceYield >= 100) { sinceYield = 0; wait(yield()); @@ -3184,14 +3210,11 @@ public: : pager(pager), metaKey(meta), version(version), expired(expiredFuture) {} ~DWALPagerSnapshot() override {} - Future> getPhysicalPage(LogicalPageID pageID, - bool cacheable, - bool noHit, - bool* fromCache) override { + Future> getPhysicalPage(LogicalPageID pageID, bool cacheable, bool noHit) override { if (expired.isError()) { throw expired.getError(); } - return map(pager->readPageAtVersion(pageID, version, cacheable, noHit, fromCache), + return map(pager->readPageAtVersion(pageID, version, cacheable, noHit), [=](Reference p) { return Reference(std::move(p)); }); } @@ -3354,15 +3377,18 @@ std::string toString(BTreePageIDRef id) { struct RedwoodRecordRef { typedef uint8_t byte; - RedwoodRecordRef(KeyRef key = KeyRef(), Version ver = 0, Optional value = {}) - : key(key), version(ver), value(value) {} + RedwoodRecordRef(KeyRef key = KeyRef(), Optional value = {}) : key(key), value(value) {} - RedwoodRecordRef(Arena& arena, const RedwoodRecordRef& toCopy) : key(arena, toCopy.key), version(toCopy.version) { + RedwoodRecordRef(Arena& arena, const RedwoodRecordRef& toCopy) : key(arena, toCopy.key) { if (toCopy.value.present()) { value = ValueRef(arena, toCopy.value.get()); } } + typedef KeyRef Partial; + + void updateCache(Optional cache, Arena& arena) const { cache = KeyRef(arena, key); } + KeyValueRef toKeyValueRef() const { return KeyValueRef(key, value.get()); } // RedwoodRecordRefs are used for both internal and leaf pages of the BTree. @@ -3382,20 +3408,19 @@ struct RedwoodRecordRef { } inline RedwoodRecordRef withPageID(BTreePageIDRef id) const { - return RedwoodRecordRef(key, version, ValueRef((const uint8_t*)id.begin(), id.size() * sizeof(LogicalPageID))); + return RedwoodRecordRef(key, ValueRef((const uint8_t*)id.begin(), id.size() * sizeof(LogicalPageID))); } - inline RedwoodRecordRef withoutValue() const { return RedwoodRecordRef(key, version); } + inline RedwoodRecordRef withoutValue() const { return RedwoodRecordRef(key); } inline RedwoodRecordRef withMaxPageID() const { - return RedwoodRecordRef(key, version, StringRef((uint8_t*)&maxPageID, sizeof(maxPageID))); + return RedwoodRecordRef(key, StringRef((uint8_t*)&maxPageID, sizeof(maxPageID))); } // Truncate (key, version, part) tuple to len bytes. void truncate(int len) { ASSERT(len <= key.size()); key = key.substr(0, len); - version = 0; } // Find the common key prefix between two records, assuming that the first skipLen bytes are the same @@ -3410,10 +3435,7 @@ struct RedwoodRecordRef { int cmp = key.compareSuffix(rhs.key, keySkip); if (cmp == 0) { - cmp = version - rhs.version; - if (cmp == 0) { - cmp = value.compare(rhs.value); - } + cmp = value.compare(rhs.value); } return cmp; } @@ -3424,31 +3446,15 @@ struct RedwoodRecordRef { return (key.size() == k.size()) && (key.substr(skipLen) == k.substr(skipLen)); } - bool sameExceptValue(const RedwoodRecordRef& rhs, int skipLen = 0) const { - return sameUserKey(rhs.key, skipLen) && version == rhs.version; - } + bool sameExceptValue(const RedwoodRecordRef& rhs, int skipLen = 0) const { return sameUserKey(rhs.key, skipLen); } // TODO: Use SplitStringRef (unless it ends up being slower) KeyRef key; Optional value; - Version version; int expectedSize() const { return key.expectedSize() + value.expectedSize(); } int kvBytes() const { return expectedSize(); } - class Reader { - public: - Reader(const void* ptr) : rptr((const byte*)ptr) {} - - const byte* rptr; - - StringRef readString(int len) { - StringRef s(rptr, len); - rptr += len; - return s; - } - }; - #pragma pack(push, 1) struct Delta { @@ -3481,45 +3487,35 @@ struct RedwoodRecordRef { } LengthFormat3; }; - struct int48_t { - static constexpr int64_t MASK = 0xFFFFFFFFFFFFLL; - int32_t high; - int16_t low; - }; - static constexpr int LengthFormatSizes[] = { sizeof(LengthFormat0), sizeof(LengthFormat1), sizeof(LengthFormat2), sizeof(LengthFormat3) }; - static constexpr int VersionDeltaSizes[] = { 0, sizeof(int32_t), sizeof(int48_t), sizeof(int64_t) }; // Serialized Format // // Flags - 1 byte // 1 bit - borrow source is prev ancestor (otherwise next ancestor) // 1 bit - item is deleted - // 1 bit - has value (different from zero-length value, if 0 value len will be 0) - // 1 bits - has nonzero version - // 2 bits - version delta integer size code, maps to 0, 4, 6, 8 + // 1 bit - has value (different from a zero-length value, which is still a value) + // 3 unused bits // 2 bits - length fields format // // Length fields using 3 to 8 bytes total depending on length fields format // // Byte strings - // Key suffix bytes // Value bytes - // Version delta bytes - // + // Key suffix bytes enum EFlags { PREFIX_SOURCE_PREV = 0x80, IS_DELETED = 0x40, HAS_VALUE = 0x20, - HAS_VERSION = 0x10, - VERSION_DELTA_SIZE = 0xC, + // 3 unused bits LENGTHS_FORMAT = 0x03 }; + // Figure out which length format must be used for the given lengths static inline int determineLengthFormat(int prefixLength, int suffixLength, int valueLength) { // Large prefix or suffix length, which should be rare, is format 3 if (prefixLength > 0xFF || suffixLength > 0xFF) { @@ -3590,64 +3586,9 @@ struct RedwoodRecordRef { } } - StringRef getKeySuffix() const { return StringRef(data(), getKeySuffixLength()); } + StringRef getKeySuffix() const { return StringRef(data() + getValueLength(), getKeySuffixLength()); } - StringRef getValue() const { return StringRef(data() + getKeySuffixLength(), getValueLength()); } - - bool hasVersion() const { return flags & HAS_VERSION; } - - int getVersionDeltaSizeBytes() const { - int code = (flags & VERSION_DELTA_SIZE) >> 2; - return VersionDeltaSizes[code]; - } - - static int getVersionDeltaSizeBytes(Version d) { - if (d == 0) { - return 0; - } else if (d == (int32_t)d) { - return sizeof(int32_t); - } else if (d == (d & int48_t::MASK)) { - return sizeof(int48_t); - } - return sizeof(int64_t); - } - - int getVersionDelta(const uint8_t* r) const { - int code = (flags & VERSION_DELTA_SIZE) >> 2; - switch (code) { - case 0: - return 0; - case 1: - return *(int32_t*)r; - case 2: - return ((int64_t) static_cast(reinterpret_cast(r)->high) << 16) | - (((int48_t*)r)->low & 0xFFFF); - case 3: - default: - return *(int64_t*)r; - } - } - - // Version delta size should be 0 before calling - int setVersionDelta(Version d, uint8_t* w) { - flags |= HAS_VERSION; - if (d == 0) { - return 0; - } else if (d == (int32_t)d) { - flags |= 1 << 2; - *(uint32_t*)w = d; - return sizeof(uint32_t); - } else if (d == (d & int48_t::MASK)) { - flags |= 2 << 2; - ((int48_t*)w)->high = d >> 16; - ((int48_t*)w)->low = d; - return sizeof(int48_t); - } else { - flags |= 3 << 2; - *(int64_t*)w = d; - return sizeof(int64_t); - } - } + StringRef getValue() const { return StringRef(data(), getValueLength()); } bool hasValue() const { return flags & HAS_VALUE; } @@ -3671,40 +3612,59 @@ struct RedwoodRecordRef { bool getDeleted() const { return flags & IS_DELETED; } + // DeltaTree interface RedwoodRecordRef apply(const RedwoodRecordRef& base, Arena& arena) const { int keyPrefixLen = getKeyPrefixLength(); int keySuffixLen = getKeySuffixLength(); int valueLen = hasValue() ? getValueLength() : 0; + byte* pData = data(); StringRef k; - - Reader r(data()); // If there is a key suffix, reconstitute the complete key into a contiguous string if (keySuffixLen > 0) { - StringRef keySuffix = r.readString(keySuffixLen); k = makeString(keyPrefixLen + keySuffixLen, arena); memcpy(mutateString(k), base.key.begin(), keyPrefixLen); - memcpy(mutateString(k) + keyPrefixLen, keySuffix.begin(), keySuffixLen); + memcpy(mutateString(k) + keyPrefixLen, pData + valueLen, keySuffixLen); } else { // Otherwise just reference the base key's memory k = base.key.substr(0, keyPrefixLen); } - Optional value; - if (hasValue()) { - value = r.readString(valueLen); - } + return RedwoodRecordRef(k, hasValue() ? ValueRef(pData, valueLen) : Optional()); + } - Version v = 0; - if (hasVersion()) { - v = base.version + getVersionDelta(r.rptr); - } + // DeltaTree interface + RedwoodRecordRef apply(const Partial& cache) { + return RedwoodRecordRef(cache, hasValue() ? Optional(getValue()) : Optional()); + } - return RedwoodRecordRef(k, v, value); + RedwoodRecordRef apply(Arena& arena, const Partial& baseKey, Optional& cache) { + int keyPrefixLen = getKeyPrefixLength(); + int keySuffixLen = getKeySuffixLength(); + int valueLen = hasValue() ? getValueLength() : 0; + byte* pData = data(); + + StringRef k; + // If there is a key suffix, reconstitute the complete key into a contiguous string + if (keySuffixLen > 0) { + k = makeString(keyPrefixLen + keySuffixLen, arena); + memcpy(mutateString(k), baseKey.begin(), keyPrefixLen); + memcpy(mutateString(k) + keyPrefixLen, pData + valueLen, keySuffixLen); + } else { + // Otherwise just reference the base key's memory + k = baseKey.substr(0, keyPrefixLen); + } + cache = k; + + return RedwoodRecordRef(k, hasValue() ? ValueRef(pData, valueLen) : Optional()); + } + + RedwoodRecordRef apply(Arena& arena, const RedwoodRecordRef& base, Optional& cache) { + return apply(arena, base.key, cache); } int size() const { - int size = 1 + getVersionDeltaSizeBytes(); + int size = 1; switch (flags & LENGTHS_FORMAT) { case 0: return size + sizeof(LengthFormat0) + LengthFormat0.suffixLength + LengthFormat0.valueLength; @@ -3729,40 +3689,38 @@ struct RedwoodRecordRef { if (hasValue()) { flagString += "HasValue|"; } - if (hasVersion()) { - flagString += "HasVersion|"; - } int lengthFormat = flags & LENGTHS_FORMAT; - Reader r(data()); int prefixLen = getKeyPrefixLength(); int keySuffixLen = getKeySuffixLength(); int valueLen = getValueLength(); return format("lengthFormat: %d totalDeltaSize: %d flags: %s prefixLen: %d keySuffixLen: %d " - "versionDeltaSizeBytes: %d valueLen %d raw: %s", + "valueLen %d raw: %s", lengthFormat, size(), flagString.c_str(), prefixLen, keySuffixLen, - getVersionDeltaSizeBytes(), valueLen, StringRef((const uint8_t*)this, size()).toHexString().c_str()); } }; - // Using this class as an alternative for Delta enables reading a DeltaTree while only decoding + // Using this class as an alternative for Delta enables reading a DeltaTree2 while only decoding // its values, so the Reader does not require the original prev/next ancestors. struct DeltaValueOnly : Delta { RedwoodRecordRef apply(const RedwoodRecordRef& base, Arena& arena) const { - Optional value; + return RedwoodRecordRef(KeyRef(), hasValue() ? Optional(getValue()) : Optional()); + } - if (hasValue()) { - value = getValue(); - } + RedwoodRecordRef apply(const Partial& cache) { + return RedwoodRecordRef(KeyRef(), hasValue() ? Optional(getValue()) : Optional()); + } - return RedwoodRecordRef(StringRef(), 0, value); + RedwoodRecordRef apply(Arena& arena, const RedwoodRecordRef& base, Optional& cache) { + cache = KeyRef(); + return RedwoodRecordRef(KeyRef(), hasValue() ? Optional(getValue()) : Optional()); } }; #pragma pack(pop) @@ -3787,16 +3745,13 @@ struct RedwoodRecordRef { int valueLen = value.present() ? value.get().size() : 0; int formatType; - int versionBytes; if (worstCaseOverhead) { formatType = Delta::determineLengthFormat(key.size(), key.size(), valueLen); - versionBytes = version == 0 ? 0 : Delta::getVersionDeltaSizeBytes(version << 1); } else { formatType = Delta::determineLengthFormat(prefixLen, keySuffixLen, valueLen); - versionBytes = version == 0 ? 0 : Delta::getVersionDeltaSizeBytes(version - base.version); } - return 1 + Delta::LengthFormatSizes[formatType] + keySuffixLen + valueLen + versionBytes; + return 1 + Delta::LengthFormatSizes[formatType] + keySuffixLen + valueLen; } // commonPrefix between *this and base can be passed if known @@ -3838,17 +3793,14 @@ struct RedwoodRecordRef { } uint8_t* wptr = d.data(); - // Write key suffix string - wptr = keySuffix.copyTo(wptr); // Write value bytes - if (value.present()) { + if (valueLen > 0) { wptr = value.get().copyTo(wptr); } - if (version != 0) { - wptr += d.setVersionDelta(version - base.version, wptr); - } + // Write key suffix string + wptr = keySuffix.copyTo(wptr); return wptr - (uint8_t*)&d; } @@ -3868,7 +3820,7 @@ struct RedwoodRecordRef { std::string toString(bool leaf = true) const { std::string r; - r += format("'%s'@%" PRId64 " => ", key.printable().c_str(), version); + r += format("'%s' => ", key.printable().c_str()); if (value.present()) { if (leaf) { r += format("'%s'", kvformat(value.get()).c_str()); @@ -3883,8 +3835,8 @@ struct RedwoodRecordRef { }; struct BTreePage { - typedef DeltaTree BinaryTree; - typedef DeltaTree ValueTree; + typedef DeltaTree2 BinaryTree; + typedef DeltaTree2 ValueTree; #pragma pack(push, 1) struct { @@ -3894,24 +3846,23 @@ struct BTreePage { #pragma pack(pop) int size() const { - auto& t = tree(); - return (uint8_t*)&t - (uint8_t*)this + t.size(); + const BinaryTree* t = tree(); + return (uint8_t*)t - (uint8_t*)this + t->size(); } bool isLeaf() const { return height == 1; } - BinaryTree& tree() { return *(BinaryTree*)(this + 1); } + BinaryTree* tree() { return (BinaryTree*)(this + 1); } - const BinaryTree& tree() const { return *(const BinaryTree*)(this + 1); } + BinaryTree* tree() const { return (BinaryTree*)(this + 1); } - const ValueTree& valueTree() const { return *(const ValueTree*)(this + 1); } + ValueTree* valueTree() const { return (ValueTree*)(this + 1); } - // TODO: boundaries are for decoding, but upper std::string toString(bool write, BTreePageIDRef id, Version ver, - const RedwoodRecordRef* lowerBound, - const RedwoodRecordRef* upperBound) const { + const RedwoodRecordRef& lowerBound, + const RedwoodRecordRef& upperBound) const { std::string r; r += format("BTreePage op=%s %s @%" PRId64 " ptr=%p height=%d count=%d kvBytes=%d\n lowerBound: %s\n upperBound: %s\n", @@ -3920,16 +3871,16 @@ struct BTreePage { ver, this, height, - (int)tree().numItems, + (int)tree()->numItems, (int)kvBytes, - lowerBound->toString(false).c_str(), - upperBound->toString(false).c_str()); + lowerBound.toString(false).c_str(), + upperBound.toString(false).c_str()); try { - if (tree().numItems > 0) { + if (tree()->numItems > 0) { // This doesn't use the cached reader for the page because it is only for debugging purposes, // a cached reader may not exist - BinaryTree::Mirror reader(&tree(), lowerBound, upperBound); - BinaryTree::Cursor c = reader.getCursor(); + BinaryTree::DecodeCache cache(lowerBound, upperBound); + BinaryTree::Cursor c(&cache, tree()); c.moveFirst(); ASSERT(c.valid()); @@ -3939,8 +3890,8 @@ struct BTreePage { r += " "; r += c.get().toString(height == 1); - bool tooLow = c.get().withoutValue() < lowerBound->withoutValue(); - bool tooHigh = c.get().withoutValue() >= upperBound->withoutValue(); + bool tooLow = c.get().withoutValue() < lowerBound.withoutValue(); + bool tooHigh = c.get().withoutValue() >= upperBound.withoutValue(); if (tooLow || tooHigh) { anyOutOfRange = true; if (tooLow) { @@ -3957,7 +3908,7 @@ struct BTreePage { // Out of range entries are actually okay now and the result of subtree deletion followed by // incremental insertions of records in the deleted range being added to an adjacent subtree // which is logically expanded encompass the deleted range but still is using the original - // subtree boundaries as DeltaTree boundaries. + // subtree boundaries as DeltaTree2 boundaries. // ASSERT(!anyOutOfRange); } } catch (Error& e) { @@ -3976,11 +3927,7 @@ static void makeEmptyRoot(Reference page) { BTreePage* btpage = (BTreePage*)page->begin(); btpage->height = 1; btpage->kvBytes = 0; - btpage->tree().build(page->size(), nullptr, nullptr, nullptr, nullptr); -} - -BTreePage::BinaryTree::Cursor getCursor(const Reference& page) { - return ((BTreePage::BinaryTree::Mirror*)page->userData)->getCursor(); + btpage->tree()->build(page->size(), nullptr, nullptr, nullptr, nullptr); } struct BoundaryRefAndPage { @@ -3993,8 +3940,6 @@ struct BoundaryRefAndPage { } }; -#define NOT_IMPLEMENTED UNSTOPPABLE_ASSERT(false) - #pragma pack(push, 1) template struct InPlaceArray { @@ -4016,11 +3961,12 @@ struct InPlaceArray { memcpy(begin(), v.begin(), sizeof(T) * v.size()); } - int extraSize() const { return count * sizeof(T); } + int size() const { return count; } + int sizeBytes() const { return count * sizeof(T); } }; #pragma pack(pop) -class VersionedBTree final : public IVersionedStore { +class VersionedBTree { public: // The first possible internal record possible in the tree static RedwoodRecordRef dbBegin; @@ -4086,14 +4032,14 @@ public: #pragma pack(push, 1) struct MetaKey { - static constexpr int FORMAT_VERSION = 8; + static constexpr int FORMAT_VERSION = 11; // This serves as the format version for the entire tree, individual pages will not be versioned uint16_t formatVersion; uint8_t height; LazyClearQueueT::QueueState lazyDeleteQueue; InPlaceArray root; - KeyRef asKeyRef() const { return KeyRef((uint8_t*)this, sizeof(MetaKey) + root.extraSize()); } + KeyRef asKeyRef() const { return KeyRef((uint8_t*)this, sizeof(MetaKey) + root.sizeBytes()); } void fromKeyRef(KeyRef k) { memcpy(this, k.begin(), k.size()); @@ -4101,9 +4047,9 @@ public: } std::string toString() { - return format("{height=%d formatVersion=%d root=%s lazyDeleteQueue=%s}", - (int)height, + return format("{formatVersion=%d height=%d root=%s lazyDeleteQueue=%s}", (int)formatVersion, + (int)height, ::toString(root.get()).c_str(), lazyDeleteQueue.toString().c_str()); } @@ -4112,9 +4058,9 @@ public: // All async opts on the btree are based on pager reads, writes, and commits, so // we can mostly forward these next few functions to the pager - Future getError() override { return m_pager->getError(); } + Future getError() { return m_pager->getError(); } - Future onClosed() override { return m_pager->onClosed(); } + Future onClosed() { return m_pager->onClosed(); } void close_impl(bool dispose) { auto* pager = m_pager; @@ -4125,26 +4071,24 @@ public: pager->close(); } - void dispose() override { return close_impl(true); } + void dispose() { return close_impl(true); } - void close() override { return close_impl(false); } + void close() { return close_impl(false); } - KeyValueStoreType getType() const override { NOT_IMPLEMENTED; } - bool supportsMutation(int op) const override { NOT_IMPLEMENTED; } - StorageBytes getStorageBytes() const override { return m_pager->getStorageBytes(); } + StorageBytes getStorageBytes() const { return m_pager->getStorageBytes(); } // Writes are provided in an ordered stream. // A write is considered part of (a change leading to) the version determined by the previous call to // setWriteVersion() A write shall not become durable until the following call to commit() begins, and shall be // durable once the following call to commit() returns - void set(KeyValueRef keyValue) override { + void set(KeyValueRef keyValue) { ++g_redwoodMetrics.opSet; g_redwoodMetrics.opSetKeyBytes += keyValue.key.size(); g_redwoodMetrics.opSetValueBytes += keyValue.value.size(); m_pBuffer->insert(keyValue.key).mutation().setBoundaryValue(m_pBuffer->copyToArena(keyValue.value)); } - void clear(KeyRangeRef clearedRange) override { + void clear(KeyRangeRef clearedRange) { // Optimization for single key clears to create just one mutation boundary instead of two if (clearedRange.begin.size() == clearedRange.end.size() - 1 && clearedRange.end[clearedRange.end.size() - 1] == 0 && clearedRange.end.startsWith(clearedRange.begin)) { @@ -4163,13 +4107,11 @@ public: m_pBuffer->erase(iBegin, iEnd); } - void mutate(int op, StringRef param1, StringRef param2) override { NOT_IMPLEMENTED; } + void setOldestVersion(Version v) { m_newOldestVersion = v; } - void setOldestVersion(Version v) override { m_newOldestVersion = v; } + Version getOldestVersion() const { return m_pager->getOldestVersion(); } - Version getOldestVersion() const override { return m_pager->getOldestVersion(); } - - Version getLatestVersion() const override { + Version getLatestVersion() const { if (m_writeVersion != invalidVersion) return m_writeVersion; return m_pager->getLatestVersion(); @@ -4181,7 +4123,7 @@ public: VersionedBTree(IPager2* pager, std::string name) : m_pager(pager), m_writeVersion(invalidVersion), m_lastCommittedVersion(invalidVersion), m_pBuffer(nullptr), - m_commitReadLock(new FlowLock(SERVER_KNOBS->REDWOOD_COMMIT_CONCURRENT_READS)), m_name(name) { + m_name(name), m_pHeader(nullptr), m_headerSpace(0) { m_lazyClearActor = 0; m_init = init_impl(this); @@ -4209,8 +4151,7 @@ public: break; } // Start reading the page, without caching - entries.push_back( - std::make_pair(q.get(), self->readPage(snapshot, q.get().pageID, nullptr, nullptr, true, false))); + entries.push_back(std::make_pair(q.get(), self->readPage(snapshot, q.get().pageID, true, false))); --toPop; } @@ -4229,8 +4170,8 @@ public: // Iterate over page entries, skipping key decoding using BTreePage::ValueTree which uses // RedwoodRecordRef::DeltaValueOnly as the delta type type to skip key decoding - BTreePage::ValueTree::Mirror reader(&btPage.valueTree(), &dbBegin, &dbEnd); - auto c = reader.getCursor(); + BTreePage::ValueTree::DecodeCache cache(dbBegin, dbEnd); + BTreePage::ValueTree::Cursor c(&cache, btPage.valueTree()); ASSERT(c.moveFirst()); Version v = entry.version; while (1) { @@ -4284,6 +4225,10 @@ public: ACTOR static Future init_impl(VersionedBTree* self) { wait(self->m_pager->init()); + // TODO: Get actual max MetaKey size limit from Pager + self->m_headerSpace = self->m_pager->getUsablePageSize(); + self->m_pHeader = (MetaKey*)new uint8_t[self->m_headerSpace]; + self->m_blockSize = self->m_pager->getUsablePageSize(); state Version latest = self->m_pager->getLatestVersion(); self->m_newOldestVersion = self->m_pager->getOldestVersion(); @@ -4293,12 +4238,12 @@ public: state Key meta = self->m_pager->getMetaKey(); if (meta.size() == 0) { - self->m_header.formatVersion = MetaKey::FORMAT_VERSION; + self->m_pHeader->formatVersion = MetaKey::FORMAT_VERSION; LogicalPageID id = wait(self->m_pager->newPageID()); BTreePageIDRef newRoot((LogicalPageID*)&id, 1); debug_printf("new root %s\n", toString(newRoot).c_str()); - self->m_header.root.set(newRoot, sizeof(headerSpace) - sizeof(m_header)); - self->m_header.height = 1; + self->m_pHeader->root.set(newRoot, self->m_headerSpace - sizeof(MetaKey)); + self->m_pHeader->height = 1; ++latest; Reference page = self->m_pager->newPageBuffer(); makeEmptyRoot(page); @@ -4308,23 +4253,23 @@ public: LogicalPageID newQueuePage = wait(self->m_pager->newPageID()); self->m_lazyClearQueue.create( self->m_pager, newQueuePage, "LazyClearQueue", self->m_pager->newLastQueueID(), false); - self->m_header.lazyDeleteQueue = self->m_lazyClearQueue.getState(); - self->m_pager->setMetaKey(self->m_header.asKeyRef()); + self->m_pHeader->lazyDeleteQueue = self->m_lazyClearQueue.getState(); + self->m_pager->setMetaKey(self->m_pHeader->asKeyRef()); wait(self->m_pager->commit()); debug_printf("Committed initial commit.\n"); } else { - self->m_header.fromKeyRef(meta); - self->m_lazyClearQueue.recover(self->m_pager, self->m_header.lazyDeleteQueue, "LazyClearQueueRecovered"); + self->m_pHeader->fromKeyRef(meta); + self->m_lazyClearQueue.recover(self->m_pager, self->m_pHeader->lazyDeleteQueue, "LazyClearQueueRecovered"); } - debug_printf("Recovered btree at version %" PRId64 ": %s\n", latest, self->m_header.toString().c_str()); + debug_printf("Recovered btree at version %" PRId64 ": %s\n", latest, self->m_pHeader->toString().c_str()); self->m_lastCommittedVersion = latest; self->m_lazyClearActor = incrementalLazyClear(self); return Void(); } - Future init() override { return m_init; } + Future init() { return m_init; } virtual ~VersionedBTree() { // This probably shouldn't be called directly (meaning deleting an instance directly) but it should be safe, @@ -4332,22 +4277,14 @@ public: // uncommitted writes so it should not be committed. m_init.cancel(); m_latestCommit.cancel(); - } - Reference readAtVersion(Version v) override { - // Only committed versions can be read. - ASSERT(v <= m_lastCommittedVersion); - Reference snapshot = m_pager->getReadSnapshot(v); - - // This is a ref because snapshot will continue to hold the metakey value memory - KeyRef m = snapshot->getMetaKey(); - - // Currently all internal records generated in the write path are at version 0 - return Reference(new Cursor(snapshot, ((MetaKey*)m.begin())->root.get(), (Version)0)); + if (m_pHeader != nullptr) { + delete[](uint8_t*) m_pHeader; + } } // Must be nondecreasing - void setWriteVersion(Version v) override { + void setWriteVersion(Version v) { ASSERT(v > m_lastCommittedVersion); // If there was no current mutation buffer, create one in the buffer map and update m_pBuffer if (m_pBuffer == nullptr) { @@ -4361,7 +4298,7 @@ public: m_writeVersion = v; } - Future commit() override { + Future commit() { if (m_pBuffer == nullptr) return m_latestCommit; return commit_impl(this); @@ -4401,8 +4338,8 @@ public: ASSERT(s.numPages == 1); // The btree should now be a single non-oversized root page. - ASSERT(self->m_header.height == 1); - ASSERT(self->m_header.root.count == 1); + ASSERT(self->m_pHeader->height == 1); + ASSERT(self->m_pHeader->root.count == 1); // From the pager's perspective the only pages that should be in use are the btree root and // the previously mentioned lazy delete queue page. @@ -4434,14 +4371,14 @@ private: inline bool equalToSet(ValueRef val) { return isSet() && value == val; } - inline RedwoodRecordRef toRecord(KeyRef userKey, Version version) const { + inline RedwoodRecordRef toRecord(KeyRef userKey) const { // No point in serializing an atomic op, it needs to be coalesced to a real value. ASSERT(!isAtomicOp()); if (isClear()) - return RedwoodRecordRef(userKey, version); + return RedwoodRecordRef(userKey); - return RedwoodRecordRef(userKey, version, value); + return RedwoodRecordRef(userKey, value); } std::string toString() const { return format("op=%d val='%s'", op, printable(value).c_str()); } @@ -4632,20 +4569,15 @@ private: Version m_writeVersion; Version m_lastCommittedVersion; Version m_newOldestVersion; - Reference m_commitReadLock; Future m_latestCommit; Future m_init; std::string m_name; int m_blockSize; - std::unordered_map parents; ParentInfoMapT childUpdateTracker; - // MetaKey changes size so allocate space for it to expand into. FIXME: Steve is fixing this to be dynamically - // sized. - union { - uint8_t headerSpace[sizeof(MetaKey) + sizeof(LogicalPageID) * 200]; - MetaKey m_header; - }; + // MetaKey has a variable size, it can be as large as m_headerSpace + MetaKey* m_pHeader; + int m_headerSpace; LazyClearQueueT m_lazyClearQueue; Future m_lazyClearActor; @@ -4663,7 +4595,7 @@ private: int count; // Number of records added to the page int pageSize; // Page size required to hold a BTreePage of the added records, which is a multiple of blockSize int bytesLeft; // Bytes in pageSize that are unused by the BTreePage so far - bool largeDeltaTree; // Whether or not the DeltaTree in the generated page is in the 'large' size range + bool largeDeltaTree; // Whether or not the tree in the generated page is in the 'large' size range int blockSize; // Base block size by which pageSize can be incremented int blockCount; // The number of blocks in pageSize int kvBytes; // The amount of user key/value bytes added to the page @@ -4859,6 +4791,7 @@ private: // Lower bound of the page being added to state RedwoodRecordRef pageLowerBound = lowerBound->withoutValue(); state RedwoodRecordRef pageUpperBound; + state int sinceYield = 0; state int pageIndex; @@ -4925,7 +4858,7 @@ private: pageUpperBound.toString(false).c_str()); int deltaTreeSpace = p.pageSize - sizeof(BTreePage); - state int written = btPage->tree().build( + state int written = btPage->tree()->build( deltaTreeSpace, &entries[p.startIndex], &entries[endIndex], &pageLowerBound, &pageUpperBound); if (written > deltaTreeSpace) { @@ -4987,7 +4920,10 @@ private: } } - wait(yield()); + if (++sinceYield > 100) { + sinceYield = 0; + wait(yield()); + } if (REDWOOD_DEBUG) { auto& p = pagesToBuild[pageIndex]; @@ -5022,7 +4958,7 @@ private: // While there are multiple child pages for this version we must write new tree levels. while (records.size() > 1) { - self->m_header.height = ++height; + self->m_pHeader->height = ++height; Standalone> newRecords = wait(writePages(self, &dbBegin, &dbEnd, records, height, version, BTreePageIDRef())); debug_printf("Wrote a new root level at version %" PRId64 " height %d size %lu pages\n", @@ -5048,28 +4984,18 @@ private: ACTOR static Future> readPage(Reference snapshot, BTreePageIDRef id, - const RedwoodRecordRef* lowerBound, - const RedwoodRecordRef* upperBound, bool forLazyClear = false, - bool cacheable = true, - bool* fromCache = nullptr) { - if (!forLazyClear) { - debug_printf("readPage() op=read %s @%" PRId64 " lower=%s upper=%s\n", - toString(id).c_str(), - snapshot->getVersion(), - lowerBound->toString(false).c_str(), - upperBound->toString(false).c_str()); - } else { - debug_printf( - "readPage() op=readForDeferredClear %s @%" PRId64 " \n", toString(id).c_str(), snapshot->getVersion()); - } + bool cacheable = true) { - wait(yield()); + debug_printf("readPage() op=read%s %s @%" PRId64 "\n", + forLazyClear ? "ForDeferredClear" : "", + toString(id).c_str(), + snapshot->getVersion()); state Reference page; if (id.size() == 1) { - Reference p = wait(snapshot->getPhysicalPage(id.front(), cacheable, false, fromCache)); + Reference p = wait(snapshot->getPhysicalPage(id.front(), cacheable, false)); page = std::move(p); } else { ASSERT(!id.empty()); @@ -5080,11 +5006,6 @@ private: std::vector> pages = wait(getAll(reads)); // TODO: Cache reconstituted super pages somehow, perhaps with help from the Pager. page = ArenaPage::concatPages(pages); - - // In the current implementation, SuperPages are never present in the cache - if (fromCache != nullptr) { - *fromCache = false; - } } debug_printf("readPage() op=readComplete %s @%" PRId64 " \n", toString(id).c_str(), snapshot->getVersion()); @@ -5093,24 +5014,39 @@ private: metrics.pageRead += 1; metrics.pageReadExt += (id.size() - 1); - if (!forLazyClear && page->userData == nullptr) { - debug_printf("readPage() Creating Mirror for %s @%" PRId64 " lower=%s upper=%s\n", - toString(id).c_str(), - snapshot->getVersion(), - lowerBound->toString(false).c_str(), - upperBound->toString(false).c_str()); - page->userData = new BTreePage::BinaryTree::Mirror(&pTreePage->tree(), lowerBound, upperBound); - page->userDataDestructor = [](void* ptr) { delete (BTreePage::BinaryTree::Mirror*)ptr; }; - } - - if (!forLazyClear) { - debug_printf("readPage() %s\n", - pTreePage->toString(false, id, snapshot->getVersion(), lowerBound, upperBound).c_str()); - } - return std::move(page); } + // Get cursor into a BTree node, creating decode cache from boundaries if needed + static BTreePage::BinaryTree::Cursor getCursor(Reference page, + const RedwoodRecordRef& lowerBound, + const RedwoodRecordRef& upperBound) { + if (page->userData == nullptr) { + debug_printf("Creating DecodeCache for ptr=%p lower=%s upper=%s\n", + page->begin(), + lowerBound.toString().c_str(), + upperBound.toString().c_str()); + + BTreePage::BinaryTree::DecodeCache* cache = new BTreePage::BinaryTree::DecodeCache(lowerBound, upperBound); + page->userData = cache; + page->userDataDestructor = [](void* cache) { ((BTreePage::BinaryTree::DecodeCache*)cache)->delref(); }; + } + + return BTreePage::BinaryTree::Cursor((BTreePage::BinaryTree::DecodeCache*)page->userData, + ((BTreePage*)page->begin())->tree()); + } + + // Get cursor into a BTree node from a child link + static BTreePage::BinaryTree::Cursor getCursor(const Reference& page, + const BTreePage::BinaryTree::Cursor& link) { + if (page->userData == nullptr) { + return getCursor(page, link.get(), link.next().getOrUpperBound()); + } + + return BTreePage::BinaryTree::Cursor((BTreePage::BinaryTree::DecodeCache*)page->userData, + ((BTreePage*)page->begin())->tree()); + } + static void preLoadPage(IPagerSnapshot* snapshot, BTreePageIDRef id) { g_redwoodMetrics.btreeLeafPreload += 1; g_redwoodMetrics.btreeLeafPreloadExt += (id.size() - 1); @@ -5137,6 +5073,18 @@ private: state BTreePageIDRef newID; newID.resize(*arena, oldID.size()); + if (REDWOOD_DEBUG) { + BTreePage* btPage = (BTreePage*)page->begin(); + BTreePage::BinaryTree::DecodeCache* cache = (BTreePage::BinaryTree::DecodeCache*)page->userData; + debug_printf_always( + "updateBTreePage(%s, %s) %s\n", + ::toString(oldID).c_str(), + ::toString(writeVersion).c_str(), + cache == nullptr + ? "" + : btPage->toString(true, oldID, writeVersion, cache->lowerBound, cache->upperBound).c_str()); + } + if (oldID.size() == 1) { LogicalPageID id = wait(self->m_pager->atomicUpdatePage(oldID.front(), page, writeVersion)); newID.front() = id; @@ -5165,33 +5113,33 @@ private: return newID; } - // Copy page and initialize a Mirror for reading it. - Reference cloneForUpdate(Reference page) { + // Copy page to a new page which shares the same DecodeCache with the old page + static Reference clonePageForUpdate(Reference page) { Reference newPage = page->cloneContents(); - auto oldMirror = (const BTreePage::BinaryTree::Mirror*)page->userData; - auto newBTPage = (BTreePage*)newPage->mutate(); + BTreePage::BinaryTree::DecodeCache* cache = (BTreePage::BinaryTree::DecodeCache*)page->userData; + cache->addref(); + newPage->userData = cache; + newPage->userDataDestructor = [](void* cache) { ((BTreePage::BinaryTree::DecodeCache*)cache)->delref(); }; - newPage->userData = - new BTreePage::BinaryTree::Mirror(&newBTPage->tree(), oldMirror->lowerBound(), oldMirror->upperBound()); - newPage->userDataDestructor = [](void* ptr) { delete (BTreePage::BinaryTree::Mirror*)ptr; }; + debug_printf("cloneForUpdate(%p -> %p size=%d\n", page->begin(), newPage->begin(), page->size()); return newPage; } // Each call to commitSubtree() will pass most of its arguments via a this structure because the caller // will need access to these parameters after commitSubtree() is done. - struct InternalPageSliceUpdate { + struct InternalPageSliceUpdate : public FastAllocated { // The logical range for the subtree's contents. Due to subtree clears, these boundaries may not match // the lower/upper bounds needed to decode the page. // Subtree clears can cause the boundaries for decoding the page to be more restrictive than the subtree's // logical boundaries. When a subtree is fully cleared, the link to it is replaced with a null link, but // the key boundary remains in tact to support decoding of the previous subtree. - const RedwoodRecordRef* subtreeLowerBound; - const RedwoodRecordRef* subtreeUpperBound; + RedwoodRecordRef subtreeLowerBound; + RedwoodRecordRef subtreeUpperBound; // The lower/upper bound for decoding the root of the subtree - const RedwoodRecordRef* decodeLowerBound; - const RedwoodRecordRef* decodeUpperBound; + RedwoodRecordRef decodeLowerBound; + RedwoodRecordRef decodeUpperBound; bool boundariesNormal() const { // If the decode upper boundary is the subtree upper boundary the pointers will be the same @@ -5199,7 +5147,7 @@ private: // that the keys are the same. This happens for the first remaining subtree of an internal page // after the prior subtree(s) were cleared. return (decodeUpperBound == subtreeUpperBound) && - (decodeLowerBound == subtreeLowerBound || decodeLowerBound->sameExceptValue(*subtreeLowerBound)); + (decodeLowerBound == subtreeLowerBound || decodeLowerBound.sameExceptValue(subtreeLowerBound)); } // The record range of the subtree slice is cBegin to cEnd @@ -5228,7 +5176,7 @@ private: // The upper boundary expected, if any, by the last child in either [cBegin, cEnd) or newLinks // If the last record in the range has a null link then this will be null. - const RedwoodRecordRef* expectedUpperBound; + Optional expectedUpperBound; bool inPlaceUpdate; @@ -5238,7 +5186,7 @@ private: void cleared() { inPlaceUpdate = false; childrenChanged = true; - expectedUpperBound = nullptr; + expectedUpperBound.reset(); } // Page was updated in-place through edits and written to maybeNewID @@ -5249,12 +5197,12 @@ private: metrics.pageModifyExt += (maybeNewID.size() - 1); metrics.modifyFillPct += (double)btPage->size() / capacity; metrics.modifyStoredPct += (double)btPage->kvBytes / capacity; - metrics.modifyItemCount += btPage->tree().numItems; + metrics.modifyItemCount += btPage->tree()->numItems; // The boundaries can't have changed, but the child page link may have. - if (maybeNewID != decodeLowerBound->getChildPage()) { + if (maybeNewID != decodeLowerBound.getChildPage()) { // Add page's decode lower bound to newLinks set without its child page, intially - newLinks.push_back_deep(newLinks.arena(), decodeLowerBound->withoutValue()); + newLinks.push_back_deep(newLinks.arena(), decodeLowerBound.withoutValue()); // Set the child page ID, which has already been allocated in result.arena() newLinks.back().setChildPage(maybeNewID); @@ -5275,7 +5223,7 @@ private: // If the replacement records ended on a non-null child page, then the expect upper bound is // the subtree upper bound since that is what would have been used for the page(s) rebuild, // otherwise it is null. - expectedUpperBound = newLinks.back().value.present() ? subtreeUpperBound : nullptr; + expectedUpperBound = newLinks.back().value.present() ? subtreeUpperBound : Optional(); } // Get the first record for this range AFTER applying whatever changes were made @@ -5286,7 +5234,7 @@ private: } return &newLinks.front(); } - return decodeLowerBound; + return &decodeLowerBound; } std::string toString() const { @@ -5297,12 +5245,12 @@ private: childrenChanged && newLinks.empty(), childrenChanged, inPlaceUpdate); - s += format("SubtreeLower: %s\n", subtreeLowerBound->toString(false).c_str()); - s += format(" DecodeLower: %s\n", decodeLowerBound->toString(false).c_str()); - s += format(" DecodeUpper: %s\n", decodeUpperBound->toString(false).c_str()); - s += format("SubtreeUpper: %s\n", subtreeUpperBound->toString(false).c_str()); + s += format("SubtreeLower: %s\n", subtreeLowerBound.toString(false).c_str()); + s += format(" DecodeLower: %s\n", decodeLowerBound.toString(false).c_str()); + s += format(" DecodeUpper: %s\n", decodeUpperBound.toString(false).c_str()); + s += format("SubtreeUpper: %s\n", subtreeUpperBound.toString(false).c_str()); s += format("expectedUpperBound: %s\n", - expectedUpperBound ? expectedUpperBound->toString(false).c_str() : "(null)"); + expectedUpperBound.present() ? expectedUpperBound.get().toString(false).c_str() : "(null)"); for (int i = 0; i < newLinks.size(); ++i) { s += format(" %i: %s\n", i, newLinks[i].toString(false).c_str()); } @@ -5313,42 +5261,61 @@ private: struct InternalPageModifier { InternalPageModifier() {} - InternalPageModifier(BTreePage* p, BTreePage::BinaryTree::Mirror* m, bool updating, ParentInfo* parentInfo) - : btPage(p), m(m), updating(updating), changesMade(false), parentInfo(parentInfo) {} + InternalPageModifier(Reference p, bool alreadyCloned, bool updating, ParentInfo* parentInfo) + : page(p), clonedPage(alreadyCloned), updating(updating), changesMade(false), parentInfo(parentInfo) {} + // Whether updating the existing page is allowed bool updating; - BTreePage* btPage; - BTreePage::BinaryTree::Mirror* m; + Reference page; + + // Whether or not page has been cloned for update + bool clonedPage; + Standalone> rebuild; + + // Whether there are any changes to the page, either made in place or staged in rebuild bool changesMade; ParentInfo* parentInfo; + BTreePage* btPage() const { return (BTreePage*)page->begin(); } + bool empty() const { if (updating) { - return m->tree->numItems == 0; + return btPage()->tree()->numItems == 0; } else { return rebuild.empty(); } } + void cloneForUpdate() { + if (!clonedPage) { + page = clonePageForUpdate(page); + clonedPage = true; + } + } + // end is the cursor position of the first record of the unvisited child link range, which // is needed if the insert requires switching from update to rebuild mode. void insert(BTreePage::BinaryTree::Cursor end, const VectorRef& recs) { int i = 0; if (updating) { + // Update must be done in the new tree, not the original tree where the end cursor will be from + end.tree = btPage()->tree(); + // TODO: insert recs in a random order to avoid new subtree being entirely right child links while (i != recs.size()) { const RedwoodRecordRef& rec = recs[i]; debug_printf("internal page (updating) insert: %s\n", rec.toString(false).c_str()); - if (!m->insert(rec)) { + if (!end.insert(rec)) { debug_printf("internal page: failed to insert %s, switching to rebuild\n", rec.toString(false).c_str()); + // Update failed, so populate rebuild vector with everything up to but not including end, which // may include items from recs that were already added. auto c = end; if (c.moveFirst()) { - rebuild.reserve(rebuild.arena(), c.mirror->tree->numItems); + rebuild.reserve(rebuild.arena(), c.tree->numItems); while (c != end) { debug_printf(" internal page rebuild: add %s\n", c.get().toString(false).c_str()); rebuild.push_back(rebuild.arena(), c.get()); @@ -5358,7 +5325,7 @@ private: updating = false; break; } - btPage->kvBytes += rec.kvBytes(); + btPage()->kvBytes += rec.kvBytes(); ++i; } } @@ -5401,11 +5368,19 @@ private: if (u.childrenChanged) { if (updating) { auto c = u.cBegin; + + if (c != u.cEnd) { + cloneForUpdate(); + // must point c to the tree to erase from + c.tree = btPage()->tree(); + } + while (c != u.cEnd) { debug_printf("internal page (updating) erasing: %s\n", c.get().toString(false).c_str()); - btPage->kvBytes -= c.get().kvBytes(); + btPage()->kvBytes -= c.get().kvBytes(); c.erase(); } + // [cBegin, cEnd) is now erased, and cBegin is invalid, so cEnd represents the end // of the range that comes before any part of newLinks that can't be added if there // is not enough space. @@ -5420,8 +5395,11 @@ private: changesMade = true; } else { + // If this was an in-place update, where the child page IDs do not change, notify the + // parentInfo that those pages have been updated so it can possibly eliminate their + // second writes later. if (u.inPlaceUpdate) { - for (auto id : u.decodeLowerBound->getChildPage()) { + for (auto id : u.decodeLowerBound.getChildPage()) { parentInfo->pageUpdated(id); } } @@ -5430,12 +5408,14 @@ private: } // If there is an expected upper boundary for the next range after u - if (u.expectedUpperBound != nullptr) { + if (u.expectedUpperBound.present()) { // Then if it does not match the next boundary then insert a dummy record - if (nextBoundary == nullptr || - (nextBoundary != u.expectedUpperBound && !nextBoundary->sameExceptValue(*u.expectedUpperBound))) { - RedwoodRecordRef rec = u.expectedUpperBound->withoutValue(); + if (nextBoundary == nullptr || (nextBoundary != &u.expectedUpperBound.get() && + !nextBoundary->sameExceptValue(u.expectedUpperBound.get()))) { + RedwoodRecordRef rec = u.expectedUpperBound.get().withoutValue(); debug_printf("applyUpdate adding dummy record %s\n", rec.toString(false).c_str()); + + cloneForUpdate(); insert(u.cEnd, { &rec, 1 }); changesMade = true; } @@ -5474,15 +5454,14 @@ private: debug_printf("%s -------------------------------------\n", context.c_str()); } + state Reference page = wait(readPage(snapshot, rootID, false, false)); state Version writeVersion = self->getLastCommittedVersion() + 1; - state Reference commitReadLock = self->m_commitReadLock; - wait(commitReadLock->take()); - state FlowLock::Releaser readLock(*commitReadLock); - state bool fromCache = false; - state Reference page = wait( - readPage(snapshot, rootID, update->decodeLowerBound, update->decodeUpperBound, false, false, &fromCache)); - readLock.release(); + // If the page exists in the cache, it must be copied before modification. + // That copy will be referenced by pageCopy, as page must stay in scope in case anything references its + // memory and it gets evicted from the cache. + // If the page is not in the cache, then no copy is needed so we will initialize pageCopy to page + state Reference pageCopy; state BTreePage* btPage = (BTreePage*)page->begin(); ASSERT(isLeaf == btPage->isLeaf()); @@ -5491,16 +5470,8 @@ private: // TODO: Decide if it is okay to update if the subtree boundaries are expanded. It can result in // records in a DeltaTree being outside its decode boundary range, which isn't actually invalid // though it is awkward to reason about. - state bool tryToUpdate = btPage->tree().numItems > 0 && update->boundariesNormal(); - - // If trying to update the page and the page reference points into the cache, - // we need to clone it so we don't modify the original version of the page. - // TODO: Refactor DeltaTree::Mirror so it can be shared between different versions of pages - if (tryToUpdate && fromCache) { - page = self->cloneForUpdate(page); - btPage = (BTreePage*)page->begin(); - fromCache = false; - } + // TryToUpdate indicates insert and erase operations should be tried on the existing page first + state bool tryToUpdate = btPage->tree()->numItems > 0 && update->boundariesNormal(); debug_printf( "%s commitSubtree(): %s\n", @@ -5508,7 +5479,8 @@ private: btPage->toString(false, rootID, snapshot->getVersion(), update->decodeLowerBound, update->decodeUpperBound) .c_str()); - state BTreePage::BinaryTree::Cursor cursor = getCursor(page); + state BTreePage::BinaryTree::Cursor cursor = + update->cBegin.valid() ? getCursor(page, update->cBegin) : getCursor(page, dbBegin, dbEnd); if (REDWOOD_DEBUG) { debug_printf("%s ---------MUTATION BUFFER SLICE ---------------------\n", context.c_str()); @@ -5531,10 +5503,9 @@ private: bool updating = tryToUpdate; bool changesMade = false; - // Couldn't make changes in place, so now do a linear merge and build new pages. state Standalone> merged; - auto switchToLinearMerge = [&]() { + // Couldn't make changes in place, so now do a linear merge and build new pages. updating = false; auto c = cursor; c.moveFirst(); @@ -5562,7 +5533,7 @@ private: // - there actually is a change (whether a set or a clear, old records are to be removed) // - either this is not the first boundary or it is but its key matches our lower bound key bool applyBoundaryChange = mBegin.mutation().boundaryChanged && - (!firstMutationBoundary || mBegin.key() == update->subtreeLowerBound->key); + (!firstMutationBoundary || mBegin.key() == update->subtreeLowerBound.key); firstMutationBoundary = false; // Iterate over records for the mutation boundary key, keep them unless the boundary key was changed or @@ -5586,6 +5557,14 @@ private: debug_printf("%s Erasing %s [existing, boundary start]\n", context.c_str(), cursor.get().toString().c_str()); + + // Copy page for modification if not already copied + if (!pageCopy.isValid()) { + pageCopy = clonePageForUpdate(page); + btPage = (BTreePage*)pageCopy->begin(); + cursor.tree = btPage->tree(); + } + btPage->kvBytes -= cursor.get().kvBytes(); cursor.erase(); } else { @@ -5603,12 +5582,19 @@ private: // Clears of this key will have been processed above by not being erased from the updated page or // excluded from the merge output if (applyBoundaryChange && mBegin.mutation().boundarySet()) { - RedwoodRecordRef rec(mBegin.key(), 0, mBegin.mutation().boundaryValue.get()); + RedwoodRecordRef rec(mBegin.key(), mBegin.mutation().boundaryValue.get()); changesMade = true; // If updating, add to the page, else add to the output set if (updating) { - if (cursor.mirror->insert(rec, update->skipLen, maxHeightAllowed)) { + // Copy page for modification if not already copied + if (!pageCopy.isValid()) { + pageCopy = clonePageForUpdate(page); + btPage = (BTreePage*)pageCopy->begin(); + cursor.tree = btPage->tree(); + } + + if (cursor.insert(rec, update->skipLen, maxHeightAllowed)) { btPage->kvBytes += rec.kvBytes(); debug_printf( "%s Inserted %s [mutation, boundary start]\n", context.c_str(), rec.toString().c_str()); @@ -5663,6 +5649,14 @@ private: debug_printf("%s Erasing %s [existing, boundary start]\n", context.c_str(), cursor.get().toString().c_str()); + + // Copy page for modification if not already copied + if (!pageCopy.isValid()) { + pageCopy = clonePageForUpdate(page); + btPage = (BTreePage*)pageCopy->begin(); + cursor.tree = btPage->tree(); + } + btPage->kvBytes -= cursor.get().kvBytes(); cursor.erase(); changesMade = true; @@ -5699,6 +5693,14 @@ private: "%s Erasing %s and beyond [existing, matches changed upper mutation boundary]\n", context.c_str(), cursor.get().toString().c_str()); + + // Copy page for modification if not already copied + if (!pageCopy.isValid()) { + pageCopy = clonePageForUpdate(page); + btPage = (BTreePage*)pageCopy->begin(); + cursor.tree = btPage->tree(); + } + btPage->kvBytes -= cursor.get().kvBytes(); cursor.erase(); } else { @@ -5729,9 +5731,8 @@ private: writeVersion = self->getLastCommittedVersion() + 1; if (updating) { - const BTreePage::BinaryTree& deltaTree = btPage->tree(); // If the tree is now empty, delete the page - if (deltaTree.numItems == 0) { + if (cursor.tree->numItems == 0) { update->cleared(); self->freeBTreePage(rootID, writeVersion); debug_printf("%s Page updates cleared all entries, returning %s\n", @@ -5740,7 +5741,7 @@ private: } else { // Otherwise update it. BTreePageIDRef newID = wait(self->updateBTreePage( - self, rootID, &update->newLinks.arena(), page.castTo(), writeVersion)); + self, rootID, &update->newLinks.arena(), pageCopy.castTo(), writeVersion)); update->updatedInPlace(newID, btPage, newID.size() * self->m_blockSize); debug_printf( @@ -5762,8 +5763,8 @@ private: // Rebuild new page(s). state Standalone> entries = wait(writePages(self, - update->subtreeLowerBound, - update->subtreeUpperBound, + &update->subtreeLowerBound, + &update->subtreeUpperBound, merged, btPage->height, writeVersion, @@ -5777,16 +5778,15 @@ private: } else { // Internal Page std::vector> recursions; - state std::vector slices; - state Arena arena; + state std::vector> slices; cursor.moveFirst(); bool first = true; while (cursor.valid()) { - InternalPageSliceUpdate& u = *new (arena) InternalPageSliceUpdate(); - slices.push_back(&u); + slices.emplace_back(new InternalPageSliceUpdate()); + InternalPageSliceUpdate& u = *slices.back(); // At this point we should never be at a null child page entry because the first entry of a page // can't be null and this loop will skip over null entries that come after non-null entries. @@ -5794,7 +5794,7 @@ private: // Subtree lower boundary is this page's subtree lower bound or cursor u.cBegin = cursor; - u.decodeLowerBound = &cursor.get(); + u.decodeLowerBound = cursor.get(); if (first) { u.subtreeLowerBound = update->subtreeLowerBound; first = false; @@ -5805,7 +5805,7 @@ private: // mBegin is either at or greater than subtreeLowerBound->key, which was the subtreeUpperBound->key // for the previous subtree slice. But we need it to be at or *before* subtreeLowerBound->key // so if mBegin.key() is not exactly the subtree lower bound key then decrement it. - if (mBegin.key() != u.subtreeLowerBound->key) { + if (mBegin.key() != u.subtreeLowerBound.key) { --mBegin; } } @@ -5816,14 +5816,14 @@ private: // The decode upper bound is always the next key after the child link, or the decode upper bound for // this page if (cursor.moveNext()) { - u.decodeUpperBound = &cursor.get(); + u.decodeUpperBound = cursor.get(); // If cursor record has a null child page then it exists only to preserve a previous // subtree boundary that is now needed for reading the subtree at cBegin. if (!cursor.get().value.present()) { // If the upper bound is provided by a dummy record in [cBegin, cEnd) then there is no // requirement on the next subtree range or the parent page to have a specific upper boundary // for decoding the subtree. - u.expectedUpperBound = nullptr; + u.expectedUpperBound.reset(); cursor.moveNext(); // If there is another record after the null child record, it must have a child page value ASSERT(!cursor.valid() || cursor.get().value.present()); @@ -5834,12 +5834,12 @@ private: u.decodeUpperBound = update->decodeUpperBound; u.expectedUpperBound = update->decodeUpperBound; } - u.subtreeUpperBound = cursor.valid() ? &cursor.get() : update->subtreeUpperBound; + u.subtreeUpperBound = cursor.valid() ? cursor.get() : update->subtreeUpperBound; u.cEnd = cursor; u.skipLen = 0; // TODO: set this // Find the mutation buffer range that includes all changes to the range described by u - mEnd = mutationBuffer->lower_bound(u.subtreeUpperBound->key); + mEnd = mutationBuffer->lower_bound(u.subtreeUpperBound.key); // If the mutation range described by mBegin extends to mEnd, then see if the part of that range // that overlaps with u's subtree range is being fully cleared or fully unchanged. @@ -5854,12 +5854,12 @@ private: if (range.clearAfterBoundary) { // If the mutation range after the boundary key is cleared, then the mutation boundary key must // be cleared or must be different than the subtree lower bound key so that it doesn't matter - uniform = range.boundaryCleared() || mutationBoundaryKey != u.subtreeLowerBound->key; + uniform = range.boundaryCleared() || mutationBoundaryKey != u.subtreeLowerBound.key; } else { // If the mutation range after the boundary key is unchanged, then the mutation boundary key // must be also unchanged or must be different than the subtree lower bound key so that it // doesn't matter - uniform = !range.boundaryChanged || mutationBoundaryKey != u.subtreeLowerBound->key; + uniform = !range.boundaryChanged || mutationBoundaryKey != u.subtreeLowerBound.key; } // If u's subtree is either all cleared or all unchanged @@ -5868,8 +5868,9 @@ private: // include sibling subtrees also covered by (mBegin, mEnd) so we can not recurse to those, too. // If the cursor is valid, u.subtreeUpperBound is the cursor's position, which is >= mEnd.key(). // If equal, no range expansion is possible. - if (cursor.valid() && mEnd.key() != u.subtreeUpperBound->key) { - cursor.seekLessThanOrEqual(mEnd.key(), update->skipLen, &cursor, 1); + if (cursor.valid() && mEnd.key() != u.subtreeUpperBound.key) { + // TODO: If cursor hints are available, use (cursor, 1) + cursor.seekLessThanOrEqual(mEnd.key(), update->skipLen); // If this seek moved us ahead, to something other than cEnd, then update subtree range // boundaries @@ -5882,7 +5883,7 @@ private: } u.cEnd = cursor; - u.subtreeUpperBound = &cursor.get(); + u.subtreeUpperBound = cursor.get(); u.skipLen = 0; // TODO: set this // The new decode upper bound is either cEnd or the record before it if it has no child @@ -5891,8 +5892,8 @@ private: c.movePrev(); ASSERT(c.valid()); if (!c.get().value.present()) { - u.decodeUpperBound = &c.get(); - u.expectedUpperBound = nullptr; + u.decodeUpperBound = c.get(); + u.expectedUpperBound.reset(); } else { u.decodeUpperBound = u.subtreeUpperBound; u.expectedUpperBound = u.subtreeUpperBound; @@ -5906,7 +5907,7 @@ private: u.cleared(); auto c = u.cBegin; while (c != u.cEnd) { - const RedwoodRecordRef& rec = c.get(); + RedwoodRecordRef rec = c.get(); if (rec.value.present()) { if (btPage->height == 2) { debug_printf("%s: freeing child page in cleared subtree range: %s\n", @@ -5947,23 +5948,29 @@ private: context.c_str(), btPage->size(), btPage->height, - btPage->tree().numItems, + btPage->tree()->numItems, slices.size(), recursions.size()); wait(waitForAll(recursions)); debug_printf("%s Recursions done, processing slice updates.\n", context.c_str()); - // Note: parentInfo could be invalid after a wait and must be re-initialized. + // ParentInfo could be invalid after a wait and must be re-initialized. // All uses below occur before waits so no reinitialization is done. state ParentInfo* parentInfo = &self->childUpdateTracker[rootID.front()]; - state InternalPageModifier m(btPage, cursor.mirror, tryToUpdate, parentInfo); + + // InternalPageModifier takes the results of the recursive commitSubtree() calls in order + // and makes changes to page as needed, copying as needed, and generating an array from + // which to build new page(s) if modification is not possible or not allowed. + // If pageCopy is already set it was initialized to page above so the modifier doesn't need + // to copy it + state InternalPageModifier modifier(page, pageCopy.isValid(), tryToUpdate, parentInfo); // Apply the possible changes for each subtree range recursed to, except the last one. // For each range, the expected next record, if any, is checked against the first boundary // of the next range, if any. for (int i = 0, iEnd = slices.size() - 1; i < iEnd; ++i) { - m.applyUpdate(*slices[i], slices[i + 1]->getFirstBoundary()); + modifier.applyUpdate(*slices[i], slices[i + 1]->getFirstBoundary()); } // The expected next record for the final range is checked against one of the upper boundaries passed to @@ -5973,39 +5980,40 @@ private: // sole purpose of adding a dummy upper bound record. debug_printf("%s Applying final child range update. changesMade=%d Parent update is: %s\n", context.c_str(), - m.changesMade, + modifier.changesMade, update->toString().c_str()); - m.applyUpdate(*slices.back(), m.changesMade ? update->subtreeUpperBound : update->decodeUpperBound); + modifier.applyUpdate(*slices.back(), + modifier.changesMade ? &update->subtreeUpperBound : &update->decodeUpperBound); state bool detachChildren = (parentInfo->count > 2); state bool forceUpdate = false; // If no changes were made, but we should rewrite it to point directly to remapped child pages - if (!m.changesMade && detachChildren) { + if (!modifier.changesMade && detachChildren) { debug_printf( "%s Internal page forced rewrite because at least %d children have been updated in-place.\n", context.c_str(), parentInfo->count); - forceUpdate = true; - if (!m.updating) { - m.updating = true; - // Copy the page before modification if the page references the cache - if (fromCache) { - page = self->cloneForUpdate(page); - cursor = getCursor(page); - btPage = (BTreePage*)page->begin(); - m.btPage = btPage; - m.m = cursor.mirror; - fromCache = false; - } - } + forceUpdate = true; + modifier.updating = true; + + // Make sure the modifier cloned the page so we can update the child links in-place below. + modifier.cloneForUpdate(); + ++g_redwoodMetrics.level(btPage->height).forceUpdate; } + // If the modifier cloned the page for updating, then update our local pageCopy, btPage, and cursor + if (modifier.clonedPage) { + pageCopy = modifier.page; + btPage = modifier.btPage(); + cursor.tree = btPage->tree(); + } + // If page contents have changed - if (m.changesMade || forceUpdate) { - if (m.empty()) { + if (modifier.changesMade || forceUpdate) { + if (modifier.empty()) { update->cleared(); debug_printf("%s All internal page children were deleted so deleting this page too, returning %s\n", context.c_str(), @@ -6013,7 +6021,7 @@ private: self->freeBTreePage(rootID, writeVersion); self->childUpdateTracker.erase(rootID.front()); } else { - if (m.updating) { + if (modifier.updating) { // Page was updated in place (or being forced to be updated in place to update child page ids) debug_printf( "%s Internal page modified in-place tryToUpdate=%d forceUpdate=%d detachChildren=%d\n", @@ -6052,7 +6060,7 @@ private: } BTreePageIDRef newID = wait(self->updateBTreePage( - self, rootID, &update->newLinks.arena(), page.castTo(), writeVersion)); + self, rootID, &update->newLinks.arena(), pageCopy.castTo(), writeVersion)); debug_printf( "%s commitSubtree(): Internal page updated in-place at version %s, new contents: %s\n", context.c_str(), @@ -6076,7 +6084,7 @@ private: if (detachChildren) { auto& stats = g_redwoodMetrics.level(btPage->height); - for (auto& rec : m.rebuild) { + for (auto& rec : modifier.rebuild) { if (rec.value.present()) { BTreePageIDRef oldPages = rec.getChildPage(); BTreePageIDRef newPages; @@ -6087,7 +6095,7 @@ private: if (newID != invalidLogicalPageID) { // Rebuild record values reference original page memory so make a copy if (newPages.empty()) { - newPages = BTreePageIDRef(m.rebuild.arena(), oldPages); + newPages = BTreePageIDRef(modifier.rebuild.arena(), oldPages); rec.setChildPage(newPages); } debug_printf("%s Detach updated %u -> %u\n", context.c_str(), p, newID); @@ -6103,9 +6111,9 @@ private: Standalone> newChildEntries = wait(writePages(self, - update->subtreeLowerBound, - update->subtreeUpperBound, - m.rebuild, + &update->subtreeLowerBound, + &update->subtreeUpperBound, + modifier.rebuild, btPage->height, writeVersion, rootID)); @@ -6151,24 +6159,24 @@ private: state Version latestVersion = self->m_pager->getLatestVersion(); debug_printf("%s: pager latestVersion %" PRId64 "\n", self->m_name.c_str(), latestVersion); - state Standalone rootPageID = self->m_header.root.get(); + state Standalone rootPageID = self->m_pHeader->root.get(); state InternalPageSliceUpdate all; state RedwoodRecordRef rootLink = dbBegin.withPageID(rootPageID); - all.subtreeLowerBound = &rootLink; - all.decodeLowerBound = &rootLink; - all.subtreeUpperBound = &dbEnd; - all.decodeUpperBound = &dbEnd; + all.subtreeLowerBound = rootLink; + all.decodeLowerBound = rootLink; + all.subtreeUpperBound = dbEnd; + all.decodeUpperBound = dbEnd; all.skipLen = 0; - MutationBuffer::const_iterator mBegin = mutations->upper_bound(all.subtreeLowerBound->key); + MutationBuffer::const_iterator mBegin = mutations->upper_bound(all.subtreeLowerBound.key); --mBegin; - MutationBuffer::const_iterator mEnd = mutations->lower_bound(all.subtreeUpperBound->key); + MutationBuffer::const_iterator mEnd = mutations->lower_bound(all.subtreeUpperBound.key); wait(commitSubtree(self, self->m_pager->getReadSnapshot(latestVersion), mutations, rootPageID, - self->m_header.height == 1, + self->m_pHeader->height == 1, mBegin, mEnd, &all)); @@ -6180,7 +6188,7 @@ private: LogicalPageID newRootID = wait(self->m_pager->newPageID()); Reference page = self->m_pager->newPageBuffer(); makeEmptyRoot(page); - self->m_header.height = 1; + self->m_pHeader->height = 1; self->m_pager->updatePage(newRootID, page); rootPageID = BTreePageIDRef((LogicalPageID*)&newRootID, 1); } else { @@ -6190,13 +6198,14 @@ private: } else { // If the new root level's size is not 1 then build new root level(s) Standalone> newRootPage = - wait(buildNewRoot(self, latestVersion, newRootLevel, self->m_header.height)); + wait(buildNewRoot(self, latestVersion, newRootLevel, self->m_pHeader->height)); rootPageID = newRootPage.front().getChildPage(); } } } - self->m_header.root.set(rootPageID, sizeof(headerSpace) - sizeof(m_header)); + debug_printf("new root %s\n", toString(rootPageID).c_str()); + self->m_pHeader->root.set(rootPageID, self->m_headerSpace - sizeof(MetaKey)); self->m_lazyClearStop = true; wait(success(self->m_lazyClearActor)); @@ -6205,10 +6214,10 @@ private: self->m_pager->setCommitVersion(writeVersion); wait(self->m_lazyClearQueue.flush()); - self->m_header.lazyDeleteQueue = self->m_lazyClearQueue.getState(); + self->m_pHeader->lazyDeleteQueue = self->m_lazyClearQueue.getState(); debug_printf("Setting metakey\n"); - self->m_pager->setMetaKey(self->m_header.asKeyRef()); + self->m_pager->setMetaKey(self->m_pHeader->asKeyRef()); debug_printf("%s: Committing pager %" PRId64 "\n", self->m_name.c_str(), writeVersion); wait(self->m_pager->commit()); @@ -6228,280 +6237,17 @@ private: } public: - // InternalCursor is for seeking to and iterating over the leaf-level RedwoodRecordRef records in the tree. - // The records could represent multiple values for the same key at different versions, including a non-present value - // representing a clear. Currently, however, all records are at version 0 and no clears are present in the tree. - struct InternalCursor { - private: - // Each InternalCursor's position is represented by a reference counted PageCursor, which links - // to its parent PageCursor, up to a PageCursor representing a cursor on the root page. - // PageCursors can be shared by many InternalCursors, making InternalCursor copying low overhead - struct PageCursor : ReferenceCounted, FastAllocated { - Reference parent; - BTreePageIDRef pageID; // Only needed for debugging purposes - Reference page; - BTreePage::BinaryTree::Cursor cursor; - - // id will normally reference memory owned by the parent, which is okay because a reference to the parent - // will be held in the cursor - PageCursor(BTreePageIDRef id, Reference page, Reference parent = {}) - : pageID(id), page(page), parent(parent), cursor(getCursor(page)) {} - - PageCursor(const PageCursor& toCopy) - : parent(toCopy.parent), pageID(toCopy.pageID), page(toCopy.page), cursor(toCopy.cursor) {} - - // Convenience method for copying a PageCursor - Reference copy() const { return makeReference(*this); } - - const BTreePage* btPage() const { return (const BTreePage*)page->begin(); } - - bool isLeaf() const { return btPage()->isLeaf(); } - - Future> getChild(Reference pager, int readAheadBytes = 0) { - ASSERT(!isLeaf()); - BTreePage::BinaryTree::Cursor next = cursor; - next.moveNext(); - const RedwoodRecordRef& rec = cursor.get(); - BTreePageIDRef id = rec.getChildPage(); - Future> child = readPage(pager, id, &rec, &next.getOrUpperBound()); - - // Read ahead siblings at level 2 - // TODO: Application of readAheadBytes is not taking into account the size of the current page or any - // of the adjacent pages it is preloading. - if (readAheadBytes > 0 && btPage()->height == 2 && next.valid()) { - do { - debug_printf("preloading %s %d bytes left\n", - ::toString(next.get().getChildPage()).c_str(), - readAheadBytes); - // If any part of the page was already loaded then stop - if (next.get().value.present()) { - preLoadPage(pager.getPtr(), next.get().getChildPage()); - readAheadBytes -= page->size(); - } - } while (readAheadBytes > 0 && next.moveNext()); - } - - return map(child, [=](Reference page) { - return makeReference(id, page, Reference::addRef(this)); - }); - } - - std::string toString() const { - return format("%s, %s", - ::toString(pageID).c_str(), - cursor.valid() ? cursor.get().toString(isLeaf()).c_str() : ""); - } - }; - - Standalone rootPageID; - Reference pager; - Reference pageCursor; - - public: - InternalCursor() {} - - InternalCursor(Reference pager, BTreePageIDRef root) : pager(pager), rootPageID(root) {} - - std::string toString() const { - std::string r; - - Reference c = pageCursor; - int maxDepth = 0; - while (c) { - c = c->parent; - ++maxDepth; - } - - c = pageCursor; - int depth = maxDepth; - while (c) { - r = format("[%d/%d: %s] ", depth--, maxDepth, c->toString().c_str()) + r; - c = c->parent; - } - return r; - } - - // Returns true if cursor position is a valid leaf page record - bool valid() const { return pageCursor && pageCursor->isLeaf() && pageCursor->cursor.valid(); } - - // Returns true if cursor position is valid() and has a present record value - bool present() const { return valid() && pageCursor->cursor.get().value.present(); } - - // Returns true if cursor position is present() and has an effective version <= v - bool presentAtVersion(Version v) { return present() && pageCursor->cursor.get().version <= v; } - - // This is to enable an optimization for the case where all internal records are at the - // same version and there are no implicit clears - // *this MUST be valid() - bool presentAtExactVersion(Version v) const { return present() && pageCursor->cursor.get().version == v; } - - // Returns true if cursor position is present() and has an effective version <= v - bool validAtVersion(Version v) { return valid() && pageCursor->cursor.get().version <= v; } - - const RedwoodRecordRef& get() const { return pageCursor->cursor.get(); } - - // Ensure that pageCursor is not shared with other cursors so we can modify it - void ensureUnshared() { - if (!pageCursor->isSoleOwner()) { - pageCursor = pageCursor->copy(); - } - } - - Future moveToRoot() { - // If pageCursor exists follow parent links to the root - if (pageCursor) { - while (pageCursor->parent) { - pageCursor = pageCursor->parent; - } - return Void(); - } - - // Otherwise read the root page - Future> root = readPage(pager, rootPageID, &dbBegin, &dbEnd); - return map(root, [=](Reference p) { - pageCursor = makeReference(rootPageID, p); - return Void(); - }); - } - - ACTOR Future seekLessThan_impl(InternalCursor* self, RedwoodRecordRef query, int prefetchBytes) { - Future f = self->moveToRoot(); - // f will almost always be ready - if (!f.isReady()) { - wait(f); - } - - self->ensureUnshared(); - loop { - bool isLeaf = self->pageCursor->isLeaf(); - bool success = self->pageCursor->cursor.seekLessThan(query); - - // Skip backwards over internal page entries that do not link to child pages - if (!isLeaf) { - // While record has no value, move again - while (success && !self->pageCursor->cursor.get().value.present()) { - success = self->pageCursor->cursor.movePrev(); - } - } - - if (success) { - // If we found a record < query at a leaf page then return success - if (isLeaf) { - return true; - } - - Reference child = wait(self->pageCursor->getChild(self->pager, prefetchBytes)); - self->pageCursor = child; - } else { - // No records < query on this page, so move to immediate previous record at leaf level - bool success = wait(self->move(false)); - return success; - } - } - } - - Future seekLessThan(RedwoodRecordRef query, int prefetchBytes) { - return seekLessThan_impl(this, query, prefetchBytes); - } - - ACTOR Future move_impl(InternalCursor* self, bool forward) { - // Try to move pageCursor, if it fails to go parent, repeat until it works or root cursor can't be moved - while (1) { - self->ensureUnshared(); - bool success = self->pageCursor->cursor.valid() && - (forward ? self->pageCursor->cursor.moveNext() : self->pageCursor->cursor.movePrev()); - - // Skip over internal page entries that do not link to child pages - if (!self->pageCursor->isLeaf()) { - // While record has no value, move again - while (success && !self->pageCursor->cursor.get().value.present()) { - success = forward ? self->pageCursor->cursor.moveNext() : self->pageCursor->cursor.movePrev(); - } - } - - // Stop if successful or there's no parent to move to - if (success || !self->pageCursor->parent) { - break; - } - - // Move to parent - self->pageCursor = self->pageCursor->parent; - } - - // If pageCursor not valid we've reached an end of the tree - if (!self->pageCursor->cursor.valid()) { - return false; - } - - // While not on a leaf page, move down to get to one. - while (!self->pageCursor->isLeaf()) { - // Skip over internal page entries that do not link to child pages - while (!self->pageCursor->cursor.get().value.present()) { - bool success = forward ? self->pageCursor->cursor.moveNext() : self->pageCursor->cursor.movePrev(); - if (!success) { - return false; - } - } - - Reference child = wait(self->pageCursor->getChild(self->pager)); - forward ? child->cursor.moveFirst() : child->cursor.moveLast(); - self->pageCursor = child; - } - - return true; - } - - Future move(bool forward) { return move_impl(this, forward); } - - // Move to the first or last record of the database. - ACTOR Future move_end(InternalCursor* self, bool begin) { - Future f = self->moveToRoot(); - - // f will almost always be ready - if (!f.isReady()) { - wait(f); - } - - self->ensureUnshared(); - - loop { - // Move to first or last record in the page - bool success = begin ? self->pageCursor->cursor.moveFirst() : self->pageCursor->cursor.moveLast(); - - // Skip over internal page entries that do not link to child pages - if (!self->pageCursor->isLeaf()) { - // While record has no value, move past it - while (success && !self->pageCursor->cursor.get().value.present()) { - success = begin ? self->pageCursor->cursor.moveNext() : self->pageCursor->cursor.movePrev(); - } - } - - // If it worked, return true if we've reached a leaf page otherwise go to the next child - if (success) { - if (self->pageCursor->isLeaf()) { - return true; - } - - Reference child = wait(self->pageCursor->getChild(self->pager)); - self->pageCursor = child; - } else { - return false; - } - } - } - - Future moveFirst() { return move_end(this, true); } - Future moveLast() { return move_end(this, false); } - }; - - // Cursor designed for short lifespans. - // Holds references to all pages touched. - // All record references returned from it are valid until the cursor is destroyed. + // Cursor into BTree which enables seeking and iteration in the BTree as a whole, or + // iteration within a specific page and movement across levels for more efficient access. + // Cursor record's memory is only guaranteed to be valid until cursor moves to a different page. class BTreeCursor { public: struct PathEntry { Reference page; BTreePage::BinaryTree::Cursor cursor; +#if REDWOOD_DEBUG + Standalone id; +#endif const BTreePage* btPage() const { return (BTreePage*)page->begin(); }; }; @@ -6515,14 +6261,20 @@ public: public: BTreeCursor() {} + bool intialized() const { return pager.isValid(); } bool isValid() const { return valid; } std::string toString() const { std::string r = format("{ptr=%p %s ", this, ::toString(pager->getVersion()).c_str()); for (int i = 0; i < path.size(); ++i) { - r += format("[%d/%d: %s] ", - i + 1, - path.size(), + std::string id = ""; +#if REDWOOD_DEBUG + id = ::toString(path[i].id); +#endif + r += format("[Level=%d ID=%s ptr=%p Cursor=%s] ", + path[i].btPage()->height, + id.c_str(), + path[i].page->begin(), path[i].cursor.valid() ? path[i].cursor.get().toString(path[i].btPage()->isLeaf()).c_str() : ""); } @@ -6533,7 +6285,7 @@ public: return r; } - const RedwoodRecordRef& get() { return path.back().cursor.get(); } + const RedwoodRecordRef get() { return path.back().cursor.get(); } bool inRoot() const { return path.size() == 1; } @@ -6542,30 +6294,38 @@ public: PathEntry& back() { return path.back(); } void popPath() { path.pop_back(); } - Future pushPage(BTreePageIDRef id, - const RedwoodRecordRef& lowerBound, - const RedwoodRecordRef& upperBound) { - - return map(readPage(pager, id, &lowerBound, &upperBound), [this, id](Reference p) { - path.push_back({ p, getCursor(p) }); + Future pushPage(const BTreePage::BinaryTree::Cursor& link) { + debug_printf("pushPage(link=%s)\n", link.get().toString(false).c_str()); + return map(readPage(pager, link.get().getChildPage()), [=](Reference p) { +#if REDWOOD_DEBUG + path.push_back({ p, getCursor(p, link), link.get().getChildPage() }); +#else + path.push_back({ p, getCursor(p, link) }); +#endif return Void(); }); } - Future pushPage(BTreePage::BinaryTree::Cursor c) { - const RedwoodRecordRef& rec = c.get(); - auto next = c; - next.moveNext(); - BTreePageIDRef id = rec.getChildPage(); - return pushPage(id, rec, next.getOrUpperBound()); + Future pushPage(BTreePageIDRef id) { + debug_printf("pushPage(root=%s)\n", ::toString(id).c_str()); + return map(readPage(pager, id), [=](Reference p) { +#if REDWOOD_DEBUG + path.push_back({ p, getCursor(p, dbBegin, dbEnd), id }); +#else + path.push_back({ p, getCursor(p, dbBegin, dbEnd) }); +#endif + return Void(); + }); } + // Initialize or reinitialize cursor Future init(VersionedBTree* btree_in, Reference pager_in, BTreePageIDRef root) { btree = btree_in; pager = pager_in; + path.clear(); path.reserve(6); valid = false; - return pushPage(root, dbBegin, dbEnd); + return pushPage(root); } // Seeks cursor to query if it exists, the record before or after it, or an undefined and invalid @@ -6587,7 +6347,7 @@ public: auto& entry = self->path.back(); if (entry.btPage()->isLeaf()) { int cmp = entry.cursor.seek(query); - self->valid = entry.cursor.valid() && !entry.cursor.node->isDeleted(); + self->valid = entry.cursor.valid() && !entry.cursor.isErased(); debug_printf("seek(%s, %d) loop exit cmp=%d cursor=%s\n", query.toString().c_str(), prefetchBytes, @@ -6690,6 +6450,7 @@ public: if (self->path.size() == 1) { self->valid = false; + debug_printf("move%s() exit cursor=%s\n", forward ? "Next" : "Prev", self->toString().c_str()); return Void(); } @@ -6736,206 +6497,6 @@ public: return cursor->init(this, snapshot, ((MetaKey*)m.begin())->root.get()); } - - // Cursor is for reading and interating over user visible KV pairs at a specific version - // KeyValueRefs returned become invalid once the cursor is moved - class Cursor : public IStoreCursor, public ReferenceCounted, public FastAllocated, NonCopyable { - public: - Cursor(Reference pageSource, BTreePageIDRef root, Version internalRecordVersion) - : m_version(internalRecordVersion), m_cur1(pageSource, root), m_cur2(m_cur1) {} - - void addref() override { ReferenceCounted::addref(); } - void delref() override { ReferenceCounted::delref(); } - - private: - Version m_version; - // If kv is valid - // - kv.key references memory held by cur1 - // - If cur1 points to a non split KV pair - // - kv.value references memory held by cur1 - // - cur2 points to the next internal record after cur1 - // Else - // - kv.value references memory in arena - // - cur2 points to the first internal record of the split KV pair - InternalCursor m_cur1; - InternalCursor m_cur2; - Arena m_arena; - Optional m_kv; - - public: - Future findEqual(KeyRef key) override { return find_impl(this, key, 0); } - Future findFirstEqualOrGreater(KeyRef key, int prefetchBytes) override { - return find_impl(this, key, 1, prefetchBytes); - } - Future findLastLessOrEqual(KeyRef key, int prefetchBytes) override { - return find_impl(this, key, -1, prefetchBytes); - } - - Future next() override { return move(this, true); } - Future prev() override { return move(this, false); } - - bool isValid() override { return m_kv.present(); } - - KeyRef getKey() override { return m_kv.get().key; } - - ValueRef getValue() override { return m_kv.get().value; } - - std::string toString(bool includePaths = true) const { - std::string r; - r += format("Cursor(%p) ver: %" PRId64 " ", this, m_version); - if (m_kv.present()) { - r += format( - " KV: '%s' -> '%s'", m_kv.get().key.printable().c_str(), m_kv.get().value.printable().c_str()); - } else { - r += " KV: "; - } - if (includePaths) { - r += format("\n Cur1: %s", m_cur1.toString().c_str()); - r += format("\n Cur2: %s", m_cur2.toString().c_str()); - } else { - if (m_cur1.valid()) { - r += format("\n Cur1: %s", m_cur1.get().toString().c_str()); - } - if (m_cur2.valid()) { - r += format("\n Cur2: %s", m_cur2.get().toString().c_str()); - } - } - - return r; - } - - private: - // find key in tree closest to or equal to key (at this cursor's version) - // for less than or equal use cmp < 0 - // for greater than or equal use cmp > 0 - // for equal use cmp == 0 - ACTOR static Future find_impl(Cursor* self, KeyRef key, int cmp, int prefetchBytes = 0) { - state RedwoodRecordRef query(key, self->m_version + 1); - self->m_kv.reset(); - - wait(success(self->m_cur1.seekLessThan(query, prefetchBytes))); - debug_printf("find%sE(%s): %s\n", - cmp > 0 ? "GT" : (cmp == 0 ? "" : "LT"), - query.toString().c_str(), - self->toString().c_str()); - - // If we found the target key with a present value then return it as it is valid for any cmp type - if (self->m_cur1.present() && self->m_cur1.get().key == key) { - debug_printf("Target key found. Cursor: %s\n", self->toString().c_str()); - self->m_kv = self->m_cur1.get().toKeyValueRef(); - return Void(); - } - - // If cmp type is Equal and we reached here, we didn't find it - if (cmp == 0) { - return Void(); - } - - // cmp mode is GreaterThanOrEqual, so if we've reached here an equal key was not found and cur1 either - // points to a lesser key or is invalid. - if (cmp > 0) { - // If cursor is invalid, query was less than the first key in database so go to the first record - if (!self->m_cur1.valid()) { - bool valid = wait(self->m_cur1.moveFirst()); - if (!valid) { - self->m_kv.reset(); - return Void(); - } - } else { - // Otherwise, move forward until we find a key greater than the target key. - // If multiversion data is present, the next record could have the same key as the initial - // record found but be at a newer version. - loop { - bool valid = wait(self->m_cur1.move(true)); - if (!valid) { - self->m_kv.reset(); - return Void(); - } - - if (self->m_cur1.get().key > key) { - break; - } - } - } - - // Get the next present key at the target version. Handles invalid cursor too. - wait(self->next()); - } else if (cmp < 0) { - // cmp mode is LessThanOrEqual. An equal key to the target key was already checked above, and the - // search was for LessThan query, so cur1 is already in the right place. - if (!self->m_cur1.valid()) { - self->m_kv.reset(); - return Void(); - } - - // Move to previous present kv pair at the target version - wait(self->prev()); - } - - return Void(); - } - - ACTOR static Future move(Cursor* self, bool fwd) { - debug_printf("Cursor::move(%d): Start %s\n", fwd, self->toString().c_str()); - ASSERT(self->m_cur1.valid()); - - // If kv is present then the key/version at cur1 was already returned so move to a new key - // Move cur1 until failure or a new key is found, keeping prior record visited in cur2 - if (self->m_kv.present()) { - ASSERT(self->m_cur1.valid()); - loop { - self->m_cur2 = self->m_cur1; - debug_printf("Cursor::move(%d): Advancing cur1 %s\n", fwd, self->toString().c_str()); - bool valid = wait(self->m_cur1.move(fwd)); - if (!valid || self->m_cur1.get().key != self->m_cur2.get().key) { - break; - } - } - } - - // Given two consecutive cursors c1 and c2, c1 represents a returnable record if - // c1 is present at exactly version v - // OR - // c1 is.presentAtVersion(v) && (!c2.validAtVersion() || c2.get().key != c1.get().key()) - // Note the distinction between 'present' and 'valid'. Present means the value for the key - // exists at the version (but could be the empty string) while valid just means the internal - // record is in effect at that version but it could indicate that the key was cleared and - // no longer exists from the user's perspective at that version - if (self->m_cur1.valid()) { - self->m_cur2 = self->m_cur1; - debug_printf("Cursor::move(%d): Advancing cur2 %s\n", fwd, self->toString().c_str()); - wait(success(self->m_cur2.move(true))); - } - - while (self->m_cur1.valid()) { - - if (self->m_cur1.get().version == self->m_version || - (self->m_cur1.presentAtVersion(self->m_version) && - (!self->m_cur2.validAtVersion(self->m_version) || - self->m_cur2.get().key != self->m_cur1.get().key))) { - self->m_kv = self->m_cur1.get().toKeyValueRef(); - return Void(); - } - - if (fwd) { - // Moving forward, move cur2 forward and keep cur1 pointing to the prior (predecessor) record - debug_printf("Cursor::move(%d): Moving forward %s\n", fwd, self->toString().c_str()); - self->m_cur1 = self->m_cur2; - wait(success(self->m_cur2.move(true))); - } else { - // Moving backward, move cur1 backward and keep cur2 pointing to the prior (successor) record - debug_printf("Cursor::move(%d): Moving backward %s\n", fwd, self->toString().c_str()); - self->m_cur2 = self->m_cur1; - wait(success(self->m_cur1.move(false))); - } - } - - debug_printf("Cursor::move(%d): Exit, end of db reached. Cursor = %s\n", fwd, self->toString().c_str()); - self->m_kv.reset(); - - return Void(); - } - }; }; #include "fdbserver/art_impl.h" @@ -6946,8 +6507,7 @@ RedwoodRecordRef VersionedBTree::dbEnd(LiteralStringRef("\xff\xff\xff\xff\xff")) class KeyValueStoreRedwoodUnversioned : public IKeyValueStore { public: KeyValueStoreRedwoodUnversioned(std::string filePrefix, UID logID) - : m_filePrefix(filePrefix), m_concurrentReads(new FlowLock(SERVER_KNOBS->REDWOOD_KVSTORE_CONCURRENT_READS)) { - // TODO: This constructor should really just take an IVersionedStore + : m_filePrefix(filePrefix), m_concurrentReads(SERVER_KNOBS->REDWOOD_KVSTORE_CONCURRENT_READS) { int pageSize = BUGGIFY ? deterministicRandom()->randomInt(1000, 4096 * 4) : SERVER_KNOBS->REDWOOD_DEFAULT_PAGE_SIZE; @@ -6965,7 +6525,9 @@ public: filePrefix, pageCacheBytes, remapCleanupWindow, - SERVER_KNOBS->REDWOOD_EXTENT_CONCURRENT_READS); + SERVER_KNOBS->REDWOOD_EXTENT_CONCURRENT_READS, + false, + m_error); m_tree = new VersionedBTree(pager, filePrefix); m_init = catchError(init_impl(this)); } @@ -7041,9 +6603,8 @@ public: state VersionedBTree::BTreeCursor cur; wait(self->m_tree->initBTreeCursor(&cur, self->m_tree->getLastCommittedVersion())); - state Reference readLock = self->m_concurrentReads; - wait(readLock->take()); - state FlowLock::Releaser releaser(*readLock); + wait(self->m_concurrentReads.take()); + state FlowLock::Releaser releaser(self->m_concurrentReads); ++g_redwoodMetrics.opGetRange; state RangeResult result; @@ -7066,13 +6627,13 @@ public: // we can bypass the bounds check for each key in the leaf if the entire leaf is in range // > because both query end and page upper bound are exclusive of the query results and page contents, // respectively - bool boundsCheck = leafCursor.upperBound() > keys.end; + bool checkBounds = leafCursor.cache->upperBound > keys.end; // Whether or not any results from this page were added to results bool usedPage = false; while (leafCursor.valid()) { KeyValueRef kv = leafCursor.get().toKeyValueRef(); - if (boundsCheck && kv.key.compare(keys.end) >= 0) { + if (checkBounds && kv.key.compare(keys.end) >= 0) { break; } accumulatedBytes += kv.expectedSize(); @@ -7087,7 +6648,7 @@ public: // If the page was used, results must depend on the ArenaPage arena and the Mirror arena. // This must be done after visiting all the results in case the Mirror arena changes. if (usedPage) { - result.arena().dependsOn(leafCursor.mirror->arena); + result.arena().dependsOn(leafCursor.cache->arena); result.arena().dependsOn(cur.back().page->getArena()); } @@ -7108,13 +6669,13 @@ public: // we can bypass the bounds check for each key in the leaf if the entire leaf is in range // < because both query begin and page lower bound are inclusive of the query results and page contents, // respectively - bool boundsCheck = leafCursor.lowerBound() < keys.begin; + bool checkBounds = leafCursor.cache->lowerBound < keys.begin; // Whether or not any results from this page were added to results bool usedPage = false; while (leafCursor.valid()) { KeyValueRef kv = leafCursor.get().toKeyValueRef(); - if (boundsCheck && kv.key.compare(keys.begin) < 0) { + if (checkBounds && kv.key.compare(keys.begin) < 0) { break; } accumulatedBytes += kv.expectedSize(); @@ -7129,7 +6690,7 @@ public: // If the page was used, results must depend on the ArenaPage arena and the Mirror arena. // This must be done after visiting all the results in case the Mirror arena changes. if (usedPage) { - result.arena().dependsOn(leafCursor.mirror->arena); + result.arena().dependsOn(leafCursor.cache->arena); result.arena().dependsOn(cur.back().page->getArena()); } @@ -7157,9 +6718,8 @@ public: state VersionedBTree::BTreeCursor cur; wait(self->m_tree->initBTreeCursor(&cur, self->m_tree->getLastCommittedVersion())); - state Reference readLock = self->m_concurrentReads; - wait(readLock->take()); - state FlowLock::Releaser releaser(*readLock); + wait(self->m_concurrentReads.take()); + state FlowLock::Releaser releaser(self->m_concurrentReads); ++g_redwoodMetrics.opGet; wait(cur.seekGTE(key, 0)); @@ -7197,7 +6757,7 @@ private: Future m_init; Promise m_closed; Promise m_error; - Reference m_concurrentReads; + FlowLock m_concurrentReads; template inline Future catchError(Future f) { @@ -7344,7 +6904,7 @@ ACTOR Future verifyRangeBTreeCursor(VersionedBTree* btree, ASSERT(errors == 0); results.push_back(results.arena(), cur.get().toKeyValueRef()); - results.arena().dependsOn(cur.back().cursor.mirror->arena); + results.arena().dependsOn(cur.back().cursor.cache->arena); results.arena().dependsOn(cur.back().page->getArena()); wait(cur.moveNext()); @@ -7441,255 +7001,6 @@ ACTOR Future verifyRangeBTreeCursor(VersionedBTree* btree, return errors; } -ACTOR Future verifyRange(VersionedBTree* btree, - Key start, - Key end, - Version v, - std::map, Optional>* written, - int* pErrorCount) { - state int errors = 0; - if (end <= start) - end = keyAfter(start); - - state std::map, Optional>::const_iterator i = - written->lower_bound(std::make_pair(start.toString(), 0)); - state std::map, Optional>::const_iterator iEnd = - written->upper_bound(std::make_pair(end.toString(), 0)); - state std::map, Optional>::const_iterator iLast; - - state Reference cur = btree->readAtVersion(v); - debug_printf("VerifyRange(@%" PRId64 ", %s, %s): Start cur=%p\n", - v, - start.printable().c_str(), - end.printable().c_str(), - cur.getPtr()); - - // Randomly use the cursor for something else first. - if (deterministicRandom()->coinflip()) { - state Key randomKey = randomKV().key; - debug_printf("VerifyRange(@%" PRId64 ", %s, %s): Dummy seek to '%s'\n", - v, - start.printable().c_str(), - end.printable().c_str(), - randomKey.toString().c_str()); - wait(deterministicRandom()->coinflip() ? cur->findFirstEqualOrGreater(randomKey) - : cur->findLastLessOrEqual(randomKey)); - } - - debug_printf( - "VerifyRange(@%" PRId64 ", %s, %s): Actual seek\n", v, start.printable().c_str(), end.printable().c_str()); - wait(cur->findFirstEqualOrGreater(start)); - - state std::vector results; - - while (cur->isValid() && cur->getKey() < end) { - // Find the next written kv pair that would be present at this version - while (1) { - iLast = i; - if (i == iEnd) - break; - ++i; - - if (iLast->first.second <= v && iLast->second.present() && - (i == iEnd || i->first.first != iLast->first.first || i->first.second > v)) { - debug_printf("VerifyRange(@%" PRId64 ", %s, %s) Found key in written map: %s\n", - v, - start.printable().c_str(), - end.printable().c_str(), - iLast->first.first.c_str()); - break; - } - } - - if (iLast == iEnd) { - ++errors; - ++*pErrorCount; - printf("VerifyRange(@%" PRId64 ", %s, %s) ERROR: Tree key '%s' vs nothing in written map.\n", - v, - start.printable().c_str(), - end.printable().c_str(), - cur->getKey().toString().c_str()); - break; - } - - if (cur->getKey() != iLast->first.first) { - ++errors; - ++*pErrorCount; - printf("VerifyRange(@%" PRId64 ", %s, %s) ERROR: Tree key '%s' but expected '%s'\n", - v, - start.printable().c_str(), - end.printable().c_str(), - cur->getKey().toString().c_str(), - iLast->first.first.c_str()); - break; - } - if (cur->getValue() != iLast->second.get()) { - ++errors; - ++*pErrorCount; - printf("VerifyRange(@%" PRId64 ", %s, %s) ERROR: Tree key '%s' has tree value '%s' but expected '%s'\n", - v, - start.printable().c_str(), - end.printable().c_str(), - cur->getKey().toString().c_str(), - cur->getValue().toString().c_str(), - iLast->second.get().c_str()); - break; - } - - ASSERT(errors == 0); - - results.push_back(KeyValue(KeyValueRef(cur->getKey(), cur->getValue()))); - wait(cur->next()); - } - - // Make sure there are no further written kv pairs that would be present at this version. - while (1) { - iLast = i; - if (i == iEnd) - break; - ++i; - if (iLast->first.second <= v && iLast->second.present() && - (i == iEnd || i->first.first != iLast->first.first || i->first.second > v)) - break; - } - - if (iLast != iEnd) { - ++errors; - ++*pErrorCount; - printf("VerifyRange(@%" PRId64 ", %s, %s) ERROR: Tree range ended but written has @%" PRId64 " '%s'\n", - v, - start.printable().c_str(), - end.printable().c_str(), - iLast->first.second, - iLast->first.first.c_str()); - } - - debug_printf( - "VerifyRangeReverse(@%" PRId64 ", %s, %s): start\n", v, start.printable().c_str(), end.printable().c_str()); - - // Randomly use a new cursor at the same version for the reverse range read, if the version is still available for - // opening new cursors - if (v >= btree->getOldestVersion() && deterministicRandom()->coinflip()) { - cur = btree->readAtVersion(v); - } - - // Now read the range from the tree in reverse order and compare to the saved results - wait(cur->findLastLessOrEqual(end)); - if (cur->isValid() && cur->getKey() == end) - wait(cur->prev()); - - state std::vector::const_reverse_iterator r = results.rbegin(); - - while (cur->isValid() && cur->getKey() >= start) { - if (r == results.rend()) { - ++errors; - ++*pErrorCount; - printf("VerifyRangeReverse(@%" PRId64 ", %s, %s) ERROR: Tree key '%s' vs nothing in written map.\n", - v, - start.printable().c_str(), - end.printable().c_str(), - cur->getKey().toString().c_str()); - break; - } - - if (cur->getKey() != r->key) { - ++errors; - ++*pErrorCount; - printf("VerifyRangeReverse(@%" PRId64 ", %s, %s) ERROR: Tree key '%s' but expected '%s'\n", - v, - start.printable().c_str(), - end.printable().c_str(), - cur->getKey().toString().c_str(), - r->key.toString().c_str()); - break; - } - if (cur->getValue() != r->value) { - ++errors; - ++*pErrorCount; - printf("VerifyRangeReverse(@%" PRId64 - ", %s, %s) ERROR: Tree key '%s' has tree value '%s' but expected '%s'\n", - v, - start.printable().c_str(), - end.printable().c_str(), - cur->getKey().toString().c_str(), - cur->getValue().toString().c_str(), - r->value.toString().c_str()); - break; - } - - ++r; - wait(cur->prev()); - } - - if (r != results.rend()) { - ++errors; - ++*pErrorCount; - printf("VerifyRangeReverse(@%" PRId64 ", %s, %s) ERROR: Tree range ended but written has '%s'\n", - v, - start.printable().c_str(), - end.printable().c_str(), - r->key.toString().c_str()); - } - - return errors; -} - -// Verify the result of point reads for every set or cleared key at the given version -ACTOR Future seekAll(VersionedBTree* btree, - Version v, - std::map, Optional>* written, - int* pErrorCount) { - state std::map, Optional>::const_iterator i = written->cbegin(); - state std::map, Optional>::const_iterator iEnd = written->cend(); - state int errors = 0; - state Reference cur = btree->readAtVersion(v); - - while (i != iEnd) { - state std::string key = i->first.first; - state Version ver = i->first.second; - if (ver == v) { - state Optional val = i->second; - debug_printf("Verifying @%" PRId64 " '%s'\n", ver, key.c_str()); - state Arena arena; - wait(cur->findEqual(KeyRef(arena, key))); - - if (val.present()) { - if (!(cur->isValid() && cur->getKey() == key && cur->getValue() == val.get())) { - ++errors; - ++*pErrorCount; - if (!cur->isValid()) - printf("Verify ERROR: key_not_found: '%s' -> '%s' @%" PRId64 "\n", - key.c_str(), - val.get().c_str(), - ver); - else if (cur->getKey() != key) - printf("Verify ERROR: key_incorrect: found '%s' expected '%s' @%" PRId64 "\n", - cur->getKey().toString().c_str(), - key.c_str(), - ver); - else if (cur->getValue() != val.get()) - printf("Verify ERROR: value_incorrect: for '%s' found '%s' expected '%s' @%" PRId64 "\n", - cur->getKey().toString().c_str(), - cur->getValue().toString().c_str(), - val.get().c_str(), - ver); - } - } else { - if (cur->isValid() && cur->getKey() == key) { - ++errors; - ++*pErrorCount; - printf("Verify ERROR: cleared_key_found: '%s' -> '%s' @%" PRId64 "\n", - key.c_str(), - cur->getValue().toString().c_str(), - ver); - } - } - } - ++i; - } - return errors; -} - // Verify the result of point reads for every set or cleared key at the given version ACTOR Future seekAllBTreeCursor(VersionedBTree* btree, Version v, @@ -7709,7 +7020,7 @@ ACTOR Future seekAllBTreeCursor(VersionedBTree* btree, state Optional val = i->second; debug_printf("Verifying @%" PRId64 " '%s'\n", ver, key.c_str()); state Arena arena; - wait(cur.seekGTE(RedwoodRecordRef(KeyRef(arena, key), 0), 0)); + wait(cur.seekGTE(RedwoodRecordRef(KeyRef(arena, key)), 0)); bool foundKey = cur.isValid() && cur.get().key == key; bool hasValue = foundKey && cur.get().value.present(); @@ -7755,9 +7066,6 @@ ACTOR Future verify(VersionedBTree* btree, std::map, Optional>* written, int* pErrorCount, bool serial) { - state Future fRangeAll; - state Future fRangeRandom; - state Future fSeekAll; // Queue of committed versions still readable from btree state std::deque committedVersions; @@ -7782,40 +7090,30 @@ ACTOR Future verify(VersionedBTree* btree, v = committedVersions[deterministicRandom()->randomInt(0, committedVersions.size())]; debug_printf("Using committed version %" PRId64 "\n", v); + // Get a cursor at v so that v doesn't get expired between the possibly serial steps below. - state Reference cur = btree->readAtVersion(v); + state VersionedBTree::BTreeCursor cur; + wait(btree->initBTreeCursor(&cur, v)); debug_printf("Verifying entire key range at version %" PRId64 "\n", v); - if (deterministicRandom()->coinflip()) { - fRangeAll = - verifyRange(btree, LiteralStringRef(""), LiteralStringRef("\xff\xff"), v, written, pErrorCount); - } else { - fRangeAll = verifyRangeBTreeCursor( - btree, LiteralStringRef(""), LiteralStringRef("\xff\xff"), v, written, pErrorCount); - } + state Future fRangeAll = verifyRangeBTreeCursor( + btree, LiteralStringRef(""), LiteralStringRef("\xff\xff"), v, written, pErrorCount); if (serial) { wait(success(fRangeAll)); } Key begin = randomKV().key; Key end = randomKV().key; + debug_printf( "Verifying range (%s, %s) at version %" PRId64 "\n", toString(begin).c_str(), toString(end).c_str(), v); - if (deterministicRandom()->coinflip()) { - fRangeRandom = verifyRange(btree, begin, end, v, written, pErrorCount); - } else { - fRangeRandom = verifyRangeBTreeCursor(btree, begin, end, v, written, pErrorCount); - } + state Future fRangeRandom = verifyRangeBTreeCursor(btree, begin, end, v, written, pErrorCount); if (serial) { wait(success(fRangeRandom)); } debug_printf("Verifying seeks to each changed key at version %" PRId64 "\n", v); - if (deterministicRandom()->coinflip()) { - fSeekAll = seekAll(btree, v, written, pErrorCount); - } else { - fSeekAll = seekAllBTreeCursor(btree, v, written, pErrorCount); - } + state Future fSeekAll = seekAllBTreeCursor(btree, v, written, pErrorCount); if (serial) { wait(success(fSeekAll)); } @@ -7838,19 +7136,20 @@ ACTOR Future verify(VersionedBTree* btree, // Does a random range read, doesn't trap/report errors ACTOR Future randomReader(VersionedBTree* btree) { try { - state Reference cur; + state VersionedBTree::BTreeCursor cur; + loop { wait(yield()); - if (!cur || deterministicRandom()->random01() > .01) { - Version v = btree->getLastCommittedVersion(); - cur = btree->readAtVersion(v); + if (!cur.intialized() || deterministicRandom()->random01() > .01) { + wait(btree->initBTreeCursor(&cur, btree->getLastCommittedVersion())); } state KeyValue kv = randomKV(10, 0); - wait(cur->findFirstEqualOrGreater(kv.key)); + wait(cur.seekGTE(kv.key, 0)); state int c = deterministicRandom()->randomInt(0, 100); - while (cur->isValid() && c-- > 0) { - wait(success(cur->next())); + state bool direction = deterministicRandom()->coinflip(); + while (cur.isValid() && c-- > 0) { + wait(success(direction ? cur.moveNext() : cur.movePrev())); wait(yield()); } } @@ -7866,9 +7165,11 @@ ACTOR Future randomReader(VersionedBTree* btree) { struct IntIntPair { IntIntPair() {} IntIntPair(int k, int v) : k(k), v(v) {} - IntIntPair(Arena& arena, const IntIntPair& toCopy) { *this = toCopy; } + typedef IntIntPair Partial; + + void updateCache(Optional cache, Arena& arena) const {} struct Delta { bool prefixSource; bool deleted; @@ -7877,6 +7178,13 @@ struct IntIntPair { IntIntPair apply(const IntIntPair& base, Arena& arena) { return { base.k + dk, base.v + dv }; } + IntIntPair apply(const Partial& cache) { return cache; } + + IntIntPair apply(Arena& arena, const IntIntPair& base, Optional& cache) { + cache = IntIntPair(base.k + dk, base.v + dv); + return cache.get(); + } + void setPrefixSource(bool val) { prefixSource = val; } bool getPrefixSource() const { return prefixSource; } @@ -7972,13 +7280,6 @@ RedwoodRecordRef randomRedwoodRecordRef(const std::string& keyBuffer, const std: rec.value = StringRef((uint8_t*)valueBuffer.data(), deterministicRandom()->randomInt(0, valueBuffer.size())); } - int versionIntSize = deterministicRandom()->randomInt(0, 8) * 8; - if (versionIntSize > 0) { - --versionIntSize; - int64_t max = ((int64_t)1 << versionIntSize) - 1; - rec.version = deterministicRandom()->randomInt64(0, max); - } - return rec; } @@ -7988,10 +7289,7 @@ TEST_CASE("/redwood/correctness/unit/RedwoodRecordRef") { ASSERT(RedwoodRecordRef::Delta::LengthFormatSizes[2] == 6); ASSERT(RedwoodRecordRef::Delta::LengthFormatSizes[3] == 8); - ASSERT(RedwoodRecordRef::Delta::VersionDeltaSizes[0] == 0); - ASSERT(RedwoodRecordRef::Delta::VersionDeltaSizes[1] == 4); - ASSERT(RedwoodRecordRef::Delta::VersionDeltaSizes[2] == 6); - ASSERT(RedwoodRecordRef::Delta::VersionDeltaSizes[3] == 8); + printf("sizeof(RedwoodRecordRef) = %d\n", sizeof(RedwoodRecordRef)); // Test pageID stuff. { @@ -8007,35 +7305,35 @@ TEST_CASE("/redwood/correctness/unit/RedwoodRecordRef") { ASSERT(r2.getChildPage().begin() != id.begin()); } - deltaTest(RedwoodRecordRef(LiteralStringRef(""), 0, LiteralStringRef("")), - RedwoodRecordRef(LiteralStringRef(""), 0, LiteralStringRef(""))); + deltaTest(RedwoodRecordRef(LiteralStringRef(""), LiteralStringRef("")), + RedwoodRecordRef(LiteralStringRef(""), LiteralStringRef(""))); - deltaTest(RedwoodRecordRef(LiteralStringRef("abc"), 0, LiteralStringRef("")), - RedwoodRecordRef(LiteralStringRef("abc"), 0, LiteralStringRef(""))); + deltaTest(RedwoodRecordRef(LiteralStringRef("abc"), LiteralStringRef("")), + RedwoodRecordRef(LiteralStringRef("abc"), LiteralStringRef(""))); - deltaTest(RedwoodRecordRef(LiteralStringRef("abc"), 0, LiteralStringRef("")), - RedwoodRecordRef(LiteralStringRef("abcd"), 0, LiteralStringRef(""))); + deltaTest(RedwoodRecordRef(LiteralStringRef("abc"), LiteralStringRef("")), + RedwoodRecordRef(LiteralStringRef("abcd"), LiteralStringRef(""))); - deltaTest(RedwoodRecordRef(LiteralStringRef("abcd"), 2, LiteralStringRef("")), - RedwoodRecordRef(LiteralStringRef("abc"), 2, LiteralStringRef(""))); + deltaTest(RedwoodRecordRef(LiteralStringRef("abcd"), LiteralStringRef("")), + RedwoodRecordRef(LiteralStringRef("abc"), LiteralStringRef(""))); - deltaTest(RedwoodRecordRef(std::string(300, 'k'), 2, std::string(1e6, 'v')), - RedwoodRecordRef(std::string(300, 'k'), 2, LiteralStringRef(""))); + deltaTest(RedwoodRecordRef(std::string(300, 'k'), std::string(1e6, 'v')), + RedwoodRecordRef(std::string(300, 'k'), LiteralStringRef(""))); - deltaTest(RedwoodRecordRef(LiteralStringRef(""), 2, LiteralStringRef("")), - RedwoodRecordRef(LiteralStringRef(""), 1, LiteralStringRef(""))); + deltaTest(RedwoodRecordRef(LiteralStringRef(""), LiteralStringRef("")), + RedwoodRecordRef(LiteralStringRef(""), LiteralStringRef(""))); - deltaTest(RedwoodRecordRef(LiteralStringRef(""), 0xffff, LiteralStringRef("")), - RedwoodRecordRef(LiteralStringRef(""), 1, LiteralStringRef(""))); + deltaTest(RedwoodRecordRef(LiteralStringRef(""), LiteralStringRef("")), + RedwoodRecordRef(LiteralStringRef(""), LiteralStringRef(""))); - deltaTest(RedwoodRecordRef(LiteralStringRef(""), 1, LiteralStringRef("")), - RedwoodRecordRef(LiteralStringRef(""), 0xffff, LiteralStringRef(""))); + deltaTest(RedwoodRecordRef(LiteralStringRef(""), LiteralStringRef("")), + RedwoodRecordRef(LiteralStringRef(""), LiteralStringRef(""))); - deltaTest(RedwoodRecordRef(LiteralStringRef(""), 0xffffff, LiteralStringRef("")), - RedwoodRecordRef(LiteralStringRef(""), 1, LiteralStringRef(""))); + deltaTest(RedwoodRecordRef(LiteralStringRef(""), LiteralStringRef("")), + RedwoodRecordRef(LiteralStringRef(""), LiteralStringRef(""))); - deltaTest(RedwoodRecordRef(LiteralStringRef(""), 1, LiteralStringRef("")), - RedwoodRecordRef(LiteralStringRef(""), 0xffffff, LiteralStringRef(""))); + deltaTest(RedwoodRecordRef(LiteralStringRef(""), LiteralStringRef("")), + RedwoodRecordRef(LiteralStringRef(""), LiteralStringRef(""))); Arena mem; double start; @@ -8075,9 +7373,6 @@ TEST_CASE("/redwood/correctness/unit/RedwoodRecordRef") { rec1.key = LiteralStringRef("alksdfjaklsdfjlkasdjflkasdjfklajsdflk;ajsdflkajdsflkjadsf1"); rec2.key = LiteralStringRef("alksdfjaklsdfjlkasdjflkasdjfklajsdflk;ajsdflkajdsflkjadsf234"); - rec1.version = deterministicRandom()->randomInt64(0, std::numeric_limits::max()); - rec2.version = deterministicRandom()->randomInt64(0, std::numeric_limits::max()); - start = timer(); total = 0; count = 100e6; @@ -8136,8 +7431,8 @@ TEST_CASE("/redwood/correctness/unit/RedwoodRecordRef") { TEST_CASE("/redwood/correctness/unit/deltaTree/RedwoodRecordRef") { // Sanity check on delta tree node format - ASSERT(DeltaTree::Node::headerSize(false) == 4); - ASSERT(DeltaTree::Node::headerSize(true) == 8); + ASSERT(DeltaTree2::Node::headerSize(false) == 4); + ASSERT(DeltaTree2::Node::headerSize(true) == 8); const int N = deterministicRandom()->randomInt(200, 1000); @@ -8153,9 +7448,6 @@ TEST_CASE("/redwood/correctness/unit/deltaTree/RedwoodRecordRef") { std::string v = deterministicRandom()->randomAlphaNumeric(30); RedwoodRecordRef rec; rec.key = StringRef(arena, k); - rec.version = deterministicRandom()->coinflip() - ? deterministicRandom()->randomInt64(0, std::numeric_limits::max()) - : invalidVersion; if (deterministicRandom()->coinflip()) { rec.value = StringRef(arena, v); } @@ -8313,17 +7605,204 @@ TEST_CASE("/redwood/correctness/unit/deltaTree/RedwoodRecordRef") { return Void(); } +TEST_CASE("/redwood/correctness/unit/deltaTree/RedwoodRecordRef2") { + // Sanity check on delta tree node format + ASSERT(DeltaTree2::Node::headerSize(false) == 4); + ASSERT(DeltaTree2::Node::headerSize(true) == 8); + ASSERT(sizeof(DeltaTree2::DecodedNode) == 28); + + const int N = deterministicRandom()->randomInt(200, 1000); + + RedwoodRecordRef prev; + RedwoodRecordRef next(LiteralStringRef("\xff\xff\xff\xff")); + + Arena arena; + std::set uniqueItems; + + // Add random items to uniqueItems until its size is N + while (uniqueItems.size() < N) { + std::string k = deterministicRandom()->randomAlphaNumeric(30); + std::string v = deterministicRandom()->randomAlphaNumeric(30); + RedwoodRecordRef rec; + rec.key = StringRef(arena, k); + if (deterministicRandom()->coinflip()) { + rec.value = StringRef(arena, v); + } + if (uniqueItems.count(rec) == 0) { + uniqueItems.insert(rec); + } + } + std::vector items(uniqueItems.begin(), uniqueItems.end()); + + int bufferSize = N * 100; + bool largeTree = bufferSize > DeltaTree2::SmallSizeLimit; + DeltaTree2* tree = (DeltaTree2*)new uint8_t[bufferSize]; + + tree->build(bufferSize, &items[0], &items[items.size()], &prev, &next); + + printf("Count=%d Size=%d InitialHeight=%d largeTree=%d\n", + (int)items.size(), + (int)tree->size(), + (int)tree->initialHeight, + largeTree); + debug_printf("Data(%p): %s\n", tree, StringRef((uint8_t*)tree, tree->size()).toHexString().c_str()); + + DeltaTree2::DecodeCache cache(prev, next); + DeltaTree2::Cursor c(&cache, tree); + + // Test delete/insert behavior for each item, making no net changes + printf("Testing seek/delete/insert for existing keys with random values\n"); + ASSERT(tree->numItems == items.size()); + for (auto rec : items) { + // Insert existing should fail + ASSERT(!c.insert(rec)); + ASSERT(tree->numItems == items.size()); + + // Erase existing should succeed + ASSERT(c.erase(rec)); + ASSERT(tree->numItems == items.size() - 1); + + // Erase deleted should fail + ASSERT(!c.erase(rec)); + ASSERT(tree->numItems == items.size() - 1); + + // Insert deleted should succeed + ASSERT(c.insert(rec)); + ASSERT(tree->numItems == items.size()); + + // Insert existing should fail + ASSERT(!c.insert(rec)); + ASSERT(tree->numItems == items.size()); + } + + DeltaTree2::Cursor fwd = c; + DeltaTree2::Cursor rev = c; + + DeltaTree2::DecodeCache cacheValuesOnly(prev, next); + DeltaTree2::Cursor fwdValueOnly( + &cacheValuesOnly, (DeltaTree2*)tree); + + printf("Verifying tree contents using forward, reverse, and value-only iterators\n"); + ASSERT(fwd.moveFirst()); + ASSERT(fwdValueOnly.moveFirst()); + ASSERT(rev.moveLast()); + + int i = 0; + while (1) { + if (fwd.get() != items[i]) { + printf("forward iterator i=%d\n %s found\n %s expected\n", + i, + fwd.get().toString().c_str(), + items[i].toString().c_str()); + printf("Cursor: %s\n", fwd.toString().c_str()); + ASSERT(false); + } + if (rev.get() != items[items.size() - 1 - i]) { + printf("reverse iterator i=%d\n %s found\n %s expected\n", + i, + rev.get().toString().c_str(), + items[items.size() - 1 - i].toString().c_str()); + printf("Cursor: %s\n", rev.toString().c_str()); + ASSERT(false); + } + if (fwdValueOnly.get().value != items[i].value) { + printf("forward values-only iterator i=%d\n %s found\n %s expected\n", + i, + fwdValueOnly.get().toString().c_str(), + items[i].toString().c_str()); + printf("Cursor: %s\n", fwdValueOnly.toString().c_str()); + ASSERT(false); + } + ++i; + + bool more = fwd.moveNext(); + ASSERT(fwdValueOnly.moveNext() == more); + ASSERT(rev.movePrev() == more); + + ASSERT(fwd.valid() == more); + ASSERT(fwdValueOnly.valid() == more); + ASSERT(rev.valid() == more); + + if (!fwd.valid()) { + break; + } + } + ASSERT(i == items.size()); + + { + DeltaTree2::DecodeCache cache(prev, next); + DeltaTree2::Cursor c(&cache, tree); + + printf("Doing 20M random seeks using the same cursor from the same mirror.\n"); + double start = timer(); + + for (int i = 0; i < 20000000; ++i) { + const RedwoodRecordRef& query = items[deterministicRandom()->randomInt(0, items.size())]; + if (!c.seekLessThanOrEqual(query)) { + printf("Not found! query=%s\n", query.toString().c_str()); + ASSERT(false); + } + if (c.get() != query) { + printf("Found incorrect node! query=%s found=%s\n", + query.toString().c_str(), + c.get().toString().c_str()); + ASSERT(false); + } + } + double elapsed = timer() - start; + printf("Elapsed %f\n", elapsed); + } + + // { + // printf("Doing 5M random seeks using 10k random cursors, each from a different mirror.\n"); + // double start = timer(); + // std::vector::Mirror*> mirrors; + // std::vector::Cursor> cursors; + // for (int i = 0; i < 10000; ++i) { + // mirrors.push_back(new DeltaTree2::Mirror(tree, &prev, &next)); + // cursors.push_back(mirrors.back()->getCursor()); + // } + + // for (int i = 0; i < 5000000; ++i) { + // const RedwoodRecordRef& query = items[deterministicRandom()->randomInt(0, items.size())]; + // DeltaTree2::Cursor& c = cursors[deterministicRandom()->randomInt(0, cursors.size())]; + // if (!c.seekLessThanOrEqual(query)) { + // printf("Not found! query=%s\n", query.toString().c_str()); + // ASSERT(false); + // } + // if (c.get() != query) { + // printf("Found incorrect node! query=%s found=%s\n", + // query.toString().c_str(), + // c.get().toString().c_str()); + // ASSERT(false); + // } + // } + // double elapsed = timer() - start; + // printf("Elapsed %f\n", elapsed); + // } + + return Void(); +} + TEST_CASE("/redwood/correctness/unit/deltaTree/IntIntPair") { const int N = 200; - IntIntPair prev = { 1, 0 }; - IntIntPair next = { 10000, 10000 }; + IntIntPair lowerBound = { 0, 0 }; + IntIntPair upperBound = { 1000, 1000 }; state std::function randomPair = [&]() { - return IntIntPair( - { deterministicRandom()->randomInt(prev.k, next.k), deterministicRandom()->randomInt(prev.v, next.v) }); + // Generate a pair >= lowerBound and < upperBound + int k = deterministicRandom()->randomInt(lowerBound.k, upperBound.k + 1); + int v = deterministicRandom()->randomInt(lowerBound.v, upperBound.v); + + // Only generate even values so the tests below can approach and find each + // key with a directional seek of the adjacent absent value on either side. + v -= v % 2; + + return IntIntPair(k, v); }; - // Build a set of N unique items, where no consecutive items are in the set, a requirement of the seek behavior tests. + // Build a set of N unique items, where no consecutive items are in the set, a requirement of the seek behavior + // tests. std::set uniqueItems; while (uniqueItems.size() < N) { IntIntPair p = randomPair(); @@ -8338,55 +7817,44 @@ TEST_CASE("/redwood/correctness/unit/deltaTree/IntIntPair") { // Build tree of items std::vector items(uniqueItems.begin(), uniqueItems.end()); - int bufferSize = N * 2 * 20; + int bufferSize = N * 2 * 30; + DeltaTree* tree = (DeltaTree*)new uint8_t[bufferSize]; - int builtSize = tree->build(bufferSize, &items[0], &items[items.size()], &prev, &next); + int builtSize = tree->build(bufferSize, &items[0], &items[items.size()], &lowerBound, &upperBound); ASSERT(builtSize <= bufferSize); + DeltaTree::Mirror r(tree, &lowerBound, &upperBound); - DeltaTree::Mirror r(tree, &prev, &next); - - // Grow uniqueItems until tree is full, adding half of new items to toDelete - std::vector toDelete; - while (1) { - IntIntPair p = randomPair(); - auto nextP = p; // also check if next highest/lowest key is not in the set - nextP.v++; - auto prevP = p; - prevP.v--; - if (uniqueItems.count(p) == 0 && uniqueItems.count(nextP) == 0 && uniqueItems.count(prevP) == 0) { - if (!r.insert(p)) { - break; - }; - uniqueItems.insert(p); - if (deterministicRandom()->coinflip()) { - toDelete.push_back(p); - } - // printf("Inserted %s size=%d\n", items.back().toString().c_str(), tree->size()); - } - } - - ASSERT(tree->numItems > 2 * N); - ASSERT(tree->size() <= bufferSize); - - // Update items vector - items = std::vector(uniqueItems.begin(), uniqueItems.end()); + DeltaTree2* tree2 = (DeltaTree2*)new uint8_t[bufferSize]; + int builtSize2 = tree2->build(bufferSize, &items[0], &items[items.size()], &lowerBound, &upperBound); + ASSERT(builtSize2 <= bufferSize); + DeltaTree2::DecodeCache cache(lowerBound, upperBound); + DeltaTree2::Cursor cur2(&cache, tree2); auto printItems = [&] { for (int k = 0; k < items.size(); ++k) { - printf("%d %s\n", k, items[k].toString().c_str()); + debug_printf("%d/%d %s\n", k + 1, items.size(), items[k].toString().c_str()); } }; - printf("Count=%d Size=%d InitialHeight=%d MaxHeight=%d\n", - (int)items.size(), - (int)tree->size(), - (int)tree->initialHeight, - (int)tree->maxHeight); - debug_printf("Data(%p): %s\n", tree, StringRef((uint8_t*)tree, tree->size()).toHexString().c_str()); + auto printTrees = [&] { + printf("DeltaTree: Count=%d Size=%d InitialHeight=%d MaxHeight=%d\n", + (int)tree->numItems, + (int)tree->size(), + (int)tree->initialHeight, + (int)tree->maxHeight); + debug_printf("Data(%p): %s\n", tree, StringRef((uint8_t*)tree, tree->size()).toHexString().c_str()); + + printf("DeltaTree2: Count=%d Size=%d InitialHeight=%d MaxHeight=%d\n", + (int)tree2->numItems, + (int)tree2->size(), + (int)tree2->initialHeight, + (int)tree2->maxHeight); + debug_printf("Data(%p): %s\n", tree2, StringRef((uint8_t*)tree2, tree2->size()).toHexString().c_str()); + }; // Iterate through items and tree forward and backward, verifying tree contents. auto scanAndVerify = [&]() { - printf("Verify tree contents.\n"); + printf("Verify DeltaTree contents.\n"); DeltaTree::Cursor fwd = r.getCursor(); DeltaTree::Cursor rev = r.getCursor(); @@ -8426,56 +7894,162 @@ TEST_CASE("/redwood/correctness/unit/deltaTree/IntIntPair") { } }; + // Iterate through items and tree forward and backward, verifying tree contents. + auto scanAndVerify2 = [&]() { + printf("Verify DeltaTree2 contents.\n"); + + DeltaTree2::Cursor fwd(&cache, tree2); + DeltaTree2::Cursor rev(&cache, tree2); + + ASSERT(fwd.moveFirst()); + ASSERT(rev.moveLast()); + + for (int i = 0; i < items.size(); ++i) { + if (fwd.get() != items[i]) { + printItems(); + printf("forward iterator i=%d\n %s found\n %s expected\n", + i, + fwd.get().toString().c_str(), + items[i].toString().c_str()); + ASSERT(false); + } + if (rev.get() != items[items.size() - 1 - i]) { + printItems(); + printf("reverse iterator i=%d\n %s found\n %s expected\n", + i, + rev.get().toString().c_str(), + items[items.size() - 1 - i].toString().c_str()); + ASSERT(false); + } + + // Advance iterator, check scanning cursors for correct validity state + int j = i + 1; + bool end = j == items.size(); + + ASSERT(fwd.moveNext() == !end); + ASSERT(rev.movePrev() == !end); + ASSERT(fwd.valid() == !end); + ASSERT(rev.valid() == !end); + + if (end) { + break; + } + } + }; + + printItems(); + printTrees(); + // Verify tree contents scanAndVerify(); + scanAndVerify2(); + + // Grow uniqueItems until tree is full, adding half of new items to toDelete + std::vector toDelete; + int maxInsert = 9999999; + bool shouldBeFull = false; + while (maxInsert-- > 0) { + IntIntPair p = randomPair(); + // Insert record if it, its predecessor, and its successor are not present. + // Test data is intentionally sparse to test finding each record with a directional + // seek from each adjacent possible but not present record. + if (uniqueItems.count(p) == 0 && uniqueItems.count(IntIntPair(p.k, p.v - 1)) == 0 && + uniqueItems.count(IntIntPair(p.k, p.v + 1)) == 0) { + if (!cur2.insert(p)) { + shouldBeFull = true; + break; + }; + ASSERT(r.insert(p)); + uniqueItems.insert(p); + if (deterministicRandom()->coinflip()) { + toDelete.push_back(p); + } + // printf("Inserted %s size=%d\n", items.back().toString().c_str(), tree->size()); + } + } + + // If the tree refused to insert an item, the count should be at least 2*N + ASSERT(!shouldBeFull || tree->numItems > 2 * N); + ASSERT(tree->size() <= bufferSize); + + // Update items vector + items = std::vector(uniqueItems.begin(), uniqueItems.end()); + + printItems(); + printTrees(); + + // Verify tree contents + scanAndVerify(); + scanAndVerify2(); // Create a new mirror, decoding the tree from scratch since insert() modified both the tree and the mirror - r = DeltaTree::Mirror(tree, &prev, &next); + r = DeltaTree::Mirror(tree, &lowerBound, &upperBound); + cache.clear(); scanAndVerify(); + scanAndVerify2(); - // For each randomly selected new item to be deleted, delete it from the DeltaTree and from uniqueItems + // For each randomly selected new item to be deleted, delete it from the DeltaTree2 and from uniqueItems printf("Deleting some items\n"); for (auto p : toDelete) { uniqueItems.erase(p); + DeltaTree::Cursor c = r.getCursor(); ASSERT(c.seekLessThanOrEqual(p)); c.erase(); + + ASSERT(cur2.seekLessThanOrEqual(p)); + cur2.erase(); } // Update items vector items = std::vector(uniqueItems.begin(), uniqueItems.end()); + printItems(); + printTrees(); + // Verify tree contents after deletions scanAndVerify(); + scanAndVerify2(); printf("Verifying insert/erase behavior for existing items\n"); // Test delete/insert behavior for each item, making no net changes for (auto p : items) { // Insert existing should fail ASSERT(!r.insert(p)); + ASSERT(!cur2.insert(p)); // Erase existing should succeed ASSERT(r.erase(p)); + ASSERT(cur2.erase(p)); // Erase deleted should fail ASSERT(!r.erase(p)); + ASSERT(!cur2.erase(p)); // Insert deleted should succeed ASSERT(r.insert(p)); + ASSERT(cur2.insert(p)); // Insert existing should fail ASSERT(!r.insert(p)); + ASSERT(!cur2.insert(p)); } + printItems(); + printTrees(); + // Tree contents should still match items vector scanAndVerify(); + scanAndVerify2(); printf("Verifying seek behaviors\n"); DeltaTree::Cursor s = r.getCursor(); + DeltaTree2::Cursor s2(&cache, tree2); // SeekLTE to each element for (int i = 0; i < items.size(); ++i) { IntIntPair p = items[i]; IntIntPair q = p; + ASSERT(s.seekLessThanOrEqual(q)); if (s.get() != p) { printItems(); @@ -8485,12 +8059,23 @@ TEST_CASE("/redwood/correctness/unit/deltaTree/IntIntPair") { p.toString().c_str()); ASSERT(false); } + + ASSERT(s2.seekLessThanOrEqual(q)); + if (s2.get() != p) { + printItems(); + printf("seekLessThanOrEqual(%s) found %s expected %s\n", + q.toString().c_str(), + s2.get().toString().c_str(), + p.toString().c_str()); + ASSERT(false); + } } // SeekGTE to each element for (int i = 0; i < items.size(); ++i) { IntIntPair p = items[i]; IntIntPair q = p; + ASSERT(s.seekGreaterThanOrEqual(q)); if (s.get() != p) { printItems(); @@ -8500,6 +8085,16 @@ TEST_CASE("/redwood/correctness/unit/deltaTree/IntIntPair") { p.toString().c_str()); ASSERT(false); } + + ASSERT(s2.seekGreaterThanOrEqual(q)); + if (s2.get() != p) { + printItems(); + printf("seekGreaterThanOrEqual(%s) found %s expected %s\n", + q.toString().c_str(), + s2.get().toString().c_str(), + p.toString().c_str()); + ASSERT(false); + } } // SeekLTE to the next possible int pair value after each element to make sure the base element is found @@ -8508,6 +8103,7 @@ TEST_CASE("/redwood/correctness/unit/deltaTree/IntIntPair") { IntIntPair p = items[i]; IntIntPair q = p; q.v++; + ASSERT(s.seekLessThanOrEqual(q)); if (s.get() != p) { printItems(); @@ -8517,6 +8113,16 @@ TEST_CASE("/redwood/correctness/unit/deltaTree/IntIntPair") { p.toString().c_str()); ASSERT(false); } + + ASSERT(s2.seekLessThanOrEqual(q)); + if (s2.get() != p) { + printItems(); + printf("seekLessThanOrEqual(%s) found %s expected %s\n", + q.toString().c_str(), + s2.get().toString().c_str(), + p.toString().c_str()); + ASSERT(false); + } } // SeekGTE to the previous possible int pair value after each element to make sure the base element is found @@ -8525,6 +8131,7 @@ TEST_CASE("/redwood/correctness/unit/deltaTree/IntIntPair") { IntIntPair p = items[i]; IntIntPair q = p; q.v--; + ASSERT(s.seekGreaterThanOrEqual(q)); if (s.get() != p) { printItems(); @@ -8534,6 +8141,16 @@ TEST_CASE("/redwood/correctness/unit/deltaTree/IntIntPair") { p.toString().c_str()); ASSERT(false); } + + ASSERT(s2.seekGreaterThanOrEqual(q)); + if (s2.get() != p) { + printItems(); + printf("seekGreaterThanOrEqual(%s) found %s expected %s\n", + q.toString().c_str(), + s2.get().toString().c_str(), + p.toString().c_str()); + ASSERT(false); + } } // SeekLTE to each element N times, using every element as a hint @@ -8615,11 +8232,56 @@ TEST_CASE("/redwood/correctness/unit/deltaTree/IntIntPair") { double(count) / elapsed / 1e6); }; + auto skipSeekPerformance2 = [&](int jumpMax, bool old, bool useHint, int count) { + // Skip to a series of increasing items, jump by up to jumpMax units forward in the + // items, wrapping around to 0. + double start = timer(); + s2.moveFirst(); + auto first = s2; + int pos = 0; + for (int c = 0; c < count; ++c) { + int jump = deterministicRandom()->randomInt(0, jumpMax); + int newPos = pos + jump; + if (newPos >= items.size()) { + pos = 0; + newPos = jump; + s2 = first; + } + IntIntPair q = items[newPos]; + ++q.v; + if (old) { + if (useHint) { + // s.seekLessThanOrEqualOld(q, 0, &s, newPos - pos); + } else { + // s.seekLessThanOrEqualOld(q, 0, nullptr, 0); + } + } else { + if (useHint) { + // s.seekLessThanOrEqual(q, 0, &s, newPos - pos); + } else { + s2.seekLessThanOrEqual(q); + } + } + pos = newPos; + } + double elapsed = timer() - start; + printf("DeltaTree2 Seek/skip test, count=%d jumpMax=%d, items=%d, oldSeek=%d useHint=%d: Elapsed %f seconds " + "%.2f M/s\n", + count, + jumpMax, + items.size(), + old, + useHint, + elapsed, + double(count) / elapsed / 1e6); + }; + // Compare seeking to nearby elements with and without hints, using the old and new SeekLessThanOrEqual methods. // TODO: Once seekLessThanOrEqual() with a hint is as fast as seekLessThanOrEqualOld, remove it. + skipSeekPerformance(8, false, false, 80e6); + skipSeekPerformance2(8, false, false, 80e6); skipSeekPerformance(8, true, false, 80e6); skipSeekPerformance(8, true, true, 80e6); - skipSeekPerformance(8, false, false, 80e6); skipSeekPerformance(8, false, true, 80e6); // Repeatedly seek for one of a set of pregenerated random pairs and time it. @@ -9013,7 +8675,9 @@ TEST_CASE("/redwood/correctness/btree") { // Create new promise stream and start the verifier again committedVersions = PromiseStream(); verifyTask = verify(btree, committedVersions.getFuture(), &written, &errorCount, serialTest); - randomTask = randomReader(btree) || btree->getError(); + if (!serialTest) { + randomTask = randomReader(btree) || btree->getError(); + } committedVersions.send(v); } @@ -9057,11 +8721,11 @@ ACTOR Future randomSeeks(VersionedBTree* btree, int count, char firstChar, state Version readVer = btree->getLatestVersion(); state int c = 0; state double readStart = timer(); - printf("Executing %d random seeks\n", count); - state Reference cur = btree->readAtVersion(readVer); + state VersionedBTree::BTreeCursor cur; + wait(btree->initBTreeCursor(&cur, readVer)); while (c < count) { state Key k = randomString(20, firstChar, lastChar); - wait(success(cur->findFirstEqualOrGreater(k))); + wait(cur.seekGTE(k, 0)); ++c; } double elapsed = timer() - readStart; @@ -9078,21 +8742,22 @@ ACTOR Future randomScans(VersionedBTree* btree, state Version readVer = btree->getLatestVersion(); state int c = 0; state double readStart = timer(); - printf("Executing %d random scans\n", count); - state Reference cur = btree->readAtVersion(readVer); + state VersionedBTree::BTreeCursor cur; + wait(btree->initBTreeCursor(&cur, readVer)); + state bool adaptive = readAhead < 0; state int totalScanBytes = 0; while (c++ < count) { state Key k = randomString(20, firstChar, lastChar); - wait(success(cur->findFirstEqualOrGreater(k, readAhead))); + wait(cur.seekGTE(k, readAhead)); if (adaptive) { readAhead = totalScanBytes / c; } state int w = width; - while (w > 0 && cur->isValid()) { - totalScanBytes += cur->getKey().size(); - totalScanBytes += cur->getValue().size(); - wait(cur->next()); + state bool direction = deterministicRandom()->coinflip(); + while (w > 0 && cur.isValid()) { + totalScanBytes += cur.get().expectedSize(); + wait(success(direction ? cur.moveNext() : cur.movePrev())); --w; } } @@ -9304,9 +8969,6 @@ TEST_CASE(":/redwood/performance/extentQueue") { TEST_CASE(":/redwood/performance/set") { state SignalableActorCollection actors; - g_redwoodMetricsActor = Void(); // Prevent trace event metrics from starting - g_redwoodMetrics.clear(); - state std::string fileName = params.get("fileName").orDefault("unittest.redwood"); state int pageSize = params.getInt("pageSize").orDefault(SERVER_KNOBS->REDWOOD_DEFAULT_PAGE_SIZE); state int extentSize = params.getInt("extentSize").orDefault(SERVER_KNOBS->REDWOOD_DEFAULT_EXTENT_SIZE); @@ -9333,7 +8995,10 @@ TEST_CASE(":/redwood/performance/set") { state int concurrentScans = params.getInt("concurrentScans").orDefault(64); state int seeks = params.getInt("seeks").orDefault(1000000); state int scans = params.getInt("scans").orDefault(20000); + state bool pagerMemoryOnly = params.getInt("pagerMemoryOnly").orDefault(0); + state bool traceMetrics = params.getInt("traceMetrics").orDefault(0); + printf("pagerMemoryOnly: %d\n", pagerMemoryOnly); printf("pageSize: %d\n", pageSize); printf("extentSize: %d\n", extentSize); printf("pageCacheBytes: %" PRId64 "\n", pageCacheBytes); @@ -9357,13 +9022,19 @@ TEST_CASE(":/redwood/performance/set") { printf("openExisting: %d\n", openExisting); printf("insertRecords: %d\n", insertRecords); + // If using stdout for metrics, prevent trace event metrics logger from starting + if (!traceMetrics) { + g_redwoodMetricsActor = Void(); + g_redwoodMetrics.clear(); + } + if (!openExisting) { printf("Deleting old test data\n"); deleteFile(fileName); } - DWALPager* pager = - new DWALPager(pageSize, extentSize, fileName, pageCacheBytes, remapCleanupWindow, concurrentExtentReads); + DWALPager* pager = new DWALPager( + pageSize, extentSize, fileName, pageCacheBytes, remapCleanupWindow, concurrentExtentReads, pagerMemoryOnly); state VersionedBTree* btree = new VersionedBTree(pager, fileName); wait(btree->init()); printf("Initialized. StorageBytes=%s\n", btree->getStorageBytes().toString().c_str()); @@ -9377,11 +9048,10 @@ TEST_CASE(":/redwood/performance/set") { printf("Starting.\n"); state double intervalStart = timer(); state double start = intervalStart; + state int sinceYield = 0; if (insertRecords) { while (kvBytesTotal < kvBytesTarget) { - wait(yield()); - Version lastVer = btree->getLatestVersion(); state Version version = lastVer + 1; btree->setWriteVersion(version); @@ -9411,7 +9081,10 @@ TEST_CASE(":/redwood/performance/set") { ++recordsThisCommit; } - wait(yield()); + if (++sinceYield >= 100) { + sinceYield = 0; + wait(yield()); + } } if (kvBytesThisCommit >= maxKVBytesPerCommit || recordsThisCommit >= maxRecordsPerCommit) { @@ -9430,7 +9103,9 @@ TEST_CASE(":/redwood/performance/set") { double* pIntervalStart = &intervalStart; commit = map(btree->commit(), [=](Void result) { - printf("Committed:\n%s\n", g_redwoodMetrics.toString(true).c_str()); + if (!traceMetrics) { + printf("%s\n", g_redwoodMetrics.toString(true).c_str()); + } double elapsed = timer() - *pIntervalStart; printf("Committed %d keyValueBytes in %d records in %f seconds, %.2f MB/s\n", kvb, @@ -9454,56 +9129,27 @@ TEST_CASE(":/redwood/performance/set") { printf("StorageBytes=%s\n", btree->getStorageBytes().toString().c_str()); } - printf("Warming cache with seeks\n"); - for (int x = 0; x < concurrentSeeks; ++x) { - actors.add(randomSeeks(btree, seeks / concurrentSeeks, firstKeyChar, lastKeyChar)); + if (scans > 0) { + printf("Parallel scans, count=%d, concurrency=%d, no readAhead ...\n", scans, concurrentScans); + for (int x = 0; x < concurrentScans; ++x) { + actors.add(randomScans(btree, scans / concurrentScans, 50, 0, firstKeyChar, lastKeyChar)); + } + wait(actors.signalAndReset()); + if (!traceMetrics) { + printf("Stats:\n%s\n", g_redwoodMetrics.toString(true).c_str()); + } } - wait(actors.signalAndReset()); - printf("Stats:\n%s\n", g_redwoodMetrics.toString(true).c_str()); - printf("Serial scans with adaptive readAhead...\n"); - actors.add(randomScans(btree, scans, 50, -1, firstKeyChar, lastKeyChar)); - wait(actors.signalAndReset()); - printf("Stats:\n%s\n", g_redwoodMetrics.toString(true).c_str()); - - printf("Serial scans with readAhead 3 pages...\n"); - actors.add(randomScans(btree, scans, 50, 12000, firstKeyChar, lastKeyChar)); - wait(actors.signalAndReset()); - printf("Stats:\n%s\n", g_redwoodMetrics.toString(true).c_str()); - - printf("Serial scans with readAhead 2 pages...\n"); - actors.add(randomScans(btree, scans, 50, 8000, firstKeyChar, lastKeyChar)); - wait(actors.signalAndReset()); - printf("Stats:\n%s\n", g_redwoodMetrics.toString(true).c_str()); - - printf("Serial scans with readAhead 1 page...\n"); - actors.add(randomScans(btree, scans, 50, 4000, firstKeyChar, lastKeyChar)); - wait(actors.signalAndReset()); - printf("Stats:\n%s\n", g_redwoodMetrics.toString(true).c_str()); - - printf("Serial scans...\n"); - actors.add(randomScans(btree, scans, 50, 0, firstKeyChar, lastKeyChar)); - wait(actors.signalAndReset()); - printf("Stats:\n%s\n", g_redwoodMetrics.toString(true).c_str()); - - printf("Parallel scans, concurrency=%d, no readAhead ...\n", concurrentScans); - for (int x = 0; x < concurrentScans; ++x) { - actors.add(randomScans(btree, scans / concurrentScans, 50, 0, firstKeyChar, lastKeyChar)); + if (seeks > 0) { + printf("Parallel seeks, count=%d, concurrency=%d ...\n", seeks, concurrentSeeks); + for (int x = 0; x < concurrentSeeks; ++x) { + actors.add(randomSeeks(btree, seeks / concurrentSeeks, firstKeyChar, lastKeyChar)); + } + wait(actors.signalAndReset()); + if (!traceMetrics) { + printf("Stats:\n%s\n", g_redwoodMetrics.toString(true).c_str()); + } } - wait(actors.signalAndReset()); - printf("Stats:\n%s\n", g_redwoodMetrics.toString(true).c_str()); - - printf("Serial seeks...\n"); - actors.add(randomSeeks(btree, seeks, firstKeyChar, lastKeyChar)); - wait(actors.signalAndReset()); - printf("Stats:\n%s\n", g_redwoodMetrics.toString(true).c_str()); - - printf("Parallel seeks, concurrency=%d ...\n", concurrentSeeks); - for (int x = 0; x < concurrentSeeks; ++x) { - actors.add(randomSeeks(btree, seeks / concurrentSeeks, firstKeyChar, lastKeyChar)); - } - wait(actors.signalAndReset()); - printf("Stats:\n%s\n", g_redwoodMetrics.toString(true).c_str()); Future closedFuture = btree->onClosed(); btree->close(); diff --git a/flow/genericactors.actor.h b/flow/genericactors.actor.h index 2963f63bc8..4f97762f69 100644 --- a/flow/genericactors.actor.h +++ b/flow/genericactors.actor.h @@ -1271,6 +1271,40 @@ Future waitOrError(Future f, Future errorSignal) { } } +// A low-overhead FIFO mutex made with no internal queue structure (no list, deque, vector, etc) +// The lock is implemented as a Promise, which is returned to callers in a convenient wrapper +// called Lock. +// +// Usage: +// Lock lock = wait(mutex.take()); +// lock.release(); // Next waiter will get the lock, OR +// lock.error(e); // Next waiter will get e, future waiters will see broken_promise +// lock = Lock(); // Or let Lock and any copies go out of scope. All waiters will see broken_promise. +struct FlowMutex { + FlowMutex() { lastPromise.send(Void()); } + + bool available() { return lastPromise.isSet(); } + + struct Lock { + void release() { promise.send(Void()); } + + void error(Error e = broken_promise()) { promise.sendError(e); } + + // This is exposed in case the caller wants to use/copy it directly + Promise promise; + }; + + Future take() { + Lock newLock; + Future f = lastPromise.isSet() ? newLock : tag(lastPromise.getFuture(), newLock); + lastPromise = newLock.promise; + return f; + } + +private: + Promise lastPromise; +}; + struct FlowLock : NonCopyable, public ReferenceCounted { // FlowLock implements a nonblocking critical section: there can be only a limited number of clients executing code // between wait(take()) and release(). Not thread safe. take() returns only when the number of holders of the lock