diff --git a/fdbserver/DeltaTree.h b/fdbserver/DeltaTree.h index 639daa98fe..6e821cb2b4 100644 --- a/fdbserver/DeltaTree.h +++ b/fdbserver/DeltaTree.h @@ -32,10 +32,10 @@ static inline int commonPrefixLength(uint8_t const* ap, uint8_t const* bp, int c int i = 0; const int wordEnd = cl - sizeof(Word) + 1; - for(; i < wordEnd; i += sizeof(Word)) { - Word a = *(Word *)ap; - Word b = *(Word *)bp; - if(a != b) { + for (; i < wordEnd; i += sizeof(Word)) { + Word a = *(Word*)ap; + Word b = *(Word*)bp; + if (a != b) { return i + ctzll(a ^ b) / 8; } ap += sizeof(Word); @@ -59,31 +59,32 @@ static int commonPrefixLength(StringRef a, StringRef b) { // This appears to be the fastest version static int lessOrEqualPowerOfTwo(int n) { int p; - for (p = 1; p+p <= n; p+=p); + for (p = 1; p + p <= n; p += p) + ; return p; } /* static int _lessOrEqualPowerOfTwo(uint32_t n) { - if(n == 0) - return n; - int trailing = __builtin_ctz(n); - int leading = __builtin_clz(n); - if(trailing + leading == ((sizeof(n) * 8) - 1)) - return n; - return 1 << ( (sizeof(n) * 8) - leading - 1); + if(n == 0) + return n; + int trailing = __builtin_ctz(n); + int leading = __builtin_clz(n); + if(trailing + leading == ((sizeof(n) * 8) - 1)) + return n; + return 1 << ( (sizeof(n) * 8) - leading - 1); } static int __lessOrEqualPowerOfTwo(unsigned int n) { - int p = 1; - for(; p <= n; p <<= 1); - return p >> 1; + int p = 1; + for(; p <= n; p <<= 1); + return p >> 1; } */ static int perfectSubtreeSplitPoint(int subtree_size) { // return the inorder index of the root node in a subtree of the given size - // consistent with the resulting binary search tree being "perfect" (having minimal height + // consistent with the resulting binary search tree being "perfect" (having minimal height // and all missing nodes as far right as possible). // There has to be a simpler way to do this. int s = lessOrEqualPowerOfTwo((subtree_size - 1) / 2 + 1) - 1; @@ -91,16 +92,14 @@ static int perfectSubtreeSplitPoint(int subtree_size) { } static int perfectSubtreeSplitPointCached(int subtree_size) { - static uint16_t *points = nullptr; + static uint16_t* points = nullptr; static const int max = 500; - if(points == nullptr) { + if (points == nullptr) { points = new uint16_t[max]; - for(int i = 0; i < max; ++i) - points[i] = perfectSubtreeSplitPoint(i); + for (int i = 0; i < max; ++i) points[i] = perfectSubtreeSplitPoint(i); } - if(subtree_size < max) - return points[subtree_size]; + if (subtree_size < max) return points[subtree_size]; return perfectSubtreeSplitPoint(subtree_size); } @@ -129,7 +128,7 @@ static int perfectSubtreeSplitPointCached(int subtree_size) { // int getCommonPrefixLen(const T &base, int skip) const; // // // Returns the size of the delta object needed to make *this from base -// // TODO: Explain contract required for deltaSize to be used to predict final +// // TODO: Explain contract required for deltaSize to be used to predict final // // balanced tree size incrementally while adding sorted items to a build set // int deltaSize(const T &base) const; // @@ -147,7 +146,7 @@ static int perfectSubtreeSplitPointCached(int subtree_size) { // // Retrieves the previously stored boolean // bool getPrefixSource() const; // -#pragma pack(push,1) +#pragma pack(push, 1) template struct DeltaTree { struct Node { @@ -161,45 +160,35 @@ struct DeltaTree { uint16_t right; } smallOffsets; }; - - static int headerSize(bool large) { - return large ? sizeof(largeOffsets) : sizeof(smallOffsets); - } - - inline DeltaT & delta(bool large) { - return large ? *(DeltaT *)(&largeOffsets + 1) : *(DeltaT *)(&smallOffsets + 1); + + static int headerSize(bool large) { return large ? sizeof(largeOffsets) : sizeof(smallOffsets); } + + inline DeltaT& delta(bool large) { + return large ? *(DeltaT*)(&largeOffsets + 1) : *(DeltaT*)(&smallOffsets + 1); }; - inline const DeltaT & delta(bool large) const { - return large ? *(const DeltaT *)(&largeOffsets + 1) : *(const DeltaT *)(&smallOffsets + 1); + inline const DeltaT& delta(bool large) const { + return large ? *(const DeltaT*)(&largeOffsets + 1) : *(const DeltaT*)(&smallOffsets + 1); }; - Node * resolvePointer(int offset) const { - return offset == 0 ? nullptr : (Node *)((uint8_t *)this + offset); - } + Node* resolvePointer(int offset) const { return offset == 0 ? nullptr : (Node*)((uint8_t*)this + offset); } - Node * rightChild(bool large) const { - return resolvePointer(large ? largeOffsets.right : smallOffsets.right); - } + Node* rightChild(bool large) const { return resolvePointer(large ? largeOffsets.right : smallOffsets.right); } - Node * leftChild(bool large) const { - return resolvePointer(large ? largeOffsets.left : smallOffsets.left); - } + Node* leftChild(bool large) const { return resolvePointer(large ? largeOffsets.left : smallOffsets.left); } void setRightChildOffset(bool large, int offset) { - if(large) { + if (large) { largeOffsets.right = offset; - } - else { + } else { smallOffsets.right = offset; } } void setLeftChildOffset(bool large, int offset) { - if(large) { + if (large) { largeOffsets.left = offset; - } - else { + } else { smallOffsets.left = offset; } } @@ -213,90 +202,69 @@ struct DeltaTree { static constexpr int LargeTreePerNodeExtraOverhead = sizeof(Node::largeOffsets) - sizeof(Node::smallOffsets); struct { - uint16_t numItems; // Number of items in the tree. - uint32_t nodeBytesUsed; // Bytes used by nodes (everything after the tree header) - uint32_t nodeBytesFree; // Bytes left at end of tree to expand into - uint32_t nodeBytesDeleted; // Delta bytes deleted from tree. Note that some of these bytes could be borrowed by descendents. - uint8_t initialHeight; // Height of tree as originally built - uint8_t maxHeight; // Maximum height of tree after any insertion. Value of 0 means no insertions done. - bool largeNodes; // Node size, can be calculated as capacity > SmallSizeLimit but it will be used a lot + uint16_t numItems; // Number of items in the tree. + uint32_t nodeBytesUsed; // Bytes used by nodes (everything after the tree header) + uint32_t nodeBytesFree; // Bytes left at end of tree to expand into + uint32_t nodeBytesDeleted; // Delta bytes deleted from tree. Note that some of these bytes could be borrowed by + // descendents. + uint8_t initialHeight; // Height of tree as originally built + uint8_t maxHeight; // Maximum height of tree after any insertion. Value of 0 means no insertions done. + bool largeNodes; // Node size, can be calculated as capacity > SmallSizeLimit but it will be used a lot }; #pragma pack(pop) - inline Node & root() { - return *(Node *)(this + 1); - } + inline Node& root() { return *(Node*)(this + 1); } - inline const Node & root() const { - return *(const Node *)(this + 1); - } + inline const Node& root() const { return *(const Node*)(this + 1); } - int size() const { - return sizeof(DeltaTree) + nodeBytesUsed; - } + int size() const { return sizeof(DeltaTree) + nodeBytesUsed; } - int capacity() const { - return size() + nodeBytesFree; - } + int capacity() const { return size() + nodeBytesFree; } - inline Node & newNode() { - return *(Node *)((uint8_t *)this + size()); - } + inline Node& newNode() { return *(Node*)((uint8_t*)this + size()); } public: // Get count of total overhead bytes (everything but the user-formatted Delta) for a tree given size n - static int emptyTreeSize() { - return sizeof(DeltaTree); - } + static int emptyTreeSize() { return sizeof(DeltaTree); } struct DecodedNode { DecodedNode() {} // construct root node - DecodedNode(Node *raw, const T *prev, const T *next, Arena &arena, bool large) - : raw(raw), parent(nullptr), otherAncestor(nullptr), leftChild(nullptr), rightChild(nullptr), prev(prev), next(next), - item(raw->delta(large).apply(raw->delta(large).getPrefixSource() ? *prev : *next, arena)), - large(large) - { - //printf("DecodedNode1 raw=%p delta=%s\n", raw, raw->delta(large).toString().c_str()); + DecodedNode(Node* raw, const T* prev, const T* next, Arena& arena, bool large) + : raw(raw), parent(nullptr), otherAncestor(nullptr), leftChild(nullptr), rightChild(nullptr), prev(prev), + next(next), item(raw->delta(large).apply(raw->delta(large).getPrefixSource() ? *prev : *next, arena)), + large(large) { + // printf("DecodedNode1 raw=%p delta=%s\n", raw, raw->delta(large).toString().c_str()); } - + // Construct non-root node - // wentLeft indicates that we've gone left to get to the raw node. - DecodedNode(Node *raw, DecodedNode *parent, bool wentLeft, Arena &arena) - : parent(parent), large(parent->large), otherAncestor(wentLeft ? parent->getPrevAncestor() : parent->getNextAncestor()), - prev(wentLeft ? parent->prev : &parent->item), - next(wentLeft ? &parent->item : parent->next), - leftChild(nullptr), rightChild(nullptr), - raw(raw), item(raw->delta(large).apply(raw->delta(large).getPrefixSource() ? *prev : *next, arena)) - { - //printf("DecodedNode2 raw=%p delta=%s\n", raw, raw->delta(large).toString().c_str()); + // wentLeft indicates that we've gone left to get to the raw node. + DecodedNode(Node* raw, DecodedNode* parent, bool wentLeft, Arena& arena) + : parent(parent), large(parent->large), + otherAncestor(wentLeft ? parent->getPrevAncestor() : parent->getNextAncestor()), + prev(wentLeft ? parent->prev : &parent->item), next(wentLeft ? &parent->item : parent->next), + leftChild(nullptr), rightChild(nullptr), raw(raw), + item(raw->delta(large).apply(raw->delta(large).getPrefixSource() ? *prev : *next, arena)) { + // printf("DecodedNode2 raw=%p delta=%s\n", raw, raw->delta(large).toString().c_str()); } // Returns true if otherAncestor is the previous ("greatest lesser") ancestor - bool otherAncestorPrev() const { - return parent && parent->leftChild == this; - } + bool otherAncestorPrev() const { return parent && parent->leftChild == this; } // Returns true if otherAncestor is the next ("least greator") ancestor - bool otherAncestorNext() const { - return parent && parent->rightChild == this; - } + bool otherAncestorNext() const { return parent && parent->rightChild == this; } - DecodedNode * getPrevAncestor() const { - return otherAncestorPrev() ? otherAncestor : parent; - } + DecodedNode* getPrevAncestor() const { return otherAncestorPrev() ? otherAncestor : parent; } - DecodedNode * getNextAncestor() const { - return otherAncestorNext() ? otherAncestor : parent; - } + DecodedNode* getNextAncestor() const { return otherAncestorNext() ? otherAncestor : parent; } - DecodedNode * jumpUpNext(DecodedNode *root, bool &othersChild) const { - if(parent != nullptr) { - if(parent->rightChild == this) { + DecodedNode* jumpUpNext(DecodedNode* root, bool& othersChild) const { + if (parent != nullptr) { + if (parent->rightChild == this) { return otherAncestor; } - if(otherAncestor != nullptr) { + if (otherAncestor != nullptr) { othersChild = true; return otherAncestor->rightChild; } @@ -304,12 +272,12 @@ public: return parent; } - DecodedNode * jumpUpPrev(DecodedNode *root, bool &othersChild) const { - if(parent != nullptr) { - if(parent->leftChild == this) { + DecodedNode* jumpUpPrev(DecodedNode* root, bool& othersChild) const { + if (parent != nullptr) { + if (parent->leftChild == this) { return otherAncestor; } - if(otherAncestor != nullptr) { + if (otherAncestor != nullptr) { othersChild = true; return otherAncestor->leftChild; } @@ -317,62 +285,56 @@ public: return parent; } - DecodedNode * jumpNext(DecodedNode *root) const { - if(otherAncestorNext()) { + DecodedNode* jumpNext(DecodedNode* root) const { + if (otherAncestorNext()) { return (otherAncestor != nullptr) ? otherAncestor : rightChild; - } - else { - if(this == root) { + } else { + if (this == root) { return rightChild; } return (otherAncestor != nullptr) ? otherAncestor->rightChild : root; } } - DecodedNode * jumpPrev(DecodedNode *root) const { - if(otherAncestorPrev()) { + DecodedNode* jumpPrev(DecodedNode* root) const { + if (otherAncestorPrev()) { return (otherAncestor != nullptr) ? otherAncestor : leftChild; - } - else { - if(this == root) { + } else { + if (this == root) { return leftChild; } return (otherAncestor != nullptr) ? otherAncestor->leftChild : root; } } - void setDeleted(bool deleted) { - raw->delta(large).setDeleted(deleted); - } + void setDeleted(bool deleted) { raw->delta(large).setDeleted(deleted); } - bool isDeleted() const { - return raw->delta(large).getDeleted(); - } + bool isDeleted() const { return raw->delta(large).getDeleted(); } - bool large; // Node size - Node *raw; - DecodedNode *parent; - DecodedNode *otherAncestor; - DecodedNode *leftChild; - DecodedNode *rightChild; - const T *prev; // greatest ancestor to the left, or tree lower bound - const T *next; // least ancestor to the right, or tree upper bound + bool large; // Node size + Node* raw; + DecodedNode* parent; + DecodedNode* otherAncestor; + DecodedNode* leftChild; + DecodedNode* rightChild; + const T* prev; // greatest ancestor to the left, or tree lower bound + const T* next; // least ancestor to the right, or tree upper bound T item; - DecodedNode *getRightChild(Arena &arena) { - if(rightChild == nullptr) { - Node *n = raw->rightChild(large); - if(n != nullptr) { + DecodedNode* getRightChild(Arena& arena) { + if (rightChild == nullptr) { + Node* n = raw->rightChild(large); + if (n != nullptr) { rightChild = new (arena) DecodedNode(n, this, false, arena); } } return rightChild; } - DecodedNode *getLeftChild(Arena &arena) { - if(leftChild == nullptr) { - Node *n = raw->leftChild(large); - if(n != nullptr) { + DecodedNode* getLeftChild(Arena& arena) { + if (leftChild == nullptr) { + Node* n = raw->leftChild(large); + if (n != nullptr) { leftChild = new (arena) DecodedNode(n, this, true, arena); } } @@ -389,75 +351,69 @@ public: struct Mirror : FastAllocated { friend class Cursor; - Mirror(const void *treePtr = nullptr, const T *lowerBound = nullptr, const T *upperBound = nullptr) - : tree((DeltaTree *)treePtr), lower(lowerBound), upper(upperBound) - { - // TODO: Remove these copies into arena and require users of Mirror to keep prev and next alive during its lifetime - lower = new(arena) T(arena, *lower); - upper = new(arena) T(arena, *upper); + Mirror(const void* treePtr = nullptr, const T* lowerBound = nullptr, const T* upperBound = nullptr) + : tree((DeltaTree*)treePtr), lower(lowerBound), upper(upperBound) { + // TODO: Remove these copies into arena and require users of Mirror to keep prev and next alive during its + // lifetime + lower = new (arena) T(arena, *lower); + upper = new (arena) T(arena, *upper); - root = (tree->nodeBytesUsed == 0) ? nullptr : new (arena) DecodedNode(&tree->root(), lower, upper, arena, tree->largeNodes); + root = (tree->nodeBytesUsed == 0) ? nullptr + : new (arena) + DecodedNode(&tree->root(), lower, upper, arena, tree->largeNodes); } - const T *lowerBound() const { - return lower; - } + const T* lowerBound() const { return lower; } - const T *upperBound() const { - return upper; - } + const T* upperBound() const { return upper; } -private: + private: Arena arena; - DeltaTree *tree; - DecodedNode *root; - const T *lower; - const T *upper; -public: + DeltaTree* tree; + DecodedNode* root; + const T* lower; + const T* upper; - Cursor getCursor() { - return Cursor(this); - } + public: + Cursor getCursor() { return Cursor(this); } // Try to insert k into the DeltaTree, updating byte counts and initialHeight if they // have changed (they won't if k already exists in the tree but was deleted). // Returns true if successful, false if k does not fit in the space available // or if k is already in the tree (and was not already deleted). - bool insert(const T &k, int skipLen = 0, int maxHeightAllowed = std::numeric_limits::max()) { + bool insert(const T& k, int skipLen = 0, int maxHeightAllowed = std::numeric_limits::max()) { int height = 1; - DecodedNode *n = root; + DecodedNode* n = root; bool addLeftChild = false; - while(n != nullptr) { + while (n != nullptr) { int cmp = k.compare(n->item, skipLen); - if(cmp >= 0) { + if (cmp >= 0) { // If we found an item identical to k then if it is deleted, undeleted it, // otherwise fail - if(cmp == 0) { - auto &d = n->raw->delta(tree->largeNodes); - if(d.getDeleted()) { + if (cmp == 0) { + auto& d = n->raw->delta(tree->largeNodes); + if (d.getDeleted()) { d.setDeleted(false); ++tree->numItems; return true; - } - else { + } else { return false; } } - DecodedNode *right = n->getRightChild(arena); + DecodedNode* right = n->getRightChild(arena); - if(right == nullptr) { + if (right == nullptr) { break; } n = right; - } - else { - DecodedNode *left = n->getLeftChild(arena); + } else { + DecodedNode* left = n->getLeftChild(arena); - if(left == nullptr) { + if (left == nullptr) { addLeftChild = true; break; } @@ -467,14 +423,14 @@ public: ++height; } - if(height > maxHeightAllowed) { + if (height > maxHeightAllowed) { return false; } // Insert k as the left or right child of n, depending on the value of addLeftChild // First, see if it will fit. - const T *prev = addLeftChild ? n->prev : &n->item; - const T *next = addLeftChild ? &n->item : n->next; + const T* prev = addLeftChild ? n->prev : &n->item; + const T* next = addLeftChild ? &n->item : n->next; int common = prev->getCommonPrefixLen(*next, skipLen); int commonWithPrev = k.getCommonPrefixLen(*prev, common); @@ -482,26 +438,25 @@ public: bool basePrev = commonWithPrev >= commonWithNext; int commonPrefix = basePrev ? commonWithPrev : commonWithNext; - const T *base = basePrev ? prev : next; + const T* base = basePrev ? prev : next; int deltaSize = k.deltaSize(*base, commonPrefix, false); int nodeSpace = deltaSize + Node::headerSize(tree->largeNodes); - if(nodeSpace > tree->nodeBytesFree) { + if (nodeSpace > tree->nodeBytesFree) { return false; } - DecodedNode *newNode = new (arena) DecodedNode(); - Node *raw = &tree->newNode(); + DecodedNode* newNode = new (arena) DecodedNode(); + Node* raw = &tree->newNode(); raw->setLeftChildOffset(tree->largeNodes, 0); raw->setRightChildOffset(tree->largeNodes, 0); - int newOffset = (uint8_t *)raw - (uint8_t *)n->raw; - //printf("Inserting %s at offset %d\n", k.toString().c_str(), newOffset); + int newOffset = (uint8_t*)raw - (uint8_t*)n->raw; + // printf("Inserting %s at offset %d\n", k.toString().c_str(), newOffset); - if(addLeftChild) { + if (addLeftChild) { n->leftChild = newNode; n->raw->setLeftChildOffset(tree->largeNodes, newOffset); - } - else { + } else { n->rightChild = newNode; n->raw->setRightChildOffset(tree->largeNodes, newOffset); } @@ -518,7 +473,8 @@ public: ASSERT(deltaSize == k.writeDelta(raw->delta(tree->largeNodes), *base, commonPrefix)); raw->delta(tree->largeNodes).setPrefixSource(basePrev); - // Initialize node's item from the delta (instead of copying into arena) to avoid unnecessary arena space usage + // Initialize node's item from the delta (instead of copying into arena) to avoid unnecessary arena space + // usage newNode->item = raw->delta(tree->largeNodes).apply(*base, arena); tree->nodeBytesUsed += nodeSpace; @@ -526,7 +482,7 @@ public: ++tree->numItems; // Update max height of the tree if necessary - if(height > tree->maxHeight) { + if (height > tree->maxHeight) { tree->maxHeight = height; } @@ -534,11 +490,11 @@ public: } // Erase k by setting its deleted flag to true. Returns true only if k existed - bool erase(const T &k, int skipLen = 0) { + bool erase(const T& k, int skipLen = 0) { Cursor c = getCursor(); int cmp = c.seek(k); // If exactly k is found - if(cmp == 0 && !c.node->isDeleted()) { + if (cmp == 0 && !c.node->isDeleted()) { c.erase(); return true; } @@ -549,34 +505,22 @@ public: // Cursor provides a way to seek into a DeltaTree and iterate over its contents // All Cursors from a Mirror share the same decoded node 'cache' (tree of DecodedNodes) struct Cursor { - Cursor() : mirror(nullptr), node(nullptr) { - } + Cursor() : mirror(nullptr), node(nullptr) {} - Cursor(Mirror *r) : mirror(r), node(mirror->root) { - } + Cursor(Mirror* r) : mirror(r), node(mirror->root) {} - Mirror *mirror; - DecodedNode *node; + Mirror* mirror; + DecodedNode* node; - bool valid() const { - return node != nullptr; - } + bool valid() const { return node != nullptr; } - const T & get() const { - return node->item; - } + const T& get() const { return node->item; } - const T & getOrUpperBound() const { - return valid() ? node->item : *mirror->upperBound(); - } + const T& getOrUpperBound() const { return valid() ? node->item : *mirror->upperBound(); } - bool operator==(const Cursor &rhs) const { - return node == rhs.node; - } + bool operator==(const Cursor& rhs) const { return node == rhs.node; } - bool operator!=(const Cursor &rhs) const { - return node != rhs.node; - } + bool operator!=(const Cursor& rhs) const { return node != rhs.node; } void erase() { node->setDeleted(true); @@ -584,72 +528,69 @@ public: moveNext(); } - // TODO: Make hint-based seek() use the hint logic in this, which is better and actually improves seek times, then remove this function. - bool seekLessThanOrEqualOld(const T &s, int skipLen, const Cursor *pHint, int initialCmp) { - DecodedNode *n; + // TODO: Make hint-based seek() use the hint logic in this, which is better and actually improves seek times, + // then remove this function. + bool seekLessThanOrEqualOld(const T& s, int skipLen, const Cursor* pHint, int initialCmp) { + DecodedNode* n; // If there's a hint position, use it // At the end of using the hint, if n is valid it should point to a node which has not yet been compared to. - if(pHint != nullptr && pHint->node != nullptr) { + if (pHint != nullptr && pHint->node != nullptr) { n = pHint->node; - if(initialCmp == 0) { + if (initialCmp == 0) { node = n; return _hideDeletedBackward(); } - if(initialCmp > 0) { + if (initialCmp > 0) { node = n; - while(n != nullptr) { + while (n != nullptr) { n = n->jumpNext(mirror->root); - if(n == nullptr) { + if (n == nullptr) { break; } int cmp = s.compare(n->item, skipLen); - if(cmp > 0) { + if (cmp > 0) { node = n; continue; } - if(cmp == 0) { + if (cmp == 0) { node = n; n = nullptr; - } - else { + } else { n = n->leftChild; } break; } - } - else { - while(n != nullptr) { + } else { + while (n != nullptr) { n = n->jumpPrev(mirror->root); - if(n == nullptr) { + if (n == nullptr) { break; } int cmp = s.compare(n->item, skipLen); - if(cmp >= 0) { + if (cmp >= 0) { node = n; n = (cmp == 0) ? nullptr : n->rightChild; break; } } } - } - else { + } else { // Start at root, clear current position n = mirror->root; node = nullptr; } - while(n != nullptr) { + while (n != nullptr) { int cmp = s.compare(n->item, skipLen); - if(cmp < 0) { + if (cmp < 0) { n = n->getLeftChild(mirror->arena); - } - else { + } else { // n <= s so store it in node as a potential result node = n; - if(cmp == 0) { + if (cmp == 0) { break; } @@ -665,54 +606,54 @@ public: // Then will not "see" erased records. // If successful, they return true, and if not then false a while making the cursor invalid. // These methods forward arguments to the seek() overloads, see those for argument descriptions. - template + template bool seekLessThan(Args... args) { int cmp = seek(args...); - if(cmp < 0 || (cmp == 0 && node != nullptr)) { + if (cmp < 0 || (cmp == 0 && node != nullptr)) { movePrev(); } return _hideDeletedBackward(); } - template + template bool seekLessThanOrEqual(Args... args) { int cmp = seek(args...); - if(cmp < 0) { + if (cmp < 0) { movePrev(); } return _hideDeletedBackward(); } - template + template bool seekGreaterThan(Args... args) { int cmp = seek(args...); - if(cmp > 0 || (cmp == 0 && node != nullptr)) { + if (cmp > 0 || (cmp == 0 && node != nullptr)) { moveNext(); } return _hideDeletedForward(); } - template + template bool seekGreaterThanOrEqual(Args... args) { int cmp = seek(args...); - if(cmp > 0) { + if (cmp > 0) { moveNext(); } return _hideDeletedForward(); } - // seek() moves the cursor to a node containing s or the node that would be the parent of s if s were to be added to the tree. - // If the tree was empty, the cursor will be invalid and the return value will be 0. + // seek() moves the cursor to a node containing s or the node that would be the parent of s if s were to be + // added to the tree. If the tree was empty, the cursor will be invalid and the return value will be 0. // Otherwise, returns the result of s.compare(item at cursor position) // Does not skip/avoid deleted nodes. - int seek(const T &s, int skipLen = 0) { - DecodedNode *n = mirror->root; + int seek(const T& s, int skipLen = 0) { + DecodedNode* n = mirror->root; node = nullptr; int cmp = 0; - while(n != nullptr) { + while (n != nullptr) { node = n; cmp = s.compare(n->item, skipLen); - if(cmp == 0) { + if (cmp == 0) { break; } @@ -724,34 +665,36 @@ public: // Same usage as seek() but with a hint of a cursor, which can't be null, whose starting position // should be close to s in the tree to improve seek time. - // initialCmp should be logically equivalent to s.compare(pHint->get()) or 0, in which + // initialCmp should be logically equivalent to s.compare(pHint->get()) or 0, in which // case the comparison will be done in this method. - // TODO: This is broken, it's not faster than not using a hint. See Make thisUnfortunately in a microbenchmark attempting to approximate a common use case, this version - // of using a cursor hint is actually slower than not using a hint. - int seek(const T &s, int skipLen, const Cursor *pHint, int initialCmp = 0) { - DecodedNode *n = mirror->root; + // TODO: This is broken, it's not faster than not using a hint. See Make thisUnfortunately in a microbenchmark + // attempting to approximate a common use case, this version of using a cursor hint is actually slower than not + // using a hint. + int seek(const T& s, int skipLen, const Cursor* pHint, int initialCmp = 0) { + DecodedNode* n = mirror->root; node = nullptr; int cmp; // If there's a hint position, use it // At the end of using the hint, if n is valid it should point to a node which has not yet been compared to. - if(pHint->node != nullptr) { + if (pHint->node != nullptr) { n = pHint->node; - if(initialCmp == 0) { + if (initialCmp == 0) { initialCmp = s.compare(pHint->get()); } cmp = initialCmp; - while(true) { + while (true) { node = n; - if(cmp == 0) { + if (cmp == 0) { return cmp; } // Attempt to jump up and past s bool othersChild = false; - n = (initialCmp > 0) ? n->jumpUpNext(mirror->root, othersChild) : n->jumpUpPrev(mirror->root, othersChild); - if(n == nullptr) { + n = (initialCmp > 0) ? n->jumpUpNext(mirror->root, othersChild) + : n->jumpUpPrev(mirror->root, othersChild); + if (n == nullptr) { n = (cmp > 0) ? node->rightChild : node->leftChild; break; } @@ -760,15 +703,14 @@ public: cmp = s.compare(n->item, skipLen); // n is on the oposite side of s than node is, then n is too far. - if(cmp != 0 && ((initialCmp ^ cmp) < 0)) { - if(!othersChild) { + if (cmp != 0 && ((initialCmp ^ cmp) < 0)) { + if (!othersChild) { n = (cmp < 0) ? node->rightChild : node->leftChild; } break; } } - } - else { + } else { // Start at root, clear current position n = mirror->root; node = nullptr; @@ -776,10 +718,10 @@ public: } // Search starting from n, which is either the root or the result of applying the hint - while(n != nullptr) { + while (n != nullptr) { node = n; cmp = s.compare(n->item, skipLen); - if(cmp == 0) { + if (cmp == 0) { break; } @@ -790,23 +732,21 @@ public: } bool moveFirst() { - DecodedNode *n = mirror->root; + DecodedNode* n = mirror->root; node = n; - while(n != nullptr) { + while (n != nullptr) { n = n->getLeftChild(mirror->arena); - if(n != nullptr) - node = n; + if (n != nullptr) node = n; } return _hideDeletedForward(); } bool moveLast() { - DecodedNode *n = mirror->root; + DecodedNode* n = mirror->root; node = n; - while(n != nullptr) { + while (n != nullptr) { n = n->getRightChild(mirror->arena); - if(n != nullptr) - node = n; + if (n != nullptr) node = n; } return _hideDeletedBackward(); } @@ -814,15 +754,14 @@ public: // Try to move to next node, sees deleted nodes. void _moveNext() { // Try to go right - DecodedNode *n = node->getRightChild(mirror->arena); + DecodedNode* n = node->getRightChild(mirror->arena); // If we couldn't go right, then the answer is our next ancestor - if(n == nullptr) { + if (n == nullptr) { node = node->getNextAncestor(); - } - else { + } else { // Go left as far as possible - while(n != nullptr) { + while (n != nullptr) { node = n; n = n->getLeftChild(mirror->arena); } @@ -832,15 +771,14 @@ public: // Try to move to previous node, sees deleted nodes. void _movePrev() { // Try to go left - DecodedNode *n = node->getLeftChild(mirror->arena); + DecodedNode* n = node->getLeftChild(mirror->arena); // If we couldn't go left, then the answer is our prev ancestor - if(n == nullptr) { + if (n == nullptr) { node = node->getPrevAncestor(); - } - else { + } else { // Go right as far as possible - while(n != nullptr) { + while (n != nullptr) { node = n; n = n->getRightChild(mirror->arena); } @@ -859,14 +797,14 @@ public: private: bool _hideDeletedBackward() { - while(node != nullptr && node->isDeleted()) { + while (node != nullptr && node->isDeleted()) { _movePrev(); } return node != nullptr; } bool _hideDeletedForward() { - while(node != nullptr && node->isDeleted()) { + while (node != nullptr && node->isDeleted()) { _moveNext(); } return node != nullptr; @@ -874,7 +812,7 @@ public: }; // Returns number of bytes written - int build(int spaceAvailable, const T *begin, const T *end, const T *prev, const T *next) { + int build(int spaceAvailable, const T* begin, const T* end, const T* prev, const T* next) { largeNodes = spaceAvailable > SmallSizeLimit; int count = end - begin; numItems = count; @@ -883,10 +821,9 @@ public: maxHeight = 0; // The boundary leading to the new page acts as the last time we branched right - if(begin != end) { + if (begin != end) { nodeBytesUsed = buildSubtree(root(), begin, end, prev, next, prev->getCommonPrefixLen(*next, 0)); - } - else { + } else { nodeBytesUsed = 0; } nodeBytesFree = spaceAvailable - size(); @@ -894,28 +831,28 @@ public: } private: - int buildSubtree(Node &node, const T *begin, const T *end, const T *prev, const T *next, int subtreeCommon) { - //printf("build: %s to %s\n", begin->toString().c_str(), (end - 1)->toString().c_str()); - //printf("build: root at %p Node::headerSize %d delta at %p \n", &root, Node::headerSize(largeNodes), &node.delta(largeNodes)); + int buildSubtree(Node& node, const T* begin, const T* end, const T* prev, const T* next, int subtreeCommon) { + // printf("build: %s to %s\n", begin->toString().c_str(), (end - 1)->toString().c_str()); + // printf("build: root at %p Node::headerSize %d delta at %p \n", &root, Node::headerSize(largeNodes), + // &node.delta(largeNodes)); ASSERT(end != begin); int count = end - begin; // Find key to be stored in root int mid = perfectSubtreeSplitPointCached(count); - const T &item = begin[mid]; + const T& item = begin[mid]; int commonWithPrev = item.getCommonPrefixLen(*prev, subtreeCommon); int commonWithNext = item.getCommonPrefixLen(*next, subtreeCommon); bool prefixSourcePrev; int commonPrefix; - const T *base; - if(commonWithPrev >= commonWithNext) { + const T* base; + if (commonWithPrev >= commonWithNext) { prefixSourcePrev = true; commonPrefix = commonWithPrev; base = prev; - } - else { + } else { prefixSourcePrev = false; commonPrefix = commonWithNext; base = next; @@ -923,29 +860,27 @@ private: int deltaSize = item.writeDelta(node.delta(largeNodes), *base, commonPrefix); node.delta(largeNodes).setPrefixSource(prefixSourcePrev); - //printf("Serialized %s to %p\n", item.toString().c_str(), &root.delta(largeNodes)); + // printf("Serialized %s to %p\n", item.toString().c_str(), &root.delta(largeNodes)); // Continue writing after the serialized Delta. - uint8_t *wptr = (uint8_t *)&node.delta(largeNodes) + deltaSize; + uint8_t* wptr = (uint8_t*)&node.delta(largeNodes) + deltaSize; // Serialize left child - if(count > 1) { - wptr += buildSubtree(*(Node *)wptr, begin, begin + mid, prev, &item, commonWithPrev); + if (count > 1) { + wptr += buildSubtree(*(Node*)wptr, begin, begin + mid, prev, &item, commonWithPrev); node.setLeftChildOffset(largeNodes, Node::headerSize(largeNodes) + deltaSize); - } - else { + } else { node.setLeftChildOffset(largeNodes, 0); } // Serialize right child - if(count > 2) { - node.setRightChildOffset(largeNodes, wptr - (uint8_t *)&node); - wptr += buildSubtree(*(Node *)wptr, begin + mid + 1, end, &item, next, commonWithNext); - } - else { + if (count > 2) { + node.setRightChildOffset(largeNodes, wptr - (uint8_t*)&node); + wptr += buildSubtree(*(Node*)wptr, begin + mid + 1, end, &item, next, commonWithNext); + } else { node.setRightChildOffset(largeNodes, 0); } - return wptr - (uint8_t *)&node; + return wptr - (uint8_t*)&node; } }; diff --git a/fdbserver/IPager.h b/fdbserver/IPager.h index 5043d315fa..b3991a025c 100644 --- a/fdbserver/IPager.h +++ b/fdbserver/IPager.h @@ -4,13 +4,13 @@ * This source file is part of the FoundationDB open source project * * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors - * + * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -30,23 +30,29 @@ #define REDWOOD_DEBUG 0 #define debug_printf_stream stdout -#define debug_printf_always(...) { fprintf(debug_printf_stream, "%s %f %04d ", g_network->getLocalAddress().toString().c_str(), now(), __LINE__); fprintf(debug_printf_stream, __VA_ARGS__); fflush(debug_printf_stream); } +#define debug_printf_always(...) \ + { \ + fprintf(debug_printf_stream, "%s %f %04d ", g_network->getLocalAddress().toString().c_str(), now(), __LINE__); \ + fprintf(debug_printf_stream, __VA_ARGS__); \ + fflush(debug_printf_stream); \ + } #define debug_printf_noop(...) #if defined(NO_INTELLISENSE) - #if REDWOOD_DEBUG - #define debug_printf debug_printf_always - #else - #define debug_printf debug_printf_noop - #endif +#if REDWOOD_DEBUG +#define debug_printf debug_printf_always #else - // To get error-checking on debug_printf statements in IDE - #define debug_printf printf +#define debug_printf debug_printf_noop +#endif +#else +// To get error-checking on debug_printf statements in IDE +#define debug_printf printf #endif #define BEACON debug_printf_always("HERE\n") -#define TRACE debug_printf_always("%s: %s line %d %s\n", __FUNCTION__, __FILE__, __LINE__, platform::get_backtrace().c_str()); +#define TRACE \ + debug_printf_always("%s: %s line %d %s\n", __FUNCTION__, __FILE__, __LINE__, platform::get_backtrace().c_str()); #ifndef VALGRIND #define VALGRIND_MAKE_MEM_UNDEFINED(x, y) @@ -67,12 +73,10 @@ public: // Must return the same size for all pages created by the same pager instance virtual int size() const = 0; - StringRef asStringRef() const { - return StringRef(begin(), size()); - } + StringRef asStringRef() const { return StringRef(begin(), size()); } virtual ~IPage() { - if(userData != nullptr && userDataDestructor != nullptr) { + if (userData != nullptr && userDataDestructor != nullptr) { userDataDestructor(userData); } } @@ -82,8 +86,8 @@ public: virtual void addref() const = 0; virtual void delref() const = 0; - mutable void *userData; - mutable void (*userDataDestructor)(void *); + mutable void* userData; + mutable void (*userDataDestructor)(void*); }; class IPagerSnapshot { diff --git a/fdbserver/IVersionedStore.h b/fdbserver/IVersionedStore.h index 9baf5c4469..b1feb8063c 100644 --- a/fdbserver/IVersionedStore.h +++ b/fdbserver/IVersionedStore.h @@ -4,13 +4,13 @@ * This source file is part of the FoundationDB open source project * * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors - * + * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -46,28 +46,33 @@ public: class IVersionedStore : public IClosable { public: virtual KeyValueStoreType getType() = 0; - virtual bool supportsMutation(int op) = 0; // If this returns true, then mutate(op, ...) may be called + virtual bool supportsMutation(int op) = 0; // If this returns true, then mutate(op, ...) may be called virtual StorageBytes getStorageBytes() = 0; // Writes are provided in an ordered stream. - // A write is considered part of (a change leading to) the version determined by the previous call to setWriteVersion() - // A write shall not become durable until the following call to commit() begins, and shall be durable once the following call to commit() returns + // A write is considered part of (a change leading to) the version determined by the previous call to + // setWriteVersion() A write shall not become durable until the following call to commit() begins, and shall be + // durable once the following call to commit() returns virtual void set(KeyValueRef keyValue) = 0; virtual void clear(KeyRangeRef range) = 0; virtual void mutate(int op, StringRef param1, StringRef param2) = 0; - virtual void setWriteVersion(Version) = 0; // The write version must be nondecreasing - virtual void setOldestVersion(Version v) = 0; // Set oldest readable version to be used in next commit - virtual Version getOldestVersion() = 0; // Get oldest readable version + virtual void setWriteVersion(Version) = 0; // The write version must be nondecreasing + virtual void setOldestVersion(Version v) = 0; // Set oldest readable version to be used in next commit + virtual Version getOldestVersion() = 0; // Get oldest readable version virtual Future commit() = 0; virtual Future init() = 0; virtual Version getLatestVersion() = 0; - // readAtVersion() may only be called on a version which has previously been passed to setWriteVersion() and never previously passed - // to forgetVersion. The returned results when violating this precondition are unspecified; the store is not required to be able to detect violations. - // The returned read cursor provides a consistent snapshot of the versioned store, corresponding to all the writes done with write versions less + // readAtVersion() may only be called on a version which has previously been passed to setWriteVersion() and never + // previously passed + // to forgetVersion. The returned results when violating this precondition are unspecified; the store is not + // required to be able to detect violations. + // The returned read cursor provides a consistent snapshot of the versioned store, corresponding to all the writes + // done with write versions less // than or equal to the given version. - // If readAtVersion() is called on the *current* write version, the given read cursor MAY reflect subsequent writes at the same + // If readAtVersion() is called on the *current* write version, the given read cursor MAY reflect subsequent writes + // at the same // write version, OR it may represent a snapshot as of the call to readAtVersion(). virtual Reference readAtVersion(Version) = 0; }; diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index 99c0bf30ed..084fead508 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -4,13 +4,13 @@ * This source file is part of the FoundationDB open source project * * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors - * + * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -42,8 +42,8 @@ // Some convenience functions for debugging to stringify various structures // Classes can add compatibility by either specializing toString or implementing // std::string toString() const; -template -std::string toString(const T &o) { +template +std::string toString(const T& o) { return o.toString(); } @@ -52,27 +52,26 @@ std::string toString(StringRef s) { } std::string toString(LogicalPageID id) { - if(id == invalidLogicalPageID) { + if (id == invalidLogicalPageID) { return "LogicalPageID{invalid}"; } return format("LogicalPageID{%" PRId64 "}", id); } -template -std::string toString(const Standalone &s) { +template +std::string toString(const Standalone& s) { return toString((T)s); } -template -std::string toString(const T *begin, const T *end) { +template +std::string toString(const T* begin, const T* end) { std::string r = "{"; bool comma = false; - while(begin != end) { - if(comma) { + while (begin != end) { + if (comma) { r += ", "; - } - else { + } else { comma = true; } r += toString(*begin++); @@ -82,25 +81,25 @@ std::string toString(const T *begin, const T *end) { return r; } -template -std::string toString(const std::vector &v) { +template +std::string toString(const std::vector& v) { return toString(&v.front(), &v.back() + 1); } -template -std::string toString(const VectorRef &v) { +template +std::string toString(const VectorRef& v) { return toString(v.begin(), v.end()); } -template -std::string toString(const Optional &o) { - if(o.present()) { +template +std::string toString(const Optional& o) { + if (o.present()) { return toString(o.get()); } return ""; } -// A FIFO queue of T stored as a linked list of pages. +// A FIFO queue of T stored as a linked list of pages. // Main operations are pop(), pushBack(), pushFront(), and flush(). // // flush() will ensure all queue pages are written to the pager and move the unflushed @@ -133,64 +132,54 @@ std::string toString(const Optional &o) { // // Serialize *this to dst, return number of bytes written to dst // int writeToBytes(uint8_t *dst) const; // - must be supported by toString(object) (see above) -template +template struct FIFOQueueCodec { - static T readFromBytes(const uint8_t *src, int &bytesRead) { + static T readFromBytes(const uint8_t* src, int& bytesRead) { T x; bytesRead = x.readFromBytes(src); return x; } - static int bytesNeeded(const T &x) { - return x.bytesNeeded(); - } - static int writeToBytes(uint8_t *dst, const T &x) { - return x.writeToBytes(dst); - } + static int bytesNeeded(const T& x) { return x.bytesNeeded(); } + static int writeToBytes(uint8_t* dst, const T& x) { return x.writeToBytes(dst); } }; -template +template struct FIFOQueueCodec::value>::type> { static_assert(std::is_trivially_copyable::value); - static T readFromBytes(const uint8_t *src, int &bytesRead) { + static T readFromBytes(const uint8_t* src, int& bytesRead) { bytesRead = sizeof(T); - return *(T *)src; + return *(T*)src; } - static int bytesNeeded(const T &x) { - return sizeof(T); - } - static int writeToBytes(uint8_t *dst, const T &x) { - *(T *)dst = x; + static int bytesNeeded(const T& x) { return sizeof(T); } + static int writeToBytes(uint8_t* dst, const T& x) { + *(T*)dst = x; return sizeof(T); } }; -template> +template > class FIFOQueue { public: #pragma pack(push, 1) struct QueueState { - bool operator==(const QueueState &rhs) const { - return memcmp(this, &rhs, sizeof(QueueState)) == 0; - } + bool operator==(const QueueState& rhs) const { return memcmp(this, &rhs, sizeof(QueueState)) == 0; } LogicalPageID headPageID = invalidLogicalPageID; LogicalPageID tailPageID = invalidLogicalPageID; uint16_t headOffset; - // Note that there is no tail index because the tail page is always never-before-written and its index will start at 0 + // Note that there is no tail index because the tail page is always never-before-written and its index will + // start at 0 int64_t numPages; int64_t numEntries; std::string toString() const { - return format("{head: %s:%d tail: %s numPages: %" PRId64 " numEntries: %" PRId64 "}", ::toString(headPageID).c_str(), (int)headOffset, ::toString(tailPageID).c_str(), numPages, numEntries); + return format("{head: %s:%d tail: %s numPages: %" PRId64 " numEntries: %" PRId64 "}", + ::toString(headPageID).c_str(), (int)headOffset, ::toString(tailPageID).c_str(), numPages, + numEntries); } }; #pragma pack(pop) struct Cursor { - enum Mode { - NONE, - POP, - READONLY, - WRITE - }; + enum Mode { NONE, POP, READONLY, WRITE }; // The current page being read or written to LogicalPageID pageID; @@ -198,23 +187,23 @@ public: // The first page ID to be written to the pager, if this cursor has written anything LogicalPageID firstPageIDWritten; - // Offset after RawPage header to next read from or write to + // Offset after RawPage header to next read from or write to int offset; // A read cursor will not read this page (or beyond) LogicalPageID endPageID; Reference page; - FIFOQueue *queue; + FIFOQueue* queue; Future operation; Mode mode; - Cursor() : mode(NONE) { - } + Cursor() : mode(NONE) {} - // Initialize a cursor. - void init(FIFOQueue *q = nullptr, Mode m = NONE, LogicalPageID initialPageID = invalidLogicalPageID, int readOffset = 0, LogicalPageID endPage = invalidLogicalPageID) { - if(operation.isValid()) { + // Initialize a cursor. + void init(FIFOQueue* q = nullptr, Mode m = NONE, LogicalPageID initialPageID = invalidLogicalPageID, + int readOffset = 0, LogicalPageID endPage = invalidLogicalPageID) { + if (operation.isValid()) { operation.cancel(); } queue = q; @@ -224,44 +213,45 @@ public: endPageID = endPage; page.clear(); - if(mode == POP || mode == READONLY) { + if (mode == POP || mode == READONLY) { // If cursor is not pointed at the end page then start loading it. // The end page will not have been written to disk yet. pageID = initialPageID; operation = (pageID == endPageID) ? Void() : loadPage(); - } - else { + } else { pageID = invalidLogicalPageID; - ASSERT(mode == WRITE || (initialPageID == invalidLogicalPageID && readOffset == 0 && endPage == invalidLogicalPageID)); + ASSERT(mode == WRITE || + (initialPageID == invalidLogicalPageID && readOffset == 0 && endPage == invalidLogicalPageID)); operation = Void(); } debug_printf("FIFOQueue::Cursor(%s) initialized\n", toString().c_str()); - if(mode == WRITE && initialPageID != invalidLogicalPageID) { + if (mode == WRITE && initialPageID != invalidLogicalPageID) { addNewPage(initialPageID, 0, true); } } // Since cursors can have async operations pending which modify their state they can't be copied cleanly - Cursor(const Cursor &other) = delete; + Cursor(const Cursor& other) = delete; // A read cursor can be initialized from a pop cursor - void initReadOnly(const Cursor &c) { + void initReadOnly(const Cursor& c) { ASSERT(c.mode == READONLY || c.mode == POP); init(c.queue, READONLY, c.pageID, c.offset, c.endPageID); } - ~Cursor() { - operation.cancel(); - } + ~Cursor() { operation.cancel(); } std::string toString() const { - if(mode == WRITE) { - return format("{WriteCursor %s:%p pos=%s:%d endOffset=%d}", queue->name.c_str(), this, ::toString(pageID).c_str(), offset, page ? raw()->endOffset : -1); + if (mode == WRITE) { + return format("{WriteCursor %s:%p pos=%s:%d endOffset=%d}", queue->name.c_str(), this, + ::toString(pageID).c_str(), offset, page ? raw()->endOffset : -1); } - if(mode == POP || mode == READONLY) { - return format("{ReadCursor %s:%p pos=%s:%d endOffset=%d endPage=%s}", queue->name.c_str(), this, ::toString(pageID).c_str(), offset, page ? raw()->endOffset : -1, ::toString(endPageID).c_str()); + if (mode == POP || mode == READONLY) { + return format("{ReadCursor %s:%p pos=%s:%d endOffset=%d endPage=%s}", queue->name.c_str(), this, + ::toString(pageID).c_str(), offset, page ? raw()->endOffset : -1, + ::toString(endPageID).c_str()); } ASSERT(mode == NONE); return format("{NullCursor=%p}", this); @@ -272,28 +262,20 @@ public: LogicalPageID nextPageID; uint16_t nextOffset; uint16_t endOffset; - uint8_t * begin() { - return (uint8_t *)(this + 1); - } + uint8_t* begin() { return (uint8_t*)(this + 1); } }; #pragma pack(pop) - Future notBusy() { - return operation; - } + Future notBusy() { return operation; } // Returns true if any items have been written to the last page - bool pendingWrites() const { - return mode == WRITE && offset != 0; - } + bool pendingWrites() const { return mode == WRITE && offset != 0; } - RawPage * raw() const { - return ((RawPage *)(page->begin())); - } + RawPage* raw() const { return ((RawPage*)(page->begin())); } void setNext(LogicalPageID pageID, int offset) { ASSERT(mode == WRITE); - RawPage *p = raw(); + RawPage* p = raw(); p->nextPageID = pageID; p->nextOffset = offset; } @@ -314,21 +296,22 @@ public: VALGRIND_MAKE_MEM_DEFINED(raw()->begin(), offset); VALGRIND_MAKE_MEM_DEFINED(raw()->begin() + offset, queue->dataBytesPerPage - raw()->endOffset); queue->pager->updatePage(pageID, page); - if(firstPageIDWritten == invalidLogicalPageID) { + if (firstPageIDWritten == invalidLogicalPageID) { firstPageIDWritten = pageID; } } // Link the current page to newPageID:newOffset and then write it to the pager. - // If initializeNewPage is true a page buffer will be allocated for the new page and it will be initialized + // If initializeNewPage is true a page buffer will be allocated for the new page and it will be initialized // as a new tail page. void addNewPage(LogicalPageID newPageID, int newOffset, bool initializeNewPage) { ASSERT(mode == WRITE); ASSERT(newPageID != invalidLogicalPageID); - debug_printf("FIFOQueue::Cursor(%s) Adding page %s init=%d\n", toString().c_str(), ::toString(newPageID).c_str(), initializeNewPage); + debug_printf("FIFOQueue::Cursor(%s) Adding page %s init=%d\n", toString().c_str(), + ::toString(newPageID).c_str(), initializeNewPage); // Update existing page and write, if it exists - if(page) { + if (page) { setNext(newPageID, newOffset); debug_printf("FIFOQueue::Cursor(%s) Linked new page\n", toString().c_str()); writePage(); @@ -337,21 +320,20 @@ public: pageID = newPageID; offset = newOffset; - if(initializeNewPage) { + if (initializeNewPage) { debug_printf("FIFOQueue::Cursor(%s) Initializing new page\n", toString().c_str()); page = queue->pager->newPageBuffer(); setNext(0, 0); auto p = raw(); ASSERT(newOffset == 0); p->endOffset = 0; - } - else { + } else { page.clear(); } } // Write item to the next position in the current page or, if it won't fit, add a new page and write it there. - ACTOR static Future write_impl(Cursor *self, T item, Future start) { + ACTOR static Future write_impl(Cursor* self, T item, Future start) { ASSERT(self->mode == WRITE); // Wait for the previous operation to finish @@ -360,14 +342,16 @@ public: wait(previous); state int bytesNeeded = Codec::bytesNeeded(item); - if(self->pageID == invalidLogicalPageID || self->offset + bytesNeeded > self->queue->dataBytesPerPage) { - debug_printf("FIFOQueue::Cursor(%s) write(%s) page is full, adding new page\n", self->toString().c_str(), ::toString(item).c_str()); + if (self->pageID == invalidLogicalPageID || self->offset + bytesNeeded > self->queue->dataBytesPerPage) { + debug_printf("FIFOQueue::Cursor(%s) write(%s) page is full, adding new page\n", + self->toString().c_str(), ::toString(item).c_str()); LogicalPageID newPageID = wait(self->queue->pager->newPageID()); self->addNewPage(newPageID, 0, true); ++self->queue->numPages; wait(yield()); } - debug_printf("FIFOQueue::Cursor(%s) before write(%s)\n", self->toString().c_str(), ::toString(item).c_str()); + debug_printf("FIFOQueue::Cursor(%s) before write(%s)\n", self->toString().c_str(), + ::toString(item).c_str()); auto p = self->raw(); Codec::writeToBytes(p->begin() + self->offset, item); self->offset += bytesNeeded; @@ -376,14 +360,15 @@ public: return Void(); } - void write(const T &item) { + void write(const T& item) { Promise p; operation = write_impl(this, item, p.getFuture()); p.send(Void()); } - // Read the next item at the cursor (if <= upperBound), moving to a new page first if the current page is exhausted - ACTOR static Future> readNext_impl(Cursor *self, Optional upperBound, Future start) { + // Read the next item at the cursor (if <= upperBound), moving to a new page first if the current page is + // exhausted + ACTOR static Future> readNext_impl(Cursor* self, Optional upperBound, Future start) { ASSERT(self->mode == POP || self->mode == READONLY); // Wait for the previous operation to finish @@ -392,13 +377,13 @@ public: wait(previous); debug_printf("FIFOQueue::Cursor(%s) readNext begin\n", self->toString().c_str()); - if(self->pageID == invalidLogicalPageID || self->pageID == self->endPageID) { + if (self->pageID == invalidLogicalPageID || self->pageID == self->endPageID) { debug_printf("FIFOQueue::Cursor(%s) readNext returning nothing\n", self->toString().c_str()); return Optional(); } // We now know we are pointing to PageID and it should be read and used, but it may not be loaded yet. - if(!self->page) { + if (!self->page) { wait(self->loadPage()); wait(yield()); } @@ -409,46 +394,50 @@ public: int bytesRead; T result = Codec::readFromBytes(p->begin() + self->offset, bytesRead); - if(upperBound.present() && upperBound.get() < result) { - debug_printf("FIFOQueue::Cursor(%s) not popping %s, exceeds upper bound %s\n", - self->toString().c_str(), ::toString(result).c_str(), ::toString(upperBound.get()).c_str()); + if (upperBound.present() && upperBound.get() < result) { + debug_printf("FIFOQueue::Cursor(%s) not popping %s, exceeds upper bound %s\n", self->toString().c_str(), + ::toString(result).c_str(), ::toString(upperBound.get()).c_str()); return Optional(); } self->offset += bytesRead; - if(self->mode == POP) { + if (self->mode == POP) { --self->queue->numEntries; } - debug_printf("FIFOQueue::Cursor(%s) after read of %s\n", self->toString().c_str(), ::toString(result).c_str()); + debug_printf("FIFOQueue::Cursor(%s) after read of %s\n", self->toString().c_str(), + ::toString(result).c_str()); ASSERT(self->offset <= p->endOffset); - if(self->offset == p->endOffset) { + if (self->offset == p->endOffset) { debug_printf("FIFOQueue::Cursor(%s) Page exhausted\n", self->toString().c_str()); LogicalPageID oldPageID = self->pageID; self->pageID = p->nextPageID; self->offset = p->nextOffset; - if(self->mode == POP) { + if (self->mode == POP) { --self->queue->numPages; } self->page.clear(); - debug_printf("FIFOQueue::Cursor(%s) readNext page exhausted, moved to new page\n", self->toString().c_str()); + debug_printf("FIFOQueue::Cursor(%s) readNext page exhausted, moved to new page\n", + self->toString().c_str()); - if(self->mode == POP) { - // Freeing the old page must happen after advancing the cursor and clearing the page reference because - // freePage() could cause a push onto a queue that causes a newPageID() call which could pop() from this - // very same queue. - // Queue pages are freed at page 0 because they can be reused after the next commit. + if (self->mode == POP) { + // Freeing the old page must happen after advancing the cursor and clearing the page reference + // because freePage() could cause a push onto a queue that causes a newPageID() call which could + // pop() from this very same queue. Queue pages are freed at page 0 because they can be reused after + // the next commit. self->queue->pager->freePage(oldPageID, 0); } } - debug_printf("FIFOQueue(%s) %s(upperBound=%s) -> %s\n", self->queue->name.c_str(), (self->mode == POP ? "pop" : "peek"), ::toString(upperBound).c_str(), ::toString(result).c_str()); + debug_printf("FIFOQueue(%s) %s(upperBound=%s) -> %s\n", self->queue->name.c_str(), + (self->mode == POP ? "pop" : "peek"), ::toString(upperBound).c_str(), + ::toString(result).c_str()); return result; } // Read and move past the next item if is <= upperBound or if upperBound is not present - Future> readNext(const Optional &upperBound = {}) { - if(mode == NONE) { + Future> readNext(const Optional& upperBound = {}) { + if (mode == NONE) { return Optional(); } Promise p; @@ -460,18 +449,15 @@ public: }; public: - FIFOQueue() : pager(nullptr) { - } + FIFOQueue() : pager(nullptr) {} - ~FIFOQueue() { - newTailPage.cancel(); - } + ~FIFOQueue() { newTailPage.cancel(); } - FIFOQueue(const FIFOQueue &other) = delete; - void operator=(const FIFOQueue &rhs) = delete; + FIFOQueue(const FIFOQueue& other) = delete; + void operator=(const FIFOQueue& rhs) = delete; // Create a new queue at newPageID - void create(IPager2 *p, LogicalPageID newPageID, std::string queueName) { + void create(IPager2* p, LogicalPageID newPageID, std::string queueName) { debug_printf("FIFOQueue(%s) create from page %s\n", queueName.c_str(), toString(newPageID).c_str()); pager = p; name = queueName; @@ -486,7 +472,7 @@ public: } // Load an existing queue from its queue state - void recover(IPager2 *p, const QueueState &qs, std::string queueName) { + void recover(IPager2* p, const QueueState& qs, std::string queueName) { debug_printf("FIFOQueue(%s) recover from queue state %s\n", queueName.c_str(), qs.toString().c_str()); pager = p; name = queueName; @@ -500,7 +486,7 @@ public: debug_printf("FIFOQueue(%s) recovered\n", queueName.c_str()); } - ACTOR static Future>> peekAll_impl(FIFOQueue *self) { + ACTOR static Future>> peekAll_impl(FIFOQueue* self) { state Standalone> results; state Cursor c; c.initReadOnly(self->headReader); @@ -508,7 +494,7 @@ public: loop { Optional x = wait(c.readNext()); - if(!x.present()) { + if (!x.present()) { break; } results.push_back(results.arena(), x.get()); @@ -517,14 +503,10 @@ public: return results; } - Future>> peekAll() { - return peekAll_impl(this); - } + Future>> peekAll() { return peekAll_impl(this); } // Pop the next item on front of queue if it is <= upperBound or if upperBound is not present - Future> pop(Optional upperBound = {}) { - return headReader.readNext(upperBound); - } + Future> pop(Optional upperBound = {}) { return headReader.readNext(upperBound); } QueueState getState() const { QueueState s; @@ -538,12 +520,12 @@ public: return s; } - void pushBack(const T &item) { + void pushBack(const T& item) { debug_printf("FIFOQueue(%s) pushBack(%s)\n", name.c_str(), toString(item).c_str()); tailWriter.write(item); } - void pushFront(const T &item) { + void pushFront(const T& item) { debug_printf("FIFOQueue(%s) pushFront(%s)\n", name.c_str(), toString(item).c_str()); headWriter.write(item); } @@ -555,7 +537,8 @@ public: // Returns true if any most recently started operations on any cursors are not ready bool busy() { - return !headWriter.notBusy().isReady() || !headReader.notBusy().isReady() || !tailWriter.notBusy().isReady() || !newTailPage.isReady(); + return !headWriter.notBusy().isReady() || !headReader.notBusy().isReady() || !tailWriter.notBusy().isReady() || + !newTailPage.isReady(); } // preFlush() prepares this queue to be flushed to disk, but doesn't actually do it so the queue can still @@ -571,7 +554,7 @@ public: // - queue push() can call pager->newPageID() which can call pop() on the same or another queue // This creates a circular dependency with 1 or more queues when those queues are used by the pager // to manage free page IDs. - ACTOR static Future preFlush_impl(FIFOQueue *self) { + ACTOR static Future preFlush_impl(FIFOQueue* self) { debug_printf("FIFOQueue(%s) preFlush begin\n", self->name.c_str()); wait(self->notBusy()); @@ -579,14 +562,15 @@ public: // so see if any work is pending now. bool workPending = self->busy(); - if(!workPending) { + if (!workPending) { // A newly created or flushed queue starts out in a state where its tail page to be written to is empty. - // After pushBack() is called, this is no longer the case and never will be again until the queue is flushed. - // Before the non-empty tail page is written it must be linked to a new empty page for use after the next - // flush. (This is explained more at the top of FIFOQueue but it is because queue pages can only be written - // once because once they contain durable data a second write to link to a new page could corrupt the existing - // data if the subsequent commit never succeeds.) - if(self->newTailPage.isReady() && self->newTailPage.get() == invalidLogicalPageID && self->tailWriter.pendingWrites()) { + // After pushBack() is called, this is no longer the case and never will be again until the queue is + // flushed. Before the non-empty tail page is written it must be linked to a new empty page for use after + // the next flush. (This is explained more at the top of FIFOQueue but it is because queue pages can only + // be written once because once they contain durable data a second write to link to a new page could corrupt + // the existing data if the subsequent commit never succeeds.) + if (self->newTailPage.isReady() && self->newTailPage.get() == invalidLogicalPageID && + self->tailWriter.pendingWrites()) { self->newTailPage = self->pager->newPageID(); workPending = true; } @@ -596,16 +580,14 @@ public: return workPending; } - Future preFlush() { - return preFlush_impl(this); - } + Future preFlush() { return preFlush_impl(this); } void finishFlush() { debug_printf("FIFOQueue(%s) finishFlush start\n", name.c_str()); ASSERT(!busy()); // If a new tail page was allocated, link the last page of the tail writer to it. - if(newTailPage.get() != invalidLogicalPageID) { + if (newTailPage.get() != invalidLogicalPageID) { tailWriter.addNewPage(newTailPage.get(), 0, false); // The flush sequence allocated a page and added it to the queue so increment numPages ++numPages; @@ -618,7 +600,7 @@ public: // If the headWriter wrote anything, link its tail page to the headReader position and point the headReader // to the start of the headWriter - if(headWriter.pendingWrites()) { + if (headWriter.pendingWrites()) { headWriter.addNewPage(headReader.pageID, headReader.offset, false); headReader.pageID = headWriter.firstPageIDWritten; headReader.offset = 0; @@ -635,10 +617,10 @@ public: debug_printf("FIFOQueue(%s) finishFlush end\n", name.c_str()); } - ACTOR static Future flush_impl(FIFOQueue *self) { + ACTOR static Future flush_impl(FIFOQueue* self) { loop { bool notDone = wait(self->preFlush()); - if(!notDone) { + if (!notDone) { break; } } @@ -646,15 +628,13 @@ public: return Void(); } - Future flush() { - return flush_impl(this); - } + Future flush() { return flush_impl(this); } - IPager2 *pager; + IPager2* pager; int64_t numPages; int64_t numEntries; int dataBytesPerPage; - + Cursor headReader; Cursor tailWriter; Cursor headWriter; @@ -673,63 +653,44 @@ class FastAllocatedPage : public IPage, public FastAllocated, public: // Create a fast-allocated page with size total bytes INCLUDING checksum FastAllocatedPage(int size, int bufferSize) : logicalSize(size), bufferSize(bufferSize) { - buffer = (uint8_t *)allocateFast(bufferSize); + buffer = (uint8_t*)allocateFast(bufferSize); // Mark any unused page portion defined VALGRIND_MAKE_MEM_DEFINED(buffer + logicalSize, bufferSize - logicalSize); }; - virtual ~FastAllocatedPage() { - freeFast(bufferSize, buffer); - } + virtual ~FastAllocatedPage() { freeFast(bufferSize, buffer); } virtual Reference clone() const { - FastAllocatedPage *p = new FastAllocatedPage(logicalSize, bufferSize); + FastAllocatedPage* p = new FastAllocatedPage(logicalSize, bufferSize); memcpy(p->buffer, buffer, logicalSize); return Reference(p); } // Usable size, without checksum - int size() const { - return logicalSize - sizeof(Checksum); - } + int size() const { return logicalSize - sizeof(Checksum); } - uint8_t const* begin() const { - return buffer; - } + uint8_t const* begin() const { return buffer; } - uint8_t* mutate() { - return buffer; - } + uint8_t* mutate() { return buffer; } - void addref() const { - ReferenceCounted::addref(); - } + void addref() const { ReferenceCounted::addref(); } + + void delref() const { ReferenceCounted::delref(); } - void delref() const { - ReferenceCounted::delref(); - } - typedef uint32_t Checksum; - Checksum & getChecksum() { - return *(Checksum *)(buffer + size()); - } + Checksum& getChecksum() { return *(Checksum*)(buffer + size()); } - Checksum calculateChecksum(LogicalPageID pageID) { - return crc32c_append(pageID, buffer, size()); - } + Checksum calculateChecksum(LogicalPageID pageID) { return crc32c_append(pageID, buffer, size()); } - void updateChecksum(LogicalPageID pageID) { - getChecksum() = calculateChecksum(pageID); - } + void updateChecksum(LogicalPageID pageID) { getChecksum() = calculateChecksum(pageID); } + + bool verifyChecksum(LogicalPageID pageID) { return getChecksum() == calculateChecksum(pageID); } - bool verifyChecksum(LogicalPageID pageID) { - return getChecksum() == calculateChecksum(pageID); - } private: int logicalSize; int bufferSize; - uint8_t *buffer; + uint8_t* buffer; }; // Holds an index of recently used objects. @@ -737,12 +698,11 @@ private: // bool evictable() const; // return true if the entry can be evicted // Future onEvictable() const; // ready when entry can be evicted // indicating if it is safe to evict. -template +template class ObjectCache : NonCopyable { struct Entry : public boost::intrusive::list_base_hook<> { - Entry() : hits(0) { - } + Entry() : hits(0) {} IndexType index; ObjectType item; int hits; @@ -752,8 +712,8 @@ class ObjectCache : NonCopyable { typedef boost::intrusive::list EvictionOrderT; public: - ObjectCache(int sizeLimit = 1) : sizeLimit(sizeLimit), cacheHits(0), cacheMisses(0), noHitEvictions(0), failedEvictions(0) { - } + ObjectCache(int sizeLimit = 1) + : sizeLimit(sizeLimit), cacheHits(0), cacheMisses(0), noHitEvictions(0), failedEvictions(0) {} void setSizeLimit(int n) { ASSERT(n > 0); @@ -762,9 +722,9 @@ public: // Get the object for i if it exists, else return nullptr. // If the object exists, its eviction order will NOT change as this is not a cache hit. - ObjectType * getIfExists(const IndexType &index) { + ObjectType* getIfExists(const IndexType& index) { auto i = cache.find(index); - if(i != cache.end()) { + if (i != cache.end()) { ++i->second.hits; return &i->second.item; } @@ -773,20 +733,19 @@ public: // Get the object for i or create a new one. // After a get(), the object for i is the last in evictionOrder. - ObjectType & get(const IndexType &index, bool noHit = false) { - Entry &entry = cache[index]; + ObjectType& get(const IndexType& index, bool noHit = false) { + Entry& entry = cache[index]; // If entry is linked into evictionOrder then move it to the back of the order - if(entry.is_linked()) { - if(!noHit) { + if (entry.is_linked()) { + if (!noHit) { ++entry.hits; ++cacheHits; } // Move the entry to the back of the eviction order evictionOrder.erase(evictionOrder.iterator_to(entry)); evictionOrder.push_back(entry); - } - else { + } else { ++cacheMisses; // Finish initializing entry entry.index = index; @@ -795,25 +754,27 @@ public: evictionOrder.push_back(entry); // While the cache is too big, evict the oldest entry until the oldest entry can't be evicted. - while(cache.size() > sizeLimit) { - Entry &toEvict = evictionOrder.front(); - debug_printf("Trying to evict %s to make room for %s\n", toString(toEvict.index).c_str(), toString(index).c_str()); + while (cache.size() > sizeLimit) { + Entry& toEvict = evictionOrder.front(); + debug_printf("Trying to evict %s to make room for %s\n", toString(toEvict.index).c_str(), + toString(index).c_str()); - // It's critical that we do not evict the item we just added (or the reference we return would be invalid) but - // since sizeLimit must be > 0, entry was just added to the end of the evictionOrder, and this loop will end - // if we move anything to the end of the eviction order, we can be guaraunted that entry != toEvict, so we - // do not need to check. - // If the item is not evictable then move it to the back of the eviction order and stop. - if(!toEvict.item.evictable()) { + // It's critical that we do not evict the item we just added (or the reference we return would be + // invalid) but since sizeLimit must be > 0, entry was just added to the end of the evictionOrder, and + // this loop will end if we move anything to the end of the eviction order, we can be guaraunted that + // entry != toEvict, so we do not need to check. If the item is not evictable then move it to the back + // of the eviction order and stop. + if (!toEvict.item.evictable()) { evictionOrder.erase(evictionOrder.iterator_to(toEvict)); evictionOrder.push_back(toEvict); ++failedEvictions; break; } else { - if(toEvict.hits == 0) { + if (toEvict.hits == 0) { ++noHitEvictions; } - debug_printf("Evicting %s to make room for %s\n", toString(toEvict.index).c_str(), toString(index).c_str()); + debug_printf("Evicting %s to make room for %s\n", toString(toEvict.index).c_str(), + toString(index).c_str()); evictionOrder.pop_front(); cache.erase(toEvict.index); } @@ -825,12 +786,12 @@ public: // Clears the cache, saving the entries, and then waits for eachWaits for each item to be evictable and evicts it. // The cache should not be Evicts all evictable entries - ACTOR static Future clear_impl(ObjectCache *self) { + ACTOR static Future clear_impl(ObjectCache* self) { state ObjectCache::CacheT cache; state EvictionOrderT evictionOrder; // Swap cache contents to local state vars - // After this, no more entries will be added to or read from these + // After this, no more entries will be added to or read from these // structures so we know for sure that no page will become unevictable // after it is either evictable or onEvictable() is ready. cache.swap(self->cache); @@ -839,8 +800,8 @@ public: state typename EvictionOrderT::iterator i = evictionOrder.begin(); state typename EvictionOrderT::iterator iEnd = evictionOrder.begin(); - while(i != iEnd) { - if(!i->item.evictable()) { + while (i != iEnd) { + if (!i->item.evictable()) { wait(i->item.onEvictable()); } ++i; @@ -852,9 +813,7 @@ public: return Void(); } - Future clear() { - return clear_impl(this); - } + Future clear() { return clear_impl(this); } int count() const { ASSERT(evictionOrder.size() == cache.size()); @@ -872,13 +831,13 @@ private: EvictionOrderT evictionOrder; }; -ACTOR template Future forwardError(Future f, Promise target) { +ACTOR template +Future forwardError(Future f, Promise target) { try { T x = wait(f); return x; - } - catch(Error &e) { - if(e.code() != error_code_actor_cancelled && target.canBeSet()) { + } catch (Error& e) { + if (e.code() != error_code_actor_cancelled && target.canBeSet()) { target.sendError(e); } @@ -892,7 +851,7 @@ class DWALPagerSnapshot; // It does this internally mapping the original page ID to alternate page IDs by write version. // The page id remaps are kept in memory and also logged to a "remap queue" which must be reloaded on cold start. // To prevent the set of remaps from growing unboundedly, once a remap is old enough to be at or before the -// oldest pager version being maintained the remap can be "undone" by popping it from the remap queue, +// oldest pager version being maintained the remap can be "undone" by popping it from the remap queue, // copying the alternate page ID's data over top of the original page ID's data, and deleting the remap from memory. // This process basically describes a "Delayed" Write-Ahead-Log (DWAL) because the remap queue and the newly allocated // alternate pages it references basically serve as a write ahead log for pages that will eventially be copied @@ -907,9 +866,7 @@ public: Version version; LogicalPageID pageID; - bool operator<(const DelayedFreePage &rhs) const { - return version < rhs.version; - } + bool operator<(const DelayedFreePage& rhs) const { return version < rhs.version; } std::string toString() const { return format("DelayedFreePage{%s @%" PRId64 "}", ::toString(pageID).c_str(), version); @@ -921,12 +878,11 @@ public: LogicalPageID originalPageID; LogicalPageID newPageID; - bool operator<(const RemappedPage &rhs) { - return version < rhs.version; - } + bool operator<(const RemappedPage& rhs) { return version < rhs.version; } std::string toString() const { - return format("RemappedPage(%s -> %s @%" PRId64 "}", ::toString(originalPageID).c_str(), ::toString(newPageID).c_str(), version); + return format("RemappedPage(%s -> %s @%" PRId64 "}", ::toString(originalPageID).c_str(), + ::toString(newPageID).c_str(), version); } }; @@ -938,10 +894,11 @@ public: // If the file already exists, pageSize might be different than desiredPageSize // Use pageCacheSizeBytes == 0 for default DWALPager(int desiredPageSize, std::string filename, int64_t pageCacheSizeBytes) - : desiredPageSize(desiredPageSize), filename(filename), pHeader(nullptr), pageCacheBytes(pageCacheSizeBytes) - { - if(pageCacheBytes == 0) { - pageCacheBytes = g_network->isSimulated() ? (BUGGIFY ? FLOW_KNOBS->BUGGIFY_SIM_PAGE_CACHE_4K : FLOW_KNOBS->SIM_PAGE_CACHE_4K) : FLOW_KNOBS->PAGE_CACHE_4K; + : desiredPageSize(desiredPageSize), filename(filename), pHeader(nullptr), pageCacheBytes(pageCacheSizeBytes) { + if (pageCacheBytes == 0) { + pageCacheBytes = g_network->isSimulated() + ? (BUGGIFY ? FLOW_KNOBS->BUGGIFY_SIM_PAGE_CACHE_4K : FLOW_KNOBS->SIM_PAGE_CACHE_4K) + : FLOW_KNOBS->PAGE_CACHE_4K; } commitFuture = Void(); recoverFuture = forwardError(recover(this), errorPromise); @@ -950,10 +907,10 @@ public: void setPageSize(int size) { logicalPageSize = size; physicalPageSize = smallestPhysicalBlock; - while(logicalPageSize > physicalPageSize) { + while (logicalPageSize > physicalPageSize) { physicalPageSize += smallestPhysicalBlock; } - if(pHeader != nullptr) { + if (pHeader != nullptr) { pHeader->pageSize = logicalPageSize; } pageCache.setSizeLimit(pageCacheBytes / physicalPageSize); @@ -963,14 +920,15 @@ public: memcpy(lastCommittedHeaderPage->mutate(), headerPage->begin(), smallestPhysicalBlock); } - ACTOR static Future recover(DWALPager *self) { + ACTOR static Future recover(DWALPager* self) { ASSERT(!self->recoverFuture.isValid()); self->remapUndoFuture = Void(); - int64_t flags = IAsyncFile::OPEN_UNCACHED | IAsyncFile::OPEN_UNBUFFERED | IAsyncFile::OPEN_READWRITE | IAsyncFile::OPEN_LOCK; + int64_t flags = IAsyncFile::OPEN_UNCACHED | IAsyncFile::OPEN_UNBUFFERED | IAsyncFile::OPEN_READWRITE | + IAsyncFile::OPEN_LOCK; state bool exists = fileExists(self->filename); - if(!exists) { + if (!exists) { flags |= IAsyncFile::OPEN_ATOMIC_WRITE_AND_CREATE | IAsyncFile::OPEN_CREATE; } @@ -979,19 +937,20 @@ public: // Header page is always treated as having a page size of smallestPhysicalBlock self->setPageSize(smallestPhysicalBlock); self->lastCommittedHeaderPage = self->newPageBuffer(); - self->pLastCommittedHeader = (Header *)self->lastCommittedHeaderPage->begin(); + self->pLastCommittedHeader = (Header*)self->lastCommittedHeaderPage->begin(); state int64_t fileSize = 0; - if(exists) { + if (exists) { wait(store(fileSize, self->pageFile->size())); } - debug_printf("DWALPager(%s) recover exists=%d fileSize=%" PRId64 "\n", self->filename.c_str(), exists, fileSize); + debug_printf("DWALPager(%s) recover exists=%d fileSize=%" PRId64 "\n", self->filename.c_str(), exists, + fileSize); // TODO: If the file exists but appears to never have been successfully committed is this an error or // should recovery proceed with a new pager instance? // If there are at least 2 pages then try to recover the existing file - if(exists && fileSize >= (self->smallestPhysicalBlock * 2)) { + if (exists && fileSize >= (self->smallestPhysicalBlock * 2)) { debug_printf("DWALPager(%s) recovering using existing file\n"); state bool recoveredHeader = false; @@ -1000,44 +959,42 @@ public: wait(store(self->headerPage, self->readHeaderPage(self, 0))); // If the checksum fails for the header page, try to recover committed header backup from page 1 - if(!self->headerPage.castTo()->verifyChecksum(0)) { + if (!self->headerPage.castTo()->verifyChecksum(0)) { TraceEvent(SevWarn, "DWALPagerRecoveringHeader").detail("Filename", self->filename); - + wait(store(self->headerPage, self->readHeaderPage(self, 1))); - if(!self->headerPage.castTo()->verifyChecksum(1)) { - if(g_network->isSimulated()) { + if (!self->headerPage.castTo()->verifyChecksum(1)) { + if (g_network->isSimulated()) { // TODO: Detect if process is being restarted and only throw injected if so? throw io_error().asInjectedFault(); } Error e = checksum_failed(); - TraceEvent(SevError, "DWALPagerRecoveryFailed") - .detail("Filename", self->filename) - .error(e); + TraceEvent(SevError, "DWALPagerRecoveryFailed").detail("Filename", self->filename).error(e); throw e; } recoveredHeader = true; } - self->pHeader = (Header *)self->headerPage->begin(); + self->pHeader = (Header*)self->headerPage->begin(); - if(self->pHeader->formatVersion != Header::FORMAT_VERSION) { - Error e = internal_error(); // TODO: Something better? + if (self->pHeader->formatVersion != Header::FORMAT_VERSION) { + Error e = internal_error(); // TODO: Something better? TraceEvent(SevError, "DWALPagerRecoveryFailedWrongVersion") - .detail("Filename", self->filename) - .detail("Version", self->pHeader->formatVersion) - .detail("ExpectedVersion", Header::FORMAT_VERSION) - .error(e); + .detail("Filename", self->filename) + .detail("Version", self->pHeader->formatVersion) + .detail("ExpectedVersion", Header::FORMAT_VERSION) + .error(e); throw e; } self->setPageSize(self->pHeader->pageSize); - if(self->logicalPageSize != self->desiredPageSize) { + if (self->logicalPageSize != self->desiredPageSize) { TraceEvent(SevWarn, "DWALPagerPageSizeNotDesired") - .detail("Filename", self->filename) - .detail("ExistingPageSize", self->logicalPageSize) - .detail("DesiredPageSize", self->desiredPageSize); + .detail("Filename", self->filename) + .detail("ExistingPageSize", self->logicalPageSize) + .detail("DesiredPageSize", self->desiredPageSize); } self->freeList.recover(self, self->pHeader->freeList, "FreeListRecovered"); @@ -1045,15 +1002,15 @@ public: self->remapQueue.recover(self, self->pHeader->remapQueue, "RemapQueueRecovered"); Standalone> remaps = wait(self->remapQueue.peekAll()); - for(auto &r : remaps) { - if(r.newPageID != invalidLogicalPageID) { + for (auto& r : remaps) { + if (r.newPageID != invalidLogicalPageID) { self->remappedPages[r.originalPageID][r.version] = r.newPageID; } } // If the header was recovered from the backup at Page 1 then write and sync it to Page 0 before continuing. // If this fails, the backup header is still in tact for the next recovery attempt. - if(recoveredHeader) { + if (recoveredHeader) { // Write the header to page 0 wait(self->writeHeaderPage(0, self->headerPage)); @@ -1065,19 +1022,19 @@ public: debug_printf("DWALPager(%s) Header recovery complete.\n", self->filename.c_str()); } - // Update the last committed header with the one that was recovered (which is the last known committed header) + // Update the last committed header with the one that was recovered (which is the last known committed + // header) self->updateCommittedHeader(); self->addLatestSnapshot(); - } - else { - // Note: If the file contains less than 2 pages but more than 0 bytes then the pager was never successfully committed. - // A new pager will be created in its place. + } else { + // Note: If the file contains less than 2 pages but more than 0 bytes then the pager was never successfully + // committed. A new pager will be created in its place. // TODO: Is the right behavior? debug_printf("DWALPager(%s) creating new pager\n"); self->headerPage = self->newPageBuffer(); - self->pHeader = (Header *)self->headerPage->begin(); + self->pHeader = (Header*)self->headerPage->begin(); // Now that the header page has been allocated, set page size to desired self->setPageSize(self->desiredPageSize); @@ -1107,7 +1064,8 @@ public: self->pHeader->remapQueue = self->remapQueue.getState(); // Set remaining header bytes to \xff - memset(self->headerPage->mutate() + self->pHeader->size(), 0xff, self->headerPage->size() - self->pHeader->size()); + memset(self->headerPage->mutate() + self->pHeader->size(), 0xff, + self->headerPage->size() - self->pHeader->size()); // Since there is no previously committed header use the initial header for the initial commit. self->updateCommittedHeader(); @@ -1115,7 +1073,9 @@ public: wait(self->commit()); } - debug_printf("DWALPager(%s) recovered. committedVersion=%" PRId64 " logicalPageSize=%d physicalPageSize=%d\n", self->filename.c_str(), self->pHeader->committedVersion, self->logicalPageSize, self->physicalPageSize); + debug_printf("DWALPager(%s) recovered. committedVersion=%" PRId64 " logicalPageSize=%d physicalPageSize=%d\n", + self->filename.c_str(), self->pHeader->committedVersion, self->logicalPageSize, + self->physicalPageSize); return Void(); } @@ -1125,31 +1085,34 @@ public: // Returns the usable size of pages returned by the pager (i.e. the size of the page that isn't pager overhead). // For a given pager instance, separate calls to this function must return the same value. - int getUsablePageSize() override { - return logicalPageSize - sizeof(FastAllocatedPage::Checksum); - } + int getUsablePageSize() override { return logicalPageSize - sizeof(FastAllocatedPage::Checksum); } // Get a new, previously available page ID. The page will be considered in-use after the next commit // regardless of whether or not it was written to, until it is returned to the pager via freePage() - ACTOR static Future newPageID_impl(DWALPager *self) { + ACTOR static Future newPageID_impl(DWALPager* self) { // First try the free list Optional freePageID = wait(self->freeList.pop()); - if(freePageID.present()) { - debug_printf("DWALPager(%s) newPageID() returning %s from free list\n", self->filename.c_str(), toString(freePageID.get()).c_str()); + if (freePageID.present()) { + debug_printf("DWALPager(%s) newPageID() returning %s from free list\n", self->filename.c_str(), + toString(freePageID.get()).c_str()); return freePageID.get(); } - // Try to reuse pages up to the earlier of the oldest version set by the user or the oldest snapshot still in the snapshots list + // Try to reuse pages up to the earlier of the oldest version set by the user or the oldest snapshot still in + // the snapshots list ASSERT(!self->snapshots.empty()); - Optional delayedFreePageID = wait(self->delayedFreeList.pop(DelayedFreePage{self->effectiveOldestVersion(), 0})); - if(delayedFreePageID.present()) { - debug_printf("DWALPager(%s) newPageID() returning %s from delayed free list\n", self->filename.c_str(), toString(delayedFreePageID.get()).c_str()); + Optional delayedFreePageID = + wait(self->delayedFreeList.pop(DelayedFreePage{ self->effectiveOldestVersion(), 0 })); + if (delayedFreePageID.present()) { + debug_printf("DWALPager(%s) newPageID() returning %s from delayed free list\n", self->filename.c_str(), + toString(delayedFreePageID.get()).c_str()); return delayedFreePageID.get().pageID; } // Lastly, add a new page to the pager LogicalPageID id = self->newLastPageID(); - debug_printf("DWALPager(%s) newPageID() returning %s at end of file\n", self->filename.c_str(), toString(id).c_str()); + debug_printf("DWALPager(%s) newPageID() returning %s at end of file\n", self->filename.c_str(), + toString(id).c_str()); return id; }; @@ -1160,22 +1123,24 @@ public: return id; } - Future newPageID() override { - return newPageID_impl(this); - } + Future newPageID() override { return newPageID_impl(this); } Future writePhysicalPage(PhysicalPageID pageID, Reference page, bool header = false) { - debug_printf("DWALPager(%s) op=%s %s ptr=%p\n", filename.c_str(), (header ? "writePhysicalHeader" : "writePhysical"), toString(pageID).c_str(), page->begin()); + debug_printf("DWALPager(%s) op=%s %s ptr=%p\n", filename.c_str(), + (header ? "writePhysicalHeader" : "writePhysical"), toString(pageID).c_str(), page->begin()); VALGRIND_MAKE_MEM_DEFINED(page->begin(), page->size()); - ((Page *)page.getPtr())->updateChecksum(pageID); + ((Page*)page.getPtr())->updateChecksum(pageID); // Note: Not using forwardError here so a write error won't be discovered until commit time. int blockSize = header ? smallestPhysicalBlock : physicalPageSize; - Future f = holdWhile(page, map(pageFile->write(page->begin(), blockSize, (int64_t)pageID * blockSize), [=](Void) { - debug_printf("DWALPager(%s) op=%s %s ptr=%p\n", filename.c_str(), (header ? "writePhysicalHeaderComplete" : "writePhysicalComplete"), toString(pageID).c_str(), page->begin()); - return Void(); - })); + Future f = + holdWhile(page, map(pageFile->write(page->begin(), blockSize, (int64_t)pageID * blockSize), [=](Void) { + debug_printf("DWALPager(%s) op=%s %s ptr=%p\n", filename.c_str(), + (header ? "writePhysicalHeaderComplete" : "writePhysicalComplete"), + toString(pageID).c_str(), page->begin()); + return Void(); + })); operations.add(f); return f; } @@ -1186,8 +1151,11 @@ public: void updatePage(LogicalPageID pageID, Reference data) override { // Get the cache entry for this page, without counting it as a cache hit as we're replacing its contents now - PageCacheEntry &cacheEntry = pageCache.get(pageID, true); - debug_printf("DWALPager(%s) op=write %s cached=%d reading=%d writing=%d\n", filename.c_str(), toString(pageID).c_str(), cacheEntry.initialized(), cacheEntry.initialized() && cacheEntry.reading(), cacheEntry.initialized() && cacheEntry.writing()); + PageCacheEntry& cacheEntry = pageCache.get(pageID, true); + debug_printf("DWALPager(%s) op=write %s cached=%d reading=%d writing=%d\n", filename.c_str(), + toString(pageID).c_str(), cacheEntry.initialized(), + cacheEntry.initialized() && cacheEntry.reading(), + cacheEntry.initialized() && cacheEntry.writing()); // If the page is still being read then it's not also being written because a write places // the new content into readFuture when the write is launched, not when it is completed. @@ -1195,25 +1163,23 @@ public: // is necessary for remap erasure to work correctly since the oldest version of a page, located // at the original page ID, could have a pending read when that version is expired and the write // of the next newest version over top of the original page begins. - if(!cacheEntry.initialized()) { + if (!cacheEntry.initialized()) { cacheEntry.writeFuture = writePhysicalPage(pageID, data); - } - else if(cacheEntry.reading()) { + } else if (cacheEntry.reading()) { // Wait for the read to finish, then start the write. cacheEntry.writeFuture = map(success(cacheEntry.readFuture), [=](Void) { writePhysicalPage(pageID, data); return Void(); }); - } + } // If the page is being written, wait for this write before issuing the new write to ensure the // writes happen in the correct order - else if(cacheEntry.writing()) { + else if (cacheEntry.writing()) { cacheEntry.writeFuture = map(cacheEntry.writeFuture, [=](Void) { writePhysicalPage(pageID, data); return Void(); }); - } - else { + } else { cacheEntry.writeFuture = writePhysicalPage(pageID, data); } @@ -1227,7 +1193,7 @@ public: Future f = map(newPageID(), [=](LogicalPageID newPageID) { updatePage(newPageID, data); // TODO: Possibly limit size of remap queue since it must be recovered on cold start - RemappedPage r{v, pageID, newPageID}; + RemappedPage r{ v, pageID, newPageID }; remapQueue.pushBack(r); remappedPages[pageID][v] = newPageID; debug_printf("DWALPager(%s) pushed %s\n", filename.c_str(), RemappedPage(r).toString().c_str()); @@ -1239,62 +1205,71 @@ public: } void freePage(LogicalPageID pageID, Version v) override { - // If pageID has been remapped, then it can't be freed until all existing remaps for that page have been undone, so queue it for later deletion - if(remappedPages.find(pageID) != remappedPages.end()) { - debug_printf("DWALPager(%s) op=freeRemapped %s @%" PRId64 " oldestVersion=%" PRId64 "\n", filename.c_str(), toString(pageID).c_str(), v, pLastCommittedHeader->oldestVersion); - remapQueue.pushBack(RemappedPage{v, pageID, invalidLogicalPageID}); + // If pageID has been remapped, then it can't be freed until all existing remaps for that page have been undone, + // so queue it for later deletion + if (remappedPages.find(pageID) != remappedPages.end()) { + debug_printf("DWALPager(%s) op=freeRemapped %s @%" PRId64 " oldestVersion=%" PRId64 "\n", filename.c_str(), + toString(pageID).c_str(), v, pLastCommittedHeader->oldestVersion); + remapQueue.pushBack(RemappedPage{ v, pageID, invalidLogicalPageID }); return; } // If v is older than the oldest version still readable then mark pageID as free as of the next commit - if(v < effectiveOldestVersion()) { - debug_printf("DWALPager(%s) op=freeNow %s @%" PRId64 " oldestVersion=%" PRId64 "\n", filename.c_str(), toString(pageID).c_str(), v, pLastCommittedHeader->oldestVersion); + if (v < effectiveOldestVersion()) { + debug_printf("DWALPager(%s) op=freeNow %s @%" PRId64 " oldestVersion=%" PRId64 "\n", filename.c_str(), + toString(pageID).c_str(), v, pLastCommittedHeader->oldestVersion); freeList.pushBack(pageID); - } - else { + } else { // Otherwise add it to the delayed free list - debug_printf("DWALPager(%s) op=freeLater %s @%" PRId64 " oldestVersion=%" PRId64 "\n", filename.c_str(), toString(pageID).c_str(), v, pLastCommittedHeader->oldestVersion); - delayedFreeList.pushBack({v, pageID}); + debug_printf("DWALPager(%s) op=freeLater %s @%" PRId64 " oldestVersion=%" PRId64 "\n", filename.c_str(), + toString(pageID).c_str(), v, pLastCommittedHeader->oldestVersion); + delayedFreeList.pushBack({ v, pageID }); } }; // Read a physical page from the page file. Note that header pages use a page size of smallestPhysicalBlock // If the user chosen physical page size is larger, then there will be a gap of unused space after the header pages // and before the user-chosen sized pages. - ACTOR static Future> readPhysicalPage(DWALPager *self, PhysicalPageID pageID, bool header = false) { - if(g_network->getCurrentTask() > TaskPriority::DiskRead) { + ACTOR static Future> readPhysicalPage(DWALPager* self, PhysicalPageID pageID, + bool header = false) { + if (g_network->getCurrentTask() > TaskPriority::DiskRead) { wait(delay(0, TaskPriority::DiskRead)); } - state Reference page = header ? Reference(new FastAllocatedPage(smallestPhysicalBlock, smallestPhysicalBlock)) : self->newPageBuffer(); - debug_printf("DWALPager(%s) op=readPhysicalStart %s ptr=%p\n", self->filename.c_str(), toString(pageID).c_str(), page->begin()); + state Reference page = + header ? Reference(new FastAllocatedPage(smallestPhysicalBlock, smallestPhysicalBlock)) + : self->newPageBuffer(); + debug_printf("DWALPager(%s) op=readPhysicalStart %s ptr=%p\n", self->filename.c_str(), toString(pageID).c_str(), + page->begin()); int blockSize = header ? smallestPhysicalBlock : self->physicalPageSize; // TODO: Could a dispatched read try to write to page after it has been destroyed if this actor is cancelled? int readBytes = wait(self->pageFile->read(page->mutate(), blockSize, (int64_t)pageID * blockSize)); - debug_printf("DWALPager(%s) op=readPhysicalComplete %s ptr=%p bytes=%d\n", self->filename.c_str(), toString(pageID).c_str(), page->begin(), readBytes); + debug_printf("DWALPager(%s) op=readPhysicalComplete %s ptr=%p bytes=%d\n", self->filename.c_str(), + toString(pageID).c_str(), page->begin(), readBytes); // Header reads are checked explicitly during recovery - if(!header) { - Page *p = (Page *)page.getPtr(); - if(!p->verifyChecksum(pageID)) { - debug_printf("DWALPager(%s) checksum failed for %s\n", self->filename.c_str(), toString(pageID).c_str()); + if (!header) { + Page* p = (Page*)page.getPtr(); + if (!p->verifyChecksum(pageID)) { + debug_printf("DWALPager(%s) checksum failed for %s\n", self->filename.c_str(), + toString(pageID).c_str()); Error e = checksum_failed(); TraceEvent(SevError, "DWALPagerChecksumFailed") - .detail("Filename", self->filename.c_str()) - .detail("PageID", pageID) - .detail("PageSize", self->physicalPageSize) - .detail("Offset", pageID * self->physicalPageSize) - .detail("CalculatedChecksum", p->calculateChecksum(pageID)) - .detail("ChecksumInPage", p->getChecksum()) - .error(e); + .detail("Filename", self->filename.c_str()) + .detail("PageID", pageID) + .detail("PageSize", self->physicalPageSize) + .detail("Offset", pageID * self->physicalPageSize) + .detail("CalculatedChecksum", p->calculateChecksum(pageID)) + .detail("ChecksumInPage", p->getChecksum()) + .error(e); throw e; } } return page; } - static Future> readHeaderPage(DWALPager *self, PhysicalPageID pageID) { + static Future> readHeaderPage(DWALPager* self, PhysicalPageID pageID) { return readPhysicalPage(self, pageID, true); } @@ -1302,10 +1277,10 @@ public: Future> readPage(LogicalPageID pageID, bool cacheable, bool noHit = false) override { // Use cached page if present, without triggering a cache hit. // Otherwise, read the page and return it but don't add it to the cache - if(!cacheable) { + if (!cacheable) { debug_printf("DWALPager(%s) op=readUncached %s\n", filename.c_str(), toString(pageID).c_str()); - PageCacheEntry *pCacheEntry = pageCache.getIfExists(pageID); - if(pCacheEntry != nullptr) { + PageCacheEntry* pCacheEntry = pageCache.getIfExists(pageID); + if (pCacheEntry != nullptr) { debug_printf("DWALPager(%s) op=readUncachedHit %s\n", filename.c_str(), toString(pageID).c_str()); return pCacheEntry->readFuture; } @@ -1314,10 +1289,13 @@ public: return forwardError(readPhysicalPage(this, (PhysicalPageID)pageID), errorPromise); } - PageCacheEntry &cacheEntry = pageCache.get(pageID, noHit); - debug_printf("DWALPager(%s) op=read %s cached=%d reading=%d writing=%d noHit=%d\n", filename.c_str(), toString(pageID).c_str(), cacheEntry.initialized(), cacheEntry.initialized() && cacheEntry.reading(), cacheEntry.initialized() && cacheEntry.writing(), noHit); + PageCacheEntry& cacheEntry = pageCache.get(pageID, noHit); + debug_printf("DWALPager(%s) op=read %s cached=%d reading=%d writing=%d noHit=%d\n", filename.c_str(), + toString(pageID).c_str(), cacheEntry.initialized(), + cacheEntry.initialized() && cacheEntry.reading(), cacheEntry.initialized() && cacheEntry.writing(), + noHit); - if(!cacheEntry.initialized()) { + if (!cacheEntry.initialized()) { debug_printf("DWALPager(%s) issuing actual read of %s\n", filename.c_str(), toString(pageID).c_str()); cacheEntry.readFuture = readPhysicalPage(this, (PhysicalPageID)pageID); cacheEntry.writeFuture = Void(); @@ -1330,16 +1308,17 @@ public: Future> readPageAtVersion(LogicalPageID pageID, Version v, bool cacheable, bool noHit) { auto i = remappedPages.find(pageID); - if(i != remappedPages.end()) { + if (i != remappedPages.end()) { auto j = i->second.upper_bound(v); - if(j != i->second.begin()) { + if (j != i->second.begin()) { --j; - debug_printf("DWALPager(%s) read %s @%" PRId64 " -> %s\n", filename.c_str(), toString(pageID).c_str(), v, toString(j->second).c_str()); + debug_printf("DWALPager(%s) read %s @%" PRId64 " -> %s\n", filename.c_str(), toString(pageID).c_str(), + v, toString(j->second).c_str()); pageID = j->second; } - } - else { - debug_printf("DWALPager(%s) read %s @%" PRId64 " (not remapped)\n", filename.c_str(), toString(pageID).c_str(), v); + } else { + debug_printf("DWALPager(%s) read %s @%" PRId64 " (not remapped)\n", filename.c_str(), + toString(pageID).c_str(), v); } return readPage(pageID, cacheable, noHit); @@ -1359,9 +1338,7 @@ public: // Get the oldest *readable* version, which is not the same as the oldest retained version as the version // returned could have been set as the oldest version in the pending commit - Version getOldestVersion() override { - return pHeader->oldestVersion; - }; + Version getOldestVersion() override { return pHeader->oldestVersion; }; // Calculate the *effective* oldest version, which can be older than the one set in the last commit since we // are allowing active snapshots to temporarily delay page reuse. @@ -1369,27 +1346,28 @@ public: return std::min(pLastCommittedHeader->oldestVersion, snapshots.front().version); } - ACTOR static Future undoRemaps(DWALPager *self) { + ACTOR static Future undoRemaps(DWALPager* self) { state RemappedPage cutoff; cutoff.version = self->effectiveOldestVersion(); // TODO: Use parallel reads - // TODO: One run of this actor might write to the same original page more than once, in which case just unmap the latest + // TODO: One run of this actor might write to the same original page more than once, in which case just unmap + // the latest loop { - if(self->remapUndoStop) { + if (self->remapUndoStop) { break; } state Optional p = wait(self->remapQueue.pop(cutoff)); - if(!p.present()) { + if (!p.present()) { break; } debug_printf("DWALPager(%s) undoRemaps popped %s\n", self->filename.c_str(), p.get().toString().c_str()); - if(p.get().newPageID == invalidLogicalPageID) { - debug_printf("DWALPager(%s) undoRemaps freeing %s\n", self->filename.c_str(), p.get().toString().c_str()); + if (p.get().newPageID == invalidLogicalPageID) { + debug_printf("DWALPager(%s) undoRemaps freeing %s\n", self->filename.c_str(), + p.get().toString().c_str()); self->freePage(p.get().originalPageID, p.get().version); - } - else { + } else { // Read the data from the page that the original was mapped to Reference data = wait(self->readPage(p.get().newPageID, false)); @@ -1398,24 +1376,25 @@ public: // Remove the remap from this page, deleting the entry for the pageID if its map becomes empty auto i = self->remappedPages.find(p.get().originalPageID); - if(i->second.size() == 1) { + if (i->second.size() == 1) { self->remappedPages.erase(i); - } - else { + } else { i->second.erase(p.get().version); } - // Now that the remap has been undone nothing will read this page so it can be freed as of the next commit. + // Now that the remap has been undone nothing will read this page so it can be freed as of the next + // commit. self->freePage(p.get().newPageID, 0); } } - debug_printf("DWALPager(%s) undoRemaps stopped, remapQueue size is %d\n", self->filename.c_str(), self->remapQueue.numEntries); + debug_printf("DWALPager(%s) undoRemaps stopped, remapQueue size is %d\n", self->filename.c_str(), + self->remapQueue.numEntries); return Void(); } // Flush all queues so they have no operations pending. - ACTOR static Future flushQueues(DWALPager *self) { + ACTOR static Future flushQueues(DWALPager* self) { ASSERT(self->remapUndoFuture.isReady()); // Flush remap queue separately, it's not involved in free page management @@ -1429,7 +1408,7 @@ public: // Once preFlush() returns false for both queues then there are no more operations pending // on either queue. If preFlush() returns true for either queue in one loop execution then // it could have generated new work for itself or the other queue. - if(!freeBusy && !delayedFreeBusy) { + if (!freeBusy && !delayedFreeBusy) { break; } } @@ -1439,7 +1418,7 @@ public: return Void(); } - ACTOR static Future commit_impl(DWALPager *self) { + ACTOR static Future commit_impl(DWALPager* self) { debug_printf("DWALPager(%s) commit begin\n", self->filename.c_str()); // Write old committed header to Page 1 @@ -1461,19 +1440,21 @@ public: debug_printf("DWALPager(%s) Syncing\n", self->filename.c_str()); // Sync everything except the header - if(g_network->getCurrentTask() > TaskPriority::DiskWrite) { + if (g_network->getCurrentTask() > TaskPriority::DiskWrite) { wait(delay(0, TaskPriority::DiskWrite)); } wait(self->pageFile->sync()); - debug_printf("DWALPager(%s) commit version %" PRId64 " sync 1\n", self->filename.c_str(), self->pHeader->committedVersion); + debug_printf("DWALPager(%s) commit version %" PRId64 " sync 1\n", self->filename.c_str(), + self->pHeader->committedVersion); // Update header on disk and sync again. wait(self->writeHeaderPage(0, self->headerPage)); - if(g_network->getCurrentTask() > TaskPriority::DiskWrite) { + if (g_network->getCurrentTask() > TaskPriority::DiskWrite) { wait(delay(0, TaskPriority::DiskWrite)); } wait(self->pageFile->sync()); - debug_printf("DWALPager(%s) commit version %" PRId64 " sync 2\n", self->filename.c_str(), self->pHeader->committedVersion); + debug_printf("DWALPager(%s) commit version %" PRId64 " sync 2\n", self->filename.c_str(), + self->pHeader->committedVersion); // Update the last committed header for use in the next commit. self->updateCommittedHeader(); @@ -1497,19 +1478,13 @@ public: return commitFuture; } - Key getMetaKey() const override { - return pHeader->getMetaKey(); - } + Key getMetaKey() const override { return pHeader->getMetaKey(); } - void setCommitVersion(Version v) override { - pHeader->committedVersion = v; - } + void setCommitVersion(Version v) override { pHeader->committedVersion = v; } - void setMetaKey(KeyRef metaKey) override { - pHeader->setMetaKey(metaKey); - } - - ACTOR void shutdown(DWALPager *self, bool dispose) { + void setMetaKey(KeyRef metaKey) override { pHeader->setMetaKey(metaKey); } + + ACTOR void shutdown(DWALPager* self, bool dispose) { debug_printf("DWALPager(%s) shutdown cancel recovery\n", self->filename.c_str()); self->recoverFuture.cancel(); debug_printf("DWALPager(%s) shutdown cancel commit\n", self->filename.c_str()); @@ -1517,9 +1492,9 @@ public: debug_printf("DWALPager(%s) shutdown cancel remap\n", self->filename.c_str()); self->remapUndoFuture.cancel(); - if(self->errorPromise.canBeSet()) { + if (self->errorPromise.canBeSet()) { debug_printf("DWALPager(%s) shutdown sending error\n", self->filename.c_str()); - self->errorPromise.sendError(actor_cancelled()); // Ideally this should be shutdown_in_progress + self->errorPromise.sendError(actor_cancelled()); // Ideally this should be shutdown_in_progress } // Must wait for pending operations to complete, canceling them can cause a crash because the underlying @@ -1532,7 +1507,7 @@ public: // Unreference the file and clear self->pageFile.clear(); - if(dispose) { + if (dispose) { debug_printf("DWALPager(%s) shutdown deleting file\n", self->filename.c_str()); wait(IAsyncFileSystem::filesystem()->incrementalDeleteFile(self->filename, true)); } @@ -1541,21 +1516,13 @@ public: delete self; } - void dispose() override { - shutdown(this, true); - } + void dispose() override { shutdown(this, true); } - void close() override { - shutdown(this, false); - } + void close() override { shutdown(this, false); } - Future getError() override { - return errorPromise.getFuture(); - } - - Future onClosed() override { - return closedPromise.getFuture(); - } + Future getError() override { return errorPromise.getFuture(); } + + Future onClosed() override { return closedPromise.getFuture(); } StorageBytes getStorageBytes() override { ASSERT(recoverFuture.isReady()); @@ -1564,41 +1531,42 @@ public: g_network->getDiskBytes(parentDirectory(filename), free, total); int64_t pagerSize = pHeader->pageCount * physicalPageSize; - // It is not exactly known how many pages on the delayed free list are usable as of right now. It could be known, - // if each commit delayed entries that were freeable were shuffled from the delayed free queue to the free queue, - // but this doesn't seem necessary. + // It is not exactly known how many pages on the delayed free list are usable as of right now. It could be + // known, if each commit delayed entries that were freeable were shuffled from the delayed free queue to the + // free queue, but this doesn't seem necessary. int64_t reusable = (freeList.numEntries + delayedFreeList.numEntries) * physicalPageSize; return StorageBytes(free, total, pagerSize - reusable, free + reusable); } - ACTOR static Future getUserPageCount_cleanup(DWALPager *self) { + ACTOR static Future getUserPageCount_cleanup(DWALPager* self) { // Wait for the remap eraser to finish all of its work (not triggering stop) wait(self->remapUndoFuture); // Flush queues so there are no pending freelist operations wait(flushQueues(self)); - + return Void(); } // Get the number of pages in use by the pager's user Future getUserPageCount() override { return map(getUserPageCount_cleanup(this), [=](Void) { - int64_t userPages = pHeader->pageCount - 2 - freeList.numPages - freeList.numEntries - delayedFreeList.numPages - delayedFreeList.numEntries - remapQueue.numPages; - debug_printf("DWALPager(%s) userPages=%" PRId64 " totalPageCount=%" PRId64 " freeQueuePages=%" PRId64 " freeQueueCount=%" PRId64 " delayedFreeQueuePages=%" PRId64 " delayedFreeQueueCount=%" PRId64 " remapQueuePages=%" PRId64 " remapQueueCount=%" PRId64 "\n", - filename.c_str(), userPages, pHeader->pageCount, freeList.numPages, freeList.numEntries, delayedFreeList.numPages, delayedFreeList.numEntries, remapQueue.numPages, remapQueue.numEntries); + int64_t userPages = pHeader->pageCount - 2 - freeList.numPages - freeList.numEntries - + delayedFreeList.numPages - delayedFreeList.numEntries - remapQueue.numPages; + debug_printf("DWALPager(%s) userPages=%" PRId64 " totalPageCount=%" PRId64 " freeQueuePages=%" PRId64 + " freeQueueCount=%" PRId64 " delayedFreeQueuePages=%" PRId64 " delayedFreeQueueCount=%" PRId64 + " remapQueuePages=%" PRId64 " remapQueueCount=%" PRId64 "\n", + filename.c_str(), userPages, pHeader->pageCount, freeList.numPages, freeList.numEntries, + delayedFreeList.numPages, delayedFreeList.numEntries, remapQueue.numPages, + remapQueue.numEntries); return userPages; }); } - Future init() override { - return recoverFuture; - } + Future init() override { return recoverFuture; } - Version getLatestVersion() override { - return pLastCommittedHeader->committedVersion; - } + Version getLatestVersion() override { return pLastCommittedHeader->committedVersion; } private: ~DWALPager() {} @@ -1617,12 +1585,10 @@ private: FIFOQueue::QueueState delayedFreeList; FIFOQueue::QueueState remapQueue; Version committedVersion; - Version oldestVersion; + Version oldestVersion; int32_t metaKeySize; - KeyRef getMetaKey() const { - return KeyRef((const uint8_t *)(this + 1), metaKeySize); - } + KeyRef getMetaKey() const { return KeyRef((const uint8_t*)(this + 1), metaKeySize); } void setMetaKey(StringRef key) { ASSERT(key.size() < (smallestPhysicalBlock - sizeof(Header))); @@ -1632,9 +1598,7 @@ private: } } - int size() const { - return sizeof(Header) + metaKeySize; - } + int size() const { return sizeof(Header) + metaKeySize; } private: Header(); @@ -1645,26 +1609,18 @@ private: Future> readFuture; Future writeFuture; - bool initialized() const { - return readFuture.isValid(); - } + bool initialized() const { return readFuture.isValid(); } - bool reading() const { - return !readFuture.isReady(); - } + bool reading() const { return !readFuture.isReady(); } - bool writing() const { - return !writeFuture.isReady(); - } + bool writing() const { return !writeFuture.isReady(); } bool evictable() const { // Don't evict if a page is still being read or written return !reading() && !writing(); } - Future onEvictable() const { - return ready(readFuture) && writeFuture; - } + Future onEvictable() const { return ready(readFuture) && writeFuture; } }; // Physical page sizes will always be a multiple of 4k because AsyncFileNonDurable requires @@ -1672,18 +1628,18 @@ private: // Allowing a smaller 'logical' page size is very useful for testing. static constexpr int smallestPhysicalBlock = 4096; int physicalPageSize; - int logicalPageSize; // In simulation testing it can be useful to use a small logical page size + int logicalPageSize; // In simulation testing it can be useful to use a small logical page size int64_t pageCacheBytes; // The header will be written to / read from disk as a smallestPhysicalBlock sized chunk. Reference headerPage; - Header *pHeader; + Header* pHeader; int desiredPageSize; Reference lastCommittedHeaderPage; - Header *pLastCommittedHeader; + Header* pLastCommittedHeader; std::string filename; @@ -1691,7 +1647,7 @@ private: PageCacheT pageCache; Promise closedPromise; - Promise errorPromise; + Promise errorPromise; Future commitFuture; SignalableActorCollection operations; Future recoverFuture; @@ -1715,13 +1671,9 @@ private: }; struct SnapshotEntryLessThanVersion { - bool operator() (Version v, const SnapshotEntry &snapshot) { - return v < snapshot.version; - } + bool operator()(Version v, const SnapshotEntry& snapshot) { return v < snapshot.version; } - bool operator() (const SnapshotEntry &snapshot, Version v) { - return snapshot.version < v; - } + bool operator()(const SnapshotEntry& snapshot, Version v) { return snapshot.version < v; } }; // TODO: Better data structure @@ -1733,46 +1685,38 @@ private: // Prevents pager from reusing freed pages from version until the snapshot is destroyed class DWALPagerSnapshot : public IPagerSnapshot, public ReferenceCounted { public: - DWALPagerSnapshot(DWALPager *pager, Key meta, Version version, Future expiredFuture) : pager(pager), metaKey(meta), version(version), expired(expiredFuture) { - } - virtual ~DWALPagerSnapshot() { - } + DWALPagerSnapshot(DWALPager* pager, Key meta, Version version, Future expiredFuture) + : pager(pager), metaKey(meta), version(version), expired(expiredFuture) {} + virtual ~DWALPagerSnapshot() {} Future> getPhysicalPage(LogicalPageID pageID, bool cacheable, bool noHit) override { - if(expired.isError()) { + if (expired.isError()) { throw expired.getError(); } - return map(pager->readPageAtVersion(pageID, version, cacheable, noHit), [=](Reference p) { - return Reference(p); - }); + return map(pager->readPageAtVersion(pageID, version, cacheable, noHit), + [=](Reference p) { return Reference(p); }); } - Key getMetaKey() const override { - return metaKey; - } + Key getMetaKey() const override { return metaKey; } - Version getVersion() const override { - return version; - } + Version getVersion() const override { return version; } - void addref() override { - ReferenceCounted::addref(); - } + void addref() override { ReferenceCounted::addref(); } - void delref() override { - ReferenceCounted::delref(); - } + void delref() override { ReferenceCounted::delref(); } - DWALPager *pager; + DWALPager* pager; Future expired; Version version; Key metaKey; }; void DWALPager::expireSnapshots(Version v) { - debug_printf("DWALPager(%s) expiring snapshots through %" PRId64 " snapshot count %d\n", filename.c_str(), v, (int)snapshots.size()); - while(snapshots.size() > 1 && snapshots.front().version < v && snapshots.front().snapshot->isSoleOwner()) { - debug_printf("DWALPager(%s) expiring snapshot for %" PRId64 " soleOwner=%d\n", filename.c_str(), snapshots.front().version, snapshots.front().snapshot->isSoleOwner()); + debug_printf("DWALPager(%s) expiring snapshots through %" PRId64 " snapshot count %d\n", filename.c_str(), v, + (int)snapshots.size()); + while (snapshots.size() > 1 && snapshots.front().version < v && snapshots.front().snapshot->isSoleOwner()) { + debug_printf("DWALPager(%s) expiring snapshot for %" PRId64 " soleOwner=%d\n", filename.c_str(), + snapshots.front().version, snapshots.front().snapshot->isSoleOwner()); // The snapshot contract could be made such that the expired promise isn't need anymore. In practice it // probably is already not needed but it will gracefully handle the case where a user begins a page read // with a snapshot reference, keeps the page read future, and drops the snapshot reference. @@ -1785,7 +1729,7 @@ Reference DWALPager::getReadSnapshot(Version v) { ASSERT(!snapshots.empty()); auto i = std::upper_bound(snapshots.begin(), snapshots.end(), v, SnapshotEntryLessThanVersion()); - if(i == snapshots.begin()) { + if (i == snapshots.begin()) { throw version_invalid(); } --i; @@ -1794,35 +1738,30 @@ Reference DWALPager::getReadSnapshot(Version v) { void DWALPager::addLatestSnapshot() { Promise expired; - snapshots.push_back({ - pLastCommittedHeader->committedVersion, - expired, - Reference(new DWALPagerSnapshot(this, pLastCommittedHeader->getMetaKey(), pLastCommittedHeader->committedVersion, expired.getFuture())) - }); + snapshots.push_back({ pLastCommittedHeader->committedVersion, expired, + Reference(new DWALPagerSnapshot(this, pLastCommittedHeader->getMetaKey(), + pLastCommittedHeader->committedVersion, + expired.getFuture())) }); } - // TODO: Move this to a flow header once it is mature. struct SplitStringRef { StringRef a; StringRef b; - SplitStringRef(StringRef a = StringRef(), StringRef b = StringRef()) : a(a), b(b) { - } + SplitStringRef(StringRef a = StringRef(), StringRef b = StringRef()) : a(a), b(b) {} - SplitStringRef(Arena &arena, const SplitStringRef &toCopy) - : a(toStringRef(arena)), b() { - } + SplitStringRef(Arena& arena, const SplitStringRef& toCopy) : a(toStringRef(arena)), b() {} SplitStringRef prefix(int len) const { - if(len <= a.size()) { + if (len <= a.size()) { return SplitStringRef(a.substr(0, len)); } len -= a.size(); return SplitStringRef(a, b.substr(0, len)); } - StringRef toStringRef(Arena &arena) const { + StringRef toStringRef(Arena& arena) const { StringRef c = makeString(size(), arena); memcpy(mutateString(c), a.begin(), a.size()); memcpy(mutateString(c) + a.size(), b.begin(), b.size()); @@ -1834,82 +1773,66 @@ struct SplitStringRef { return Standalone(toStringRef(a), a); } - int size() const { - return a.size() + b.size(); - } + int size() const { return a.size() + b.size(); } - int expectedSize() const { - return size(); - } + int expectedSize() const { return size(); } - std::string toString() const { - return format("%s%s", a.toString().c_str(), b.toString().c_str()); - } + std::string toString() const { return format("%s%s", a.toString().c_str(), b.toString().c_str()); } - std::string toHexString() const { - return format("%s%s", a.toHexString().c_str(), b.toHexString().c_str()); - } + std::string toHexString() const { return format("%s%s", a.toHexString().c_str(), b.toHexString().c_str()); } struct const_iterator { - const uint8_t *ptr; - const uint8_t *end; - const uint8_t *next; + const uint8_t* ptr; + const uint8_t* end; + const uint8_t* next; - inline bool operator==(const const_iterator &rhs) const { - return ptr == rhs.ptr; - } + inline bool operator==(const const_iterator& rhs) const { return ptr == rhs.ptr; } - inline const_iterator & operator++() { + inline const_iterator& operator++() { ++ptr; - if(ptr == end) { + if (ptr == end) { ptr = next; } return *this; } - inline const_iterator & operator+(int n) { + inline const_iterator& operator+(int n) { ptr += n; - if(ptr >= end) { + if (ptr >= end) { ptr = next + (ptr - end); } return *this; } - inline uint8_t operator *() const { - return *ptr; - } + inline uint8_t operator*() const { return *ptr; } }; - inline const_iterator begin() const { - return {a.begin(), a.end(), b.begin()}; - } + inline const_iterator begin() const { return { a.begin(), a.end(), b.begin() }; } - inline const_iterator end() const { - return {b.end()}; - } + inline const_iterator end() const { return { b.end() }; } - template - int compare(const StringT &rhs) const { + template + int compare(const StringT& rhs) const { auto j = begin(); auto k = rhs.begin(); auto jEnd = end(); auto kEnd = rhs.end(); - while(j != jEnd && k != kEnd) { + while (j != jEnd && k != kEnd) { int cmp = *j - *k; - if(cmp != 0) { + if (cmp != 0) { return cmp; } } - // If we've reached the end of *this, then values are equal if rhs is also exhausted, otherwise *this is less than rhs - if(j == jEnd) { + // If we've reached the end of *this, then values are equal if rhs is also exhausted, otherwise *this is less + // than rhs + if (j == jEnd) { return k == kEnd ? 0 : -1; } return 1; } - }; // A BTree "page id" is actually a list of LogicalPageID's whose contents should be concatenated together. @@ -1925,45 +1848,37 @@ struct RedwoodRecordRef { typedef uint8_t byte; RedwoodRecordRef(KeyRef key = KeyRef(), Version ver = 0, Optional value = {}) - : key(key), version(ver), value(value) - { - } + : key(key), version(ver), value(value) {} - RedwoodRecordRef(Arena &arena, const RedwoodRecordRef &toCopy) - : key(arena, toCopy.key), version(toCopy.version) - { - if(toCopy.value.present()) { + RedwoodRecordRef(Arena& arena, const RedwoodRecordRef& toCopy) : key(arena, toCopy.key), version(toCopy.version) { + if (toCopy.value.present()) { value = ValueRef(arena, toCopy.value.get()); } } - KeyValueRef toKeyValueRef() const { - return KeyValueRef(key, value.get()); - } + KeyValueRef toKeyValueRef() const { return KeyValueRef(key, value.get()); } // RedwoodRecordRefs are used for both internal and leaf pages of the BTree. // Boundary records in internal pages are made from leaf records. // These functions make creating and working with internal page records more convenient. inline BTreePageID getChildPage() const { ASSERT(value.present()); - return BTreePageID((LogicalPageID *)value.get().begin(), value.get().size() / sizeof(LogicalPageID)); + return BTreePageID((LogicalPageID*)value.get().begin(), value.get().size() / sizeof(LogicalPageID)); } inline void setChildPage(BTreePageID id) { - value = ValueRef((const uint8_t *)id.begin(), id.size() * sizeof(LogicalPageID)); + value = ValueRef((const uint8_t*)id.begin(), id.size() * sizeof(LogicalPageID)); } - inline void setChildPage(Arena &arena, BTreePageID id) { - value = ValueRef(arena, (const uint8_t *)id.begin(), id.size() * sizeof(LogicalPageID)); + inline void setChildPage(Arena& arena, BTreePageID id) { + value = ValueRef(arena, (const uint8_t*)id.begin(), id.size() * sizeof(LogicalPageID)); } inline RedwoodRecordRef withPageID(BTreePageID id) const { - return RedwoodRecordRef(key, version, ValueRef((const uint8_t *)id.begin(), id.size() * sizeof(LogicalPageID))); + return RedwoodRecordRef(key, version, ValueRef((const uint8_t*)id.begin(), id.size() * sizeof(LogicalPageID))); } - inline RedwoodRecordRef withoutValue() const { - return RedwoodRecordRef(key, version); - } + inline RedwoodRecordRef withoutValue() const { return RedwoodRecordRef(key, version); } // Truncate (key, version, part) tuple to len bytes. void truncate(int len) { @@ -1973,32 +1888,34 @@ struct RedwoodRecordRef { } // Find the common key prefix between two records, assuming that the first skipLen bytes are the same - inline int getCommonPrefixLen(const RedwoodRecordRef &other, int skipLen = 0) const { + inline int getCommonPrefixLen(const RedwoodRecordRef& other, int skipLen = 0) const { int skipStart = std::min(skipLen, key.size()); - return skipStart + commonPrefixLength(key.begin() + skipStart, other.key.begin() + skipStart, std::min(other.key.size(), key.size()) - skipStart); + return skipStart + commonPrefixLength(key.begin() + skipStart, other.key.begin() + skipStart, + std::min(other.key.size(), key.size()) - skipStart); } // Compares and orders by key, version, chunk.total, chunk.start, value // This is the same order that delta compression uses for prefix borrowing - int compare(const RedwoodRecordRef &rhs, int skip = 0) const { + int compare(const RedwoodRecordRef& rhs, int skip = 0) const { int keySkip = std::min(skip, key.size()); int cmp = key.substr(keySkip).compare(rhs.key.substr(keySkip)); - if(cmp == 0) { + if (cmp == 0) { cmp = version - rhs.version; - if(cmp == 0) { + if (cmp == 0) { cmp = value.compare(rhs.value); } } return cmp; } - bool sameUserKey(const StringRef &k, int skipLen) const { - // Keys are the same if the sizes are the same and either the skipLen is longer or the non-skipped suffixes are the same. + bool sameUserKey(const StringRef& k, int skipLen) const { + // Keys are the same if the sizes are the same and either the skipLen is longer or the non-skipped suffixes are + // the same. return (key.size() == k.size()) && (key.substr(skipLen) == k.substr(skipLen)); } - bool sameExceptValue(const RedwoodRecordRef &rhs, int skipLen = 0) const { + bool sameExceptValue(const RedwoodRecordRef& rhs, int skipLen = 0) const { return sameUserKey(rhs.key, skipLen) && version == rhs.version; } @@ -2007,15 +1924,13 @@ struct RedwoodRecordRef { Optional value; Version version; - int expectedSize() const { - return key.expectedSize() + value.expectedSize(); - } + int expectedSize() const { return key.expectedSize() + value.expectedSize(); } class Reader { public: - Reader(const void *ptr) : rptr((const byte *)ptr) {} + Reader(const void* ptr) : rptr((const byte*)ptr) {} - const byte *rptr; + const byte* rptr; StringRef readString(int len) { StringRef s(rptr, len); @@ -2024,7 +1939,7 @@ struct RedwoodRecordRef { } }; -#pragma pack(push,1) +#pragma pack(push, 1) struct Delta { uint8_t flags; @@ -2062,8 +1977,9 @@ struct RedwoodRecordRef { int16_t low; }; - static constexpr int LengthFormatSizes[] = {sizeof(LengthFormat0), sizeof(LengthFormat1), sizeof(LengthFormat2), sizeof(LengthFormat3)}; - static constexpr int VersionDeltaSizes[] = {0, sizeof(int32_t), sizeof(int48_t), sizeof(int64_t)}; + static constexpr int LengthFormatSizes[] = { sizeof(LengthFormat0), sizeof(LengthFormat1), + sizeof(LengthFormat2), sizeof(LengthFormat3) }; + static constexpr int VersionDeltaSizes[] = { 0, sizeof(int32_t), sizeof(int48_t), sizeof(int64_t) }; // Serialized Format // @@ -2077,7 +1993,7 @@ struct RedwoodRecordRef { // // Length fields using 3 to 8 bytes total depending on length fields format // - // Byte strings + // Byte strings // Key suffix bytes // Value bytes // Version delta bytes @@ -2094,72 +2010,79 @@ struct RedwoodRecordRef { static inline int determineLengthFormat(int prefixLength, int suffixLength, int valueLength) { // Large prefix or suffix length, which should be rare, is format 3 - if(prefixLength > 0xFF || suffixLength > 0xFF) { + if (prefixLength > 0xFF || suffixLength > 0xFF) { return 3; - } - else if(valueLength < 0x100) { + } else if (valueLength < 0x100) { return 0; - } - else if(valueLength < 0x10000) { + } else if (valueLength < 0x10000) { return 1; - } - else { + } else { return 2; } } // Large prefix or suffix length, which should be rare, is format 3 - byte * data() const { - switch(flags & LENGTHS_FORMAT) { - case 0: return (byte *)(&LengthFormat0 + 1); - case 1: return (byte *)(&LengthFormat1 + 1); - case 2: return (byte *)(&LengthFormat2 + 1); - case 3: - default: return (byte *)(&LengthFormat3 + 1); + byte* data() const { + switch (flags & LENGTHS_FORMAT) { + case 0: + return (byte*)(&LengthFormat0 + 1); + case 1: + return (byte*)(&LengthFormat1 + 1); + case 2: + return (byte*)(&LengthFormat2 + 1); + case 3: + default: + return (byte*)(&LengthFormat3 + 1); } } int getKeyPrefixLength() const { - switch(flags & LENGTHS_FORMAT) { - case 0: return LengthFormat0.prefixLength; - case 1: return LengthFormat1.prefixLength; - case 2: return LengthFormat2.prefixLength; - case 3: - default: return LengthFormat3.prefixLength; + switch (flags & LENGTHS_FORMAT) { + case 0: + return LengthFormat0.prefixLength; + case 1: + return LengthFormat1.prefixLength; + case 2: + return LengthFormat2.prefixLength; + case 3: + default: + return LengthFormat3.prefixLength; } } int getKeySuffixLength() const { - switch(flags & LENGTHS_FORMAT) { - case 0: return LengthFormat0.suffixLength; - case 1: return LengthFormat1.suffixLength; - case 2: return LengthFormat2.suffixLength; - case 3: - default: return LengthFormat3.suffixLength; + switch (flags & LENGTHS_FORMAT) { + case 0: + return LengthFormat0.suffixLength; + case 1: + return LengthFormat1.suffixLength; + case 2: + return LengthFormat2.suffixLength; + case 3: + default: + return LengthFormat3.suffixLength; } } int getValueLength() const { - switch(flags & LENGTHS_FORMAT) { - case 0: return LengthFormat0.valueLength; - case 1: return LengthFormat1.valueLength; - case 2: return LengthFormat2.valueLength; - case 3: - default: return LengthFormat3.valueLength; + switch (flags & LENGTHS_FORMAT) { + case 0: + return LengthFormat0.valueLength; + case 1: + return LengthFormat1.valueLength; + case 2: + return LengthFormat2.valueLength; + case 3: + default: + return LengthFormat3.valueLength; } } - StringRef getKeySuffix() const { - return StringRef(data(), getKeySuffixLength()); - } + StringRef getKeySuffix() const { return StringRef(data(), getKeySuffixLength()); } - StringRef getValue() const { - return StringRef(data() + getKeySuffixLength(), getValueLength()); - } + StringRef getValue() const { return StringRef(data() + getKeySuffixLength(), getValueLength()); } - bool hasVersion() const { - return flags & HAS_VERSION; - } + bool hasVersion() const { return flags & HAS_VERSION; } int getVersionDeltaSizeBytes() const { int code = (flags & VERSION_DELTA_SIZE) >> 2; @@ -2167,84 +2090,75 @@ struct RedwoodRecordRef { } static int getVersionDeltaSizeBytes(Version d) { - if(d == 0) { + if (d == 0) { return 0; - } - else if(d == (int32_t)d) { + } else if (d == (int32_t)d) { return sizeof(int32_t); - } - else if(d == (d & int48_t::MASK)) { + } else if (d == (d & int48_t::MASK)) { return sizeof(int48_t); } return sizeof(int64_t); } - int getVersionDelta(const uint8_t *r) const { + int getVersionDelta(const uint8_t* r) const { int code = (flags & VERSION_DELTA_SIZE) >> 2; - switch(code) { - case 0: return 0; - case 1: return *(int32_t *)r; - case 2: return (((int64_t)((int48_t *)r)->high) << 16) | (((int48_t *)r)->low & 0xFFFF); - case 3: - default: return *(int64_t *)r; + switch (code) { + case 0: + return 0; + case 1: + return *(int32_t*)r; + case 2: + return (((int64_t)((int48_t*)r)->high) << 16) | (((int48_t*)r)->low & 0xFFFF); + case 3: + default: + return *(int64_t*)r; } } // Version delta size should be 0 before calling - int setVersionDelta(Version d, uint8_t *w) { + int setVersionDelta(Version d, uint8_t* w) { flags |= HAS_VERSION; - if(d == 0) { + if (d == 0) { return 0; - } - else if(d == (int32_t)d) { + } else if (d == (int32_t)d) { flags |= 1 << 2; - *(uint32_t *)w = d; + *(uint32_t*)w = d; return sizeof(uint32_t); - } - else if(d == (d & int48_t::MASK)) { + } else if (d == (d & int48_t::MASK)) { flags |= 2 << 2; - ((int48_t *)w)->high = d >> 16; - ((int48_t *)w)->low = d; + ((int48_t*)w)->high = d >> 16; + ((int48_t*)w)->low = d; return sizeof(int48_t); - } - else { + } else { flags |= 3 << 2; - *(int64_t *)w = d; + *(int64_t*)w = d; return sizeof(int64_t); } } - bool hasValue() const { - return flags & HAS_VALUE; - } + bool hasValue() const { return flags & HAS_VALUE; } void setPrefixSource(bool val) { - if(val) { + if (val) { flags |= PREFIX_SOURCE_PREV; - } - else { + } else { flags &= ~PREFIX_SOURCE_PREV; } } - bool getPrefixSource() const { - return flags & PREFIX_SOURCE_PREV; - } + bool getPrefixSource() const { return flags & PREFIX_SOURCE_PREV; } void setDeleted(bool val) { - if(val) { + if (val) { flags |= IS_DELETED; - } - else { + } else { flags &= ~IS_DELETED; } } - bool getDeleted() const { - return flags & IS_DELETED; - } + bool getDeleted() const { return flags & IS_DELETED; } - RedwoodRecordRef apply(const RedwoodRecordRef &base, Arena &arena) const { + RedwoodRecordRef apply(const RedwoodRecordRef& base, Arena& arena) const { int keyPrefixLen = getKeyPrefixLength(); int keySuffixLen = getKeySuffixLength(); int valueLen = hasValue() ? getValueLength() : 0; @@ -2253,24 +2167,23 @@ struct RedwoodRecordRef { Reader r(data()); // If there is a key suffix, reconstitute the complete key into a contiguous string - if(keySuffixLen > 0) { + if (keySuffixLen > 0) { StringRef keySuffix = r.readString(keySuffixLen); k = makeString(keyPrefixLen + keySuffixLen, arena); memcpy(mutateString(k), base.key.begin(), keyPrefixLen); memcpy(mutateString(k) + keyPrefixLen, keySuffix.begin(), keySuffixLen); - } - else { + } else { // Otherwise just reference the base key's memory k = base.key.substr(0, keyPrefixLen); } Optional value; - if(hasValue()) { + if (hasValue()) { value = r.readString(valueLen); } Version v = 0; - if(hasVersion()) { + if (hasVersion()) { v = base.version + getVersionDelta(r.rptr); } @@ -2279,27 +2192,31 @@ struct RedwoodRecordRef { int size() const { int size = 1 + getVersionDeltaSizeBytes(); - switch(flags & LENGTHS_FORMAT) { - case 0: return size + sizeof(LengthFormat0) + LengthFormat0.suffixLength + LengthFormat0.valueLength; - case 1: return size + sizeof(LengthFormat1) + LengthFormat1.suffixLength + LengthFormat1.valueLength; - case 2: return size + sizeof(LengthFormat2) + LengthFormat2.suffixLength + LengthFormat2.valueLength; - case 3: - default: return size + sizeof(LengthFormat3) + LengthFormat3.suffixLength + LengthFormat3.valueLength; + switch (flags & LENGTHS_FORMAT) { + case 0: + return size + sizeof(LengthFormat0) + LengthFormat0.suffixLength + LengthFormat0.valueLength; + case 1: + return size + sizeof(LengthFormat1) + LengthFormat1.suffixLength + LengthFormat1.valueLength; + case 2: + return size + sizeof(LengthFormat2) + LengthFormat2.suffixLength + LengthFormat2.valueLength; + case 3: + default: + return size + sizeof(LengthFormat3) + LengthFormat3.suffixLength + LengthFormat3.valueLength; } } std::string toString() const { std::string flagString = " "; - if(flags & PREFIX_SOURCE_PREV) { + if (flags & PREFIX_SOURCE_PREV) { flagString += "PrefixSource|"; } - if(flags & IS_DELETED) { + if (flags & IS_DELETED) { flagString += "IsDeleted|"; } - if(hasValue()) { + if (hasValue()) { flagString += "HasValue|"; } - if(hasVersion()) { + if (hasVersion()) { flagString += "HasVersion|"; } int lengthFormat = flags & LENGTHS_FORMAT; @@ -2309,18 +2226,20 @@ struct RedwoodRecordRef { int keySuffixLen = getKeySuffixLength(); int valueLen = getValueLength(); - return format("lengthFormat: %d totalDeltaSize: %d flags: %s prefixLen: %d keySuffixLen: %d versionDeltaSizeBytes: %d valueLen %d raw: %s", - lengthFormat, size(), flagString.c_str(), prefixLen, keySuffixLen, getVersionDeltaSizeBytes(), valueLen, StringRef((const uint8_t *)this, size()).toHexString().c_str()); + return format("lengthFormat: %d totalDeltaSize: %d flags: %s prefixLen: %d keySuffixLen: %d " + "versionDeltaSizeBytes: %d valueLen %d raw: %s", + lengthFormat, size(), flagString.c_str(), prefixLen, keySuffixLen, getVersionDeltaSizeBytes(), + valueLen, StringRef((const uint8_t*)this, size()).toHexString().c_str()); } }; // Using this class as an alternative for Delta enables reading a DeltaTree while only decoding // its values, so the Reader does not require the original prev/next ancestors. struct DeltaValueOnly : Delta { - RedwoodRecordRef apply(const RedwoodRecordRef &base, Arena &arena) const { + RedwoodRecordRef apply(const RedwoodRecordRef& base, Arena& arena) const { Optional value; - if(hasValue()) { + if (hasValue()) { value = getValue(); } @@ -2329,43 +2248,30 @@ struct RedwoodRecordRef { }; #pragma pack(pop) - bool operator==(const RedwoodRecordRef &rhs) const { - return compare(rhs) == 0; - } + bool operator==(const RedwoodRecordRef& rhs) const { return compare(rhs) == 0; } - bool operator!=(const RedwoodRecordRef &rhs) const { - return compare(rhs) != 0; - } + bool operator!=(const RedwoodRecordRef& rhs) const { return compare(rhs) != 0; } - bool operator<(const RedwoodRecordRef &rhs) const { - return compare(rhs) < 0; - } + bool operator<(const RedwoodRecordRef& rhs) const { return compare(rhs) < 0; } - bool operator>(const RedwoodRecordRef &rhs) const { - return compare(rhs) > 0; - } + bool operator>(const RedwoodRecordRef& rhs) const { return compare(rhs) > 0; } - bool operator<=(const RedwoodRecordRef &rhs) const { - return compare(rhs) <= 0; - } + bool operator<=(const RedwoodRecordRef& rhs) const { return compare(rhs) <= 0; } - bool operator>=(const RedwoodRecordRef &rhs) const { - return compare(rhs) >= 0; - } + bool operator>=(const RedwoodRecordRef& rhs) const { return compare(rhs) >= 0; } // Worst case overhead means to assu - int deltaSize(const RedwoodRecordRef &base, int skipLen, bool worstCaseOverhead) const { + int deltaSize(const RedwoodRecordRef& base, int skipLen, bool worstCaseOverhead) const { int prefixLen = getCommonPrefixLen(base, skipLen); int keySuffixLen = key.size() - prefixLen; int valueLen = value.present() ? value.get().size() : 0; int formatType; int versionBytes; - if(worstCaseOverhead) { + if (worstCaseOverhead) { formatType = Delta::determineLengthFormat(key.size(), key.size(), valueLen); versionBytes = version == 0 ? 0 : Delta::getVersionDeltaSizeBytes(version << 1); - } - else { + } else { formatType = Delta::determineLengthFormat(prefixLen, keySuffixLen, valueLen); versionBytes = version == 0 ? 0 : Delta::getVersionDeltaSizeBytes(version - base.version); } @@ -2374,10 +2280,10 @@ struct RedwoodRecordRef { } // commonPrefix between *this and base can be passed if known - int writeDelta(Delta &d, const RedwoodRecordRef &base, int keyPrefixLen = -1) const { + int writeDelta(Delta& d, const RedwoodRecordRef& base, int keyPrefixLen = -1) const { d.flags = value.present() ? Delta::HAS_VALUE : 0; - if(keyPrefixLen < 0) { + if (keyPrefixLen < 0) { keyPrefixLen = getCommonPrefixLen(base, 0); } @@ -2387,35 +2293,51 @@ struct RedwoodRecordRef { int formatType = Delta::determineLengthFormat(keyPrefixLen, keySuffix.size(), valueLen); d.flags |= formatType; - switch(formatType) { - case 0: d.LengthFormat0.prefixLength = keyPrefixLen; d.LengthFormat0.suffixLength = keySuffix.size(); d.LengthFormat0.valueLength = valueLen; break; - case 1: d.LengthFormat1.prefixLength = keyPrefixLen; d.LengthFormat1.suffixLength = keySuffix.size(); d.LengthFormat1.valueLength = valueLen; break; - case 2: d.LengthFormat2.prefixLength = keyPrefixLen; d.LengthFormat2.suffixLength = keySuffix.size(); d.LengthFormat2.valueLength = valueLen; break; - case 3: - default: d.LengthFormat3.prefixLength = keyPrefixLen; d.LengthFormat3.suffixLength = keySuffix.size(); d.LengthFormat3.valueLength = valueLen; break; + switch (formatType) { + case 0: + d.LengthFormat0.prefixLength = keyPrefixLen; + d.LengthFormat0.suffixLength = keySuffix.size(); + d.LengthFormat0.valueLength = valueLen; + break; + case 1: + d.LengthFormat1.prefixLength = keyPrefixLen; + d.LengthFormat1.suffixLength = keySuffix.size(); + d.LengthFormat1.valueLength = valueLen; + break; + case 2: + d.LengthFormat2.prefixLength = keyPrefixLen; + d.LengthFormat2.suffixLength = keySuffix.size(); + d.LengthFormat2.valueLength = valueLen; + break; + case 3: + default: + d.LengthFormat3.prefixLength = keyPrefixLen; + d.LengthFormat3.suffixLength = keySuffix.size(); + d.LengthFormat3.valueLength = valueLen; + break; } - uint8_t *wptr = d.data(); + uint8_t* wptr = d.data(); // Write key suffix string wptr = keySuffix.copyTo(wptr); // Write value bytes - if(value.present()) { + if (value.present()) { wptr = value.get().copyTo(wptr); } - if(version != 0) { + if (version != 0) { wptr += d.setVersionDelta(version - base.version, wptr); } - return wptr - (uint8_t *)&d; + return wptr - (uint8_t*)&d; } static std::string kvformat(StringRef s, int hexLimit = -1) { bool hex = false; - for(auto c : s) { - if(!isprint(c)) { + for (auto c : s) { + if (!isprint(c)) { hex = true; break; } @@ -2427,15 +2349,13 @@ struct RedwoodRecordRef { std::string toString(bool leaf = true) const { std::string r; r += format("'%s'@%" PRId64 " => ", kvformat(key).c_str(), version); - if(value.present()) { - if(leaf) { + if (value.present()) { + if (leaf) { r += format("'%s'", kvformat(value.get()).c_str()); - } - else { + } else { r += format("[%s]", ::toString(getChildPage()).c_str()); } - } - else { + } else { r += "(absent)"; } return r; @@ -2446,7 +2366,7 @@ struct BTreePage { typedef DeltaTree BinaryTree; typedef DeltaTree ValueTree; -#pragma pack(push,1) +#pragma pack(push, 1) struct { uint8_t height; uint32_t kvBytes; @@ -2454,33 +2374,27 @@ struct BTreePage { #pragma pack(pop) int size() const { - const BinaryTree *t = &tree(); - return (uint8_t *)t - (uint8_t *)this + t->size(); + const BinaryTree* t = &tree(); + return (uint8_t*)t - (uint8_t*)this + t->size(); } - bool isLeaf() const { - return height == 1; - } + bool isLeaf() const { return height == 1; } - BinaryTree & tree() { - return *(BinaryTree *)(this + 1); - } + BinaryTree& tree() { return *(BinaryTree*)(this + 1); } - const BinaryTree & tree() const { - return *(const BinaryTree *)(this + 1); - } + const BinaryTree& tree() const { return *(const BinaryTree*)(this + 1); } - const ValueTree & valueTree() const { - return *(const ValueTree *)(this + 1); - } + const ValueTree& valueTree() const { return *(const ValueTree*)(this + 1); } - std::string toString(bool write, BTreePageID id, Version ver, const RedwoodRecordRef *lowerBound, const RedwoodRecordRef *upperBound) const { + std::string toString(bool write, BTreePageID id, Version ver, const RedwoodRecordRef* lowerBound, + const RedwoodRecordRef* upperBound) const { std::string r; - r += format("BTreePage op=%s %s @%" PRId64 " ptr=%p height=%d count=%d kvBytes=%d\n lowerBound: %s\n upperBound: %s\n", - write ? "write" : "read", ::toString(id).c_str(), ver, this, height, (int)tree().numItems, (int)kvBytes, - lowerBound->toString(false).c_str(), upperBound->toString(false).c_str()); + r += format("BTreePage op=%s %s @%" PRId64 + " ptr=%p height=%d count=%d kvBytes=%d\n lowerBound: %s\n upperBound: %s\n", + write ? "write" : "read", ::toString(id).c_str(), ver, this, height, (int)tree().numItems, + (int)kvBytes, lowerBound->toString(false).c_str(), upperBound->toString(false).c_str()); try { - if(tree().numItems > 0) { + if (tree().numItems > 0) { // This doesn't use the cached reader for the page but it is only for debugging purposes BinaryTree::Mirror reader(&tree(), lowerBound, upperBound); BinaryTree::Cursor c = reader.getCursor(); @@ -2495,18 +2409,18 @@ struct BTreePage { bool tooLow = c.get().withoutValue() < lowerBound->withoutValue(); bool tooHigh = c.get().withoutValue() >= upperBound->withoutValue(); - if(tooLow || tooHigh) { + if (tooLow || tooHigh) { anyOutOfRange = true; - if(tooLow) { + if (tooLow) { r += " (too low)"; } - if(tooHigh) { + if (tooHigh) { r += " (too high)"; } } r += "\n"; - } while(c.moveNext()); + } while (c.moveNext()); ASSERT(!anyOutOfRange); } } catch (Error& e) { @@ -2520,14 +2434,14 @@ struct BTreePage { }; static void makeEmptyRoot(Reference page) { - BTreePage *btpage = (BTreePage *)page->begin(); + BTreePage* btpage = (BTreePage*)page->begin(); btpage->height = 1; btpage->kvBytes = 0; btpage->tree().build(page->size(), nullptr, nullptr, nullptr, nullptr); } -BTreePage::BinaryTree::Cursor getCursor(const Reference &page) { - return ((BTreePage::BinaryTree::Mirror *)page->userData)->getCursor(); +BTreePage::BinaryTree::Cursor getCursor(const Reference& page) { + return ((BTreePage::BinaryTree::Mirror*)page->userData)->getCursor(); } struct BoundaryRefAndPage { @@ -2540,32 +2454,23 @@ struct BoundaryRefAndPage { } }; -#define NOT_IMPLEMENTED { UNSTOPPABLE_ASSERT(false); } +#define NOT_IMPLEMENTED \ + { UNSTOPPABLE_ASSERT(false); } #pragma pack(push, 1) -template +template struct InPlaceArray { SizeT count; - const T * begin() const { - return (T *)(this + 1); - } - - T * begin() { - return (T *)(this + 1); - } + const T* begin() const { return (T*)(this + 1); } - const T * end() const { - return begin() + count; - } - - T * end() { - return begin() + count; - } + T* begin() { return (T*)(this + 1); } - VectorRef get() { - return VectorRef(begin(), count); - } + const T* end() const { return begin() + count; } + + T* end() { return begin() + count; } + + VectorRef get() { return VectorRef(begin(), count); } void set(VectorRef v, int availableSpace) { ASSERT(sizeof(T) * v.size() <= availableSpace); @@ -2573,9 +2478,7 @@ struct InPlaceArray { memcpy(begin(), v.begin(), sizeof(T) * v.size()); } - int extraSize() const { - return count * sizeof(T); - } + int extraSize() const { return count * sizeof(T); } }; #pragma pack(pop) @@ -2590,33 +2493,27 @@ public: Version version; Standalone pageID; - bool operator< (const LazyDeleteQueueEntry &rhs) const { - return version < rhs.version; - } + bool operator<(const LazyDeleteQueueEntry& rhs) const { return version < rhs.version; } - int readFromBytes(const uint8_t *src) { - version = *(Version *)src; + int readFromBytes(const uint8_t* src) { + version = *(Version*)src; src += sizeof(Version); int count = *src++; - pageID = BTreePageID((LogicalPageID *)src, count); + pageID = BTreePageID((LogicalPageID*)src, count); return bytesNeeded(); } - int bytesNeeded() const { - return sizeof(Version) + 1 + (pageID.size() * sizeof(LogicalPageID)); - } + int bytesNeeded() const { return sizeof(Version) + 1 + (pageID.size() * sizeof(LogicalPageID)); } - int writeToBytes(uint8_t *dst) const { - *(Version *)dst = version; + int writeToBytes(uint8_t* dst) const { + *(Version*)dst = version; dst += sizeof(Version); *dst++ = pageID.size(); memcpy(dst, pageID.begin(), pageID.size() * sizeof(LogicalPageID)); return bytesNeeded(); } - std::string toString() const { - return format("{%s @%" PRId64 "}", ::toString(pageID).c_str(), version); - } + std::string toString() const { return format("{%s @%" PRId64 "}", ::toString(pageID).c_str(), version); } }; typedef FIFOQueue LazyDeleteQueueT; @@ -2630,9 +2527,7 @@ public: LazyDeleteQueueT::QueueState lazyDeleteQueue; InPlaceArray root; - KeyRef asKeyRef() const { - return KeyRef((uint8_t *)this, sizeof(MetaKey) + root.extraSize()); - } + KeyRef asKeyRef() const { return KeyRef((uint8_t*)this, sizeof(MetaKey) + root.extraSize()); } void fromKeyRef(KeyRef k) { memcpy(this, k.begin(), k.size()); @@ -2640,9 +2535,9 @@ public: } std::string toString() { - return format("{height=%d formatVersion=%d root=%s lazyDeleteQueue=%s}", (int)height, (int)formatVersion, ::toString(root.get()).c_str(), lazyDeleteQueue.toString().c_str()); + return format("{height=%d formatVersion=%d root=%s lazyDeleteQueue=%s}", (int)height, (int)formatVersion, + ::toString(root.get()).c_str(), lazyDeleteQueue.toString().c_str()); } - }; #pragma pack(pop) @@ -2652,9 +2547,7 @@ public: startTime = g_network ? now() : 0; } - void clear() { - *this = Counts(); - } + void clear() { *this = Counts(); } int64_t pageReads; int64_t extPageReads; @@ -2675,16 +2568,23 @@ public: double startTime; std::string toString(bool clearAfter = false) { - const char *labels[] = {"set", "clear", "clearSingleKey", "get", "getRange", "commit", "pageReads", "extPageRead", "pagePreloads", "extPagePreloads", "pageWrite", "extPageWrite", "commitPage", "commitPageStart", "pageUpdates"}; - const int64_t values[] = {sets, clears, clearSingleKey, gets, getRanges, commits, pageReads, extPageReads, pagePreloads, extPagePreloads, pageWrites, extPageWrites, commitToPage, commitToPageStart, pageUpdates}; + const char* labels[] = { "set", "clear", "clearSingleKey", "get", + "getRange", "commit", "pageReads", "extPageRead", + "pagePreloads", "extPagePreloads", "pageWrite", "extPageWrite", + "commitPage", "commitPageStart", "pageUpdates" }; + const int64_t values[] = { + sets, clears, clearSingleKey, gets, getRanges, commits, pageReads, + extPageReads, pagePreloads, extPagePreloads, pageWrites, extPageWrites, commitToPage, commitToPageStart, + pageUpdates + }; double elapsed = now() - startTime; std::string s; - for(int i = 0; i < sizeof(values) / sizeof(int64_t); ++i) { + for (int i = 0; i < sizeof(values) / sizeof(int64_t); ++i) { s += format("%s=%" PRId64 " (%d/s) ", labels[i], values[i], int(values[i] / elapsed)); } - if(clearAfter) { + if (clearAfter) { clear(); } @@ -2697,40 +2597,32 @@ public: // All async opts on the btree are based on pager reads, writes, and commits, so // we can mostly forward these next few functions to the pager - Future getError() { - return m_pager->getError(); - } + Future getError() { return m_pager->getError(); } - Future onClosed() { - return m_pager->onClosed(); - } + Future onClosed() { return m_pager->onClosed(); } void close_impl(bool dispose) { - auto *pager = m_pager; + auto* pager = m_pager; delete this; - if(dispose) + if (dispose) pager->dispose(); else pager->close(); } - void dispose() { - return close_impl(true); - } + void dispose() { return close_impl(true); } - void close() { - return close_impl(false); - } + void close() { return close_impl(false); } - KeyValueStoreType getType() NOT_IMPLEMENTED - bool supportsMutation(int op) NOT_IMPLEMENTED - StorageBytes getStorageBytes() { + KeyValueStoreType getType() NOT_IMPLEMENTED bool supportsMutation(int op) NOT_IMPLEMENTED StorageBytes + getStorageBytes() { return m_pager->getStorageBytes(); } // Writes are provided in an ordered stream. - // A write is considered part of (a change leading to) the version determined by the previous call to setWriteVersion() - // A write shall not become durable until the following call to commit() begins, and shall be durable once the following call to commit() returns + // A write is considered part of (a change leading to) the version determined by the previous call to + // setWriteVersion() A write shall not become durable until the following call to commit() begins, and shall be + // durable once the following call to commit() returns void set(KeyValueRef keyValue) { ++counts.sets; m_pBuffer->insert(keyValue.key).mutation().setBoundaryValue(m_pBuffer->copyToArena(keyValue.value)); @@ -2738,10 +2630,8 @@ public: void clear(KeyRangeRef clearedRange) { // Optimization for single key clears to create just one mutation boundary instead of two - if(clearedRange.begin.size() == clearedRange.end.size() - 1 - && clearedRange.end[clearedRange.end.size() - 1] == 0 - && clearedRange.end.startsWith(clearedRange.begin) - ) { + if (clearedRange.begin.size() == clearedRange.end.size() - 1 && + clearedRange.end[clearedRange.end.size() - 1] == 0 && clearedRange.end.startsWith(clearedRange.begin)) { ++counts.clears; ++counts.clearSingleKey; m_pBuffer->insert(clearedRange.begin).mutation().clearBoundary(); @@ -2759,40 +2649,31 @@ public: void mutate(int op, StringRef param1, StringRef param2) NOT_IMPLEMENTED - void setOldestVersion(Version v) { + void setOldestVersion(Version v) { m_newOldestVersion = v; } - Version getOldestVersion() { - return m_pager->getOldestVersion(); - } + Version getOldestVersion() { return m_pager->getOldestVersion(); } Version getLatestVersion() { - if(m_writeVersion != invalidVersion) - return m_writeVersion; + if (m_writeVersion != invalidVersion) return m_writeVersion; return m_pager->getLatestVersion(); } - Version getWriteVersion() { - return m_writeVersion; - } + Version getWriteVersion() { return m_writeVersion; } - Version getLastCommittedVersion() { - return m_lastCommittedVersion; - } + Version getLastCommittedVersion() { return m_lastCommittedVersion; } - VersionedBTree(IPager2 *pager, std::string name) - : m_pager(pager), - m_writeVersion(invalidVersion), - m_lastCommittedVersion(invalidVersion), - m_pBuffer(nullptr), - m_name(name) - { + VersionedBTree(IPager2* pager, std::string name) + : m_pager(pager), m_writeVersion(invalidVersion), m_lastCommittedVersion(invalidVersion), m_pBuffer(nullptr), + m_name(name) { m_init = init_impl(this); m_latestCommit = m_init; } - ACTOR static Future incrementalSubtreeClear(VersionedBTree *self, bool *pStop = nullptr, int batchSize = 10, unsigned int minPages = 0, int maxPages = std::numeric_limits::max()) { + ACTOR static Future incrementalSubtreeClear(VersionedBTree* self, bool* pStop = nullptr, int batchSize = 10, + unsigned int minPages = 0, + int maxPages = std::numeric_limits::max()) { // TODO: Is it contractually okay to always to read at the latest version? state Reference snapshot = self->m_pager->getReadSnapshot(self->m_pager->getLatestVersion()); state int freedPages = 0; @@ -2801,52 +2682,52 @@ public: state std::vector>>> entries; // Take up to batchSize pages from front of queue - while(entries.size() < batchSize) { + while (entries.size() < batchSize) { Optional q = wait(self->m_lazyDeleteQueue.pop()); debug_printf("LazyDelete: popped %s\n", toString(q).c_str()); - if(!q.present()) { + if (!q.present()) { break; } // Start reading the page, without caching - entries.push_back(std::make_pair(q.get(), self->readPage(snapshot, q.get().pageID, nullptr, nullptr, true))); + entries.push_back( + std::make_pair(q.get(), self->readPage(snapshot, q.get().pageID, nullptr, nullptr, true))); } - if(entries.empty()) { + if (entries.empty()) { break; } state int i; - for(i = 0; i < entries.size(); ++i) { + for (i = 0; i < entries.size(); ++i) { Reference p = wait(entries[i].second); - const LazyDeleteQueueEntry &entry = entries[i].first; - const BTreePage &btPage = *(BTreePage *)p->begin(); + const LazyDeleteQueueEntry& entry = entries[i].first; + const BTreePage& btPage = *(BTreePage*)p->begin(); debug_printf("LazyDelete: processing %s\n", toString(entry).c_str()); // Level 1 (leaf) nodes should never be in the lazy delete queue ASSERT(btPage.height > 1); - + // Iterate over page entries, skipping key decoding using BTreePage::ValueTree which uses // RedwoodRecordRef::DeltaValueOnly as the delta type type to skip key decoding BTreePage::ValueTree::Mirror reader(&btPage.valueTree(), &dbBegin, &dbEnd); auto c = reader.getCursor(); ASSERT(c.moveFirst()); Version v = entry.version; - while(1) { - if(c.get().value.present()) { + while (1) { + if (c.get().value.present()) { BTreePageID btChildPageID = c.get().getChildPage(); // If this page is height 2, then the children are leaves so free - if(btPage.height == 2) { + if (btPage.height == 2) { debug_printf("LazyDelete: freeing child %s\n", toString(btChildPageID).c_str()); self->freeBtreePage(btChildPageID, v); freedPages += btChildPageID.size(); - } - else { + } else { // Otherwise, queue them for lazy delete. debug_printf("LazyDelete: queuing child %s\n", toString(btChildPageID).c_str()); - self->m_lazyDeleteQueue.pushFront(LazyDeleteQueueEntry{v, btChildPageID}); + self->m_lazyDeleteQueue.pushFront(LazyDeleteQueueEntry{ v, btChildPageID }); } } - if(!c.moveNext()) { + if (!c.moveNext()) { break; } } @@ -2858,28 +2739,30 @@ public: } // If stop is set and we've freed the minimum number of pages required, or the maximum is exceeded, return. - if((freedPages >= minPages && pStop != nullptr && *pStop) || freedPages >= maxPages) { + if ((freedPages >= minPages && pStop != nullptr && *pStop) || freedPages >= maxPages) { break; } } - debug_printf("LazyDelete: freed %d pages, %s has %" PRId64 " entries\n", freedPages, self->m_lazyDeleteQueue.name.c_str(), self->m_lazyDeleteQueue.numEntries); + debug_printf("LazyDelete: freed %d pages, %s has %" PRId64 " entries\n", freedPages, + self->m_lazyDeleteQueue.name.c_str(), self->m_lazyDeleteQueue.numEntries); return freedPages; } - ACTOR static Future init_impl(VersionedBTree *self) { + ACTOR static Future init_impl(VersionedBTree* self) { wait(self->m_pager->init()); state Version latest = self->m_pager->getLatestVersion(); self->m_newOldestVersion = self->m_pager->getOldestVersion(); - debug_printf("Recovered pager to version %" PRId64 ", oldest version is %" PRId64 "\n", self->m_newOldestVersion); + debug_printf("Recovered pager to version %" PRId64 ", oldest version is %" PRId64 "\n", + self->m_newOldestVersion); state Key meta = self->m_pager->getMetaKey(); - if(meta.size() == 0) { + if (meta.size() == 0) { self->m_header.formatVersion = MetaKey::FORMAT_VERSION; LogicalPageID id = wait(self->m_pager->newPageID()); - BTreePageID newRoot((LogicalPageID *)&id, 1); + BTreePageID newRoot((LogicalPageID*)&id, 1); debug_printf("new root %s\n", toString(newRoot).c_str()); self->m_header.root.set(newRoot, sizeof(headerSpace) - sizeof(m_header)); self->m_header.height = 1; @@ -2895,8 +2778,7 @@ public: self->m_pager->setMetaKey(self->m_header.asKeyRef()); wait(self->m_pager->commit()); debug_printf("Committed initial commit.\n"); - } - else { + } else { self->m_header.fromKeyRef(meta); self->m_lazyDeleteQueue.recover(self->m_pager, self->m_header.lazyDeleteQueue, "LazyDeleteQueueRecovered"); } @@ -2907,13 +2789,11 @@ public: return Void(); } - Future init() override { - return m_init; - } + Future init() override { return m_init; } virtual ~VersionedBTree() { // This probably shouldn't be called directly (meaning deleting an instance directly) but it should be safe, - // it will cancel init and commit and leave the pager alive but with potentially an incomplete set of + // it will cancel init and commit and leave the pager alive but with potentially an incomplete set of // uncommitted writes so it should not be committed. m_init.cancel(); m_latestCommit.cancel(); @@ -2928,19 +2808,18 @@ public: KeyRef m = snapshot->getMetaKey(); // Currently all internal records generated in the write path are at version 0 - return Reference(new Cursor(snapshot, ((MetaKey *)m.begin())->root.get(), (Version)0)); + return Reference(new Cursor(snapshot, ((MetaKey*)m.begin())->root.get(), (Version)0)); } // Must be nondecreasing void setWriteVersion(Version v) { ASSERT(v > m_lastCommittedVersion); // If there was no current mutation buffer, create one in the buffer map and update m_pBuffer - if(m_pBuffer == nullptr) { + if (m_pBuffer == nullptr) { // When starting a new mutation buffer its start version must be greater than the last write version ASSERT(v > m_writeVersion); m_pBuffer = &m_mutationBuffers[v]; - } - else { + } else { // It's OK to set the write version to the same version repeatedly so long as m_pBuffer is not null ASSERT(v >= m_writeVersion); } @@ -2948,12 +2827,11 @@ public: } Future commit() { - if(m_pBuffer == nullptr) - return m_latestCommit; + if (m_pBuffer == nullptr) return m_latestCommit; return commit_impl(this); } - ACTOR static Future destroyAndCheckSanity_impl(VersionedBTree *self) { + ACTOR static Future destroyAndCheckSanity_impl(VersionedBTree* self) { ASSERT(g_network->isSimulated()); debug_printf("Clearing tree.\n"); @@ -2964,7 +2842,7 @@ public: state int freedPages = wait(self->incrementalSubtreeClear(self)); wait(self->commit()); // Keep looping until the last commit doesn't do anything at all - if(self->m_lazyDeleteQueue.numEntries == 0 && freedPages == 0) { + if (self->m_lazyDeleteQueue.numEntries == 0 && freedPages == 0) { break; } self->setWriteVersion(self->getLatestVersion() + 1); @@ -2994,29 +2872,22 @@ public: return Void(); } - Future destroyAndCheckSanity() { - return destroyAndCheckSanity_impl(this); - } + Future destroyAndCheckSanity() { return destroyAndCheckSanity_impl(this); } private: struct ChildLinksRef { ChildLinksRef() = default; ChildLinksRef(VectorRef children, RedwoodRecordRef upperBound) - : children(children), upperBound(upperBound) { - } + : children(children), upperBound(upperBound) {} - ChildLinksRef(const RedwoodRecordRef *child, const RedwoodRecordRef *upperBound) - : children((RedwoodRecordRef *)child, 1), upperBound(*upperBound) { - } + ChildLinksRef(const RedwoodRecordRef* child, const RedwoodRecordRef* upperBound) + : children((RedwoodRecordRef*)child, 1), upperBound(*upperBound) {} - ChildLinksRef(Arena &arena, const ChildLinksRef &toCopy) - : children(arena, toCopy.children), upperBound(arena, toCopy.upperBound) { - } + ChildLinksRef(Arena& arena, const ChildLinksRef& toCopy) + : children(arena, toCopy.children), upperBound(arena, toCopy.upperBound) {} - int expectedSize() const { - return children.expectedSize() + upperBound.expectedSize(); - } + int expectedSize() const { return children.expectedSize() + upperBound.expectedSize(); } std::string toString() const { return format("{children=%s upperbound=%s}", ::toString(children).c_str(), upperBound.toString().c_str()); @@ -3033,38 +2904,36 @@ private: // boundaries of consecutive entries. struct InternalPageBuilder { // Cursor must be at first entry in page - InternalPageBuilder(const BTreePage::BinaryTree::Cursor &c) - : cursor(c), modified(false), childPageCount(0) - { - } + InternalPageBuilder(const BTreePage::BinaryTree::Cursor& c) : cursor(c), modified(false), childPageCount(0) {} private: // This must be called internally, on records whose arena has already been added to the entries arena - inline void addEntry(const RedwoodRecordRef &rec) { - if(rec.value.present()) { + inline void addEntry(const RedwoodRecordRef& rec) { + if (rec.value.present()) { ++childPageCount; } // If no modification detected yet then check that this record is identical to the next // record from the original page which is at the current cursor position. - if(!modified) { - if(cursor.valid()) { - if(rec != cursor.get()) { - debug_printf("InternalPageBuilder: Found internal page difference. new: %s old: %s\n", rec.toString().c_str(), cursor.get().toString().c_str()); + if (!modified) { + if (cursor.valid()) { + if (rec != cursor.get()) { + debug_printf("InternalPageBuilder: Found internal page difference. new: %s old: %s\n", + rec.toString().c_str(), cursor.get().toString().c_str()); modified = true; - } - else { + } else { cursor.moveNext(); } - } - else { - debug_printf("InternalPageBuilder: Found internal page difference. new: %s old: \n", rec.toString().c_str()); + } else { + debug_printf("InternalPageBuilder: Found internal page difference. new: %s old: \n", + rec.toString().c_str()); modified = true; } } entries.push_back(entries.arena(), rec); } + public: // Add the child entries from newSet into entries void addEntries(ChildLinksRef newSet) { @@ -3072,14 +2941,14 @@ private: // as the first lowerBound in newSet (or newSet is empty, as the next newSet is necessarily greater) // then add the upper bound of the previous set as a value-less record so that on future reads // the previous child page can be decoded correctly. - if(!entries.empty() && entries.back().value.present() - && (newSet.children.empty() || !newSet.children.front().sameExceptValue(lastUpperBound))) - { - debug_printf("InternalPageBuilder: Added placeholder %s\n", lastUpperBound.withoutValue().toString().c_str()); + if (!entries.empty() && entries.back().value.present() && + (newSet.children.empty() || !newSet.children.front().sameExceptValue(lastUpperBound))) { + debug_printf("InternalPageBuilder: Added placeholder %s\n", + lastUpperBound.withoutValue().toString().c_str()); addEntry(lastUpperBound.withoutValue()); } - for(auto &child : newSet.children) { + for (auto& child : newSet.children) { debug_printf("InternalPageBuilder: Adding child entry %s\n", child.toString().c_str()); addEntry(child); } @@ -3096,32 +2965,40 @@ private: // This is only done if modified is set to avoid rewriting this page for this purpose only. // // After this call, lastUpperBound is internal page's upper bound. - void finalize(const RedwoodRecordRef &upperBound, const RedwoodRecordRef &decodeUpperBound) { - debug_printf("InternalPageBuilder::end modified=%d upperBound=%s decodeUpperBound=%s lastUpperBound=%s\n", modified, upperBound.toString().c_str(), decodeUpperBound.toString().c_str(), lastUpperBound.toString().c_str()); + void finalize(const RedwoodRecordRef& upperBound, const RedwoodRecordRef& decodeUpperBound) { + debug_printf( + "InternalPageBuilder::end modified=%d upperBound=%s decodeUpperBound=%s lastUpperBound=%s\n", + modified, upperBound.toString().c_str(), decodeUpperBound.toString().c_str(), + lastUpperBound.toString().c_str()); modified = modified || cursor.valid(); debug_printf("InternalPageBuilder::end modified=%d after cursor check\n", modified); - // If there are boundary key entries and the last one has a child page then the + // If there are boundary key entries and the last one has a child page then the // upper bound for this internal page must match the required upper bound for // the last child entry. - if(!entries.empty() && entries.back().value.present()) { + if (!entries.empty() && entries.back().value.present()) { debug_printf("InternalPageBuilder::end last entry is not null\n"); // If the page contents were not modified so far and the upper bound required // for the last child page (lastUpperBound) does not match what the page // was encoded with then the page must be modified. - if(!modified && !lastUpperBound.sameExceptValue(decodeUpperBound)) { - debug_printf("InternalPageBuilder::end modified set true because lastUpperBound does not match decodeUpperBound\n"); + if (!modified && !lastUpperBound.sameExceptValue(decodeUpperBound)) { + debug_printf("InternalPageBuilder::end modified set true because lastUpperBound does not match " + "decodeUpperBound\n"); modified = true; } - if(modified && !lastUpperBound.sameExceptValue(upperBound)) { - debug_printf("InternalPageBuilder::end Modified is true but lastUpperBound does not match upperBound so adding placeholder\n"); + if (modified && !lastUpperBound.sameExceptValue(upperBound)) { + debug_printf("InternalPageBuilder::end Modified is true but lastUpperBound does not match " + "upperBound so adding placeholder\n"); addEntry(lastUpperBound.withoutValue()); lastUpperBound = upperBound; } } - debug_printf("InternalPageBuilder::end exit. modified=%d upperBound=%s decodeUpperBound=%s lastUpperBound=%s\n", modified, upperBound.toString().c_str(), decodeUpperBound.toString().c_str(), lastUpperBound.toString().c_str()); + debug_printf( + "InternalPageBuilder::end exit. modified=%d upperBound=%s decodeUpperBound=%s lastUpperBound=%s\n", + modified, upperBound.toString().c_str(), decodeUpperBound.toString().c_str(), + lastUpperBound.toString().c_str()); } BTreePage::BinaryTree::Cursor cursor; @@ -3153,33 +3030,25 @@ private: // No point in serializing an atomic op, it needs to be coalesced to a real value. ASSERT(!isAtomicOp()); - if(isClear()) - return RedwoodRecordRef(userKey, version); + if (isClear()) return RedwoodRecordRef(userKey, version); return RedwoodRecordRef(userKey, version, value); } - std::string toString() const { - return format("op=%d val='%s'", op, printable(value).c_str()); - } + std::string toString() const { return format("op=%d val='%s'", op, printable(value).c_str()); } }; struct RangeMutation { - RangeMutation() : boundaryChanged(false), clearAfterBoundary(false) { - } + RangeMutation() : boundaryChanged(false), clearAfterBoundary(false) {} bool boundaryChanged; - Optional boundaryValue; // Not present means cleared + Optional boundaryValue; // Not present means cleared bool clearAfterBoundary; - bool boundaryCleared() const { - return boundaryChanged && !boundaryValue.present(); - } + bool boundaryCleared() const { return boundaryChanged && !boundaryValue.present(); } // Returns true if this RangeMutation doesn't actually mutate anything - bool noChanges() const { - return !boundaryChanged && !clearAfterBoundary; - } + bool noChanges() const { return !boundaryChanged && !clearAfterBoundary; } void clearBoundary() { boundaryChanged = true; @@ -3190,24 +3059,21 @@ private: clearBoundary(); clearAfterBoundary = true; } - + void setBoundaryValue(ValueRef v) { boundaryChanged = true; boundaryValue = v; } - bool boundarySet() const { - return boundaryChanged && boundaryValue.present(); - } + bool boundarySet() const { return boundaryChanged && boundaryValue.present(); } std::string toString() const { - return format("boundaryChanged=%d clearAfterBoundary=%d boundaryValue=%s", boundaryChanged, clearAfterBoundary, ::toString(boundaryValue).c_str()); + return format("boundaryChanged=%d clearAfterBoundary=%d boundaryValue=%s", boundaryChanged, + clearAfterBoundary, ::toString(boundaryValue).c_str()); } }; public: - - #include "ArtMutationBuffer.h" struct MutationBufferStdMap { MutationBufferStdMap() { @@ -3228,52 +3094,36 @@ public: struct iterator : public MutationsT::iterator { typedef MutationsT::iterator Base; iterator() = default; - iterator(const MutationsT::iterator &i) : Base(i) { - } + iterator(const MutationsT::iterator& i) : Base(i) {} - const KeyRef & key() { - return (*this)->first; - } + const KeyRef& key() { return (*this)->first; } - RangeMutation & mutation() { - return (*this)->second; - } + RangeMutation& mutation() { return (*this)->second; } }; struct const_iterator : public MutationsT::const_iterator { typedef MutationsT::const_iterator Base; const_iterator() = default; - const_iterator(const MutationsT::const_iterator &i) : Base(i) { - } - const_iterator(const MutationsT::iterator &i) : Base(i) { - } + const_iterator(const MutationsT::const_iterator& i) : Base(i) {} + const_iterator(const MutationsT::iterator& i) : Base(i) {} - const KeyRef & key() { - return (*this)->first; - } + const KeyRef& key() { return (*this)->first; } - const RangeMutation & mutation() { - return (*this)->second; - } + const RangeMutation& mutation() { return (*this)->second; } }; // Return a T constructed in arena - template T copyToArena(const T &object) { + template + T copyToArena(const T& object) { return T(arena, object); } - const_iterator upper_bound(const KeyRef &k) const { - return mutations.upper_bound(k); - } + const_iterator upper_bound(const KeyRef& k) const { return mutations.upper_bound(k); } - const_iterator lower_bound(const KeyRef &k) const { - return mutations.lower_bound(k); - } + const_iterator lower_bound(const KeyRef& k) const { return mutations.lower_bound(k); } // erase [begin, end) from the mutation map - void erase(const const_iterator &begin, const const_iterator &end) { - mutations.erase(begin, end); - } + void erase(const const_iterator& begin, const const_iterator& end) { mutations.erase(begin, end); } // Find or create a mutation buffer boundary for bound and return an iterator to it iterator insert(KeyRef boundary) { @@ -3284,34 +3134,34 @@ public: iterator ib = mutations.lower_bound(boundary); // If we found the boundary we are looking for, return its iterator - if(ib.key() == boundary) { + if (ib.key() == boundary) { return ib; } // ib is our insert hint. Copy boundary into arena and insert boundary into buffer boundary = KeyRef(arena, boundary); - ib = mutations.insert(ib, {boundary, RangeMutation()}); + ib = mutations.insert(ib, { boundary, RangeMutation() }); // ib is certainly > begin() because it is guaranteed that the empty string // boundary exists and the only way to have found that is to look explicitly // for it in which case we would have returned above. iterator iPrevious = ib; --iPrevious; - // If the range we just divided was being cleared, then the dividing boundary key and range after it must also be cleared - if(iPrevious.mutation().clearAfterBoundary) { + // If the range we just divided was being cleared, then the dividing boundary key and range after it must + // also be cleared + if (iPrevious.mutation().clearAfterBoundary) { ib.mutation().clearAll(); } return ib; } - }; #define USE_ART_MUTATION_BUFFER 1 #ifdef USE_ART_MUTATION_BUFFER - typedef struct MutationBufferART MutationBuffer; + typedef struct MutationBufferART MutationBuffer; #else - typedef struct MutationBufferStdMap MutationBuffer; + typedef struct MutationBufferStdMap MutationBuffer; #endif private: @@ -3320,10 +3170,10 @@ private: * This structure's organization is meant to put pending updates for the btree in an order * that makes it efficient to query all pending mutations across all pending versions which are * relevant to a particular subtree of the btree. - * + * * At the top level, it is a map of the start of a range being modified to a RangeMutation. * The end of the range is map key (which is the next range start in the map). - * + * * - The buffer starts out with keys '' and endKVV.key already populated. * * - When a new key is inserted into the buffer map, it is by definition @@ -3364,8 +3214,8 @@ private: * to be sorted later just before being merged into the existing leaf page. */ - IPager2 *m_pager; - MutationBuffer *m_pBuffer; + IPager2* m_pager; + MutationBuffer* m_pBuffer; std::map m_mutationBuffers; Version m_writeVersion; @@ -3384,14 +3234,16 @@ private: LazyDeleteQueueT m_lazyDeleteQueue; // Writes entries to 1 or more pages and return a vector of boundary keys with their IPage(s) - ACTOR static Future>> writePages(VersionedBTree *self, const RedwoodRecordRef *lowerBound, const RedwoodRecordRef *upperBound, VectorRef entries, int height, Version v, BTreePageID previousID) { + ACTOR static Future>> writePages( + VersionedBTree* self, const RedwoodRecordRef* lowerBound, const RedwoodRecordRef* upperBound, + VectorRef entries, int height, Version v, BTreePageID previousID) { ASSERT(entries.size() > 0); state Standalone> records; // This is how much space for the binary tree exists in the page, after the header state int blockSize = self->m_pager->getUsablePageSize(); state int pageSize = blockSize - sizeof(BTreePage); - state float fillFactor = 0.66; // TODO: Make this a knob + state float fillFactor = 0.66; // TODO: Make this a knob state int pageFillTarget = pageSize * fillFactor; state int blockCount = 1; @@ -3406,20 +3258,21 @@ private: // Leaves can have just one record if it's large, but internal pages should have at least 4 state int minimumEntries = (height == 1 ? 1 : 4); - + // Lower bound of the page being added to state RedwoodRecordRef pageLowerBound = lowerBound->withoutValue(); state RedwoodRecordRef pageUpperBound; - while(1) { + while (1) { // While there are still entries to add and the page isn't full enough, add an entry - while(i < entries.size() && (i - start < minimumEntries || compressedBytes < pageFillTarget) ) { - const RedwoodRecordRef &entry = entries[i]; + while (i < entries.size() && (i - start < minimumEntries || compressedBytes < pageFillTarget)) { + const RedwoodRecordRef& entry = entries[i]; // Get delta from previous record or page lower boundary if this is the first item in a page - const RedwoodRecordRef &base = (i == start) ? pageLowerBound : entries[i - 1]; + const RedwoodRecordRef& base = (i == start) ? pageLowerBound : entries[i - 1]; - // All record pairs in entries have skipLen bytes in common with each other, but for i == 0 the base is lowerBound + // All record pairs in entries have skipLen bytes in common with each other, but for i == 0 the base is + // lowerBound int skip = i == 0 ? 0 : skipLen; // In a delta tree, all common prefix bytes that can be borrowed, will be, but not necessarily @@ -3432,27 +3285,29 @@ private: int valueSize = entry.value.present() ? entry.value.get().size() : 0; int nodeSize = BTreePage::BinaryTree::Node::headerSize(largeTree) + deltaSize; - debug_printf("Adding %3d of %3lu (i=%3d) klen %4d vlen %5d nodeSize %5d deltaSize %5d page usage: %d/%d (%.2f%%) record=%s\n", - i + 1, entries.size(), i, keySize, valueSize, nodeSize, deltaSize, compressedBytes, pageSize, (float)compressedBytes / pageSize * 100, entry.toString(height == 1).c_str()); + debug_printf("Adding %3d of %3lu (i=%3d) klen %4d vlen %5d nodeSize %5d deltaSize %5d page usage: " + "%d/%d (%.2f%%) record=%s\n", + i + 1, entries.size(), i, keySize, valueSize, nodeSize, deltaSize, compressedBytes, + pageSize, (float)compressedBytes / pageSize * 100, entry.toString(height == 1).c_str()); // While the node doesn't fit, expand the page. // This is a loop because if the page size moves into "large" range for DeltaTree // then the overhead will increase, which could require another page expansion. int spaceAvailable = pageSize - compressedBytes; - if(nodeSize > spaceAvailable) { + if (nodeSize > spaceAvailable) { // Figure out how many additional whole or partial blocks are needed // newBlocks = ceil ( additional space needed / block size) int newBlocks = 1 + (nodeSize - spaceAvailable - 1) / blockSize; int newPageSize = pageSize + (newBlocks * blockSize); // If we've moved into "large" page range for the delta tree then add additional overhead required - if(!largeTree && newPageSize > BTreePage::BinaryTree::SmallSizeLimit) { + if (!largeTree && newPageSize > BTreePage::BinaryTree::SmallSizeLimit) { largeTree = true; // Add increased overhead for the current node to nodeSize nodeSize += BTreePage::BinaryTree::LargeTreePerNodeExtraOverhead; // Add increased overhead for all previously added nodes compressedBytes += (i - start) * BTreePage::BinaryTree::LargeTreePerNodeExtraOverhead; - + // Update calculations above made with previous overhead sizes spaceAvailable = pageSize - compressedBytes; newBlocks = 1 + (nodeSize - spaceAvailable - 1) / blockSize; @@ -3471,10 +3326,11 @@ private: // Flush the accumulated records to a page state int nextStart = i; - // If we are building internal pages and there is a record after this page (index nextStart) but it has an empty childPage value then skip it. - // It only exists to serve as an upper boundary for a child page that has not been rewritten in the current commit, and that - // purpose will now be served by the upper bound of the page we are now building. - if(height != 1 && nextStart < entries.size() && !entries[nextStart].value.present()) { + // If we are building internal pages and there is a record after this page (index nextStart) but it has an + // empty childPage value then skip it. It only exists to serve as an upper boundary for a child page that + // has not been rewritten in the current commit, and that purpose will now be served by the upper bound of + // the page we are now building. + if (height != 1 && nextStart < entries.size() && !entries[nextStart].value.present()) { ++nextStart; } @@ -3483,51 +3339,56 @@ private: // If this is a leaf page, and not the last one to be written, shorten the upper boundary state bool isLastPage = (nextStart == entries.size()); - if(!isLastPage && height == 1) { + if (!isLastPage && height == 1) { int commonPrefix = pageUpperBound.getCommonPrefixLen(entries[i - 1], 0); pageUpperBound.truncate(commonPrefix + 1); } state std::vector> pages; - BTreePage *btPage; + BTreePage* btPage; - if(blockCount == 1) { + if (blockCount == 1) { Reference page = self->m_pager->newPageBuffer(); - btPage = (BTreePage *)page->mutate(); + btPage = (BTreePage*)page->mutate(); pages.push_back(std::move(page)); - } - else { + } else { ASSERT(blockCount > 1); int size = blockSize * blockCount; - btPage = (BTreePage *)new uint8_t[size]; + btPage = (BTreePage*)new uint8_t[size]; } btPage->height = height; btPage->kvBytes = kvBytes; - debug_printf("Building tree. start=%d i=%d count=%d page usage: %d/%d (%.2f%%) bytes\nlower: %s\nupper: %s\n", start, i, i - start, - compressedBytes, pageSize, (float)compressedBytes / pageSize * 100, pageLowerBound.toString(false).c_str(), pageUpperBound.toString(false).c_str()); + debug_printf( + "Building tree. start=%d i=%d count=%d page usage: %d/%d (%.2f%%) bytes\nlower: %s\nupper: %s\n", + start, i, i - start, compressedBytes, pageSize, (float)compressedBytes / pageSize * 100, + pageLowerBound.toString(false).c_str(), pageUpperBound.toString(false).c_str()); - int written = btPage->tree().build(pageSize, &entries[start], &entries[i], &pageLowerBound, &pageUpperBound); - if(written > pageSize) { - debug_printf("ERROR: Wrote %d bytes to %d byte page (%d blocks). recs %d kvBytes %d compressed %d\n", written, pageSize, blockCount, i - start, kvBytes, compressedBytes); - fprintf(stderr, "ERROR: Wrote %d bytes to %d byte page (%d blocks). recs %d kvBytes %d compressed %d\n", written, pageSize, blockCount, i - start, kvBytes, compressedBytes); + int written = + btPage->tree().build(pageSize, &entries[start], &entries[i], &pageLowerBound, &pageUpperBound); + if (written > pageSize) { + debug_printf("ERROR: Wrote %d bytes to %d byte page (%d blocks). recs %d kvBytes %d compressed %d\n", + written, pageSize, blockCount, i - start, kvBytes, compressedBytes); + fprintf(stderr, + "ERROR: Wrote %d bytes to %d byte page (%d blocks). recs %d kvBytes %d compressed %d\n", + written, pageSize, blockCount, i - start, kvBytes, compressedBytes); ASSERT(false); } // Create chunked pages // TODO: Avoid copying page bytes, but this is not trivial due to how pager checksums are currently handled. - if(blockCount != 1) { + if (blockCount != 1) { // Mark the slack in the page buffer as defined - VALGRIND_MAKE_MEM_DEFINED(((uint8_t *)btPage) + written, (blockCount * blockSize) - written); - const uint8_t *rptr = (const uint8_t *)btPage; - for(int b = 0; b < blockCount; ++b) { + VALGRIND_MAKE_MEM_DEFINED(((uint8_t*)btPage) + written, (blockCount * blockSize) - written); + const uint8_t* rptr = (const uint8_t*)btPage; + for (int b = 0; b < blockCount; ++b) { Reference page = self->m_pager->newPageBuffer(); memcpy(page->mutate(), rptr, blockSize); rptr += blockSize; pages.push_back(std::move(page)); } - delete [] (uint8_t *)btPage; + delete[](uint8_t*) btPage; } // Write this btree page, which is made of 1 or more pager pages. @@ -3537,21 +3398,20 @@ private: // If we are only writing 1 page and it has the same BTreePageID size as the original then try to reuse the // LogicalPageIDs in previousID and try to update them atomically. bool isOnlyPage = isLastPage && (start == 0); - if(isOnlyPage && previousID.size() == pages.size()) { - for(p = 0; p < pages.size(); ++p) { + if (isOnlyPage && previousID.size() == pages.size()) { + for (p = 0; p < pages.size(); ++p) { LogicalPageID id = wait(self->m_pager->atomicUpdatePage(previousID[p], pages[p], v)); childPageID.push_back(records.arena(), id); } - } - else { + } else { // Either the original page is being split, or it's not but it has changed BTreePageID size. // Either way, there is no point in reusing any of the original page IDs because the parent // must be rewritten anyway to count for the change in child count or child links. // Free the old IDs, but only once (before the first output record is added). - if(records.empty()) { + if (records.empty()) { self->freeBtreePage(previousID, v); } - for(p = 0; p < pages.size(); ++p) { + for (p = 0; p < pages.size(); ++p) { LogicalPageID id = wait(self->m_pager->newPageID()); self->m_pager->updatePage(id, pages[p]); childPageID.push_back(records.arena(), id); @@ -3562,15 +3422,18 @@ private: // Update activity counts ++counts.pageWrites; - if(pages.size() > 1) { + if (pages.size() > 1) { counts.extPageWrites += pages.size() - 1; } - debug_printf("Flushing %s lastPage=%d original=%s start=%d i=%d count=%d page usage: %d/%d (%.2f%%) bytes\nlower: %s\nupper: %s\n", toString(childPageID).c_str(), isLastPage, toString(previousID).c_str(), start, i, i - start, - compressedBytes, pageSize, (float)compressedBytes / pageSize * 100, pageLowerBound.toString(false).c_str(), pageUpperBound.toString(false).c_str()); + debug_printf("Flushing %s lastPage=%d original=%s start=%d i=%d count=%d page usage: %d/%d (%.2f%%) " + "bytes\nlower: %s\nupper: %s\n", + toString(childPageID).c_str(), isLastPage, toString(previousID).c_str(), start, i, i - start, + compressedBytes, pageSize, (float)compressedBytes / pageSize * 100, + pageLowerBound.toString(false).c_str(), pageUpperBound.toString(false).c_str()); - if(REDWOOD_DEBUG) { - for(int j = start; j < i; ++j) { + if (REDWOOD_DEBUG) { + for (int j = start; j < i; ++j) { debug_printf(" %3d: %s\n", j, entries[j].toString(height == 1).c_str()); } ASSERT(pageLowerBound.key <= pageUpperBound.key); @@ -3578,10 +3441,11 @@ private: // Push a new record onto the results set, without the child page, copying it into the records arena records.push_back_deep(records.arena(), pageLowerBound.withoutValue()); - // Set the child page value of the inserted record to childPageID, which has already been allocated in records.arena() above + // Set the child page value of the inserted record to childPageID, which has already been allocated in + // records.arena() above records.back().setChildPage(childPageID); - if(isLastPage) { + if (isLastPage) { break; } @@ -3591,85 +3455,82 @@ private: pageLowerBound = pageUpperBound; } - // If we're writing internal pages, if the last entry was the start of a new page and had an empty child link then it would not be written to a page. - // This means that the upper boundary for the the page set being built is not the upper bound of the final page in that set, so it must be added - // to the output set to preserve the decodability of the subtree to its left. - // Fortunately, this is easy to detect because the loop above would exit before i has reached the item count. - if(height != 1 && i != entries.size()) { - debug_printf("Adding dummy record to avoid writing useless page: %s\n", pageUpperBound.toString(false).c_str()); + // If we're writing internal pages, if the last entry was the start of a new page and had an empty child link + // then it would not be written to a page. This means that the upper boundary for the the page set being built + // is not the upper bound of the final page in that set, so it must be added to the output set to preserve the + // decodability of the subtree to its left. Fortunately, this is easy to detect because the loop above would + // exit before i has reached the item count. + if (height != 1 && i != entries.size()) { + debug_printf("Adding dummy record to avoid writing useless page: %s\n", + pageUpperBound.toString(false).c_str()); records.push_back_deep(records.arena(), pageUpperBound); } return records; } - ACTOR static Future>> buildNewRoot(VersionedBTree *self, Version version, Standalone> records, int height) { + ACTOR static Future>> buildNewRoot( + VersionedBTree* self, Version version, Standalone> records, int height) { debug_printf("buildNewRoot start version %" PRId64 ", %lu records\n", version, records.size()); // While there are multiple child pages for this version we must write new tree levels. - while(records.size() > 1) { + while (records.size() > 1) { self->m_header.height = ++height; - Standalone> newRecords = wait(writePages(self, &dbBegin, &dbEnd, records, height, version, BTreePageID())); - debug_printf("Wrote a new root level at version %" PRId64 " height %d size %lu pages\n", version, height, newRecords.size()); + Standalone> newRecords = + wait(writePages(self, &dbBegin, &dbEnd, records, height, version, BTreePageID())); + debug_printf("Wrote a new root level at version %" PRId64 " height %d size %lu pages\n", version, height, + newRecords.size()); records = newRecords; } return records; } - class SuperPage : public IPage, ReferenceCounted, public FastAllocated{ + class SuperPage : public IPage, ReferenceCounted, public FastAllocated { public: SuperPage(std::vector> pages) { int blockSize = pages.front()->size(); m_size = blockSize * pages.size(); m_data = new uint8_t[m_size]; - uint8_t *wptr = m_data; - for(auto &p : pages) { + uint8_t* wptr = m_data; + for (auto& p : pages) { ASSERT(p->size() == blockSize); memcpy(wptr, p->begin(), blockSize); wptr += blockSize; } } - virtual ~SuperPage() { - delete [] m_data; - } + virtual ~SuperPage() { delete[] m_data; } virtual Reference clone() const { - return Reference(new SuperPage({Reference::addRef(this)})); + return Reference(new SuperPage({ Reference::addRef(this) })); } - void addref() const { - ReferenceCounted::addref(); - } + void addref() const { ReferenceCounted::addref(); } - void delref() const { - ReferenceCounted::delref(); - } + void delref() const { ReferenceCounted::delref(); } - int size() const { - return m_size; - } + int size() const { return m_size; } - uint8_t const* begin() const { - return m_data; - } + uint8_t const* begin() const { return m_data; } - uint8_t* mutate() { - return m_data; - } + uint8_t* mutate() { return m_data; } private: - uint8_t *m_data; + uint8_t* m_data; int m_size; }; - ACTOR static Future> readPage(Reference snapshot, BTreePageID id, const RedwoodRecordRef *lowerBound, const RedwoodRecordRef *upperBound, bool forLazyDelete = false) { - if(!forLazyDelete) { - debug_printf("readPage() op=read %s @%" PRId64 " lower=%s upper=%s\n", toString(id).c_str(), snapshot->getVersion(), lowerBound->toString().c_str(), upperBound->toString().c_str()); - } - else { - debug_printf("readPage() op=readForDeferredClear %s @%" PRId64 " \n", toString(id).c_str(), snapshot->getVersion()); + ACTOR static Future> readPage(Reference snapshot, BTreePageID id, + const RedwoodRecordRef* lowerBound, + const RedwoodRecordRef* upperBound, + bool forLazyDelete = false) { + if (!forLazyDelete) { + debug_printf("readPage() op=read %s @%" PRId64 " lower=%s upper=%s\n", toString(id).c_str(), + snapshot->getVersion(), lowerBound->toString().c_str(), upperBound->toString().c_str()); + } else { + debug_printf("readPage() op=readForDeferredClear %s @%" PRId64 " \n", toString(id).c_str(), + snapshot->getVersion()); } wait(yield()); @@ -3677,15 +3538,14 @@ private: state Reference page; ++counts.pageReads; - if(id.size() == 1) { + if (id.size() == 1) { Reference p = wait(snapshot->getPhysicalPage(id.front(), !forLazyDelete, false)); page = p; - } - else { + } else { ASSERT(!id.empty()); counts.extPageReads += (id.size() - 1); std::vector>> reads; - for(auto &pageID : id) { + for (auto& pageID : id) { reads.push_back(snapshot->getPhysicalPage(pageID, !forLazyDelete, false)); } std::vector> pages = wait(getAll(reads)); @@ -3694,52 +3554,54 @@ private: } debug_printf("readPage() op=readComplete %s @%" PRId64 " \n", toString(id).c_str(), snapshot->getVersion()); - const BTreePage *pTreePage = (const BTreePage *)page->begin(); + const BTreePage* pTreePage = (const BTreePage*)page->begin(); - if(!forLazyDelete && page->userData == nullptr) { - debug_printf("readPage() Creating Reader for %s @%" PRId64 " lower=%s upper=%s\n", toString(id).c_str(), snapshot->getVersion(), lowerBound->toString().c_str(), upperBound->toString().c_str()); + if (!forLazyDelete && page->userData == nullptr) { + debug_printf("readPage() Creating Reader for %s @%" PRId64 " lower=%s upper=%s\n", toString(id).c_str(), + snapshot->getVersion(), lowerBound->toString().c_str(), upperBound->toString().c_str()); page->userData = new BTreePage::BinaryTree::Mirror(&pTreePage->tree(), lowerBound, upperBound); - page->userDataDestructor = [](void *ptr) { delete (BTreePage::BinaryTree::Mirror *)ptr; }; + page->userDataDestructor = [](void* ptr) { delete (BTreePage::BinaryTree::Mirror*)ptr; }; } - if(!forLazyDelete) { - debug_printf("readPage() %s\n", pTreePage->toString(false, id, snapshot->getVersion(), lowerBound, upperBound).c_str()); + if (!forLazyDelete) { + debug_printf("readPage() %s\n", + pTreePage->toString(false, id, snapshot->getVersion(), lowerBound, upperBound).c_str()); } return page; } - static void preLoadPage(IPagerSnapshot *snapshot, BTreePageID id) { + static void preLoadPage(IPagerSnapshot* snapshot, BTreePageID id) { ++counts.pagePreloads; counts.extPagePreloads += (id.size() - 1); - - for(auto pageID : id) { + + for (auto pageID : id) { snapshot->getPhysicalPage(pageID, true, true); } } void freeBtreePage(BTreePageID btPageID, Version v) { // Free individual pages at v - for(LogicalPageID id : btPageID) { + for (LogicalPageID id : btPageID) { m_pager->freePage(id, v); } } // Write new version of pageID at version v using page as its data. // Attempts to reuse original id(s) in btPageID, returns BTreePageID. - ACTOR static Future updateBtreePage(VersionedBTree *self, BTreePageID oldID, Arena *arena, Reference page, Version writeVersion) { + ACTOR static Future updateBtreePage(VersionedBTree* self, BTreePageID oldID, Arena* arena, + Reference page, Version writeVersion) { state BTreePageID newID; newID.resize(*arena, oldID.size()); - if(oldID.size() == 1) { + if (oldID.size() == 1) { LogicalPageID id = wait(self->m_pager->atomicUpdatePage(oldID.front(), page, writeVersion)); newID.front() = id; - } - else { + } else { state std::vector> pages; - const uint8_t *rptr = page->begin(); + const uint8_t* rptr = page->begin(); int bytesLeft = page->size(); - while(bytesLeft > 0) { + while (bytesLeft > 0) { Reference p = self->m_pager->newPageBuffer(); int blockSize = p->size(); memcpy(p->mutate(), rptr, blockSize); @@ -3751,7 +3613,7 @@ private: // Write pages, trying to reuse original page IDs state int i = 0; - for(; i < pages.size(); ++i) { + for (; i < pages.size(); ++i) { LogicalPageID id = wait(self->m_pager->atomicUpdatePage(oldID[i], pages[i], writeVersion)); newID[i] = id; } @@ -3759,7 +3621,7 @@ private: // Update activity counts ++counts.pageWrites; - if(newID.size() > 1) { + if (newID.size() > 1) { counts.extPageWrites += newID.size() - 1; } @@ -3770,11 +3632,12 @@ private: Reference cloneForUpdate(Reference page) { Reference newPage = page->clone(); - auto oldMirror = (const BTreePage::BinaryTree::Mirror *)page->userData; - auto newBTPage = (BTreePage *)newPage->mutate(); + auto oldMirror = (const BTreePage::BinaryTree::Mirror*)page->userData; + auto newBTPage = (BTreePage*)newPage->mutate(); - newPage->userData = new BTreePage::BinaryTree::Mirror(&newBTPage->tree(), oldMirror->lowerBound(), oldMirror->upperBound()); - newPage->userDataDestructor = [](void *ptr) { delete (BTreePage::BinaryTree::Mirror *)ptr; }; + newPage->userData = + new BTreePage::BinaryTree::Mirror(&newBTPage->tree(), oldMirror->lowerBound(), oldMirror->upperBound()); + newPage->userDataDestructor = [](void* ptr) { delete (BTreePage::BinaryTree::Mirror*)ptr; }; return newPage; } @@ -3782,30 +3645,26 @@ private: // iMutationBoundary is greatest boundary <= lowerBound->key // iMutationBoundaryEnd is least boundary >= upperBound->key ACTOR static Future> commitSubtree( - VersionedBTree *self, - MutationBuffer *mutationBuffer, - //MutationBuffer::const_iterator iMutationBoundary, // = mutationBuffer->upper_bound(lowerBound->key); --iMutationBoundary; - //MutationBuffer::const_iterator iMutationBoundaryEnd, // = mutationBuffer->lower_bound(upperBound->key); - Reference snapshot, - BTreePageID rootID, - bool isLeaf, - const RedwoodRecordRef *lowerBound, - const RedwoodRecordRef *upperBound, - const RedwoodRecordRef *decodeLowerBound, - const RedwoodRecordRef *decodeUpperBound, - int skipLen = 0 - ) { - //skipLen = lowerBound->getCommonPrefixLen(*upperBound, skipLen); + VersionedBTree* self, MutationBuffer* mutationBuffer, + // MutationBuffer::const_iterator iMutationBoundary, // = mutationBuffer->upper_bound(lowerBound->key); + // --iMutationBoundary; MutationBuffer::const_iterator iMutationBoundaryEnd, // = + // mutationBuffer->lower_bound(upperBound->key); + Reference snapshot, BTreePageID rootID, bool isLeaf, const RedwoodRecordRef* lowerBound, + const RedwoodRecordRef* upperBound, const RedwoodRecordRef* decodeLowerBound, + const RedwoodRecordRef* decodeUpperBound, int skipLen = 0) { + // skipLen = lowerBound->getCommonPrefixLen(*upperBound, skipLen); state std::string context; - if(REDWOOD_DEBUG) { + if (REDWOOD_DEBUG) { context = format("CommitSubtree(root=%s): ", toString(rootID).c_str()); } state Version writeVersion = self->getLastCommittedVersion() + 1; state Standalone result; - debug_printf("%s lower=%s upper=%s\n", context.c_str(), lowerBound->toString().c_str(), upperBound->toString().c_str()); - debug_printf("%s decodeLower=%s decodeUpper=%s\n", context.c_str(), decodeLowerBound->toString().c_str(), decodeUpperBound->toString().c_str()); + debug_printf("%s lower=%s upper=%s\n", context.c_str(), lowerBound->toString().c_str(), + upperBound->toString().c_str()); + debug_printf("%s decodeLower=%s decodeUpper=%s\n", context.c_str(), decodeLowerBound->toString().c_str(), + decodeUpperBound->toString().c_str()); self->counts.commitToPageStart++; // Find the slice of the mutation buffer that is relevant to this subtree @@ -3813,12 +3672,13 @@ private: --iMutationBoundary; state MutationBuffer::const_iterator iMutationBoundaryEnd = mutationBuffer->lower_bound(upperBound->key); - if(REDWOOD_DEBUG) { + if (REDWOOD_DEBUG) { debug_printf("%s ---------MUTATION BUFFER SLICE ---------------------\n", context.c_str()); auto begin = iMutationBoundary; - while(1) { - debug_printf("%s Mutation: '%s': %s\n", context.c_str(), printable(begin.key()).c_str(), begin.mutation().toString().c_str()); - if(begin == iMutationBoundaryEnd) { + while (1) { + debug_printf("%s Mutation: '%s': %s\n", context.c_str(), printable(begin.key()).c_str(), + begin.mutation().toString().c_str()); + if (begin == iMutationBoundaryEnd) { break; } ++begin; @@ -3833,7 +3693,7 @@ private: // unmodified, or possibly/partially modified. MutationBuffer::const_iterator iMutationBoundaryNext = iMutationBoundary; ++iMutationBoundaryNext; - if(iMutationBoundaryNext == iMutationBoundaryEnd) { + if (iMutationBoundaryNext == iMutationBoundaryEnd) { // Cleared means the entire range covering the subtree was cleared. It is assumed true // if the range starting after the lower mutation boundary was cleared, and then proven false // below if possible. @@ -3845,29 +3705,30 @@ private: // If the lower mutation boundary key is the same as the subtree lower bound then whether or not // that key is being changed or cleared affects this subtree. - if(iMutationBoundary.key() == lowerBound->key) { - // If subtree will be cleared (so far) but the lower boundary key is not cleared then the subtree is not cleared - if(cleared && !iMutationBoundary.mutation().boundaryCleared()) { + if (iMutationBoundary.key() == lowerBound->key) { + // If subtree will be cleared (so far) but the lower boundary key is not cleared then the subtree is not + // cleared + if (cleared && !iMutationBoundary.mutation().boundaryCleared()) { cleared = false; debug_printf("%s cleared=%d unchanged=%d\n", context.c_str(), cleared, unchanged); } - // If the subtree looked unchanged (so far) but the lower boundary is is changed then the subtree is changed - if(unchanged && iMutationBoundary.mutation().boundaryChanged) { + // If the subtree looked unchanged (so far) but the lower boundary is is changed then the subtree is + // changed + if (unchanged && iMutationBoundary.mutation().boundaryChanged) { unchanged = false; debug_printf("%s cleared=%d unchanged=%d\n", context.c_str(), cleared, unchanged); } } - // If the higher mutation boundary key is the same as the subtree upper bound key then whether + // If the higher mutation boundary key is the same as the subtree upper bound key then whether // or not it is being changed or cleared affects this subtree. - if((cleared || unchanged) && iMutationBoundaryEnd.key() == upperBound->key) { + if ((cleared || unchanged) && iMutationBoundaryEnd.key() == upperBound->key) { // If the key is being changed then the records in this subtree with the same key must be removed // so the subtree is definitely not unchanged, though it may be cleared to achieve the same effect. - if(iMutationBoundaryEnd.mutation().boundaryChanged) { + if (iMutationBoundaryEnd.mutation().boundaryChanged) { unchanged = false; debug_printf("%s cleared=%d unchanged=%d\n", context.c_str(), cleared, unchanged); - } - else { + } else { // If the key is not being changed then the records in this subtree can't be removed so the // subtree is not being cleared. cleared = false; @@ -3879,20 +3740,21 @@ private: ASSERT(!(cleared && unchanged)); // If no changes in subtree - if(unchanged) { + if (unchanged) { result.contents() = ChildLinksRef(decodeLowerBound, decodeUpperBound); - debug_printf("%s no changes on this subtree, returning %s\n", context.c_str(), toString(result).c_str()); + debug_printf("%s no changes on this subtree, returning %s\n", context.c_str(), + toString(result).c_str()); return result; } // If subtree is cleared - if(cleared) { - debug_printf("%s %s cleared, deleting it, returning %s\n", context.c_str(), isLeaf ? "Page" : "Subtree", toString(result).c_str()); - if(isLeaf) { + if (cleared) { + debug_printf("%s %s cleared, deleting it, returning %s\n", context.c_str(), isLeaf ? "Page" : "Subtree", + toString(result).c_str()); + if (isLeaf) { self->freeBtreePage(rootID, writeVersion); - } - else { - self->m_lazyDeleteQueue.pushBack(LazyDeleteQueueEntry{writeVersion, rootID}); + } else { + self->m_lazyDeleteQueue.pushBack(LazyDeleteQueueEntry{ writeVersion, rootID }); } return result; } @@ -3900,18 +3762,21 @@ private: self->counts.commitToPage++; state Reference page = wait(readPage(snapshot, rootID, decodeLowerBound, decodeUpperBound)); - state BTreePage *btPage = (BTreePage *)page->begin(); + state BTreePage* btPage = (BTreePage*)page->begin(); ASSERT(isLeaf == btPage->isLeaf()); - debug_printf("%s commitSubtree(): %s\n", context.c_str(), btPage->toString(false, rootID, snapshot->getVersion(), decodeLowerBound, decodeUpperBound).c_str()); + debug_printf( + "%s commitSubtree(): %s\n", context.c_str(), + btPage->toString(false, rootID, snapshot->getVersion(), decodeLowerBound, decodeUpperBound).c_str()); state BTreePage::BinaryTree::Cursor cursor; - if(REDWOOD_DEBUG) { + if (REDWOOD_DEBUG) { debug_printf("%s ---------MUTATION BUFFER SLICE ---------------------\n", context.c_str()); auto begin = iMutationBoundary; - while(1) { - debug_printf("%s Mutation: '%s': %s\n", context.c_str(), printable(begin.key()).c_str(), begin.mutation().toString().c_str()); - if(begin == iMutationBoundaryEnd) { + while (1) { + debug_printf("%s Mutation: '%s': %s\n", context.c_str(), printable(begin.key()).c_str(), + begin.mutation().toString().c_str()); + if (begin == iMutationBoundaryEnd) { break; } ++begin; @@ -3920,33 +3785,33 @@ private: } // Leaf Page - if(isLeaf) { + if (isLeaf) { // Try to update page unless it's an oversized page or empty or the boundaries have changed // TODO: Caller already knows if boundaries are the same. - bool updating = btPage->tree().numItems > 0 && !(*decodeLowerBound != *lowerBound || *decodeUpperBound != *upperBound); + bool updating = + btPage->tree().numItems > 0 && !(*decodeLowerBound != *lowerBound || *decodeUpperBound != *upperBound); - state Reference newPage; - // If replacement pages are written they will be at the minimum version seen in the mutations for this leaf + state Reference newPage; + // If replacement pages are written they will be at the minimum version seen in the mutations for this leaf bool changesMade = false; // If attempting an in-place page update, clone the page and read/modify the copy - if(updating) { + if (updating) { newPage = self->cloneForUpdate(page); - cursor = getCursor(newPage); - } - else { + cursor = getCursor(newPage); + } else { // Otherwise read the old page cursor = getCursor(page); } - // Couldn't make changes in place, so now do a linear merge and build new pages. - state Standalone> merged; + // Couldn't make changes in place, so now do a linear merge and build new pages. + state Standalone> merged; auto switchToLinearMerge = [&]() { updating = false; auto c = cursor; c.moveFirst(); - while(c != cursor) { + while (c != cursor) { debug_printf("%s catch-up adding %s\n", context.c_str(), c.get().toString().c_str()); merged.push_back(merged.arena(), c.get()); c.moveNext(); @@ -3955,40 +3820,46 @@ private: // The first mutation buffer boundary has a key <= the first key in the page. - cursor.moveFirst(); - debug_printf("%s Leaf page, applying changes.\n", context.c_str()); + cursor.moveFirst(); + debug_printf("%s Leaf page, applying changes.\n", context.c_str()); // Now, process each mutation range and merge changes with existing data. bool firstMutationBoundary = true; - while(iMutationBoundary != iMutationBoundaryEnd) { - debug_printf("%s New mutation boundary: '%s': %s\n", context.c_str(), printable(iMutationBoundary.key()).c_str(), iMutationBoundary.mutation().toString().c_str()); + while (iMutationBoundary != iMutationBoundaryEnd) { + debug_printf("%s New mutation boundary: '%s': %s\n", context.c_str(), + printable(iMutationBoundary.key()).c_str(), + iMutationBoundary.mutation().toString().c_str()); // Apply the change to the mutation buffer start boundary key only if // - there actually is a change (whether a set or a clear, old records are to be removed) // - either this is not the first boundary or it is but its key matches our lower bound key - bool applyBoundaryChange = iMutationBoundary.mutation().boundaryChanged && (!firstMutationBoundary || iMutationBoundary.key() >= lowerBound->key); + bool applyBoundaryChange = iMutationBoundary.mutation().boundaryChanged && + (!firstMutationBoundary || iMutationBoundary.key() >= lowerBound->key); firstMutationBoundary = false; - - // Iterate over records for the mutation boundary key, keep them unless the boundary key was changed or we are not applying it - while(cursor.valid() && cursor.get().key == iMutationBoundary.key()) { + + // Iterate over records for the mutation boundary key, keep them unless the boundary key was changed or + // we are not applying it + while (cursor.valid() && cursor.get().key == iMutationBoundary.key()) { // If there were no changes to the key or we're not applying it - if(!applyBoundaryChange) { - // If not updating, add to the output set, otherwise skip ahead past the records for the mutation boundary - if(!updating) { + if (!applyBoundaryChange) { + // If not updating, add to the output set, otherwise skip ahead past the records for the + // mutation boundary + if (!updating) { merged.push_back(merged.arena(), cursor.get()); - debug_printf("%s Added %s [existing, boundary start]\n", context.c_str(), cursor.get().toString().c_str()); + debug_printf("%s Added %s [existing, boundary start]\n", context.c_str(), + cursor.get().toString().c_str()); } cursor.moveNext(); - } - else { + } else { changesMade = true; // If updating, erase from the page, otherwise do not add to the output set - if(updating) { - debug_printf("%s Erasing %s [existing, boundary start]\n", context.c_str(), cursor.get().toString().c_str()); + if (updating) { + debug_printf("%s Erasing %s [existing, boundary start]\n", context.c_str(), + cursor.get().toString().c_str()); cursor.erase(); - } - else { - debug_printf("%s Skipped %s [existing, boundary start]\n", context.c_str(), cursor.get().toString().c_str()); + } else { + debug_printf("%s Skipped %s [existing, boundary start]\n", context.c_str(), + cursor.get().toString().c_str()); cursor.moveNext(); } } @@ -3997,25 +3868,28 @@ private: constexpr int maxHeightAllowed = 8; // Write the new record(s) for the mutation boundary start key if its value has been set - // Clears of this key will have been processed above by not being erased from the updated page or excluded from the merge output - if(applyBoundaryChange && iMutationBoundary.mutation().boundarySet()) { + // Clears of this key will have been processed above by not being erased from the updated page or + // excluded from the merge output + if (applyBoundaryChange && iMutationBoundary.mutation().boundarySet()) { RedwoodRecordRef rec(iMutationBoundary.key(), 0, iMutationBoundary.mutation().boundaryValue.get()); changesMade = true; // If updating, add to the page, else add to the output set - if(updating) { - if(cursor.mirror->insert(rec, skipLen, maxHeightAllowed)) { - debug_printf("%s Inserted %s [mutation, boundary start]\n", context.c_str(), rec.toString().c_str()); - } - else { - debug_printf("%s Inserted failed for %s [mutation, boundary start]\n", context.c_str(), rec.toString().c_str()); + if (updating) { + if (cursor.mirror->insert(rec, skipLen, maxHeightAllowed)) { + debug_printf("%s Inserted %s [mutation, boundary start]\n", context.c_str(), + rec.toString().c_str()); + } else { + debug_printf("%s Inserted failed for %s [mutation, boundary start]\n", context.c_str(), + rec.toString().c_str()); switchToLinearMerge(); } } - if(!updating) { + if (!updating) { merged.push_back(merged.arena(), rec); - debug_printf("%s Added %s [mutation, boundary start]\n", context.c_str(), rec.toString().c_str()); + debug_printf("%s Added %s [mutation, boundary start]\n", context.c_str(), + rec.toString().c_str()); } } @@ -4023,39 +3897,41 @@ private: bool remove = iMutationBoundary.mutation().clearAfterBoundary; // Advance to the next boundary because we need to know the end key for the current range. ++iMutationBoundary; - if(iMutationBoundary == iMutationBoundaryEnd) { + if (iMutationBoundary == iMutationBoundaryEnd) { skipLen = 0; } - debug_printf("%s Mutation range end: '%s'\n", context.c_str(), printable(iMutationBoundary.key()).c_str()); + debug_printf("%s Mutation range end: '%s'\n", context.c_str(), + printable(iMutationBoundary.key()).c_str()); // Now handle the records up through but not including the next mutation boundary key RedwoodRecordRef end(iMutationBoundary.key()); // If the records are being removed and we're not doing an in-place update // OR if we ARE doing an update but the records are NOT being removed, then just skip them. - if(remove != updating) { - // If not updating, then the records, if any exist, are being removed. We don't know if there actually are any - // but we must assume there are. - if(!updating) { + if (remove != updating) { + // If not updating, then the records, if any exist, are being removed. We don't know if there + // actually are any but we must assume there are. + if (!updating) { changesMade = true; } - debug_printf("%s Seeking forward to next boundary (remove=%d updating=%d) %s\n", context.c_str(), remove, updating, iMutationBoundary.key().toString().c_str()); + debug_printf("%s Seeking forward to next boundary (remove=%d updating=%d) %s\n", context.c_str(), + remove, updating, iMutationBoundary.key().toString().c_str()); cursor.seekGreaterThanOrEqual(end, skipLen); - } - else { - // Otherwise we must visit the records. If updating, the visit is to erase them, and if doing a + } else { + // Otherwise we must visit the records. If updating, the visit is to erase them, and if doing a // linear merge than the visit is to add them to the output set. - while(cursor.valid() && cursor.get().compare(end, skipLen) < 0) { - if(updating) { - debug_printf("%s Erasing %s [existing, boundary start]\n", context.c_str(), cursor.get().toString().c_str()); + while (cursor.valid() && cursor.get().compare(end, skipLen) < 0) { + if (updating) { + debug_printf("%s Erasing %s [existing, boundary start]\n", context.c_str(), + cursor.get().toString().c_str()); cursor.erase(); changesMade = true; - } - else { - merged.push_back(merged.arena(), cursor.get()); - debug_printf("%s Added %s [existing, middle]\n", context.c_str(), merged.back().toString().c_str()); + } else { + merged.push_back(merged.arena(), cursor.get()); + debug_printf("%s Added %s [existing, middle]\n", context.c_str(), + merged.back().toString().c_str()); cursor.moveNext(); } } @@ -4063,87 +3939,92 @@ private: } // If there are still more records, they have the same key as the end boundary - if(cursor.valid()) { + if (cursor.valid()) { // If the end boundary is changing, we must remove the remaining records in this page bool remove = iMutationBoundaryEnd.mutation().boundaryChanged; - if(remove) { + if (remove) { changesMade = true; } // If we don't have to remove the records and we are updating, do nothing. // If we do have to remove the records and we are not updating, do nothing. - if(remove != updating) { - debug_printf("%s Ignoring remaining records, remove=%d updating=%d\n", context.c_str(), remove, updating); - } - else { + if (remove != updating) { + debug_printf("%s Ignoring remaining records, remove=%d updating=%d\n", context.c_str(), remove, + updating); + } else { // If updating and the key is changing, we must visit the records to erase them. - // If not updating and the key is not changing, we must visit the records to add them to the output set. - while(cursor.valid()) { - if(updating) { - debug_printf("%s Erasing %s and beyond [existing, matches changed upper mutation boundary]\n", context.c_str(), cursor.get().toString().c_str()); + // If not updating and the key is not changing, we must visit the records to add them to the output + // set. + while (cursor.valid()) { + if (updating) { + debug_printf( + "%s Erasing %s and beyond [existing, matches changed upper mutation boundary]\n", + context.c_str(), cursor.get().toString().c_str()); cursor.erase(); - } - else { + } else { merged.push_back(merged.arena(), cursor.get()); - debug_printf("%s Added %s [existing, tail]\n", context.c_str(), merged.back().toString().c_str()); + debug_printf("%s Added %s [existing, tail]\n", context.c_str(), + merged.back().toString().c_str()); cursor.moveNext(); } } } - } - else { + } else { debug_printf("%s No records matching mutation buffer end boundary key\n", context.c_str()); } - // No changes were actually made. This could happen if the only mutations are clear ranges which do not match any records. - if(!changesMade) { + // No changes were actually made. This could happen if the only mutations are clear ranges which do not + // match any records. + if (!changesMade) { result.contents() = ChildLinksRef(decodeLowerBound, decodeUpperBound); - debug_printf("%s No changes were made during mutation merge, returning %s\n", context.c_str(), toString(result).c_str()); + debug_printf("%s No changes were made during mutation merge, returning %s\n", context.c_str(), + toString(result).c_str()); return result; - } - else { + } else { debug_printf("%s Changes were made, writing.\n", context.c_str()); } writeVersion = self->getLastCommittedVersion() + 1; - if(updating) { - const BTreePage::BinaryTree &deltaTree = ((const BTreePage *)newPage->begin())->tree(); - if(deltaTree.numItems == 0) { - debug_printf("%s Page updates cleared all entries, returning %s\n", context.c_str(), toString(result).c_str()); + if (updating) { + const BTreePage::BinaryTree& deltaTree = ((const BTreePage*)newPage->begin())->tree(); + if (deltaTree.numItems == 0) { + debug_printf("%s Page updates cleared all entries, returning %s\n", context.c_str(), + toString(result).c_str()); self->freeBtreePage(rootID, writeVersion); return result; - } - else { - // Otherwise update it. - BTreePageID newID = wait(self->updateBtreePage(self, rootID, &result.arena(), newPage, writeVersion)); + } else { + // Otherwise update it. + BTreePageID newID = + wait(self->updateBtreePage(self, rootID, &result.arena(), newPage, writeVersion)); - // Set the child page ID, which has already been allocated in result.arena() - RedwoodRecordRef *rec = new (result.arena()) RedwoodRecordRef(decodeLowerBound->withoutValue()); - rec->setChildPage(newID); + // Set the child page ID, which has already been allocated in result.arena() + RedwoodRecordRef* rec = new (result.arena()) RedwoodRecordRef(decodeLowerBound->withoutValue()); + rec->setChildPage(newID); - result.contents() = ChildLinksRef(rec, decodeUpperBound); - debug_printf("%s Page updated in-place, returning %s\n", context.c_str(), toString(result).c_str()); + result.contents() = ChildLinksRef(rec, decodeUpperBound); + debug_printf("%s Page updated in-place, returning %s\n", context.c_str(), toString(result).c_str()); ++counts.pageUpdates; - return result; - } + return result; + } } // If everything in the page was deleted then this page should be deleted as of the new version // Note that if a single range clear covered the entire page then we should not get this far - if(merged.empty()) { - debug_printf("%s All leaf page contents were cleared, returning %s\n", context.c_str(), toString(result).c_str()); + if (merged.empty()) { + debug_printf("%s All leaf page contents were cleared, returning %s\n", context.c_str(), + toString(result).c_str()); self->freeBtreePage(rootID, writeVersion); return result; } - state Standalone> entries = wait(writePages(self, lowerBound, upperBound, merged, btPage->height, writeVersion, rootID)); + state Standalone> entries = + wait(writePages(self, lowerBound, upperBound, merged, btPage->height, writeVersion, rootID)); result.arena().dependsOn(entries.arena()); result.contents() = ChildLinksRef(entries, *upperBound); debug_printf("%s Merge complete, returning %s\n", context.c_str(), toString(result).c_str()); return result; - } - else { + } else { // Internal Page ASSERT(!isLeaf); state std::vector>> futureChildren; @@ -4152,9 +4033,9 @@ private: cursor.moveFirst(); bool first = true; - while(cursor.valid()) { + while (cursor.valid()) { // The lower bound for the first child is the lowerBound arg - const RedwoodRecordRef &childLowerBound = first ? *lowerBound : cursor.get(); + const RedwoodRecordRef& childLowerBound = first ? *lowerBound : cursor.get(); first = false; // At this point we should never be at a null child page entry because the first entry of a page @@ -4162,54 +4043,60 @@ private: ASSERT(cursor.get().value.present()); // The decode lower bound is always the key of the child link record - const RedwoodRecordRef &decodeChildLowerBound = cursor.get(); + const RedwoodRecordRef& decodeChildLowerBound = cursor.get(); BTreePageID pageID = cursor.get().getChildPage(); ASSERT(!pageID.empty()); - // The decode upper bound is always the next key after the child link, or the decode upper bound for this page - const RedwoodRecordRef &decodeChildUpperBound = cursor.moveNext() ? cursor.get() : *decodeUpperBound; + // The decode upper bound is always the next key after the child link, or the decode upper bound for + // this page + const RedwoodRecordRef& decodeChildUpperBound = cursor.moveNext() ? cursor.get() : *decodeUpperBound; // But the decode upper bound might be a placeholder record with a null child link because // the subtree was previously deleted but the key needed to exist to enable decoding of the // previous child page which has not since been rewritten. - if(cursor.valid() && !cursor.get().value.present()) { + if (cursor.valid() && !cursor.get().value.present()) { // There should only be one null child link entry, followed by a present link or the end of the page ASSERT(!cursor.moveNext() || cursor.get().value.present()); } - const RedwoodRecordRef &childUpperBound = cursor.valid() ? cursor.get() : *upperBound; + const RedwoodRecordRef& childUpperBound = cursor.valid() ? cursor.get() : *upperBound; - debug_printf("%s recursing to %s lower=%s upper=%s decodeLower=%s decodeUpper=%s\n", - context.c_str(), toString(pageID).c_str(), childLowerBound.toString().c_str(), childUpperBound.toString().c_str(), decodeChildLowerBound.toString().c_str(), decodeChildUpperBound.toString().c_str()); + debug_printf("%s recursing to %s lower=%s upper=%s decodeLower=%s decodeUpper=%s\n", context.c_str(), + toString(pageID).c_str(), childLowerBound.toString().c_str(), + childUpperBound.toString().c_str(), decodeChildLowerBound.toString().c_str(), + decodeChildUpperBound.toString().c_str()); // If this page has height of 2 then its children are leaf nodes - futureChildren.push_back(self->commitSubtree(self, mutationBuffer, snapshot, pageID, btPage->height == 2, &childLowerBound, &childUpperBound, &decodeChildLowerBound, &decodeChildUpperBound)); + futureChildren.push_back(self->commitSubtree(self, mutationBuffer, snapshot, pageID, + btPage->height == 2, &childLowerBound, &childUpperBound, + &decodeChildLowerBound, &decodeChildUpperBound)); } // Waiting one at a time makes debugging easier // TODO: Is it better to use waitForAll()? state int k; - for(k = 0; k < futureChildren.size(); ++k) { + for (k = 0; k < futureChildren.size(); ++k) { wait(success(futureChildren[k])); } - if(REDWOOD_DEBUG) { - debug_printf("%s Subtree update results\n", context.c_str()); - for(int i = 0; i < futureChildren.size(); ++i) { + if (REDWOOD_DEBUG) { + debug_printf("%s Subtree update results\n", context.c_str()); + for (int i = 0; i < futureChildren.size(); ++i) { debug_printf("%s subtree result %s\n", context.c_str(), toString(futureChildren[i].get()).c_str()); } } - // All of the things added to pageBuilder will exist in the arenas inside futureChildren or will be upperBound + // All of the things added to pageBuilder will exist in the arenas inside futureChildren or will be + // upperBound BTreePage::BinaryTree::Cursor c = getCursor(page); c.moveFirst(); InternalPageBuilder pageBuilder(c); - for(int i = 0; i < futureChildren.size(); ++i) { + for (int i = 0; i < futureChildren.size(); ++i) { ChildLinksRef c = futureChildren[i].get(); - if(!c.children.empty()) { + if (!c.children.empty()) { pageBuilder.addEntries(c); } } @@ -4217,29 +4104,33 @@ private: pageBuilder.finalize(*upperBound, *decodeUpperBound); // If page contents have changed - if(pageBuilder.modified) { + if (pageBuilder.modified) { // If the page now has no children - if(pageBuilder.childPageCount == 0) { - debug_printf("%s All internal page children were deleted so deleting this page too, returning %s\n", context.c_str(), toString(result).c_str()); + if (pageBuilder.childPageCount == 0) { + debug_printf("%s All internal page children were deleted so deleting this page too, returning %s\n", + context.c_str(), toString(result).c_str()); self->freeBtreePage(rootID, writeVersion); return result; - } - else { + } else { debug_printf("%s Internal page modified, creating replacements.\n", context.c_str()); - debug_printf("%s newChildren=%s lastUpperBound=%s upperBound=%s\n", context.c_str(), toString(pageBuilder.entries).c_str(), pageBuilder.lastUpperBound.toString().c_str(), upperBound->toString().c_str()); + debug_printf("%s newChildren=%s lastUpperBound=%s upperBound=%s\n", context.c_str(), + toString(pageBuilder.entries).c_str(), pageBuilder.lastUpperBound.toString().c_str(), + upperBound->toString().c_str()); debug_printf("pagebuilder entries: %s\n", ::toString(pageBuilder.entries).c_str()); - ASSERT(!pageBuilder.entries.back().value.present() || pageBuilder.lastUpperBound.sameExceptValue(*upperBound)); + ASSERT(!pageBuilder.entries.back().value.present() || + pageBuilder.lastUpperBound.sameExceptValue(*upperBound)); - Standalone> childEntries = wait(holdWhile(pageBuilder.entries, writePages(self, lowerBound, upperBound, pageBuilder.entries, btPage->height, writeVersion, rootID))); + Standalone> childEntries = wait( + holdWhile(pageBuilder.entries, writePages(self, lowerBound, upperBound, pageBuilder.entries, + btPage->height, writeVersion, rootID))); result.arena().dependsOn(childEntries.arena()); result.contents() = ChildLinksRef(childEntries, *upperBound); debug_printf("%s Internal modified, returning %s\n", context.c_str(), toString(result).c_str()); return result; } - } - else { + } else { result.contents() = ChildLinksRef(decodeLowerBound, decodeUpperBound); debug_printf("%s Page has no changes, returning %s\n", context.c_str(), toString(result).c_str()); return result; @@ -4247,8 +4138,8 @@ private: } } - ACTOR static Future commit_impl(VersionedBTree *self) { - state MutationBuffer *mutations = self->m_pBuffer; + ACTOR static Future commit_impl(VersionedBTree* self) { + state MutationBuffer* mutations = self->m_pBuffer; // No more mutations are allowed to be written to this mutation buffer we will commit // at m_writeVersion, which we must save locally because it could change during commit. @@ -4267,7 +4158,8 @@ private: wait(previousCommit); self->m_pager->setOldestVersion(self->m_newOldestVersion); - debug_printf("%s: Beginning commit of version %" PRId64 ", new oldest version set to %" PRId64 "\n", self->m_name.c_str(), writeVersion, self->m_newOldestVersion); + debug_printf("%s: Beginning commit of version %" PRId64 ", new oldest version set to %" PRId64 "\n", + self->m_name.c_str(), writeVersion, self->m_newOldestVersion); state bool lazyDeleteStop = false; state Future lazyDelete = incrementalSubtreeClear(self, &lazyDeleteStop); @@ -4278,27 +4170,29 @@ private: state Standalone rootPageID = self->m_header.root.get(); state RedwoodRecordRef lowerBound = dbBegin.withPageID(rootPageID); - Standalone newRootChildren = wait(commitSubtree(self, mutations, self->m_pager->getReadSnapshot(latestVersion), rootPageID, self->m_header.height == 1, &lowerBound, &dbEnd, &lowerBound, &dbEnd)); - debug_printf("CommitSubtree(root %s) returned %s\n", toString(rootPageID).c_str(), toString(newRootChildren).c_str()); + Standalone newRootChildren = + wait(commitSubtree(self, mutations, self->m_pager->getReadSnapshot(latestVersion), rootPageID, + self->m_header.height == 1, &lowerBound, &dbEnd, &lowerBound, &dbEnd)); + debug_printf("CommitSubtree(root %s) returned %s\n", toString(rootPageID).c_str(), + toString(newRootChildren).c_str()); // If the old root was deleted, write a new empty tree root node and free the old roots - if(newRootChildren.children.empty()) { + if (newRootChildren.children.empty()) { debug_printf("Writing new empty root.\n"); LogicalPageID newRootID = wait(self->m_pager->newPageID()); Reference page = self->m_pager->newPageBuffer(); makeEmptyRoot(page); self->m_header.height = 1; self->m_pager->updatePage(newRootID, page); - rootPageID = BTreePageID((LogicalPageID *)&newRootID, 1); - } - else { - Standalone> newRootLevel(newRootChildren.children, newRootChildren.arena()); - if(newRootLevel.size() == 1) { + rootPageID = BTreePageID((LogicalPageID*)&newRootID, 1); + } else { + Standalone> newRootLevel(newRootChildren.children, newRootChildren.arena()); + if (newRootLevel.size() == 1) { rootPageID = newRootLevel.front().getChildPage(); - } - else { + } else { // If the new root level's size is not 1 then build new root level(s) - Standalone> newRootPage = wait(buildNewRoot(self, latestVersion, newRootLevel, self->m_header.height)); + Standalone> newRootPage = + wait(buildNewRoot(self, latestVersion, newRootLevel, self->m_header.height)); rootPageID = newRootPage.front().getChildPage(); } } @@ -4333,11 +4227,10 @@ private: return Void(); } - public: - +public: // InternalCursor is for seeking to and iterating over the leaf-level RedwoodRecordRef records in the tree. - // The records could represent multiple values for the same key at different versions, including a non-present value representing a clear. - // Currently, however, all records are at version 0 and no clears are present in the tree. + // The records could represent multiple values for the same key at different versions, including a non-present value + // representing a clear. Currently, however, all records are at version 0 and no clears are present in the tree. struct InternalCursor { private: // Each InternalCursor's position is represented by a reference counted PageCursor, which links @@ -4345,52 +4238,46 @@ private: // PageCursors can be shared by many InternalCursors, making InternalCursor copying low overhead struct PageCursor : ReferenceCounted, FastAllocated { Reference parent; - BTreePageID pageID; // Only needed for debugging purposes + BTreePageID pageID; // Only needed for debugging purposes Reference page; BTreePage::BinaryTree::Cursor cursor; // id will normally reference memory owned by the parent, which is okay because a reference to the parent // will be held in the cursor PageCursor(BTreePageID id, Reference page, Reference parent = {}) - : pageID(id), page(page), parent(parent), cursor(getCursor(page)) - { - } + : pageID(id), page(page), parent(parent), cursor(getCursor(page)) {} - PageCursor(const PageCursor &toCopy) : parent(toCopy.parent), pageID(toCopy.pageID), page(toCopy.page), cursor(toCopy.cursor) { - } + PageCursor(const PageCursor& toCopy) + : parent(toCopy.parent), pageID(toCopy.pageID), page(toCopy.page), cursor(toCopy.cursor) {} // Convenience method for copying a PageCursor - Reference copy() const { - return Reference(new PageCursor(*this)); - } + Reference copy() const { return Reference(new PageCursor(*this)); } - const BTreePage * btPage() const { - return (const BTreePage *)page->begin(); - } + const BTreePage* btPage() const { return (const BTreePage*)page->begin(); } - bool isLeaf() const { - return btPage()->isLeaf(); - } + bool isLeaf() const { return btPage()->isLeaf(); } Future> getChild(Reference pager, int readAheadBytes = 0) { ASSERT(!isLeaf()); BTreePage::BinaryTree::Cursor next = cursor; next.moveNext(); - const RedwoodRecordRef &rec = cursor.get(); + const RedwoodRecordRef& rec = cursor.get(); BTreePageID id = rec.getChildPage(); Future> child = readPage(pager, id, &rec, &next.getOrUpperBound()); // Read ahead siblings at level 2 - // TODO: Application of readAheadBytes is not taking into account the size of the current page or any of the adjacent pages it is preloading. - if(readAheadBytes > 0 && btPage()->height == 2 && next.valid()) { + // TODO: Application of readAheadBytes is not taking into account the size of the current page or any + // of the adjacent pages it is preloading. + if (readAheadBytes > 0 && btPage()->height == 2 && next.valid()) { do { - debug_printf("preloading %s %d bytes left\n", ::toString(next.get().getChildPage()).c_str(), readAheadBytes); + debug_printf("preloading %s %d bytes left\n", ::toString(next.get().getChildPage()).c_str(), + readAheadBytes); // If any part of the page was already loaded then stop - if(next.get().value.present()) { + if (next.get().value.present()) { preLoadPage(pager.getPtr(), next.get().getChildPage()); readAheadBytes -= page->size(); } - } while(readAheadBytes > 0 && next.moveNext()); + } while (readAheadBytes > 0 && next.moveNext()); } return map(child, [=](Reference page) { @@ -4399,7 +4286,8 @@ private: } std::string toString() const { - return format("%s, %s", ::toString(pageID).c_str(), cursor.valid() ? cursor.get().toString().c_str() : ""); + return format("%s, %s", ::toString(pageID).c_str(), + cursor.valid() ? cursor.get().toString().c_str() : ""); } }; @@ -4408,26 +4296,23 @@ private: Reference pageCursor; public: - InternalCursor() { - } + InternalCursor() {} - InternalCursor(Reference pager, BTreePageID root) - : pager(pager), rootPageID(root) { - } + InternalCursor(Reference pager, BTreePageID root) : pager(pager), rootPageID(root) {} std::string toString() const { std::string r; Reference c = pageCursor; int maxDepth = 0; - while(c) { + while (c) { c = c->parent; ++maxDepth; } c = pageCursor; int depth = maxDepth; - while(c) { + while (c) { r = format("[%d/%d: %s] ", depth--, maxDepth, c->toString().c_str()) + r; c = c->parent; } @@ -4435,47 +4320,35 @@ private: } // Returns true if cursor position is a valid leaf page record - bool valid() const { - return pageCursor && pageCursor->isLeaf() && pageCursor->cursor.valid(); - } + bool valid() const { return pageCursor && pageCursor->isLeaf() && pageCursor->cursor.valid(); } -// Returns true if cursor position is valid() and has a present record value -bool present() const { - return valid() && pageCursor->cursor.get().value.present(); -} + // Returns true if cursor position is valid() and has a present record value + bool present() const { return valid() && pageCursor->cursor.get().value.present(); } -// Returns true if cursor position is present() and has an effective version <= v -bool presentAtVersion(Version v) { - return present() && pageCursor->cursor.get().version <= v; -} + // Returns true if cursor position is present() and has an effective version <= v + bool presentAtVersion(Version v) { return present() && pageCursor->cursor.get().version <= v; } -// This is to enable an optimization for the case where all internal records are at the -// same version and there are no implicit clears -// *this MUST be valid() -bool presentAtExactVersion(Version v) const { - return present() && pageCursor->cursor.get().version == v; -} + // This is to enable an optimization for the case where all internal records are at the + // same version and there are no implicit clears + // *this MUST be valid() + bool presentAtExactVersion(Version v) const { return present() && pageCursor->cursor.get().version == v; } -// Returns true if cursor position is present() and has an effective version <= v -bool validAtVersion(Version v) { - return valid() && pageCursor->cursor.get().version <= v; -} + // Returns true if cursor position is present() and has an effective version <= v + bool validAtVersion(Version v) { return valid() && pageCursor->cursor.get().version <= v; } - const RedwoodRecordRef & get() const { - return pageCursor->cursor.get(); - } + const RedwoodRecordRef& get() const { return pageCursor->cursor.get(); } // Ensure that pageCursor is not shared with other cursors so we can modify it void ensureUnshared() { - if(!pageCursor->isSoleOwner()) { + if (!pageCursor->isSoleOwner()) { pageCursor = pageCursor->copy(); } } Future moveToRoot() { // If pageCursor exists follow parent links to the root - if(pageCursor) { - while(pageCursor->parent) { + if (pageCursor) { + while (pageCursor->parent) { pageCursor = pageCursor->parent; } return Void(); @@ -4489,10 +4362,10 @@ bool validAtVersion(Version v) { }); } - ACTOR Future seekLessThan_impl(InternalCursor *self, RedwoodRecordRef query, int prefetchBytes) { + ACTOR Future seekLessThan_impl(InternalCursor* self, RedwoodRecordRef query, int prefetchBytes) { Future f = self->moveToRoot(); // f will almost always be ready - if(!f.isReady()) { + if (!f.isReady()) { wait(f); } @@ -4502,23 +4375,22 @@ bool validAtVersion(Version v) { bool success = self->pageCursor->cursor.seekLessThan(query); // Skip backwards over internal page entries that do not link to child pages - if(!isLeaf) { + if (!isLeaf) { // While record has no value, move again - while(success && !self->pageCursor->cursor.get().value.present()) { + while (success && !self->pageCursor->cursor.get().value.present()) { success = self->pageCursor->cursor.movePrev(); } } - if(success) { + if (success) { // If we found a record < query at a leaf page then return success - if(isLeaf) { + if (isLeaf) { return true; } Reference child = wait(self->pageCursor->getChild(self->pager, prefetchBytes)); self->pageCursor = child; - } - else { + } else { // No records < query on this page, so move to immediate previous record at leaf level bool success = wait(self->move(false)); return success; @@ -4530,22 +4402,23 @@ bool validAtVersion(Version v) { return seekLessThan_impl(this, query, prefetchBytes); } - ACTOR Future move_impl(InternalCursor *self, bool forward) { + ACTOR Future move_impl(InternalCursor* self, bool forward) { // Try to move pageCursor, if it fails to go parent, repeat until it works or root cursor can't be moved - while(1) { + while (1) { self->ensureUnshared(); - bool success = self->pageCursor->cursor.valid() && (forward ? self->pageCursor->cursor.moveNext() : self->pageCursor->cursor.movePrev()); + bool success = self->pageCursor->cursor.valid() && + (forward ? self->pageCursor->cursor.moveNext() : self->pageCursor->cursor.movePrev()); // Skip over internal page entries that do not link to child pages - if(!self->pageCursor->isLeaf()) { + if (!self->pageCursor->isLeaf()) { // While record has no value, move again - while(success && !self->pageCursor->cursor.get().value.present()) { + while (success && !self->pageCursor->cursor.get().value.present()) { success = forward ? self->pageCursor->cursor.moveNext() : self->pageCursor->cursor.movePrev(); } } // Stop if successful or there's no parent to move to - if(success || !self->pageCursor->parent) { + if (success || !self->pageCursor->parent) { break; } @@ -4554,16 +4427,16 @@ bool validAtVersion(Version v) { } // If pageCursor not valid we've reached an end of the tree - if(!self->pageCursor->cursor.valid()) { + if (!self->pageCursor->cursor.valid()) { return false; } // While not on a leaf page, move down to get to one. - while(!self->pageCursor->isLeaf()) { + while (!self->pageCursor->isLeaf()) { // Skip over internal page entries that do not link to child pages - while(!self->pageCursor->cursor.get().value.present()) { + while (!self->pageCursor->cursor.get().value.present()) { bool success = forward ? self->pageCursor->cursor.moveNext() : self->pageCursor->cursor.movePrev(); - if(!success) { + if (!success) { return false; } } @@ -4576,16 +4449,14 @@ bool validAtVersion(Version v) { return true; } - Future move(bool forward) { - return move_impl(this, forward); - } + Future move(bool forward) { return move_impl(this, forward); } // Move to the first or last record of the database. - ACTOR Future move_end(InternalCursor *self, bool begin) { + ACTOR Future move_end(InternalCursor* self, bool begin) { Future f = self->moveToRoot(); // f will almost always be ready - if(!f.isReady()) { + if (!f.isReady()) { wait(f); } @@ -4596,47 +4467,37 @@ bool validAtVersion(Version v) { bool success = begin ? self->pageCursor->cursor.moveFirst() : self->pageCursor->cursor.moveLast(); // Skip over internal page entries that do not link to child pages - if(!self->pageCursor->isLeaf()) { + if (!self->pageCursor->isLeaf()) { // While record has no value, move past it - while(success && !self->pageCursor->cursor.get().value.present()) { + while (success && !self->pageCursor->cursor.get().value.present()) { success = begin ? self->pageCursor->cursor.moveNext() : self->pageCursor->cursor.movePrev(); } } // If it worked, return true if we've reached a leaf page otherwise go to the next child - if(success) { - if(self->pageCursor->isLeaf()) { + if (success) { + if (self->pageCursor->isLeaf()) { return true; } Reference child = wait(self->pageCursor->getChild(self->pager)); self->pageCursor = child; - } - else { + } else { return false; } } } - Future moveFirst() { - return move_end(this, true); - } - Future moveLast() { - return move_end(this, false); - } - + Future moveFirst() { return move_end(this, true); } + Future moveLast() { return move_end(this, false); } }; // Cursor is for reading and interating over user visible KV pairs at a specific version // KeyValueRefs returned become invalid once the cursor is moved - class Cursor : public IStoreCursor, public ReferenceCounted, public FastAllocated, NonCopyable { + class Cursor : public IStoreCursor, public ReferenceCounted, public FastAllocated, NonCopyable { public: Cursor(Reference pageSource, BTreePageID root, Version internalRecordVersion) - : m_version(internalRecordVersion), - m_cur1(pageSource, root), - m_cur2(m_cur1) - { - } + : m_version(internalRecordVersion), m_cur1(pageSource, root), m_cur2(m_cur1) {} void addref() { ReferenceCounted::addref(); } void delref() { ReferenceCounted::delref(); } @@ -4657,9 +4518,7 @@ bool validAtVersion(Version v) { Optional m_kv; public: - Future findEqual(KeyRef key) override { - return find_impl(this, key, 0); - } + Future findEqual(KeyRef key) override { return find_impl(this, key, 0); } Future findFirstEqualOrGreater(KeyRef key, int prefetchBytes) override { return find_impl(this, key, 1, prefetchBytes); } @@ -4667,43 +4526,32 @@ bool validAtVersion(Version v) { return find_impl(this, key, -1, prefetchBytes); } - Future next() override { - return move(this, true); - } - Future prev() override { - return move(this, false); - } + Future next() override { return move(this, true); } + Future prev() override { return move(this, false); } - bool isValid() override { - return m_kv.present(); - } + bool isValid() override { return m_kv.present(); } - KeyRef getKey() override { - return m_kv.get().key; - } + KeyRef getKey() override { return m_kv.get().key; } - ValueRef getValue() override { - return m_kv.get().value; - } + ValueRef getValue() override { return m_kv.get().value; } std::string toString(bool includePaths = false) const { std::string r; r += format("Cursor(%p) ver: %" PRId64 " ", this, m_version); - if(m_kv.present()) { - r += format(" KV: '%s' -> '%s'", m_kv.get().key.printable().c_str(), m_kv.get().value.printable().c_str()); - } - else { + if (m_kv.present()) { + r += format(" KV: '%s' -> '%s'", m_kv.get().key.printable().c_str(), + m_kv.get().value.printable().c_str()); + } else { r += " KV: "; } - if(includePaths) { + if (includePaths) { r += format("\n Cur1: %s", m_cur1.toString().c_str()); r += format("\n Cur2: %s", m_cur2.toString().c_str()); - } - else { - if(m_cur1.valid()) { + } else { + if (m_cur1.valid()) { r += format("\n Cur1: %s", m_cur1.get().toString().c_str()); } - if(m_cur2.valid()) { + if (m_cur2.valid()) { r += format("\n Cur2: %s", m_cur2.get().toString().c_str()); } } @@ -4716,48 +4564,48 @@ bool validAtVersion(Version v) { // for less than or equal use cmp < 0 // for greater than or equal use cmp > 0 // for equal use cmp == 0 - ACTOR static Future find_impl(Cursor *self, KeyRef key, int cmp, int prefetchBytes = 0) { + ACTOR static Future find_impl(Cursor* self, KeyRef key, int cmp, int prefetchBytes = 0) { state RedwoodRecordRef query(key, self->m_version + 1); self->m_kv.reset(); wait(success(self->m_cur1.seekLessThan(query, prefetchBytes))); - debug_printf("find%sE(%s): %s\n", cmp > 0 ? "GT" : (cmp == 0 ? "" : "LT"), query.toString().c_str(), self->toString().c_str()); + debug_printf("find%sE(%s): %s\n", cmp > 0 ? "GT" : (cmp == 0 ? "" : "LT"), query.toString().c_str(), + self->toString().c_str()); // If we found the target key with a present value then return it as it is valid for any cmp type - if(self->m_cur1.present() && self->m_cur1.get().key == key) { + if (self->m_cur1.present() && self->m_cur1.get().key == key) { debug_printf("Target key found. Cursor: %s\n", self->toString().c_str()); self->m_kv = self->m_cur1.get().toKeyValueRef(); return Void(); } // If cmp type is Equal and we reached here, we didn't find it - if(cmp == 0) { + if (cmp == 0) { return Void(); } // cmp mode is GreaterThanOrEqual, so if we've reached here an equal key was not found and cur1 either // points to a lesser key or is invalid. - if(cmp > 0) { + if (cmp > 0) { // If cursor is invalid, query was less than the first key in database so go to the first record - if(!self->m_cur1.valid()) { + if (!self->m_cur1.valid()) { bool valid = wait(self->m_cur1.moveFirst()); - if(!valid) { + if (!valid) { self->m_kv.reset(); return Void(); } - } - else { + } else { // Otherwise, move forward until we find a key greater than the target key. // If multiversion data is present, the next record could have the same key as the initial // record found but be at a newer version. loop { bool valid = wait(self->m_cur1.move(true)); - if(!valid) { + if (!valid) { self->m_kv.reset(); return Void(); } - if(self->m_cur1.get().key > key) { + if (self->m_cur1.get().key > key) { break; } } @@ -4765,10 +4613,10 @@ bool validAtVersion(Version v) { // Get the next present key at the target version. Handles invalid cursor too. wait(self->next()); - } - else if(cmp < 0) { - // cmp mode is LessThanOrEqual. An equal key to the target key was already checked above, and the search was for LessThan query, so cur1 is already in the right place. - if(!self->m_cur1.valid()) { + } else if (cmp < 0) { + // cmp mode is LessThanOrEqual. An equal key to the target key was already checked above, and the + // search was for LessThan query, so cur1 is already in the right place. + if (!self->m_cur1.valid()) { self->m_kv.reset(); return Void(); } @@ -4780,19 +4628,19 @@ bool validAtVersion(Version v) { return Void(); } - ACTOR static Future move(Cursor *self, bool fwd) { + ACTOR static Future move(Cursor* self, bool fwd) { debug_printf("Cursor::move(%d): Start %s\n", fwd, self->toString().c_str()); ASSERT(self->m_cur1.valid()); // If kv is present then the key/version at cur1 was already returned so move to a new key // Move cur1 until failure or a new key is found, keeping prior record visited in cur2 - if(self->m_kv.present()) { + if (self->m_kv.present()) { ASSERT(self->m_cur1.valid()); loop { self->m_cur2 = self->m_cur1; debug_printf("Cursor::move(%d): Advancing cur1 %s\n", fwd, self->toString().c_str()); bool valid = wait(self->m_cur1.move(fwd)); - if(!valid || self->m_cur1.get().key != self->m_cur2.get().key) { + if (!valid || self->m_cur1.get().key != self->m_cur2.get().key) { break; } } @@ -4806,36 +4654,33 @@ bool validAtVersion(Version v) { // exists at the version (but could be the empty string) while valid just means the internal // record is in effect at that version but it could indicate that the key was cleared and // no longer exists from the user's perspective at that version - if(self->m_cur1.valid()) { + if (self->m_cur1.valid()) { self->m_cur2 = self->m_cur1; debug_printf("Cursor::move(%d): Advancing cur2 %s\n", fwd, self->toString().c_str()); wait(success(self->m_cur2.move(true))); } - while(self->m_cur1.valid()) { + while (self->m_cur1.valid()) { - if(self->m_cur1.get().version == self->m_version || - (self->m_cur1.presentAtVersion(self->m_version) && - (!self->m_cur2.validAtVersion(self->m_version) || - self->m_cur2.get().key != self->m_cur1.get().key)) - ) { + if (self->m_cur1.get().version == self->m_version || + (self->m_cur1.presentAtVersion(self->m_version) && + (!self->m_cur2.validAtVersion(self->m_version) || + self->m_cur2.get().key != self->m_cur1.get().key))) { self->m_kv = self->m_cur1.get().toKeyValueRef(); return Void(); } - if(fwd) { + if (fwd) { // Moving forward, move cur2 forward and keep cur1 pointing to the prior (predecessor) record debug_printf("Cursor::move(%d): Moving forward %s\n", fwd, self->toString().c_str()); self->m_cur1 = self->m_cur2; wait(success(self->m_cur2.move(true))); - } - else { + } else { // Moving backward, move cur1 backward and keep cur2 pointing to the prior (successor) record debug_printf("Cursor::move(%d): Moving backward %s\n", fwd, self->toString().c_str()); self->m_cur2 = self->m_cur1; wait(success(self->m_cur1.move(false))); } - } debug_printf("Cursor::move(%d): Exit, end of db reached. Cursor = %s\n", fwd, self->toString().c_str()); @@ -4844,7 +4689,6 @@ bool validAtVersion(Version v) { return Void(); } }; - }; #include "art_impl.h" @@ -4857,16 +4701,14 @@ class KeyValueStoreRedwoodUnversioned : public IKeyValueStore { public: KeyValueStoreRedwoodUnversioned(std::string filePrefix, UID logID) : m_filePrefix(filePrefix) { // TODO: This constructor should really just take an IVersionedStore - IPager2 *pager = new DWALPager(4096, filePrefix, 0); + IPager2* pager = new DWALPager(4096, filePrefix, 0); m_tree = new VersionedBTree(pager, filePrefix); m_init = catchError(init_impl(this)); } - Future init() { - return m_init; - } + Future init() { return m_init; } - ACTOR Future init_impl(KeyValueStoreRedwoodUnversioned *self) { + ACTOR Future init_impl(KeyValueStoreRedwoodUnversioned* self) { TraceEvent(SevInfo, "RedwoodInit").detail("FilePrefix", self->m_filePrefix); wait(self->m_tree->init()); Version v = self->m_tree->getLatestVersion(); @@ -4875,34 +4717,30 @@ public: return Void(); } - ACTOR void shutdown(KeyValueStoreRedwoodUnversioned *self, bool dispose) { + ACTOR void shutdown(KeyValueStoreRedwoodUnversioned* self, bool dispose) { TraceEvent(SevInfo, "RedwoodShutdown").detail("FilePrefix", self->m_filePrefix).detail("Dispose", dispose); - if(self->m_error.canBeSet()) { - self->m_error.sendError(actor_cancelled()); // Ideally this should be shutdown_in_progress + if (self->m_error.canBeSet()) { + self->m_error.sendError(actor_cancelled()); // Ideally this should be shutdown_in_progress } self->m_init.cancel(); Future closedFuture = self->m_tree->onClosed(); - if(dispose) + if (dispose) self->m_tree->dispose(); else self->m_tree->close(); wait(closedFuture); self->m_closed.send(Void()); - TraceEvent(SevInfo, "RedwoodShutdownComplete").detail("FilePrefix", self->m_filePrefix).detail("Dispose", dispose); + TraceEvent(SevInfo, "RedwoodShutdownComplete") + .detail("FilePrefix", self->m_filePrefix) + .detail("Dispose", dispose); delete self; } - void close() { - shutdown(this, false); - } + void close() { shutdown(this, false); } - void dispose() { - shutdown(this, true); - } + void dispose() { shutdown(this, true); } - Future< Void > onClosed() { - return m_closed.getFuture(); - } + Future onClosed() { return m_closed.getFuture(); } Future commit(bool sequential = false) { Future c = m_tree->commit(); @@ -4911,40 +4749,35 @@ public: return catchError(c); } - KeyValueStoreType getType() { - return KeyValueStoreType::SSD_REDWOOD_V1; - } + KeyValueStoreType getType() { return KeyValueStoreType::SSD_REDWOOD_V1; } - StorageBytes getStorageBytes() { - return m_tree->getStorageBytes(); - } + StorageBytes getStorageBytes() { return m_tree->getStorageBytes(); } - Future< Void > getError() { - return delayed(m_error.getFuture()); - }; + Future getError() { return delayed(m_error.getFuture()); }; void clear(KeyRangeRef range, const Arena* arena = 0) { debug_printf("CLEAR %s\n", printable(range).c_str()); m_tree->clear(range); } - void set( KeyValueRef keyValue, const Arena* arena = NULL ) { + void set(KeyValueRef keyValue, const Arena* arena = NULL) { debug_printf("SET %s\n", printable(keyValue).c_str()); m_tree->set(keyValue); } - Future< Standalone< RangeResultRef > > readRange(KeyRangeRef keys, int rowLimit = 1<<30, int byteLimit = 1<<30) { + Future> readRange(KeyRangeRef keys, int rowLimit = 1 << 30, int byteLimit = 1 << 30) { debug_printf("READRANGE %s\n", printable(keys).c_str()); return catchError(readRange_impl(this, keys, rowLimit, byteLimit)); } - ACTOR static Future< Standalone< RangeResultRef > > readRange_impl(KeyValueStoreRedwoodUnversioned *self, KeyRange keys, int rowLimit, int byteLimit) { + ACTOR static Future> readRange_impl(KeyValueStoreRedwoodUnversioned* self, KeyRange keys, + int rowLimit, int byteLimit) { self->m_tree->counts.getRanges++; state Standalone result; state int accumulatedBytes = 0; - ASSERT( byteLimit > 0 ); + ASSERT(byteLimit > 0); - if(rowLimit == 0) { + if (rowLimit == 0) { return result; } @@ -4952,27 +4785,26 @@ public: // Prefetch is currently only done in the forward direction state int prefetchBytes = rowLimit > 1 ? byteLimit : 0; - if(rowLimit > 0) { + if (rowLimit > 0) { wait(cur->findFirstEqualOrGreater(keys.begin, prefetchBytes)); - while(cur->isValid() && cur->getKey() < keys.end) { + while (cur->isValid() && cur->getKey() < keys.end) { KeyValueRef kv(KeyRef(result.arena(), cur->getKey()), ValueRef(result.arena(), cur->getValue())); accumulatedBytes += kv.expectedSize(); result.push_back(result.arena(), kv); - if(--rowLimit == 0 || accumulatedBytes >= byteLimit) { + if (--rowLimit == 0 || accumulatedBytes >= byteLimit) { break; } wait(cur->next()); } } else { wait(cur->findLastLessOrEqual(keys.end)); - if(cur->isValid() && cur->getKey() == keys.end) - wait(cur->prev()); + if (cur->isValid() && cur->getKey() == keys.end) wait(cur->prev()); - while(cur->isValid() && cur->getKey() >= keys.begin) { + while (cur->isValid() && cur->getKey() >= keys.begin) { KeyValueRef kv(KeyRef(result.arena(), cur->getKey()), ValueRef(result.arena(), cur->getValue())); accumulatedBytes += kv.expectedSize(); result.push_back(result.arena(), kv); - if(++rowLimit == 0 || accumulatedBytes >= byteLimit) { + if (++rowLimit == 0 || accumulatedBytes >= byteLimit) { break; } wait(cur->prev()); @@ -4980,34 +4812,36 @@ public: } result.more = rowLimit == 0 || accumulatedBytes >= byteLimit; - if(result.more) { + if (result.more) { ASSERT(result.size() > 0); - result.readThrough = result[result.size()-1].key; + result.readThrough = result[result.size() - 1].key; } return result; } - ACTOR static Future< Optional > readValue_impl(KeyValueStoreRedwoodUnversioned *self, Key key, Optional< UID > debugID) { + ACTOR static Future> readValue_impl(KeyValueStoreRedwoodUnversioned* self, Key key, + Optional debugID) { self->m_tree->counts.gets++; state Reference cur = self->m_tree->readAtVersion(self->m_tree->getLastCommittedVersion()); wait(cur->findEqual(key)); - if(cur->isValid()) { + if (cur->isValid()) { return cur->getValue(); } return Optional(); } - Future< Optional< Value > > readValue(KeyRef key, Optional< UID > debugID = Optional()) { + Future> readValue(KeyRef key, Optional debugID = Optional()) { return catchError(readValue_impl(this, key, debugID)); } - ACTOR static Future< Optional > readValuePrefix_impl(KeyValueStoreRedwoodUnversioned *self, Key key, int maxLength, Optional< UID > debugID) { + ACTOR static Future> readValuePrefix_impl(KeyValueStoreRedwoodUnversioned* self, Key key, + int maxLength, Optional debugID) { self->m_tree->counts.gets++; state Reference cur = self->m_tree->readAtVersion(self->m_tree->getLastCommittedVersion()); wait(cur->findEqual(key)); - if(cur->isValid()) { + if (cur->isValid()) { Value v = cur->getValue(); int len = std::min(v.size(), maxLength); return Value(cur->getValue().substr(0, len)); @@ -5015,26 +4849,26 @@ public: return Optional(); } - Future< Optional< Value > > readValuePrefix(KeyRef key, int maxLength, Optional< UID > debugID = Optional()) { + Future> readValuePrefix(KeyRef key, int maxLength, Optional debugID = Optional()) { return catchError(readValuePrefix_impl(this, key, maxLength, debugID)); } - virtual ~KeyValueStoreRedwoodUnversioned() { - }; + virtual ~KeyValueStoreRedwoodUnversioned(){}; private: std::string m_filePrefix; - VersionedBTree *m_tree; + VersionedBTree* m_tree; Future m_init; Promise m_closed; Promise m_error; - template inline Future catchError(Future f) { + template + inline Future catchError(Future f) { return forwardError(f, m_error); } }; -IKeyValueStore* keyValueStoreRedwoodV1( std::string const& filename, UID logID) { +IKeyValueStore* keyValueStoreRedwoodV1(std::string const& filename, UID logID) { return new KeyValueStoreRedwoodUnversioned(filename, logID); } @@ -5043,18 +4877,18 @@ int randomSize(int max) { return n; } -StringRef randomString(Arena &arena, int len, char firstChar = 'a', char lastChar = 'z') { +StringRef randomString(Arena& arena, int len, char firstChar = 'a', char lastChar = 'z') { ++lastChar; StringRef s = makeString(len, arena); - for(int i = 0; i < len; ++i) { - *(uint8_t *)(s.begin() + i) = (uint8_t)deterministicRandom()->randomInt(firstChar, lastChar); + for (int i = 0; i < len; ++i) { + *(uint8_t*)(s.begin() + i) = (uint8_t)deterministicRandom()->randomInt(firstChar, lastChar); } return s; } Standalone randomString(int len, char firstChar = 'a', char lastChar = 'z') { Standalone s; - (StringRef &)s = randomString(s.arena(), len, firstChar, lastChar); + (StringRef&)s = randomString(s.arena(), len, firstChar, lastChar); return s; } @@ -5065,80 +4899,84 @@ KeyValue randomKV(int maxKeySize = 10, int maxValueSize = 5) { KeyValue kv; kv.key = randomString(kv.arena(), kLen, 'a', 'm'); - for(int i = 0; i < kLen; ++i) - mutateString(kv.key)[i] = (uint8_t)deterministicRandom()->randomInt('a', 'm'); + for (int i = 0; i < kLen; ++i) mutateString(kv.key)[i] = (uint8_t)deterministicRandom()->randomInt('a', 'm'); - if(vLen > 0) { + if (vLen > 0) { kv.value = randomString(kv.arena(), vLen, 'n', 'z'); - for(int i = 0; i < vLen; ++i) - mutateString(kv.value)[i] = (uint8_t)deterministicRandom()->randomInt('o', 'z'); + for (int i = 0; i < vLen; ++i) mutateString(kv.value)[i] = (uint8_t)deterministicRandom()->randomInt('o', 'z'); } return kv; } -ACTOR Future verifyRange(VersionedBTree *btree, Key start, Key end, Version v, std::map, Optional> *written, int *pErrorCount) { +ACTOR Future verifyRange(VersionedBTree* btree, Key start, Key end, Version v, + std::map, Optional>* written, + int* pErrorCount) { state int errors = 0; - if(end <= start) - end = keyAfter(start); + if (end <= start) end = keyAfter(start); - state std::map, Optional>::const_iterator i = written->lower_bound(std::make_pair(start.toString(), 0)); - state std::map, Optional>::const_iterator iEnd = written->upper_bound(std::make_pair(end.toString(), 0)); + state std::map, Optional>::const_iterator i = + written->lower_bound(std::make_pair(start.toString(), 0)); + state std::map, Optional>::const_iterator iEnd = + written->upper_bound(std::make_pair(end.toString(), 0)); state std::map, Optional>::const_iterator iLast; state Reference cur = btree->readAtVersion(v); - debug_printf("VerifyRange(@%" PRId64 ", %s, %s): Start cur=%p\n", v, start.toHexString().c_str(), end.toHexString().c_str(), cur.getPtr()); + debug_printf("VerifyRange(@%" PRId64 ", %s, %s): Start cur=%p\n", v, start.toHexString().c_str(), + end.toHexString().c_str(), cur.getPtr()); // Randomly use the cursor for something else first. - if(deterministicRandom()->coinflip()) { + if (deterministicRandom()->coinflip()) { state Key randomKey = randomKV().key; - debug_printf("VerifyRange(@%" PRId64 ", %s, %s): Dummy seek to '%s'\n", v, start.toHexString().c_str(), end.toHexString().c_str(), randomKey.toString().c_str()); - wait(deterministicRandom()->coinflip() ? cur->findFirstEqualOrGreater(randomKey) : cur->findLastLessOrEqual(randomKey)); + debug_printf("VerifyRange(@%" PRId64 ", %s, %s): Dummy seek to '%s'\n", v, start.toHexString().c_str(), + end.toHexString().c_str(), randomKey.toString().c_str()); + wait(deterministicRandom()->coinflip() ? cur->findFirstEqualOrGreater(randomKey) + : cur->findLastLessOrEqual(randomKey)); } - debug_printf("VerifyRange(@%" PRId64 ", %s, %s): Actual seek\n", v, start.toHexString().c_str(), end.toHexString().c_str()); + debug_printf("VerifyRange(@%" PRId64 ", %s, %s): Actual seek\n", v, start.toHexString().c_str(), + end.toHexString().c_str()); wait(cur->findFirstEqualOrGreater(start)); state std::vector results; - while(cur->isValid() && cur->getKey() < end) { + while (cur->isValid() && cur->getKey() < end) { // Find the next written kv pair that would be present at this version - while(1) { + while (1) { iLast = i; - if(i == iEnd) - break; + if (i == iEnd) break; ++i; - if(iLast->first.second <= v - && iLast->second.present() - && ( - i == iEnd - || i->first.first != iLast->first.first - || i->first.second > v - ) - ) { - debug_printf("VerifyRange(@%" PRId64 ", %s, %s) Found key in written map: %s\n", v, start.toHexString().c_str(), end.toHexString().c_str(), iLast->first.first.c_str()); + if (iLast->first.second <= v && iLast->second.present() && + (i == iEnd || i->first.first != iLast->first.first || i->first.second > v)) { + debug_printf("VerifyRange(@%" PRId64 ", %s, %s) Found key in written map: %s\n", v, + start.toHexString().c_str(), end.toHexString().c_str(), iLast->first.first.c_str()); break; } } - if(iLast == iEnd) { + if (iLast == iEnd) { ++errors; ++*pErrorCount; - printf("VerifyRange(@%" PRId64 ", %s, %s) ERROR: Tree key '%s' vs nothing in written map.\n", v, start.toHexString().c_str(), end.toHexString().c_str(), cur->getKey().toString().c_str()); + printf("VerifyRange(@%" PRId64 ", %s, %s) ERROR: Tree key '%s' vs nothing in written map.\n", v, + start.toHexString().c_str(), end.toHexString().c_str(), cur->getKey().toString().c_str()); break; } - if(cur->getKey() != iLast->first.first) { + if (cur->getKey() != iLast->first.first) { ++errors; ++*pErrorCount; - printf("VerifyRange(@%" PRId64 ", %s, %s) ERROR: Tree key '%s' vs written '%s'\n", v, start.toHexString().c_str(), end.toHexString().c_str(), cur->getKey().toString().c_str(), iLast->first.first.c_str()); + printf("VerifyRange(@%" PRId64 ", %s, %s) ERROR: Tree key '%s' vs written '%s'\n", v, + start.toHexString().c_str(), end.toHexString().c_str(), cur->getKey().toString().c_str(), + iLast->first.first.c_str()); break; } - if(cur->getValue() != iLast->second.get()) { + if (cur->getValue() != iLast->second.get()) { ++errors; ++*pErrorCount; - printf("VerifyRange(@%" PRId64 ", %s, %s) ERROR: Tree key '%s' has tree value '%s' vs written '%s'\n", v, start.toHexString().c_str(), end.toHexString().c_str(), cur->getKey().toString().c_str(), cur->getValue().toString().c_str(), iLast->second.get().c_str()); + printf("VerifyRange(@%" PRId64 ", %s, %s) ERROR: Tree key '%s' has tree value '%s' vs written '%s'\n", v, + start.toHexString().c_str(), end.toHexString().c_str(), cur->getKey().toString().c_str(), + cur->getValue().toString().c_str(), iLast->second.get().c_str()); break; } @@ -5149,60 +4987,61 @@ ACTOR Future verifyRange(VersionedBTree *btree, Key start, Key end, Version } // Make sure there are no further written kv pairs that would be present at this version. - while(1) { + while (1) { iLast = i; - if(i == iEnd) - break; + if (i == iEnd) break; ++i; - if(iLast->first.second <= v - && iLast->second.present() - && ( - i == iEnd - || i->first.first != iLast->first.first - || i->first.second > v - ) - ) + if (iLast->first.second <= v && iLast->second.present() && + (i == iEnd || i->first.first != iLast->first.first || i->first.second > v)) break; } - if(iLast != iEnd) { + if (iLast != iEnd) { ++errors; ++*pErrorCount; - printf("VerifyRange(@%" PRId64 ", %s, %s) ERROR: Tree range ended but written has @%" PRId64 " '%s'\n", v, start.toHexString().c_str(), end.toHexString().c_str(), iLast->first.second, iLast->first.first.c_str()); + printf("VerifyRange(@%" PRId64 ", %s, %s) ERROR: Tree range ended but written has @%" PRId64 " '%s'\n", v, + start.toHexString().c_str(), end.toHexString().c_str(), iLast->first.second, iLast->first.first.c_str()); } - debug_printf("VerifyRangeReverse(@%" PRId64 ", %s, %s): start\n", v, start.toHexString().c_str(), end.toHexString().c_str()); + debug_printf("VerifyRangeReverse(@%" PRId64 ", %s, %s): start\n", v, start.toHexString().c_str(), + end.toHexString().c_str()); - // Randomly use a new cursor at the same version for the reverse range read, if the version is still available for opening new cursors - if(v >= btree->getOldestVersion() && deterministicRandom()->coinflip()) { + // Randomly use a new cursor at the same version for the reverse range read, if the version is still available for + // opening new cursors + if (v >= btree->getOldestVersion() && deterministicRandom()->coinflip()) { cur = btree->readAtVersion(v); } // Now read the range from the tree in reverse order and compare to the saved results wait(cur->findLastLessOrEqual(end)); - if(cur->isValid() && cur->getKey() == end) - wait(cur->prev()); + if (cur->isValid() && cur->getKey() == end) wait(cur->prev()); state std::vector::const_reverse_iterator r = results.rbegin(); - while(cur->isValid() && cur->getKey() >= start) { - if(r == results.rend()) { + while (cur->isValid() && cur->getKey() >= start) { + if (r == results.rend()) { ++errors; ++*pErrorCount; - printf("VerifyRangeReverse(@%" PRId64 ", %s, %s) ERROR: Tree key '%s' vs nothing in written map.\n", v, start.toHexString().c_str(), end.toHexString().c_str(), cur->getKey().toString().c_str()); + printf("VerifyRangeReverse(@%" PRId64 ", %s, %s) ERROR: Tree key '%s' vs nothing in written map.\n", v, + start.toHexString().c_str(), end.toHexString().c_str(), cur->getKey().toString().c_str()); break; } - if(cur->getKey() != r->key) { + if (cur->getKey() != r->key) { ++errors; ++*pErrorCount; - printf("VerifyRangeReverse(@%" PRId64 ", %s, %s) ERROR: Tree key '%s' vs written '%s'\n", v, start.toHexString().c_str(), end.toHexString().c_str(), cur->getKey().toString().c_str(), r->key.toString().c_str()); + printf("VerifyRangeReverse(@%" PRId64 ", %s, %s) ERROR: Tree key '%s' vs written '%s'\n", v, + start.toHexString().c_str(), end.toHexString().c_str(), cur->getKey().toString().c_str(), + r->key.toString().c_str()); break; } - if(cur->getValue() != r->value) { + if (cur->getValue() != r->value) { ++errors; ++*pErrorCount; - printf("VerifyRangeReverse(@%" PRId64 ", %s, %s) ERROR: Tree key '%s' has tree value '%s' vs written '%s'\n", v, start.toHexString().c_str(), end.toHexString().c_str(), cur->getKey().toString().c_str(), cur->getValue().toString().c_str(), r->value.toString().c_str()); + printf("VerifyRangeReverse(@%" PRId64 + ", %s, %s) ERROR: Tree key '%s' has tree value '%s' vs written '%s'\n", + v, start.toHexString().c_str(), end.toHexString().c_str(), cur->getKey().toString().c_str(), + cur->getValue().toString().c_str(), r->value.toString().c_str()); break; } @@ -5210,47 +5049,54 @@ ACTOR Future verifyRange(VersionedBTree *btree, Key start, Key end, Version wait(cur->prev()); } - if(r != results.rend()) { + if (r != results.rend()) { ++errors; ++*pErrorCount; - printf("VerifyRangeReverse(@%" PRId64 ", %s, %s) ERROR: Tree range ended but written has '%s'\n", v, start.toHexString().c_str(), end.toHexString().c_str(), r->key.toString().c_str()); + printf("VerifyRangeReverse(@%" PRId64 ", %s, %s) ERROR: Tree range ended but written has '%s'\n", v, + start.toHexString().c_str(), end.toHexString().c_str(), r->key.toString().c_str()); } return errors; } // Verify the result of point reads for every set or cleared key at the given version -ACTOR Future seekAll(VersionedBTree *btree, Version v, std::map, Optional> *written, int *pErrorCount) { +ACTOR Future seekAll(VersionedBTree* btree, Version v, + std::map, Optional>* written, int* pErrorCount) { state std::map, Optional>::const_iterator i = written->cbegin(); state std::map, Optional>::const_iterator iEnd = written->cend(); state int errors = 0; state Reference cur = btree->readAtVersion(v); - while(i != iEnd) { + while (i != iEnd) { state std::string key = i->first.first; state Version ver = i->first.second; - if(ver == v) { + if (ver == v) { state Optional val = i->second; debug_printf("Verifying @%" PRId64 " '%s'\n", ver, key.c_str()); state Arena arena; wait(cur->findEqual(KeyRef(arena, key))); - if(val.present()) { - if(!(cur->isValid() && cur->getKey() == key && cur->getValue() == val.get())) { + if (val.present()) { + if (!(cur->isValid() && cur->getKey() == key && cur->getValue() == val.get())) { ++errors; ++*pErrorCount; - if(!cur->isValid()) - printf("Verify ERROR: key_not_found: '%s' -> '%s' @%" PRId64 "\n", key.c_str(), val.get().c_str(), ver); - else if(cur->getKey() != key) - printf("Verify ERROR: key_incorrect: found '%s' expected '%s' @%" PRId64 "\n", cur->getKey().toString().c_str(), key.c_str(), ver); - else if(cur->getValue() != val.get()) - printf("Verify ERROR: value_incorrect: for '%s' found '%s' expected '%s' @%" PRId64 "\n", cur->getKey().toString().c_str(), cur->getValue().toString().c_str(), val.get().c_str(), ver); + if (!cur->isValid()) + printf("Verify ERROR: key_not_found: '%s' -> '%s' @%" PRId64 "\n", key.c_str(), + val.get().c_str(), ver); + else if (cur->getKey() != key) + printf("Verify ERROR: key_incorrect: found '%s' expected '%s' @%" PRId64 "\n", + cur->getKey().toString().c_str(), key.c_str(), ver); + else if (cur->getValue() != val.get()) + printf("Verify ERROR: value_incorrect: for '%s' found '%s' expected '%s' @%" PRId64 "\n", + cur->getKey().toString().c_str(), cur->getValue().toString().c_str(), val.get().c_str(), + ver); } } else { - if(cur->isValid() && cur->getKey() == key) { + if (cur->isValid() && cur->getKey() == key) { ++errors; ++*pErrorCount; - printf("Verify ERROR: cleared_key_found: '%s' -> '%s' @%" PRId64 "\n", key.c_str(), cur->getValue().toString().c_str(), ver); + printf("Verify ERROR: cleared_key_found: '%s' -> '%s' @%" PRId64 "\n", key.c_str(), + cur->getValue().toString().c_str(), ver); } } } @@ -5259,7 +5105,9 @@ ACTOR Future seekAll(VersionedBTree *btree, Version v, std::map verify(VersionedBTree *btree, FutureStream vStream, std::map, Optional> *written, int *pErrorCount, bool serial) { +ACTOR Future verify(VersionedBTree* btree, FutureStream vStream, + std::map, Optional>* written, int* pErrorCount, + bool serial) { state Future fRangeAll; state Future fRangeRandom; state Future fSeekAll; @@ -5273,33 +5121,37 @@ ACTOR Future verify(VersionedBTree *btree, FutureStream vStream, committedVersions.push_back(v); // Remove expired versions - while(!committedVersions.empty() && committedVersions.front() < btree->getOldestVersion()) { + while (!committedVersions.empty() && committedVersions.front() < btree->getOldestVersion()) { committedVersions.pop_front(); } - // Choose a random committed version, or sometimes the latest (which could be ahead of the latest version from vStream) - v = (committedVersions.empty() || deterministicRandom()->random01() < 0.25) ? btree->getLastCommittedVersion() : committedVersions[deterministicRandom()->randomInt(0, committedVersions.size())]; + // Choose a random committed version, or sometimes the latest (which could be ahead of the latest version + // from vStream) + v = (committedVersions.empty() || deterministicRandom()->random01() < 0.25) + ? btree->getLastCommittedVersion() + : committedVersions[deterministicRandom()->randomInt(0, committedVersions.size())]; debug_printf("Using committed version %" PRId64 "\n", v); // Get a cursor at v so that v doesn't get expired between the possibly serial steps below. state Reference cur = btree->readAtVersion(v); debug_printf("Verifying entire key range at version %" PRId64 "\n", v); fRangeAll = verifyRange(btree, LiteralStringRef(""), LiteralStringRef("\xff\xff"), v, written, pErrorCount); - if(serial) { + if (serial) { wait(success(fRangeAll)); } Key begin = randomKV().key; Key end = randomKV().key; - debug_printf("Verifying range (%s, %s) at version %" PRId64 "\n", toString(begin).c_str(), toString(end).c_str(), v); + debug_printf("Verifying range (%s, %s) at version %" PRId64 "\n", toString(begin).c_str(), + toString(end).c_str(), v); fRangeRandom = verifyRange(btree, begin, end, v, written, pErrorCount); - if(serial) { + if (serial) { wait(success(fRangeRandom)); } debug_printf("Verifying seeks to each changed key at version %" PRId64 "\n", v); fSeekAll = seekAll(btree, v, written, pErrorCount); - if(serial) { + if (serial) { wait(success(fSeekAll)); } @@ -5307,11 +5159,10 @@ ACTOR Future verify(VersionedBTree *btree, FutureStream vStream, printf("Verified version %" PRId64 ", %d errors\n", v, *pErrorCount); - if(*pErrorCount != 0) - break; + if (*pErrorCount != 0) break; } - } catch(Error &e) { - if(e.code() != error_code_end_of_stream && e.code() != error_code_transaction_too_old) { + } catch (Error& e) { + if (e.code() != error_code_end_of_stream && e.code() != error_code_transaction_too_old) { throw; } } @@ -5319,12 +5170,12 @@ ACTOR Future verify(VersionedBTree *btree, FutureStream vStream, } // Does a random range read, doesn't trap/report errors -ACTOR Future randomReader(VersionedBTree *btree) { +ACTOR Future randomReader(VersionedBTree* btree) { try { state Reference cur; loop { wait(yield()); - if(!cur || deterministicRandom()->random01() > .01) { + if (!cur || deterministicRandom()->random01() > .01) { Version v = btree->getLastCommittedVersion(); cur = btree->readAtVersion(v); } @@ -5332,14 +5183,13 @@ ACTOR Future randomReader(VersionedBTree *btree) { state KeyValue kv = randomKV(10, 0); wait(cur->findFirstEqualOrGreater(kv.key)); state int c = deterministicRandom()->randomInt(0, 100); - while(cur->isValid() && c-- > 0) { + while (cur->isValid() && c-- > 0) { wait(success(cur->next())); wait(yield()); } } - } - catch(Error &e) { - if(e.code() != error_code_transaction_too_old) { + } catch (Error& e) { + if (e.code() != error_code_transaction_too_old) { throw e; } } @@ -5351,9 +5201,7 @@ struct IntIntPair { IntIntPair() {} IntIntPair(int k, int v) : k(k), v(v) {} - IntIntPair(Arena &arena, const IntIntPair &toCopy) { - *this = toCopy; - } + IntIntPair(Arena& arena, const IntIntPair& toCopy) { *this = toCopy; } struct Delta { bool prefixSource; @@ -5361,39 +5209,28 @@ struct IntIntPair { int dk; int dv; - IntIntPair apply(const IntIntPair &base, Arena &arena) { - return {base.k + dk, base.v + dv}; - } + IntIntPair apply(const IntIntPair& base, Arena& arena) { return { base.k + dk, base.v + dv }; } - void setPrefixSource(bool val) { - prefixSource = val; - } + void setPrefixSource(bool val) { prefixSource = val; } - bool getPrefixSource() const { - return prefixSource; - } + bool getPrefixSource() const { return prefixSource; } - void setDeleted(bool val) { - deleted = val; - } + void setDeleted(bool val) { deleted = val; } - bool getDeleted() const { - return deleted; - } + bool getDeleted() const { return deleted; } - int size() const { - return sizeof(Delta); - } + int size() const { return sizeof(Delta); } std::string toString() const { - return format("DELTA{prefixSource=%d deleted=%d dk=%d(0x%x) dv=%d(0x%x)}", prefixSource, deleted, dk, dk, dv, dv); + return format("DELTA{prefixSource=%d deleted=%d dk=%d(0x%x) dv=%d(0x%x)}", prefixSource, deleted, dk, dk, + dv, dv); } }; // For IntIntPair, skipLen will be in units of fields, not bytes - int getCommonPrefixLen(const IntIntPair &other, int skip = 0) const { - if(k == other.k) { - if(v == other.v) { + int getCommonPrefixLen(const IntIntPair& other, int skip = 0) const { + if (k == other.k) { + if (v == other.v) { return 2; } return 1; @@ -5401,31 +5238,25 @@ struct IntIntPair { return 0; } - int compare(const IntIntPair &rhs, int skip = 0) const { - if(skip == 2) { + int compare(const IntIntPair& rhs, int skip = 0) const { + if (skip == 2) { return 0; } int cmp = (skip > 0) ? 0 : (k - rhs.k); - if(cmp == 0) { + if (cmp == 0) { cmp = v - rhs.v; } return cmp; } - bool operator==(const IntIntPair &rhs) const { - return compare(rhs) == 0; - } + bool operator==(const IntIntPair& rhs) const { return compare(rhs) == 0; } - bool operator<(const IntIntPair &rhs) const { - return compare(rhs) < 0; - } + bool operator<(const IntIntPair& rhs) const { return compare(rhs) < 0; } - int deltaSize(const IntIntPair &base, int skipLen, bool worstcase) const { - return sizeof(Delta); - } + int deltaSize(const IntIntPair& base, int skipLen, bool worstcase) const { return sizeof(Delta); } - int writeDelta(Delta &d, const IntIntPair &base, int commonPrefix = -1) const { + int writeDelta(Delta& d, const IntIntPair& base, int commonPrefix = -1) const { d.prefixSource = false; d.deleted = false; d.dk = k - base.k; @@ -5436,21 +5267,19 @@ struct IntIntPair { int k; int v; - std::string toString() const { - return format("{k=%d(0x%x) v=%d(0x%x)}", k, k, v, v); - } + std::string toString() const { return format("{k=%d(0x%x) v=%d(0x%x)}", k, k, v, v); } }; int deltaTest(RedwoodRecordRef rec, RedwoodRecordRef base) { std::vector buf(rec.key.size() + rec.value.orDefault(StringRef()).size() + 20); - RedwoodRecordRef::Delta &d = *(RedwoodRecordRef::Delta *)&buf.front(); + RedwoodRecordRef::Delta& d = *(RedwoodRecordRef::Delta*)&buf.front(); Arena mem; int expectedSize = rec.deltaSize(base, 0, false); int deltaSize = rec.writeDelta(d, base); RedwoodRecordRef decoded = d.apply(base, mem); - if(decoded != rec || expectedSize != deltaSize || d.size() != deltaSize) { + if (decoded != rec || expectedSize != deltaSize || d.size() != deltaSize) { printf("\n"); printf("Base: %s\n", base.toString().c_str()); printf("Record: %s\n", rec.toString().c_str()); @@ -5466,15 +5295,15 @@ int deltaTest(RedwoodRecordRef rec, RedwoodRecordRef base) { return deltaSize; } -RedwoodRecordRef randomRedwoodRecordRef(const std::string &keyBuffer, const std::string &valueBuffer) { +RedwoodRecordRef randomRedwoodRecordRef(const std::string& keyBuffer, const std::string& valueBuffer) { RedwoodRecordRef rec; - rec.key = StringRef((uint8_t *)keyBuffer.data(), deterministicRandom()->randomInt(0, keyBuffer.size())); - if(deterministicRandom()->coinflip()) { - rec.value = StringRef((uint8_t *)valueBuffer.data(), deterministicRandom()->randomInt(0, valueBuffer.size())); + rec.key = StringRef((uint8_t*)keyBuffer.data(), deterministicRandom()->randomInt(0, keyBuffer.size())); + if (deterministicRandom()->coinflip()) { + rec.value = StringRef((uint8_t*)valueBuffer.data(), deterministicRandom()->randomInt(0, valueBuffer.size())); } int versionIntSize = deterministicRandom()->randomInt(0, 8) * 8; - if(versionIntSize > 0) { + if (versionIntSize > 0) { --versionIntSize; int64_t max = ((int64_t)1 << versionIntSize) - 1; rec.version = deterministicRandom()->randomInt64(0, max); @@ -5496,7 +5325,7 @@ TEST_CASE("!/redwood/correctness/unit/RedwoodRecordRef") { // Test pageID stuff. { - LogicalPageID ids[] = {1, 5}; + LogicalPageID ids[] = { 1, 5 }; BTreePageID id(ids, 2); RedwoodRecordRef r; r.setChildPage(id); @@ -5509,44 +5338,34 @@ TEST_CASE("!/redwood/correctness/unit/RedwoodRecordRef") { } deltaTest(RedwoodRecordRef(LiteralStringRef(""), 0, LiteralStringRef("")), - RedwoodRecordRef(LiteralStringRef(""), 0, LiteralStringRef("")) - ); + RedwoodRecordRef(LiteralStringRef(""), 0, LiteralStringRef(""))); deltaTest(RedwoodRecordRef(LiteralStringRef("abc"), 0, LiteralStringRef("")), - RedwoodRecordRef(LiteralStringRef("abc"), 0, LiteralStringRef("")) - ); + RedwoodRecordRef(LiteralStringRef("abc"), 0, LiteralStringRef(""))); - deltaTest(RedwoodRecordRef(LiteralStringRef("abc"), 0, LiteralStringRef("")), - RedwoodRecordRef(LiteralStringRef("abcd"), 0, LiteralStringRef("")) - ); + deltaTest(RedwoodRecordRef(LiteralStringRef("abc"), 0, LiteralStringRef("")), + RedwoodRecordRef(LiteralStringRef("abcd"), 0, LiteralStringRef(""))); deltaTest(RedwoodRecordRef(LiteralStringRef("abcd"), 2, LiteralStringRef("")), - RedwoodRecordRef(LiteralStringRef("abc"), 2, LiteralStringRef("")) - ); + RedwoodRecordRef(LiteralStringRef("abc"), 2, LiteralStringRef(""))); deltaTest(RedwoodRecordRef(std::string(300, 'k'), 2, std::string(1e6, 'v')), - RedwoodRecordRef(std::string(300, 'k'), 2, LiteralStringRef("")) - ); + RedwoodRecordRef(std::string(300, 'k'), 2, LiteralStringRef(""))); deltaTest(RedwoodRecordRef(LiteralStringRef(""), 2, LiteralStringRef("")), - RedwoodRecordRef(LiteralStringRef(""), 1, LiteralStringRef("")) - ); + RedwoodRecordRef(LiteralStringRef(""), 1, LiteralStringRef(""))); deltaTest(RedwoodRecordRef(LiteralStringRef(""), 0xffff, LiteralStringRef("")), - RedwoodRecordRef(LiteralStringRef(""), 1, LiteralStringRef("")) - ); + RedwoodRecordRef(LiteralStringRef(""), 1, LiteralStringRef(""))); deltaTest(RedwoodRecordRef(LiteralStringRef(""), 1, LiteralStringRef("")), - RedwoodRecordRef(LiteralStringRef(""), 0xffff, LiteralStringRef("")) - ); + RedwoodRecordRef(LiteralStringRef(""), 0xffff, LiteralStringRef(""))); deltaTest(RedwoodRecordRef(LiteralStringRef(""), 0xffffff, LiteralStringRef("")), - RedwoodRecordRef(LiteralStringRef(""), 1, LiteralStringRef("")) - ); + RedwoodRecordRef(LiteralStringRef(""), 1, LiteralStringRef(""))); deltaTest(RedwoodRecordRef(LiteralStringRef(""), 1, LiteralStringRef("")), - RedwoodRecordRef(LiteralStringRef(""), 0xffffff, LiteralStringRef("")) - ); + RedwoodRecordRef(LiteralStringRef(""), 0xffffff, LiteralStringRef(""))); Arena mem; double start; @@ -5560,9 +5379,9 @@ TEST_CASE("!/redwood/correctness/unit/RedwoodRecordRef") { start = timer(); count = 1000; bytes = 0; - for(i = 0; i < count; ++i) { + for (i = 0; i < count; ++i) { RedwoodRecordRef a = randomRedwoodRecordRef(keyBuffer, valueBuffer); - RedwoodRecordRef b = randomRedwoodRecordRef(keyBuffer, valueBuffer); + RedwoodRecordRef b = randomRedwoodRecordRef(keyBuffer, valueBuffer); bytes += deltaTest(a, b); } double elapsed = timer() - start; @@ -5573,9 +5392,9 @@ TEST_CASE("!/redwood/correctness/unit/RedwoodRecordRef") { start = timer(); count = 1e6; bytes = 0; - for(i = 0; i < count; ++i) { + for (i = 0; i < count; ++i) { RedwoodRecordRef a = randomRedwoodRecordRef(keyBuffer, valueBuffer); - RedwoodRecordRef b = randomRedwoodRecordRef(keyBuffer, valueBuffer); + RedwoodRecordRef b = randomRedwoodRecordRef(keyBuffer, valueBuffer); bytes += deltaTest(a, b); } printf("DeltaTest() on random small records %g M/s %g MB/s\n", count / elapsed / 1e6, bytes / elapsed / 1e6); @@ -5592,7 +5411,7 @@ TEST_CASE("!/redwood/correctness/unit/RedwoodRecordRef") { start = timer(); total = 0; count = 100e6; - for(i = 0; i < count; ++i) { + for (i = 0; i < count; ++i) { total += rec1.getCommonPrefixLen(rec2, 50); } printf("%" PRId64 " getCommonPrefixLen(skip=50) %g M/s\n", total, count / (timer() - start) / 1e6); @@ -5600,20 +5419,20 @@ TEST_CASE("!/redwood/correctness/unit/RedwoodRecordRef") { start = timer(); total = 0; count = 100e6; - for(i = 0; i < count; ++i) { + for (i = 0; i < count; ++i) { total += rec1.getCommonPrefixLen(rec2, 0); } printf("%" PRId64 " getCommonPrefixLen(skip=0) %g M/s\n", total, count / (timer() - start) / 1e6); char buf[1000]; - RedwoodRecordRef::Delta &d = *(RedwoodRecordRef::Delta *)buf; + RedwoodRecordRef::Delta& d = *(RedwoodRecordRef::Delta*)buf; start = timer(); total = 0; count = 100e6; int commonPrefix = rec1.getCommonPrefixLen(rec2, 0); - for(i = 0; i < count; ++i) { + for (i = 0; i < count; ++i) { total += rec1.writeDelta(d, rec2, commonPrefix); } printf("%" PRId64 " writeDelta(commonPrefix=%d) %g M/s\n", total, commonPrefix, count / (timer() - start) / 1e6); @@ -5621,7 +5440,7 @@ TEST_CASE("!/redwood/correctness/unit/RedwoodRecordRef") { start = timer(); total = 0; count = 10e6; - for(i = 0; i < count; ++i) { + for (i = 0; i < count; ++i) { total += rec1.writeDelta(d, rec2); } printf("%" PRId64 " writeDelta() %g M/s\n", total, count / (timer() - start) / 1e6); @@ -5643,16 +5462,18 @@ TEST_CASE("!/redwood/correctness/unit/deltaTree/RedwoodRecordRef") { std::set uniqueItems; // Add random items to uniqueItems until its size is N - while(uniqueItems.size() < N) { + while (uniqueItems.size() < N) { std::string k = deterministicRandom()->randomAlphaNumeric(30); std::string v = deterministicRandom()->randomAlphaNumeric(30); RedwoodRecordRef rec; rec.key = StringRef(arena, k); - rec.version = deterministicRandom()->coinflip() ? deterministicRandom()->randomInt64(0, std::numeric_limits::max()) : invalidVersion; - if(deterministicRandom()->coinflip()) { + rec.version = deterministicRandom()->coinflip() + ? deterministicRandom()->randomInt64(0, std::numeric_limits::max()) + : invalidVersion; + if (deterministicRandom()->coinflip()) { rec.value = StringRef(arena, v); } - if(uniqueItems.count(rec) == 0) { + if (uniqueItems.count(rec) == 0) { uniqueItems.insert(rec); } } @@ -5660,19 +5481,20 @@ TEST_CASE("!/redwood/correctness/unit/deltaTree/RedwoodRecordRef") { int bufferSize = N * 100; bool largeTree = bufferSize > DeltaTree::SmallSizeLimit; - DeltaTree *tree = (DeltaTree *) new uint8_t[bufferSize]; + DeltaTree* tree = (DeltaTree*)new uint8_t[bufferSize]; tree->build(bufferSize, &items[0], &items[items.size()], &prev, &next); - printf("Count=%d Size=%d InitialHeight=%d largeTree=%d\n", (int)items.size(), (int)tree->size(), (int)tree->initialHeight, largeTree); - debug_printf("Data(%p): %s\n", tree, StringRef((uint8_t *)tree, tree->size()).toHexString().c_str()); + printf("Count=%d Size=%d InitialHeight=%d largeTree=%d\n", (int)items.size(), (int)tree->size(), + (int)tree->initialHeight, largeTree); + debug_printf("Data(%p): %s\n", tree, StringRef((uint8_t*)tree, tree->size()).toHexString().c_str()); DeltaTree::Mirror r(tree, &prev, &next); // Test delete/insert behavior for each item, making no net changes printf("Testing seek/delete/insert for existing keys with random values\n"); ASSERT(tree->numItems == items.size()); - for(auto rec : items) { + for (auto rec : items) { // Insert existing should fail ASSERT(!r.insert(rec)); ASSERT(tree->numItems == items.size()); @@ -5706,24 +5528,27 @@ TEST_CASE("!/redwood/correctness/unit/deltaTree/RedwoodRecordRef") { ASSERT(rev.moveLast()); int i = 0; - while(1) { - if(fwd.get() != items[i]) { - printf("forward iterator i=%d\n %s found\n %s expected\n", i, fwd.get().toString().c_str(), items[i].toString().c_str()); + while (1) { + if (fwd.get() != items[i]) { + printf("forward iterator i=%d\n %s found\n %s expected\n", i, fwd.get().toString().c_str(), + items[i].toString().c_str()); printf("Delta: %s\n", fwd.node->raw->delta(largeTree).toString().c_str()); ASSERT(false); } - if(rev.get() != items[items.size() - 1 - i]) { - printf("reverse iterator i=%d\n %s found\n %s expected\n", i, rev.get().toString().c_str(), items[items.size() - 1 - i].toString().c_str()); + if (rev.get() != items[items.size() - 1 - i]) { + printf("reverse iterator i=%d\n %s found\n %s expected\n", i, rev.get().toString().c_str(), + items[items.size() - 1 - i].toString().c_str()); printf("Delta: %s\n", rev.node->raw->delta(largeTree).toString().c_str()); ASSERT(false); } - if(fwdValueOnly.get().value != items[i].value) { - printf("forward values-only iterator i=%d\n %s found\n %s expected\n", i, fwdValueOnly.get().toString().c_str(), items[i].toString().c_str()); + if (fwdValueOnly.get().value != items[i].value) { + printf("forward values-only iterator i=%d\n %s found\n %s expected\n", i, + fwdValueOnly.get().toString().c_str(), items[i].toString().c_str()); printf("Delta: %s\n", fwdValueOnly.node->raw->delta(largeTree).toString().c_str()); ASSERT(false); } ++i; - + bool more = fwd.moveNext(); ASSERT(fwdValueOnly.moveNext() == more); ASSERT(rev.movePrev() == more); @@ -5732,7 +5557,7 @@ TEST_CASE("!/redwood/correctness/unit/deltaTree/RedwoodRecordRef") { ASSERT(fwdValueOnly.valid() == more); ASSERT(rev.valid() == more); - if(!fwd.valid()) { + if (!fwd.valid()) { break; } } @@ -5744,15 +5569,16 @@ TEST_CASE("!/redwood/correctness/unit/deltaTree/RedwoodRecordRef") { printf("Doing 20M random seeks using the same cursor from the same mirror.\n"); double start = timer(); - - for(int i = 0; i < 20000000; ++i) { - const RedwoodRecordRef &query = items[deterministicRandom()->randomInt(0, items.size())]; - if(!c.seekLessThanOrEqual(query)) { + + for (int i = 0; i < 20000000; ++i) { + const RedwoodRecordRef& query = items[deterministicRandom()->randomInt(0, items.size())]; + if (!c.seekLessThanOrEqual(query)) { printf("Not found! query=%s\n", query.toString().c_str()); ASSERT(false); } - if(c.get() != query) { - printf("Found incorrect node! query=%s found=%s\n", query.toString().c_str(), c.get().toString().c_str()); + if (c.get() != query) { + printf("Found incorrect node! query=%s found=%s\n", query.toString().c_str(), + c.get().toString().c_str()); ASSERT(false); } } @@ -5763,22 +5589,23 @@ TEST_CASE("!/redwood/correctness/unit/deltaTree/RedwoodRecordRef") { { printf("Doing 5M random seeks using 10k random cursors, each from a different mirror.\n"); double start = timer(); - std::vector::Mirror *> mirrors; + std::vector::Mirror*> mirrors; std::vector::Cursor> cursors; - for(int i = 0; i < 10000; ++i) { + for (int i = 0; i < 10000; ++i) { mirrors.push_back(new DeltaTree::Mirror(tree, &prev, &next)); cursors.push_back(mirrors.back()->getCursor()); } - for(int i = 0; i < 5000000; ++i) { - const RedwoodRecordRef &query = items[deterministicRandom()->randomInt(0, items.size())]; - DeltaTree::Cursor &c = cursors[deterministicRandom()->randomInt(0, cursors.size())]; - if(!c.seekLessThanOrEqual(query)) { + for (int i = 0; i < 5000000; ++i) { + const RedwoodRecordRef& query = items[deterministicRandom()->randomInt(0, items.size())]; + DeltaTree::Cursor& c = cursors[deterministicRandom()->randomInt(0, cursors.size())]; + if (!c.seekLessThanOrEqual(query)) { printf("Not found! query=%s\n", query.toString().c_str()); ASSERT(false); } - if(c.get() != query) { - printf("Found incorrect node! query=%s found=%s\n", query.toString().c_str(), c.get().toString().c_str()); + if (c.get() != query) { + printf("Found incorrect node! query=%s found=%s\n", query.toString().c_str(), + c.get().toString().c_str()); ASSERT(false); } } @@ -5791,18 +5618,19 @@ TEST_CASE("!/redwood/correctness/unit/deltaTree/RedwoodRecordRef") { TEST_CASE("!/redwood/correctness/unit/deltaTree/IntIntPair") { const int N = 200; - IntIntPair prev = {1, 0}; - IntIntPair next = {10000, 10000}; + IntIntPair prev = { 1, 0 }; + IntIntPair next = { 10000, 10000 }; state std::function randomPair = [&]() { - return IntIntPair({deterministicRandom()->randomInt(prev.k, next.k), deterministicRandom()->randomInt(prev.v, next.v)}); + return IntIntPair( + { deterministicRandom()->randomInt(prev.k, next.k), deterministicRandom()->randomInt(prev.v, next.v) }); }; // Build a set of N unique items std::set uniqueItems; - while(uniqueItems.size() < N) { + while (uniqueItems.size() < N) { IntIntPair p = randomPair(); - if(uniqueItems.count(p) == 0) { + if (uniqueItems.count(p) == 0) { uniqueItems.insert(p); } } @@ -5810,7 +5638,7 @@ TEST_CASE("!/redwood/correctness/unit/deltaTree/IntIntPair") { // Build tree of items std::vector items(uniqueItems.begin(), uniqueItems.end()); int bufferSize = N * 2 * 20; - DeltaTree *tree = (DeltaTree *) new uint8_t[bufferSize]; + DeltaTree* tree = (DeltaTree*)new uint8_t[bufferSize]; int builtSize = tree->build(bufferSize, &items[0], &items[items.size()], &prev, &next); ASSERT(builtSize <= bufferSize); @@ -5818,17 +5646,17 @@ TEST_CASE("!/redwood/correctness/unit/deltaTree/IntIntPair") { // Grow uniqueItems until tree is full, adding half of new items to toDelete std::vector toDelete; - while(1) { + while (1) { IntIntPair p = randomPair(); - if(uniqueItems.count(p) == 0) { - if(!r.insert(p)) { + if (uniqueItems.count(p) == 0) { + if (!r.insert(p)) { break; }; uniqueItems.insert(p); - if(deterministicRandom()->coinflip()) { + if (deterministicRandom()->coinflip()) { toDelete.push_back(p); } - //printf("Inserted %s size=%d\n", items.back().toString().c_str(), tree->size()); + // printf("Inserted %s size=%d\n", items.back().toString().c_str(), tree->size()); } } @@ -5839,13 +5667,14 @@ TEST_CASE("!/redwood/correctness/unit/deltaTree/IntIntPair") { items = std::vector(uniqueItems.begin(), uniqueItems.end()); auto printItems = [&] { - for(int k = 0; k < items.size(); ++k) { + for (int k = 0; k < items.size(); ++k) { printf("%d %s\n", k, items[k].toString().c_str()); } }; - printf("Count=%d Size=%d InitialHeight=%d MaxHeight=%d\n", (int)items.size(), (int)tree->size(), (int)tree->initialHeight, (int)tree->maxHeight); - debug_printf("Data(%p): %s\n", tree, StringRef((uint8_t *)tree, tree->size()).toHexString().c_str()); + printf("Count=%d Size=%d InitialHeight=%d MaxHeight=%d\n", (int)items.size(), (int)tree->size(), + (int)tree->initialHeight, (int)tree->maxHeight); + debug_printf("Data(%p): %s\n", tree, StringRef((uint8_t*)tree, tree->size()).toHexString().c_str()); // Iterate through items and tree forward and backward, verifying tree contents. auto scanAndVerify = [&]() { @@ -5856,15 +5685,17 @@ TEST_CASE("!/redwood/correctness/unit/deltaTree/IntIntPair") { ASSERT(fwd.moveFirst()); ASSERT(rev.moveLast()); - for(int i = 0; i < items.size(); ++i) { - if(fwd.get() != items[i]) { + for (int i = 0; i < items.size(); ++i) { + if (fwd.get() != items[i]) { printItems(); - printf("forward iterator i=%d\n %s found\n %s expected\n", i, fwd.get().toString().c_str(), items[i].toString().c_str()); + printf("forward iterator i=%d\n %s found\n %s expected\n", i, fwd.get().toString().c_str(), + items[i].toString().c_str()); ASSERT(false); } - if(rev.get() != items[items.size() - 1 - i]) { + if (rev.get() != items[items.size() - 1 - i]) { printItems(); - printf("reverse iterator i=%d\n %s found\n %s expected\n", i, rev.get().toString().c_str(), items[items.size() - 1 - i].toString().c_str()); + printf("reverse iterator i=%d\n %s found\n %s expected\n", i, rev.get().toString().c_str(), + items[items.size() - 1 - i].toString().c_str()); ASSERT(false); } @@ -5877,7 +5708,7 @@ TEST_CASE("!/redwood/correctness/unit/deltaTree/IntIntPair") { ASSERT(fwd.valid() == !end); ASSERT(rev.valid() == !end); - if(end) { + if (end) { break; } } @@ -5892,7 +5723,7 @@ TEST_CASE("!/redwood/correctness/unit/deltaTree/IntIntPair") { // For each randomly selected new item to be deleted, delete it from the DeltaTree and from uniqueItems printf("Deleting some items\n"); - for(auto p : toDelete) { + for (auto p : toDelete) { uniqueItems.erase(p); DeltaTree::Cursor c = r.getCursor(); ASSERT(c.seekLessThanOrEqual(p)); @@ -5906,7 +5737,7 @@ TEST_CASE("!/redwood/correctness/unit/deltaTree/IntIntPair") { printf("Verifying insert/erase behavior for existing items\n"); // Test delete/insert behavior for each item, making no net changes - for(auto p : items) { + for (auto p : items) { // Insert existing should fail ASSERT(!r.insert(p)); @@ -5930,80 +5761,85 @@ TEST_CASE("!/redwood/correctness/unit/deltaTree/IntIntPair") { DeltaTree::Cursor s = r.getCursor(); // SeekLTE to each element - for(int i = 0; i < items.size(); ++i) { + for (int i = 0; i < items.size(); ++i) { IntIntPair p = items[i]; IntIntPair q = p; ASSERT(s.seekLessThanOrEqual(q)); - if(s.get() != p) { + if (s.get() != p) { printItems(); - printf("seekLessThanOrEqual(%s) found %s expected %s\n", q.toString().c_str(), s.get().toString().c_str(), p.toString().c_str()); + printf("seekLessThanOrEqual(%s) found %s expected %s\n", q.toString().c_str(), s.get().toString().c_str(), + p.toString().c_str()); ASSERT(false); } } // SeekGTE to each element - for(int i = 0; i < items.size(); ++i) { + for (int i = 0; i < items.size(); ++i) { IntIntPair p = items[i]; IntIntPair q = p; ASSERT(s.seekGreaterThanOrEqual(q)); - if(s.get() != p) { + if (s.get() != p) { printItems(); - printf("seekGreaterThanOrEqual(%s) found %s expected %s\n", q.toString().c_str(), s.get().toString().c_str(), p.toString().c_str()); + printf("seekGreaterThanOrEqual(%s) found %s expected %s\n", q.toString().c_str(), + s.get().toString().c_str(), p.toString().c_str()); ASSERT(false); } } // SeekLTE to the next possible int pair value after each element to make sure the base element is found - for(int i = 0; i < items.size(); ++i) { + for (int i = 0; i < items.size(); ++i) { IntIntPair p = items[i]; IntIntPair q = p; q.v++; ASSERT(s.seekLessThanOrEqual(q)); - if(s.get() != p) { + if (s.get() != p) { printItems(); - printf("seekLessThanOrEqual(%s) found %s expected %s\n", q.toString().c_str(), s.get().toString().c_str(), p.toString().c_str()); + printf("seekLessThanOrEqual(%s) found %s expected %s\n", q.toString().c_str(), s.get().toString().c_str(), + p.toString().c_str()); ASSERT(false); } } // SeekGTE to the previous possible int pair value after each element to make sure the base element is found - for(int i = 0; i < items.size(); ++i) { + for (int i = 0; i < items.size(); ++i) { IntIntPair p = items[i]; IntIntPair q = p; q.v--; ASSERT(s.seekGreaterThanOrEqual(q)); - if(s.get() != p) { + if (s.get() != p) { printItems(); - printf("seekGreaterThanOrEqual(%s) found %s expected %s\n", q.toString().c_str(), s.get().toString().c_str(), p.toString().c_str()); + printf("seekGreaterThanOrEqual(%s) found %s expected %s\n", q.toString().c_str(), + s.get().toString().c_str(), p.toString().c_str()); ASSERT(false); } } // SeekLTE to each element N times, using every element as a hint - for(int i = 0; i < items.size(); ++i) { + for (int i = 0; i < items.size(); ++i) { IntIntPair p = items[i]; IntIntPair q = p; - for(int j = 0; j < items.size(); ++j) { + for (int j = 0; j < items.size(); ++j) { ASSERT(s.seekLessThanOrEqual(items[j])); ASSERT(s.seekLessThanOrEqual(q, 0, &s)); - if(s.get() != p) { + if (s.get() != p) { printItems(); printf("i=%d j=%d\n", i, j); - printf("seekLessThanOrEqual(%s) found %s expected %s\n", q.toString().c_str(), s.get().toString().c_str(), p.toString().c_str()); + printf("seekLessThanOrEqual(%s) found %s expected %s\n", q.toString().c_str(), + s.get().toString().c_str(), p.toString().c_str()); ASSERT(false); } } } // SeekLTE to each element's next possible value, using each element as a hint - for(int i = 0; i < items.size(); ++i) { + for (int i = 0; i < items.size(); ++i) { IntIntPair p = items[i]; IntIntPair q = p; q.v++; - for(int j = 0; j < items.size(); ++j) { + for (int j = 0; j < items.size(); ++j) { ASSERT(s.seekLessThanOrEqual(items[j])); ASSERT(s.seekLessThanOrEqual(q, 0, &s)); - if(s.get() != p) { + if (s.get() != p) { printItems(); printf("i=%d j=%d\n", i, j); ASSERT(false); @@ -6018,36 +5854,34 @@ TEST_CASE("!/redwood/correctness/unit/deltaTree/IntIntPair") { s.moveFirst(); auto first = s; int pos = 0; - for(int c = 0; c < count; ++c) { + for (int c = 0; c < count; ++c) { int jump = deterministicRandom()->randomInt(0, jumpMax); int newPos = pos + jump; - if(newPos >= items.size()) { + if (newPos >= items.size()) { pos = 0; newPos = jump; s = first; } IntIntPair q = items[newPos]; ++q.v; - if(old) { - if(useHint) { + if (old) { + if (useHint) { s.seekLessThanOrEqualOld(q, 0, &s, newPos - pos); - } - else { + } else { s.seekLessThanOrEqualOld(q, 0, nullptr, 0); } - } - else { - if(useHint) { + } else { + if (useHint) { s.seekLessThanOrEqual(q, 0, &s, newPos - pos); - } - else { + } else { s.seekLessThanOrEqual(q); } } pos = newPos; } double elapsed = timer() - start; - printf("Seek/skip test, jumpMax=%d, items=%d, oldSeek=%d useHint=%d: Elapsed %f s\n", jumpMax, items.size(), old, useHint, elapsed); + printf("Seek/skip test, jumpMax=%d, items=%d, oldSeek=%d useHint=%d: Elapsed %f s\n", jumpMax, items.size(), + old, useHint, elapsed); }; // Compare seeking to nearby elements with and without hints, using the old and new SeekLessThanOrEqual methods. @@ -6059,22 +5893,21 @@ TEST_CASE("!/redwood/correctness/unit/deltaTree/IntIntPair") { // Repeatedly seek for one of a set of pregenerated random pairs and time it. std::vector randomPairs; - for(int i = 0; i < 10 * N; ++i) { + for (int i = 0; i < 10 * N; ++i) { randomPairs.push_back(randomPair()); } // Random seeks double start = timer(); - for(int i = 0; i < 20000000; ++i) { + for (int i = 0; i < 20000000; ++i) { IntIntPair p = randomPairs[i % randomPairs.size()]; // Verify the result is less than or equal, and if seek fails then p must be lower than lowest (first) item - if(!s.seekLessThanOrEqual(p)) { - if(p >= items.front()) { + if (!s.seekLessThanOrEqual(p)) { + if (p >= items.front()) { printf("Seek failed! query=%s front=%s\n", p.toString().c_str(), items.front().toString().c_str()); ASSERT(false); } - } - else if(s.get() > p) { + } else if (s.get() > p) { printf("Found incorrect node! query=%s found=%s\n", p.toString().c_str(), s.get().toString().c_str()); ASSERT(false); } @@ -6112,14 +5945,14 @@ TEST_CASE("!/redwood/performance/mutationBuffer") { printf("Generating %d strings...\n", count); Arena arena; std::vector strings; - while(strings.size() < count) { + while (strings.size() < count) { strings.push_back(randomString(arena, 5)); } printf("Inserting and then finding each string...\n", count); double start = timer(); VersionedBTree::MutationBuffer m; - for(int i = 0; i < count; ++i) { + for (int i = 0; i < count; ++i) { KeyRef key = strings[i]; auto a = m.insert(key); auto b = m.lower_bound(key); @@ -6135,12 +5968,13 @@ TEST_CASE("!/redwood/performance/mutationBuffer") { TEST_CASE("!/redwood/correctness/btree") { state std::string pagerFile = "unittest_pageFile.redwood"; - IPager2 *pager; + IPager2* pager; state bool serialTest = deterministicRandom()->coinflip(); state bool shortTest = deterministicRandom()->coinflip(); - state int pageSize = shortTest ? 200 : (deterministicRandom()->coinflip() ? 4096 : deterministicRandom()->randomInt(200, 400)); + state int pageSize = + shortTest ? 200 : (deterministicRandom()->coinflip() ? 4096 : deterministicRandom()->randomInt(200, 400)); // We must be able to fit at least two any two keys plus overhead in a page to prevent // a situation where the tree cannot be grown upward with decreasing level size. @@ -6176,7 +6010,7 @@ TEST_CASE("!/redwood/correctness/btree") { printf("Initializing...\n"); state double startTime = now(); pager = new DWALPager(pageSize, pagerFile, 0); - state VersionedBTree *btree = new VersionedBTree(pager, pagerFile); + state VersionedBTree* btree = new VersionedBTree(pager, pagerFile); wait(btree->init()); state std::map, Optional> written; @@ -6204,66 +6038,66 @@ TEST_CASE("!/redwood/correctness/btree") { state Future commit = Void(); - while(mutationBytes.get() < mutationBytesTarget && (now() - startTime) < maxDuration) { - if(now() - startTime > 600) { + while (mutationBytes.get() < mutationBytesTarget && (now() - startTime) < maxDuration) { + if (now() - startTime > 600) { mutationBytesTarget = mutationBytes.get(); } // Sometimes advance the version - if(deterministicRandom()->random01() < 0.10) { + if (deterministicRandom()->random01() < 0.10) { ++version; btree->setWriteVersion(version); } // Sometimes do a clear range - if(deterministicRandom()->random01() < clearProbability) { + if (deterministicRandom()->random01() < clearProbability) { Key start = randomKV(maxKeySize, 1).key; Key end = (deterministicRandom()->random01() < .01) ? keyAfter(start) : randomKV(maxKeySize, 1).key; // Sometimes replace start and/or end with a close actual (previously used) value - if(deterministicRandom()->random01() < .10) { + if (deterministicRandom()->random01() < .10) { auto i = keys.upper_bound(start); - if(i != keys.end()) - start = *i; + if (i != keys.end()) start = *i; } - if(deterministicRandom()->random01() < .10) { + if (deterministicRandom()->random01() < .10) { auto i = keys.upper_bound(end); - if(i != keys.end()) - end = *i; + if (i != keys.end()) end = *i; } - // Do a single key clear based on probability or end being randomly chosen to be the same as begin (unlikely) - if(deterministicRandom()->random01() < clearSingleKeyProbability || end == start) { + // Do a single key clear based on probability or end being randomly chosen to be the same as begin + // (unlikely) + if (deterministicRandom()->random01() < clearSingleKeyProbability || end == start) { end = keyAfter(start); - } - else if(end < start) { + } else if (end < start) { std::swap(end, start); } // Apply clear range to verification map ++rangeClears; KeyRangeRef range(start, end); - debug_printf(" Mutation: Clear '%s' to '%s' @%" PRId64 "\n", start.toString().c_str(), end.toString().c_str(), version); + debug_printf(" Mutation: Clear '%s' to '%s' @%" PRId64 "\n", start.toString().c_str(), + end.toString().c_str(), version); auto e = written.lower_bound(std::make_pair(start.toString(), 0)); - if(e != written.end()) { + if (e != written.end()) { auto last = e; auto eEnd = written.lower_bound(std::make_pair(end.toString(), 0)); - while(e != eEnd) { + while (e != eEnd) { auto w = *e; ++e; // If e key is different from last and last was present then insert clear for last's key at version - if(last != eEnd && ((e == eEnd || e->first.first != last->first.first) && last->second.present())) { - debug_printf(" Mutation: Clearing key '%s' @%" PRId64 "\n", last->first.first.c_str(), version); + if (last != eEnd && + ((e == eEnd || e->first.first != last->first.first) && last->second.present())) { + debug_printf(" Mutation: Clearing key '%s' @%" PRId64 "\n", last->first.first.c_str(), + version); keyBytesCleared += last->first.first.size(); mutationBytes += last->first.first.size(); mutationBytesThisCommit += last->first.first.size(); // If the last set was at version then just make it not present - if(last->first.second == version) { + if (last->first.second == version) { last->second.reset(); - } - else { + } else { written[std::make_pair(last->first.first, version)].reset(); } } @@ -6274,24 +6108,23 @@ TEST_CASE("!/redwood/correctness/btree") { btree->clear(range); // Sometimes set the range start after the clear - if(deterministicRandom()->random01() < clearPostSetProbability) { + if (deterministicRandom()->random01() < clearPostSetProbability) { KeyValue kv = randomKV(0, maxValueSize); kv.key = range.begin; btree->set(kv); written[std::make_pair(kv.key.toString(), version)] = kv.value.toString(); } - } - else { + } else { // Set a key KeyValue kv = randomKV(maxKeySize, maxValueSize); // Sometimes change key to a close previously used key - if(deterministicRandom()->random01() < .01) { + if (deterministicRandom()->random01() < .01) { auto i = keys.upper_bound(kv.key); - if(i != keys.end()) - kv.key = StringRef(kv.arena(), *i); + if (i != keys.end()) kv.key = StringRef(kv.arena(), *i); } - debug_printf(" Mutation: Set '%s' -> '%s' @%" PRId64 "\n", kv.key.toString().c_str(), kv.value.toString().c_str(), version); + debug_printf(" Mutation: Set '%s' -> '%s' @%" PRId64 "\n", kv.key.toString().c_str(), + kv.value.toString().c_str(), version); ++sets; keyBytesInserted += kv.key.size(); @@ -6305,24 +6138,24 @@ TEST_CASE("!/redwood/correctness/btree") { } // Commit at end or after this commit's mutation bytes are reached - if(mutationBytes.get() >= mutationBytesTarget || mutationBytesThisCommit >= mutationBytesTargetThisCommit) { + if (mutationBytes.get() >= mutationBytesTarget || mutationBytesThisCommit >= mutationBytesTargetThisCommit) { // Wait for previous commit to finish wait(commit); - printf("Committed. Next commit %d bytes, %" PRId64 "/%d (%.2f%%) Stats: Insert %.2f MB/s ClearedKeys %.2f MB/s Total %.2f\n", - mutationBytesThisCommit, - mutationBytes.get(), - mutationBytesTarget, - (double)mutationBytes.get() / mutationBytesTarget * 100, - (keyBytesInserted.rate() + valueBytesInserted.rate()) / 1e6, - keyBytesCleared.rate() / 1e6, - mutationBytes.rate() / 1e6 - ); + printf("Committed. Next commit %d bytes, %" PRId64 + "/%d (%.2f%%) Stats: Insert %.2f MB/s ClearedKeys %.2f MB/s Total %.2f\n", + mutationBytesThisCommit, mutationBytes.get(), mutationBytesTarget, + (double)mutationBytes.get() / mutationBytesTarget * 100, + (keyBytesInserted.rate() + valueBytesInserted.rate()) / 1e6, keyBytesCleared.rate() / 1e6, + mutationBytes.rate() / 1e6); - Version v = version; // Avoid capture of version as a member of *this + Version v = version; // Avoid capture of version as a member of *this - // Sometimes advance the oldest version to close the gap between the oldest and latest versions by a random amount. - if(deterministicRandom()->random01() < advanceOldVersionProbability) { - btree->setOldestVersion(btree->getLastCommittedVersion() - deterministicRandom()->randomInt(0, btree->getLastCommittedVersion() - btree->getOldestVersion() + 1)); + // Sometimes advance the oldest version to close the gap between the oldest and latest versions by a random + // amount. + if (deterministicRandom()->random01() < advanceOldVersionProbability) { + btree->setOldestVersion(btree->getLastCommittedVersion() - + deterministicRandom()->randomInt(0, btree->getLastCommittedVersion() - + btree->getOldestVersion() + 1)); } commit = map(btree->commit(), [=](Void) { @@ -6332,7 +6165,7 @@ TEST_CASE("!/redwood/correctness/btree") { return Void(); }); - if(serialTest) { + if (serialTest) { // Wait for commit, wait for verification, then start new verification wait(commit); committedVersions.sendError(end_of_stream()); @@ -6346,7 +6179,7 @@ TEST_CASE("!/redwood/correctness/btree") { mutationBytesTargetThisCommit = randomSize(maxCommitSize); // Recover from disk at random - if(!serialTest && deterministicRandom()->random01() < coldStartProbability) { + if (!serialTest && deterministicRandom()->random01() < coldStartProbability) { printf("Recovering from disk after next commit.\n"); // Wait for outstanding commit @@ -6364,7 +6197,7 @@ TEST_CASE("!/redwood/correctness/btree") { wait(closedFuture); printf("Reopening btree from disk.\n"); - IPager2 *pager = new DWALPager(pageSize, pagerFile, 0); + IPager2* pager = new DWALPager(pageSize, pagerFile, 0); btree = new VersionedBTree(pager, pagerFile); wait(btree->init()); @@ -6383,8 +6216,7 @@ TEST_CASE("!/redwood/correctness/btree") { } // Check for errors - if(errorCount != 0) - throw internal_error(); + if (errorCount != 0) throw internal_error(); } debug_printf("Waiting for outstanding commit\n"); @@ -6395,8 +6227,7 @@ TEST_CASE("!/redwood/correctness/btree") { wait(verifyTask); // Check for errors - if(errorCount != 0) - throw internal_error(); + if (errorCount != 0) throw internal_error(); wait(btree->destroyAndCheckSanity()); @@ -6408,13 +6239,13 @@ TEST_CASE("!/redwood/correctness/btree") { return Void(); } -ACTOR Future randomSeeks(VersionedBTree *btree, int count, char firstChar, char lastChar) { +ACTOR Future randomSeeks(VersionedBTree* btree, int count, char firstChar, char lastChar) { state Version readVer = btree->getLatestVersion(); state int c = 0; state double readStart = timer(); printf("Executing %d random seeks\n", count); state Reference cur = btree->readAtVersion(readVer); - while(c < count) { + while (c < count) { state Key k = randomString(20, firstChar, lastChar); wait(success(cur->findFirstEqualOrGreater(k))); ++c; @@ -6424,7 +6255,8 @@ ACTOR Future randomSeeks(VersionedBTree *btree, int count, char firstChar, return Void(); } -ACTOR Future randomScans(VersionedBTree *btree, int count, int width, int readAhead, char firstChar, char lastChar) { +ACTOR Future randomScans(VersionedBTree* btree, int count, int width, int readAhead, char firstChar, + char lastChar) { state Version readVer = btree->getLatestVersion(); state int c = 0; state double readStart = timer(); @@ -6432,14 +6264,14 @@ ACTOR Future randomScans(VersionedBTree *btree, int count, int width, int state Reference cur = btree->readAtVersion(readVer); state bool adaptive = readAhead < 0; state int totalScanBytes = 0; - while(c++ < count) { + while (c++ < count) { state Key k = randomString(20, firstChar, lastChar); wait(success(cur->findFirstEqualOrGreater(k, readAhead))); - if(adaptive) { + if (adaptive) { readAhead = totalScanBytes / c; } state int w = width; - while(w > 0 && cur->isValid()) { + while (w > 0 && cur->isValid()) { totalScanBytes += cur->getKey().size(); totalScanBytes += cur->getValue().size(); wait(cur->next()); @@ -6447,7 +6279,8 @@ ACTOR Future randomScans(VersionedBTree *btree, int count, int width, int } } double elapsed = timer() - readStart; - printf("Completed %d scans: readAhead=%d width=%d bytesRead=%d scansRate=%d/s\n", count, readAhead, width, totalScanBytes, int(count / elapsed)); + printf("Completed %d scans: readAhead=%d width=%d bytesRead=%d scansRate=%d/s\n", count, readAhead, width, + totalScanBytes, int(count / elapsed)); return Void(); } @@ -6457,7 +6290,7 @@ TEST_CASE("!/redwood/correctness/pager/cow") { deleteFile(pagerFile); int pageSize = 4096; - state IPager2 *pager = new DWALPager(pageSize, pagerFile, 0); + state IPager2* pager = new DWALPager(pageSize, pagerFile, 0); wait(success(pager->init())); state LogicalPageID id = wait(pager->newPageID()); @@ -6486,15 +6319,15 @@ TEST_CASE("!/redwood/performance/set") { state bool reload = getenv("TESTFILE") == nullptr; state std::string pagerFile = reload ? "unittest.redwood" : getenv("TESTFILE"); - if(reload) { + if (reload) { printf("Deleting old test data\n"); deleteFile(pagerFile); } state int pageSize = 4096; state int64_t pageCacheBytes = FLOW_KNOBS->PAGE_CACHE_4K; - DWALPager *pager = new DWALPager(pageSize, pagerFile, pageCacheBytes); - state VersionedBTree *btree = new VersionedBTree(pager, pagerFile); + DWALPager* pager = new DWALPager(pageSize, pagerFile, pageCacheBytes); + state VersionedBTree* btree = new VersionedBTree(pager, pagerFile); wait(btree->init()); state int nodeCount = 1e9; @@ -6534,8 +6367,8 @@ TEST_CASE("!/redwood/performance/set") { state double intervalStart = timer(); state double start = intervalStart; - if(reload) { - while(kvBytesTotal < kvBytesTarget) { + if (reload) { + while (kvBytesTotal < kvBytesTarget) { wait(yield()); Version lastVer = btree->getLatestVersion(); @@ -6543,15 +6376,19 @@ TEST_CASE("!/redwood/performance/set") { btree->setWriteVersion(version); int changes = deterministicRandom()->randomInt(0, maxChangesPerVersion); - while(changes > 0 && kvBytes < commitTarget) { + while (changes > 0 && kvBytes < commitTarget) { KeyValue kv; - kv.key = randomString(kv.arena(), deterministicRandom()->randomInt(minKeyPrefixBytes + sizeof(uint32_t), maxKeyPrefixBytes + sizeof(uint32_t) + 1), firstKeyChar, lastKeyChar); + kv.key = randomString(kv.arena(), + deterministicRandom()->randomInt(minKeyPrefixBytes + sizeof(uint32_t), + maxKeyPrefixBytes + sizeof(uint32_t) + 1), + firstKeyChar, lastKeyChar); int32_t index = deterministicRandom()->randomInt(0, nodeCount); int runLength = deterministicRandom()->randomInt(minConsecutiveRun, maxConsecutiveRun + 1); - while(runLength > 0 && changes > 0) { - *(uint32_t *)(kv.key.end() - sizeof(uint32_t)) = bigEndian32(index++); - kv.value = StringRef((uint8_t *)value.data(), deterministicRandom()->randomInt(minValueSize, maxValueSize + 1)); + while (runLength > 0 && changes > 0) { + *(uint32_t*)(kv.key.end() - sizeof(uint32_t)) = bigEndian32(index++); + kv.value = StringRef((uint8_t*)value.data(), + deterministicRandom()->randomInt(minValueSize, maxValueSize + 1)); btree->set(kv); @@ -6562,22 +6399,25 @@ TEST_CASE("!/redwood/performance/set") { } } - if(kvBytes >= commitTarget) { + if (kvBytes >= commitTarget) { btree->setOldestVersion(btree->getLastCommittedVersion()); wait(commit); - printf("Cumulative %.2f MB keyValue bytes written at %.2f MB/s\n", kvBytesTotal / 1e6, kvBytesTotal / (timer() - start) / 1e6); + printf("Cumulative %.2f MB keyValue bytes written at %.2f MB/s\n", kvBytesTotal / 1e6, + kvBytesTotal / (timer() - start) / 1e6); // Avoid capturing via this to freeze counter values int recs = records; int kvb = kvBytes; - // Capturing invervalStart via this->intervalStart makes IDE's unhappy as they do not know about the actor state object - double *pIntervalStart = &intervalStart; + // Capturing invervalStart via this->intervalStart makes IDE's unhappy as they do not know about the + // actor state object + double* pIntervalStart = &intervalStart; commit = map(btree->commit(), [=](Void result) { printf("Committed: %s\n", VersionedBTree::counts.toString(true).c_str()); double elapsed = timer() - *pIntervalStart; - printf("Committed %d kvBytes in %d records in %f seconds, %.2f MB/s\n", kvb, recs, elapsed, kvb / elapsed / 1e6); + printf("Committed %d kvBytes in %d records in %f seconds, %.2f MB/s\n", kvb, recs, elapsed, + kvb / elapsed / 1e6); *pIntervalStart = timer(); return Void(); }); @@ -6589,14 +6429,15 @@ TEST_CASE("!/redwood/performance/set") { } wait(commit); - printf("Cumulative %.2f MB keyValue bytes written at %.2f MB/s\n", kvBytesTotal / 1e6, kvBytesTotal / (timer() - start) / 1e6); + printf("Cumulative %.2f MB keyValue bytes written at %.2f MB/s\n", kvBytesTotal / 1e6, + kvBytesTotal / (timer() - start) / 1e6); } int seeks = 1e6; printf("Warming cache with seeks\n"); - actors.add(randomSeeks(btree, seeks/3, firstKeyChar, lastKeyChar)); - actors.add(randomSeeks(btree, seeks/3, firstKeyChar, lastKeyChar)); - actors.add(randomSeeks(btree, seeks/3, firstKeyChar, lastKeyChar)); + actors.add(randomSeeks(btree, seeks / 3, firstKeyChar, lastKeyChar)); + actors.add(randomSeeks(btree, seeks / 3, firstKeyChar, lastKeyChar)); + actors.add(randomSeeks(btree, seeks / 3, firstKeyChar, lastKeyChar)); wait(actors.signalAndReset()); printf("Stats: %s\n", VersionedBTree::counts.toString(true).c_str()); @@ -6650,9 +6491,7 @@ struct PrefixSegment { int length; int cardinality; - std::string toString() const { - return format("{%d bytes, %d choices}", length, cardinality); - } + std::string toString() const { return format("{%d bytes, %d choices}", length, cardinality); } }; // Utility class for generating kv pairs under a prefix pattern @@ -6666,42 +6505,42 @@ struct KVSource { std::vector desc; std::vector> segments; std::vector prefixes; - std::vector prefixesSorted; + std::vector prefixesSorted; std::string valueData; int prefixLen; int lastIndex; - KVSource(const std::vector &desc, int numPrefixes = 0) : desc(desc) { - if(numPrefixes == 0) { + KVSource(const std::vector& desc, int numPrefixes = 0) : desc(desc) { + if (numPrefixes == 0) { numPrefixes = 1; - for(auto &p : desc) { + for (auto& p : desc) { numPrefixes *= p.cardinality; } } prefixLen = 0; - for(auto &s : desc) { + for (auto& s : desc) { prefixLen += s.length; std::vector parts; - while(parts.size() < s.cardinality) { + while (parts.size() < s.cardinality) { parts.push_back(deterministicRandom()->randomAlphaNumeric(s.length)); } segments.push_back(std::move(parts)); } - while(prefixes.size() < numPrefixes) { + while (prefixes.size() < numPrefixes) { std::string p; - for(auto &s : segments) { + for (auto& s : segments) { p.append(s[deterministicRandom()->randomInt(0, s.size())]); } - prefixes.push_back(PrefixRef((uint8_t *)p.data(), p.size())); + prefixes.push_back(PrefixRef((uint8_t*)p.data(), p.size())); } - for(auto &p : prefixes) { + for (auto& p : prefixes) { prefixesSorted.push_back(&p); } - std::sort(prefixesSorted.begin(), prefixesSorted.end(), [](const Prefix *a, const Prefix *b) { - return KeyRef((uint8_t *)a->begin(), a->size()) < KeyRef((uint8_t *)b->begin(), b->size()); + std::sort(prefixesSorted.begin(), prefixesSorted.end(), [](const Prefix* a, const Prefix* b) { + return KeyRef((uint8_t*)a->begin(), a->size()) < KeyRef((uint8_t*)b->begin(), b->size()); }); valueData = deterministicRandom()->randomAlphaNumeric(100000); @@ -6710,13 +6549,11 @@ struct KVSource { // Expands the chosen prefix in the prefix list to hold suffix, // fills suffix with random bytes, and returns a reference to the string - KeyRef getKeyRef(int suffixLen) { - return makeKey(randomPrefix(), suffixLen); - } + KeyRef getKeyRef(int suffixLen) { return makeKey(randomPrefix(), suffixLen); } // Like getKeyRef but uses the same prefix as the last randomly chosen prefix KeyRef getAnotherKeyRef(int suffixLen, bool sorted = false) { - Prefix &p = sorted ? *prefixesSorted[lastIndex] : prefixes[lastIndex]; + Prefix& p = sorted ? *prefixesSorted[lastIndex] : prefixes[lastIndex]; return makeKey(p, suffixLen); } @@ -6724,51 +6561,48 @@ struct KVSource { KeyRangeRef getRangeRef(int prefixesCovered, int suffixLen) { prefixesCovered = std::min(prefixesCovered, prefixes.size()); int i = deterministicRandom()->randomInt(0, prefixesSorted.size() - prefixesCovered); - Prefix *begin = prefixesSorted[i]; - Prefix *end = prefixesSorted[i + prefixesCovered]; + Prefix* begin = prefixesSorted[i]; + Prefix* end = prefixesSorted[i + prefixesCovered]; return KeyRangeRef(makeKey(*begin, suffixLen), makeKey(*end, suffixLen)); } - KeyRef getValue(int len) { - return KeyRef(valueData).substr(0, len); - } + KeyRef getValue(int len) { return KeyRef(valueData).substr(0, len); } // Move lastIndex to the next position, wrapping around to 0 void nextPrefix() { ++lastIndex; - if(lastIndex == prefixes.size()) { + if (lastIndex == prefixes.size()) { lastIndex = 0; } } - Prefix & randomPrefix() { + Prefix& randomPrefix() { lastIndex = deterministicRandom()->randomInt(0, prefixes.size()); return prefixes[lastIndex]; } - static KeyRef makeKey(Prefix &p, int suffixLen) { + static KeyRef makeKey(Prefix& p, int suffixLen) { p.reserve(p.arena(), p.size() + suffixLen); - uint8_t *wptr = p.end(); - for(int i = 0; i < suffixLen; ++i) { + uint8_t* wptr = p.end(); + for (int i = 0; i < suffixLen; ++i) { *wptr++ = (uint8_t)deterministicRandom()->randomAlphaNumeric(); } return KeyRef(p.begin(), p.size() + suffixLen); } - int numPrefixes() const { - return prefixes.size(); - }; + int numPrefixes() const { return prefixes.size(); }; std::string toString() const { return format("{prefixLen=%d prefixes=%d format=%s}", prefixLen, numPrefixes(), ::toString(desc).c_str()); } }; -std::string toString(const StorageBytes &sb) { - return format("{%.2f MB total, %.2f MB free, %.2f MB available, %.2f MB used}", sb.total / 1e6, sb.free / 1e6, sb.available / 1e6, sb.used / 1e6); +std::string toString(const StorageBytes& sb) { + return format("{%.2f MB total, %.2f MB free, %.2f MB available, %.2f MB used}", sb.total / 1e6, sb.free / 1e6, + sb.available / 1e6, sb.used / 1e6); } -ACTOR Future getStableStorageBytes(IKeyValueStore *kvs) { +ACTOR Future getStableStorageBytes(IKeyValueStore* kvs) { state StorageBytes sb = kvs->getStorageBytes(); // Wait for StorageBytes used metric to stabilize @@ -6777,7 +6611,7 @@ ACTOR Future getStableStorageBytes(IKeyValueStore *kvs) { StorageBytes sb2 = kvs->getStorageBytes(); bool stable = sb2.used == sb.used; sb = sb2; - if(stable) { + if (stable) { break; } } @@ -6785,7 +6619,8 @@ ACTOR Future getStableStorageBytes(IKeyValueStore *kvs) { return sb; } -ACTOR Future prefixClusteredInsert(IKeyValueStore *kvs, int suffixSize, int valueSize, KVSource source, int recordCountTarget, bool usePrefixesInOrder) { +ACTOR Future prefixClusteredInsert(IKeyValueStore* kvs, int suffixSize, int valueSize, KVSource source, + int recordCountTarget, bool usePrefixesInOrder) { state int commitTarget = 5e6; state int recordSize = source.prefixLen + suffixSize + valueSize; @@ -6816,26 +6651,27 @@ ACTOR Future prefixClusteredInsert(IKeyValueStore *kvs, int suffixSize, in state std::function stats = [&]() { double elapsed = timer() - start; - printf("Cumulative stats: %.2f seconds %.2f MB keyValue bytes %d records %.2f MB/s %.2f rec/s\r", elapsed, kvBytesTotal / 1e6, records, kvBytesTotal / elapsed / 1e6, records / elapsed); + printf("Cumulative stats: %.2f seconds %.2f MB keyValue bytes %d records %.2f MB/s %.2f rec/s\r", elapsed, + kvBytesTotal / 1e6, records, kvBytesTotal / elapsed / 1e6, records / elapsed); fflush(stdout); }; - while(kvBytesTotal < kvBytesTarget) { + while (kvBytesTotal < kvBytesTarget) { wait(yield()); state int i; - for(i = 0; i < recordsPerPrefix; ++i) { + for (i = 0; i < recordsPerPrefix; ++i) { KeyValueRef kv(source.getAnotherKeyRef(4, usePrefixesInOrder), source.getValue(valueSize)); kvs->set(kv); kvBytes += kv.expectedSize(); ++records; - if(kvBytes >= commitTarget) { + if (kvBytes >= commitTarget) { wait(commit); stats(); commit = kvs->commit(); kvBytesTotal += kvBytes; - if(kvBytesTotal >= kvBytesTarget) { + if (kvBytesTotal >= kvBytesTarget) { break; } kvBytes = 0; @@ -6858,15 +6694,16 @@ ACTOR Future prefixClusteredInsert(IKeyValueStore *kvs, int suffixSize, in intervalStart = timer(); kvs->clear(KeyRangeRef(LiteralStringRef(""), LiteralStringRef("\xff"))); state StorageBytes sbClear = wait(getStableStorageBytes(kvs)); - printf("Cleared all keys in %.2f seconds, final storageByte: %s\n", timer() - intervalStart, toString(sbClear).c_str()); + printf("Cleared all keys in %.2f seconds, final storageByte: %s\n", timer() - intervalStart, + toString(sbClear).c_str()); return Void(); } -ACTOR Future sequentialInsert(IKeyValueStore *kvs, int prefixLen, int valueSize, int recordCountTarget) { +ACTOR Future sequentialInsert(IKeyValueStore* kvs, int prefixLen, int valueSize, int recordCountTarget) { state int commitTarget = 5e6; - state KVSource source({{prefixLen, 1}}); + state KVSource source({ { prefixLen, 1 } }); state int recordSize = source.prefixLen + sizeof(uint64_t) + valueSize; state int64_t kvBytesTarget = (int64_t)recordCountTarget * recordSize; @@ -6890,27 +6727,28 @@ ACTOR Future sequentialInsert(IKeyValueStore *kvs, int prefixLen, int valu state std::function stats = [&]() { double elapsed = timer() - start; - printf("Cumulative stats: %.2f seconds %.2f MB keyValue bytes %d records %.2f MB/s %.2f rec/s\r", elapsed, kvBytesTotal / 1e6, records, kvBytesTotal / elapsed / 1e6, records / elapsed); + printf("Cumulative stats: %.2f seconds %.2f MB keyValue bytes %d records %.2f MB/s %.2f rec/s\r", elapsed, + kvBytesTotal / 1e6, records, kvBytesTotal / elapsed / 1e6, records / elapsed); fflush(stdout); }; state uint64_t c = 0; state Key key = source.getKeyRef(sizeof(uint64_t)); - while(kvBytesTotal < kvBytesTarget) { + while (kvBytesTotal < kvBytesTarget) { wait(yield()); - *(uint64_t *)(key.end() - sizeof(uint64_t)) = bigEndian64(c); + *(uint64_t*)(key.end() - sizeof(uint64_t)) = bigEndian64(c); KeyValueRef kv(key, source.getValue(valueSize)); kvs->set(kv); kvBytes += kv.expectedSize(); ++records; - if(kvBytes >= commitTarget) { + if (kvBytes >= commitTarget) { wait(commit); stats(); commit = kvs->commit(); kvBytesTotal += kvBytes; - if(kvBytesTotal >= kvBytesTarget) { + if (kvBytesTotal >= kvBytesTarget) { break; } kvBytes = 0; @@ -6925,18 +6763,19 @@ ACTOR Future sequentialInsert(IKeyValueStore *kvs, int prefixLen, int valu return Void(); } -Future closeKVS(IKeyValueStore *kvs) { +Future closeKVS(IKeyValueStore* kvs) { Future closed = kvs->onClosed(); kvs->close(); return closed; } -ACTOR Future doPrefixInsertComparison(int suffixSize, int valueSize, int recordCountTarget, bool usePrefixesInOrder, KVSource source) { +ACTOR Future doPrefixInsertComparison(int suffixSize, int valueSize, int recordCountTarget, + bool usePrefixesInOrder, KVSource source) { VersionedBTree::counts.clear(); deleteFile("test.redwood"); wait(delay(5)); - state IKeyValueStore *redwood = openKVStore(KeyValueStoreType::SSD_REDWOOD_V1, "test.redwood", UID(), 0); + state IKeyValueStore* redwood = openKVStore(KeyValueStoreType::SSD_REDWOOD_V1, "test.redwood", UID(), 0); wait(prefixClusteredInsert(redwood, suffixSize, valueSize, source, recordCountTarget, usePrefixesInOrder)); wait(closeKVS(redwood)); printf("\n"); @@ -6944,7 +6783,7 @@ ACTOR Future doPrefixInsertComparison(int suffixSize, int valueSize, int r deleteFile("test.sqlite"); deleteFile("test.sqlite-wal"); wait(delay(5)); - state IKeyValueStore *sqlite = openKVStore(KeyValueStoreType::SSD_BTREE_V2, "test.sqlite", UID(), 0); + state IKeyValueStore* sqlite = openKVStore(KeyValueStoreType::SSD_BTREE_V2, "test.sqlite", UID(), 0); wait(prefixClusteredInsert(sqlite, suffixSize, valueSize, source, recordCountTarget, usePrefixesInOrder)); wait(closeKVS(sqlite)); printf("\n"); @@ -6958,10 +6797,14 @@ TEST_CASE("!/redwood/performance/prefixSizeComparison") { state int recordCountTarget = 100e6; state int usePrefixesInOrder = false; - wait(doPrefixInsertComparison(suffixSize, valueSize, recordCountTarget, usePrefixesInOrder, KVSource({{10, 100000}}))); - wait(doPrefixInsertComparison(suffixSize, valueSize, recordCountTarget, usePrefixesInOrder, KVSource({{16, 100000}}))); - wait(doPrefixInsertComparison(suffixSize, valueSize, recordCountTarget, usePrefixesInOrder, KVSource({{32, 100000}}))); - wait(doPrefixInsertComparison(suffixSize, valueSize, recordCountTarget, usePrefixesInOrder, KVSource({{4, 5}, {12, 1000}, {8, 5}, {8, 4}}))); + wait(doPrefixInsertComparison(suffixSize, valueSize, recordCountTarget, usePrefixesInOrder, + KVSource({ { 10, 100000 } }))); + wait(doPrefixInsertComparison(suffixSize, valueSize, recordCountTarget, usePrefixesInOrder, + KVSource({ { 16, 100000 } }))); + wait(doPrefixInsertComparison(suffixSize, valueSize, recordCountTarget, usePrefixesInOrder, + KVSource({ { 32, 100000 } }))); + wait(doPrefixInsertComparison(suffixSize, valueSize, recordCountTarget, usePrefixesInOrder, + KVSource({ { 4, 5 }, { 12, 1000 }, { 8, 5 }, { 8, 4 } }))); return Void(); } @@ -6973,11 +6816,10 @@ TEST_CASE("!/redwood/performance/sequentialInsert") { deleteFile("test.redwood"); wait(delay(5)); - state IKeyValueStore *redwood = openKVStore(KeyValueStoreType::SSD_REDWOOD_V1, "test.redwood", UID(), 0); + state IKeyValueStore* redwood = openKVStore(KeyValueStoreType::SSD_REDWOOD_V1, "test.redwood", UID(), 0); wait(sequentialInsert(redwood, prefixLen, valueSize, recordCountTarget)); wait(closeKVS(redwood)); printf("\n"); return Void(); } -