foundationdb/fdbserver/PrefixTree.h

/*
 * PrefixTree.h
 *
 * This source file is part of the FoundationDB open source project
 *
 * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#pragma once

#include "flow/flow.h"
#include "flow/Arena.h"
#include "fdbclient/FDBTypes.h"
#include "fdbserver/Knobs.h"
#include <string.h>

typedef uint64_t Word;
static inline int commonPrefixLength(uint8_t const* ap, uint8_t const* bp, int cl) {
	int i = 0;
	const int wordEnd = cl - sizeof(Word) + 1;

	for(; i < wordEnd; i += sizeof(Word)) {
		Word a = *(Word *)ap;
		Word b = *(Word *)bp;
		if(a != b) {
			return i + ctzll(a ^ b) / 8;
		}
		ap += sizeof(Word);
		bp += sizeof(Word);
	}

	for (; i < cl; i++) {
		if (*ap != *bp) {
			return i;
		}
		++ap;
		++bp;
	}
	return cl;
}

static int commonPrefixLength(StringRef a, StringRef b) {
	return commonPrefixLength(a.begin(), b.begin(), std::min(a.size(), b.size()));
}

// This appears to be the fastest version
static int lessOrEqualPowerOfTwo(int n) {
	int p;
	for (p = 1; p+p <= n; p+=p);
	return p;
}

/*
static int _lessOrEqualPowerOfTwo(uint32_t n) {
	if(n == 0)
		return n;
	int trailing = __builtin_ctz(n);
	int leading = __builtin_clz(n);
	if(trailing + leading == ((sizeof(n) * 8) - 1))
		return n;
	return 1 << ( (sizeof(n) * 8) - leading - 1);
}

static int __lessOrEqualPowerOfTwo(unsigned int n) {
	int p = 1;
	for(; p <= n; p <<= 1);
	return p >> 1;
}
*/

static int perfectSubtreeSplitPoint(int subtree_size) {
	// return the inorder index of the root node in a subtree of the given size
	// consistent with the resulting binary search tree being "perfect" (having minimal height 
	// and all missing nodes as far right as possible).
	// There has to be a simpler way to do this.
	int s = lessOrEqualPowerOfTwo((subtree_size - 1) / 2 + 1) - 1;
	return std::min(s * 2 + 1, subtree_size - s - 1);
}

static int perfectSubtreeSplitPointCached(int subtree_size) {
	static uint16_t *points = nullptr;
	static const int max = 500;
	if(points == nullptr) {
		points = new uint16_t[max];
		for(int i = 0; i < max; ++i)
			points[i] = perfectSubtreeSplitPoint(i);
	}

	if(subtree_size < max)
		return points[subtree_size];
	return perfectSubtreeSplitPoint(subtree_size);
}

struct PrefixTree {
	// TODO: Make PrefixTree use a more complex record type with a multi column key
	typedef KeyValueRef EntryRef;
	typedef Standalone<EntryRef> Entry;

	static int MaximumTreeSize() {
		return std::numeric_limits<uint16_t>::max();
	};

	struct Node {
		uint8_t flags;

/* 
 * Node fields
 * 
 * Logically, a node has the following things
 *  - Flags describing what is in the node
 *  - Optional left child
 *  - Optional right child
 *  - Prefix string, described by a length and a source (which is the most recent left or right ancestor)
 *  - Optional split string, which contains any bytes after prefix which are needed to make a branching decision
 *  - Optional suffix string, containing any remaining key bytes after the split string
 *  - Optional value string
 * 
 * The physical layout places the left child subtree immediately after the split string so that it is likely
 * that the bytes read to make a branching decision and then choosing left (as should happen half of the time)
 * will have a high cache hit rate.
 * 
 * If necessary, the flags byte could be an enumeration into a set of possible options, since not all options 
 * combinations are needed.  For example,
 * 
 *   - The tree is balanced and filled from the left at the last level, so a node cannot have only a right child.
 *   - If there are no children, there is no point in splitting any key bytes after the prefix into separate strings.
 *   - If there is exactly one child (left) then the key bytes after the prefix can all go in the split string.  The
 *     traversal decision is to either stop or go left and one of those options (stop) will still have good memory
 *     locality.
 * 
 *   8 valid/necessary option combinations for presense of (Left, Right, Split, Suffix) out of 16 possibilities
 * 
 *   L  R  Split  Suffix
 *
 *   N  N  N       N    # No children, key has no bytes after prefix
 *   N  N  Y       N    # No children, key has bytes after prefix
 *   Y  N  N       N    # One child, key has no bytes after prefix
 *   Y  N  Y       N    # One child, key has bytes after prefix
 *   Y  Y  N       N    # Two children, key has no bytes after prefix
 *   Y  Y  N       Y    # Two children, branch decision can be made using only prefix bytes but there are more key bytes after
 *   Y  Y  Y       N    # Two children, branch decision requires all key bytes after prefix
 *   Y  Y  Y       Y    # Two children, branch decision requires some but not all bytes after prefix
 * 
 *   This can be represent with just 3 bits, if necessary, but for now there is space in the flags byte for all 4.
 * 
 *   Flag Bits
 *
 *   prefix borrow from next
 *      true -  borrow from the closest ancestor greater than this node
 *      false - borrow from the closest ancestor less    than this node
 *   large lengths = use 2 byte ints instead of 1 byte for prefix, split, suffix, and value lengths
 *     (TODO: It might be better to just not use a suffix at all when large is lengths is set)
 *   left child present
 *   right child present
 *   split string present
 *   suffix string present
 *   value string present
 * 
 *   Serialized format:
 *     All lengths are in the header, which has variable size
 *
 *     flags           1 byte
 *     prefix length   1-2 bytes based on large lengths flag
 *     split length    0-2 bytes based on split string present flag
 *     suffix length   0-2 bytes based on suffix string present and large lengths flags
 *     value length    0-1 bytes based on value string present and large lengths flag
 *     left length     0 or 2 bytes depending on left child present
 *     split           0+ bytes
 *     left child      0+ bytes
 *     suffix          0+ bytes
 *     value           0+ bytes
 *     right child     0+ bytes
 *     
 */
		enum EFlags {
			USE_LARGE_LENGTHS   = 1 << 0,
			PREFIX_SOURCE_NEXT  = 1 << 1,
			HAS_LEFT_CHILD      = 1 << 2,
			HAS_RIGHT_CHILD     = 1 << 3,
			HAS_SPLIT           = 1 << 4,
			HAS_SUFFIX          = 1 << 5,
			HAS_VALUE           = 1 << 6
		};

		// Stores decoded offsets (from beginning) of Node components
		struct Parser {
			Parser() {}
			Parser(const Node *n) {
				init(n);
			}

			const Node *node;

			typedef uint16_t OffsetT;
			OffsetT headerLen;
			OffsetT prefixLen;
			OffsetT leftPos;
			OffsetT suffixPos;
			OffsetT valuePos;
			OffsetT rightPos;

			StringRef splitString() const {
				return StringRef((const uint8_t *)node + headerLen, leftPos);
			}
			StringRef suffixString() const {
				return StringRef((const uint8_t *)node + headerLen + suffixPos, valuePos - suffixPos);
			}
			StringRef valueString() const {
				return StringRef((const uint8_t *)node + headerLen + valuePos, rightPos - valuePos);
			}
			const Node *leftChild() const {
				if(node->flags & HAS_LEFT_CHILD)
					return (const Node *)((const uint8_t *)node + headerLen + leftPos);
				return nullptr;
			}
			const Node *rightChild() const {
				if(node->flags & HAS_RIGHT_CHILD)
					return (const Node *)((const uint8_t *)node + headerLen + rightPos);
				return nullptr;
			}
			int keyLen() const {
				int len = prefixLen + leftPos + (valuePos - suffixPos);
				ASSERT(len >= 0);
				return len;
			}

			void init(const Node *n) {
				node = n;
				union {
					const uint8_t *p8;
					const uint16_t *p16;
				};
				p8 = (const uint8_t *)&n->flags + 1;

				int flags = n->flags;
				bool large = flags & USE_LARGE_LENGTHS;

				prefixLen = large ? *p16++ : *p8++;

				if(flags & HAS_SPLIT)
					leftPos = large ? *p16++ : *p8++;
				else
					leftPos = 0;
				suffixPos = leftPos;
				if(flags & HAS_LEFT_CHILD) 
					suffixPos += *p16++;

				valuePos = suffixPos;
				if(flags & HAS_SUFFIX)
					valuePos += (large ? *p16++ : *p8++);

				rightPos = valuePos;
				if(flags & HAS_VALUE)
					rightPos += (large ? *p16++ : *p8++);

				int header = 2;    // flags byte, first prefix len byte
				if(large)
					++header;  // second prefix len byte
				if(flags & HAS_SPLIT)
					header += large ? 2 : 1;
				if(flags & HAS_LEFT_CHILD)
					header += 2;
				if(flags & HAS_SUFFIX)
					header += large ? 2 : 1;
				if(flags & HAS_VALUE)
					header += large ? 2 : 1;
				headerLen = header;
			}
		};

		static inline int getMaxOverhead(int index, int keySize, int valueSize) {
			bool large = keySize > 255 || valueSize > 255;
			int overhead = 1 + (large ? 2 : 1);  // flags and prefix len
			// Value length size if present
			if(valueSize > 0)
				overhead += large ? 2 : 1;
			overhead += large ? 6 : 3;  // Worst case scenario for value, split and suffix lengths
			if((index & 0x01) != 0)
				overhead += 2;  // Left child length, one less than half of nodes will have one.
			return overhead;
		}

	public:

		// Methods for decoding specific Node members on-demand
		inline int getPrefixLen() const {
			return Parser(this).prefixLen;
		}

		inline StringRef getSplitString() const {
			return Parser(this).splitString();
		}

		inline StringRef getSuffixString() const {
			return Parser(this).suffixString();
		}

		inline StringRef getValueString() const {
			return Parser(this).valueString();
		}

		inline const Node * getLeftChild() const {
			return Parser(this).leftChild();
		}

		inline const Node * getRightChild() const {
			return Parser(this).rightChild();
		}

		inline int getKeySize() const {
			return Parser(this).keyLen();
		}
	};

#pragma pack(push,1)
	uint16_t size;   // size in bytes
	Node root;
#pragma pack(pop)

	static inline int GetHeaderSize() {
		return sizeof(PrefixTree) - sizeof(root);
	}

private:
	struct PathEntry {
		const Node *node;
		Node::Parser parser;

		// Key may or may not point to the space within keyBuffer.
		// Key will always contain at least the prefix bytes borrowed by node
		// KeyBuffer will always be large enough to hold the entire reconstituted key for node
		//
		// These are mutable because getting key bytes from this PathEntry can change these
		// but they're really just a read cache for reconstituted key bytes.
		mutable StringRef key;
		mutable Standalone<VectorRef<uint8_t>> keyBuffer;

		// Path entry was reached by going left from the previous node
		bool nodeIsLeftChild;
		// number of consecutive moves in same direction
		int moves;

		PathEntry() : node(nullptr) {
		}
		PathEntry(const PathEntry &rhs) {
			*this = rhs;
		}

		// Initialize the key byte buffer to hold bytes of a new node.  Use a new arena
		// if the old arena is being held by any users.
		void initKeyBufferSpace() {
			if(node != nullptr) {
				int size = parser.keyLen();
				if(keyBuffer.arena().impl && !keyBuffer.arena().impl->isSoleOwnerUnsafe()) {
					keyBuffer = Standalone<VectorRef<uint8_t>>();
				}
				keyBuffer.reserve(keyBuffer.arena(), size);
			}
		}

		PathEntry & operator= (const PathEntry &rhs) {
			node = rhs.node;
			parser = rhs.parser;
			nodeIsLeftChild = rhs.nodeIsLeftChild;
			moves = rhs.moves;
			// New key buffer must be able to hold full reconstituted key, not just the
			// part of it referenced by rhs.key (which may not be the whole thing)
			initKeyBufferSpace();
			if(node != nullptr && rhs.key.size() > 0) {
				// Copy rhs.key into keyBuffer and set key to the destination bytes
				memcpy(keyBuffer.begin(), rhs.key.begin(), rhs.key.size());
				key = StringRef(keyBuffer.begin(), rhs.key.size());
			}
			else {
				key = rhs.key;
			}
			return *this;
		}

		void init(StringRef s) {
			node = nullptr;
			key = s;
		}

		void init(const Node *_node, const PathEntry *prefixSource, bool isLeft, int numMoves) {
			node = _node;
			parser.init(node);
			nodeIsLeftChild = isLeft;
			moves = numMoves;

			// keyBuffer will be large enough to hold the full reconstituted key but initially
			// key will be a reference returned from prefixSource->getKeyRef()
			// See comments near keyBuffer and key for more info.
			initKeyBufferSpace();
			key = prefixSource->getKeyRef(parser.prefixLen);
		}

		inline bool valid() const {
			return node != nullptr;
		}

		int compareToKey(StringRef s) const {
			// Key has at least this node's borrowed prefix bytes in it.
			// If s is shorter than key, we only need to compare it to key
			if(s.size() < key.size())
				return s.compare(key);

			int cmp = s.substr(0, key.size()).compare(key);
			if(cmp != 0)
				return cmp;

			// The borrowed prefix bytes and possibly more have already been compared and were equal
			int comparedLen = key.size();
			s = s.substr(comparedLen);
			StringRef split = parser.splitString();
			int splitSizeOriginal = split.size();
			int splitStart = comparedLen - parser.prefixLen;
			if(splitStart < split.size()) {
				split = split.substr(splitStart);
				if(s.size() < split.size())
					return s.compare(split);
				cmp = s.substr(0, split.size()).compare(split);
				if(cmp != 0)
					return cmp;
				s = s.substr(split.size());
				comparedLen += split.size();
			}

			int suffixStart = comparedLen - (parser.prefixLen + splitSizeOriginal);
			StringRef suffix = parser.suffixString();
			ASSERT(suffixStart >= 0 && suffixStart <= suffix.size());
			return s.compare(suffix.substr(suffixStart));
		}

		// Make sure that key refers to bytes in keyBuffer, copying if necessary
		void ensureKeyInBuffer() const {
			if(key.begin() != keyBuffer.begin()) {
				memcpy(keyBuffer.begin(), key.begin(), key.size());
				key = StringRef(keyBuffer.begin(), key.size());
			}
		}

		// Get the borrowed prefix string.  Key must contain all of those bytes but it could contain more.
		StringRef getPrefix() const {
			if(node == nullptr)
				return key;
			return key.substr(0, parser.prefixLen);
		}

		// Return a reference to the first size bytes of the key.
		//
		// If size <= key's size then a substring of key will be returned, but if alwaysUseKeyBuffer
		// is true then before returning the existing value of key (not just the first size bytes)
		// will be copied into keyBuffer and key will be updated to point there.
		//
		// If size is greater than key's size, then key will be moved into keyBuffer if it is not already there
		// and the remaining needed bytes will be copied into keyBuffer from the split and suffix strings.
		KeyRef getKeyRef(int size = -1, bool alwaysUseKeyBuffer = false) const {
			if(size < 0)
				size = parser.keyLen();

			// If size is less than key then return a substring of it, possibly after moving it to the keyBuffer.
			if(size <= key.size()) {
				if(alwaysUseKeyBuffer)
					ensureKeyInBuffer();
				return key.substr(0, size);
			}

			ASSERT(node != nullptr);
			ensureKeyInBuffer();

			// The borrowed prefix bytes and possibly more must already be in key
			int writtenLen = key.size();
			StringRef split = parser.splitString();
			StringRef suffix = parser.suffixString();
			int splitStart = writtenLen - parser.prefixLen;
			if(splitStart < split.size()) {
				int splitLen = std::min(split.size() - splitStart, size - writtenLen);
				memcpy(mutateString(key) + writtenLen, split.begin() + splitStart, splitLen);
				writtenLen += splitLen;
			}
			int suffixStart = writtenLen - parser.prefixLen - split.size();
			if(suffixStart < suffix.size()) {
				int suffixLen = std::min(suffix.size() - suffixStart, size - writtenLen);
				memcpy(mutateString(key) + writtenLen, suffix.begin() + suffixStart, suffixLen);
				writtenLen += suffixLen;
			}
			ASSERT(writtenLen == size);
			key = StringRef(key.begin(), size);
			return key;
		}

		// Return keyRef(size) and the arena that keyBuffer resides in.
		Key getKey(int size = -1) const {
			StringRef k = getKeyRef(size, true);
			return Key(k, keyBuffer.arena());
		}
	};

public:
	// Cursor provides a way to seek into a PrefixTree and iterate over its content
	// Seek and move methods can return false can return false if they fail to achieve the desired effect
	// but a cursor will remain 'valid' as long as the tree is not empty.
	//
	// It coalesces prefix bytes into a contiguous buffer for each node along the traversal
	// path to make iteration faster.
	struct Cursor {
		Cursor() : pathLen(0) {
		}

		Cursor(const Node *root, StringRef prevAncestor, StringRef nextAncestor) {
			init(root, prevAncestor, nextAncestor);
		}

		static const int initialPathLen = 3;
		static const int initialPathCapacity = 20;
		// This is a separate function so that Cursors can be reused to search different PrefixTrees
		// which avoids cursor destruction and creation which involves unnecessary memory churn.
		// The root node is arbitrarily assumed to be a right child of prevAncestor which itself is a left child of nextAncestor
		void init(const Node *root, StringRef prevAncestor, StringRef nextAncestor) {
			if(path.size() < initialPathCapacity)
				path.resize(initialPathCapacity);
			pathLen = initialPathLen;
			path[0].init(nextAncestor);
			path[1].init(prevAncestor);
			path[2].init(root, &path[root->flags & Node::PREFIX_SOURCE_NEXT ? 0 : 1], false, 1);
		}

		bool operator == (const Cursor &rhs) const {
			return pathBack().node == rhs.pathBack().node;
		}

		StringRef leftParentBoundary;
		StringRef rightParentBoundary;
		std::vector<PathEntry> path;
		// pathLen is the number of elements in path which are in use.  This is to prevent constantly destroying
		// and constructing PathEntry objects which would unnecessarily churn through memory in Arena for storing
		// coalesced prefixes.
		int pathLen;

		bool valid() const {
			return pathLen != 0 && pathBack().valid();
		}

		// Get a reference to the current key which is valid until the Cursor is moved.
		KeyRef getKeyRef() const {
			return pathBack().getKeyRef();
		}

		// Get a Standalone<KeyRef> for the current key which will still be valid after the Cursor is moved.
		Key getKey() const {
			return pathBack().getKey();
		}

		// Get a reference to the current value which is valid as long as the Cursor's page memory exists.
		ValueRef getValueRef() const {
			return pathBack().parser.valueString();
		}

		// Get a key/value reference that is valid until the Cursor is moved.
		EntryRef getKVRef() const {
			return EntryRef(getKeyRef(), getValueRef());
		}

		// Returns a standalone EntryRef where both key and value exist in the standalone's arena,
		// unless copyValue is false in which case the value will be a reference into tree memory.
		Entry getKV(bool copyValue = true) const {
			Key k = getKey();
			ValueRef v = getValueRef();
			if(copyValue)
				v = ValueRef(k.arena(), getValueRef());
			return Entry(EntryRef(k, v), k.arena());
		}

		// Moves the cursor to the node with the greatest key less than or equal to s.  If successful,
		// returns true, otherwise returns false and the cursor will be at the node with the next key
		// greater than s.
		bool seekLessThanOrEqual(StringRef s) {
			if(pathLen == 0)
				return false;

			pathLen = initialPathLen;

			// TODO: Track position of difference and use prefix reuse bytes and prefix sources
			// to skip comparison of some prefix bytes when possible
			while(1) {
				const PathEntry &p = pathBack();
				const Node *right = p.parser.rightChild();
				_mm_prefetch((const char*)right, _MM_HINT_T0);

				int cmp = p.compareToKey(s);
				if(cmp == 0)
					return true;

				if(cmp < 0) {
					// Try to traverse left
					const Node *left = p.parser.leftChild();
					if(left == nullptr) {
						// If we're at the root, cursor should now be before the first element
						if(pathLen == initialPathLen) {
							return false;
						}

						if(p.nodeIsLeftChild) {
							// If we only went left, cursor should now be before the first element
							if((p.moves + initialPathLen) == pathLen) {
								return false;
							}

							// Otherwise, go to the parent of the last right child traversed,
							// which is the last node from which we went right
							popPath(p.moves + 1);
							return true;
						}

						// p.directionLeft is false, so p.node is a right child, so go to its parent.
						popPath(1);
						return true;
					}

					int newMoves = p.nodeIsLeftChild ? p.moves + 1 : 1;
					const PathEntry *borrowSource = (left->flags & Node::PREFIX_SOURCE_NEXT) ? &p : &p - newMoves;
					pushPath(left, borrowSource, true, newMoves);
				}
				else {
					// Try to traverse right
					if(right == nullptr) {
						return true;
					}

					int newMoves = p.nodeIsLeftChild ? 1 : p.moves + 1;
					const PathEntry *borrowSource = (right->flags & Node::PREFIX_SOURCE_NEXT) ? &p - newMoves : &p;
					pushPath(right, borrowSource, false, newMoves);
				}
			}
		}

		inline const PathEntry &pathBack() const {
			return path[pathLen - 1];
		}

		inline PathEntry &pathBack() {
			return path[pathLen - 1];
		}

		inline void pushPath(const Node *node, const PathEntry *borrowSource, bool left, int moves) {
			++pathLen;
			if(path.size() < pathLen) {
				path.resize(pathLen);
			}
			pathBack().init(node, borrowSource, left, moves);
		}

		inline void popPath(int n) {
			pathLen -= n;
		}

		std::string pathToString() const {
			std::string s;
			for(int i = 0; i < pathLen; ++i) {
				s += format("(%d: ", i);
				const Node *node = path[i].node;
				if(node != nullptr) {
					s += "childDir=";
					s += (path[i].nodeIsLeftChild ? "left " : "right ");
				}
				s += format("prefix='%s'", path[i].getPrefix().toHexString(20).c_str());
				if(node != nullptr) {
					s += format(" split='%s' suffix='%s' value='%s'", node->getSplitString().toHexString(20).c_str(), node->getSuffixString().toHexString(20).c_str(), node->getValueString().toHexString(20).c_str());
				}
				else
					s += ") ";
			}
			return s;
		}

		bool moveFirst() {
			if(pathLen == 0)
				return false;

			pathLen = initialPathLen;

			while(1) {
				const PathEntry &p = pathBack();
				const Node *left = p.parser.leftChild();

				if(left == nullptr)
					break;

				// TODO:  This can be simpler since it only goes left
				int newMoves = p.nodeIsLeftChild ? p.moves + 1 : 1;
				const PathEntry *borrowSource = (left->flags & Node::PREFIX_SOURCE_NEXT) ? &p : &p - newMoves;
				pushPath(left, borrowSource, true, newMoves);
			}

			return true;
		}

		bool moveLast() {
			if(pathLen == 0)
				return false;

			pathLen = initialPathLen;

			while(1) {
				const PathEntry &p = pathBack();
				const Node *right = p.parser.rightChild();

				if(right == nullptr)
					break;

				// TODO:  This can be simpler since it only goes right
				int newMoves = p.nodeIsLeftChild ? 1 : p.moves + 1;
				const PathEntry *borrowSource = (right->flags & Node::PREFIX_SOURCE_NEXT) ? &p - newMoves : &p;
				pushPath(right, borrowSource, false, newMoves);
			}

			return true;
		}

		bool moveNext() {
			const PathEntry &p = pathBack();

			// If p isn't valid
			if(!p.valid()) {
				return false;
			}

			const Node *right = p.parser.rightChild();

			// If we can't go right, then go upward to the parent of the last left child
			if(right == nullptr) {
				// If current node was a left child then pop one node and we're done
				if(p.nodeIsLeftChild) {
					popPath(1);
					return true;
				}

				// Current node is a right child.
				// If we are at the rightmost tree node return false and don't move.
				if(p.moves + initialPathLen - 1 == pathLen) {
					return false;
				}

				// Truncate path to the parent of the last left child
				popPath(p.moves + 1);
				return true;
			}

			// Go right
			int newMoves = p.nodeIsLeftChild ? 1 : p.moves + 1;
			const PathEntry *borrowSource = (right->flags & Node::PREFIX_SOURCE_NEXT) ? &p - newMoves : &p;
			pushPath(right, borrowSource, false, newMoves);

			// Go left as far as possible
			while(1) {
				const PathEntry &p = pathBack();
				const Node *left = p.parser.leftChild();
				if(left == nullptr) {
					return true;
				}

				int newMoves = p.nodeIsLeftChild ? p.moves + 1 : 1;
				const PathEntry *borrowSource = (left->flags & Node::PREFIX_SOURCE_NEXT) ? &p : &p - newMoves;
				pushPath(left, borrowSource, true, newMoves);
			}
		}

		bool movePrev() {
			const PathEntry &p = pathBack();

			// If p isn't valid
			if(!p.valid()) {
				return false;
			}

			const Node *left = p.parser.leftChild();

			// If we can't go left, then go upward to the parent of the last right child
			if(left == nullptr) {
				// If current node was a right child
				if(!p.nodeIsLeftChild) {
					// If we are at the root then don't move and return false.
					if(pathLen == initialPathLen)
						return false;

					// Otherwise, pop one node from the path and return true.
					popPath(1);
					return true;
				}

				// Current node is a left child.
				// If we are at the leftmost tree node then return false and don't move.
				if(p.moves + 3 == pathLen) {
					return false;
				}

				// Truncate path to the parent of the last right child
				popPath(p.moves + 1);
				return true;
			}

			// Go left
			int newMoves = p.nodeIsLeftChild ? p.moves + 1 : 1;
			const PathEntry *borrowSource = (left->flags & Node::PREFIX_SOURCE_NEXT) ? &p : &p - newMoves;
			pushPath(left, borrowSource, true, newMoves);

			// Go right as far as possible
			while(1) {
				const PathEntry &p = pathBack();
				const Node *right = p.parser.rightChild();
				if(right == nullptr) {
					return true;
				}

				int newMoves = p.nodeIsLeftChild ? 1 : p.moves + 1;
				const PathEntry *borrowSource = (right->flags & Node::PREFIX_SOURCE_NEXT) ? &p - newMoves : &p;
				pushPath(right, borrowSource, false, newMoves);
			}
		}

	};

	Cursor getCursor(StringRef prevAncestor, StringRef nextAncestor) const {
		return (size != 0) ? Cursor(&root, prevAncestor, nextAncestor) : Cursor();
	}

	static std::string escapeForDOT(StringRef s) {
		std::string r = "\"";
		for(char c : s) {
			if(c == '\n')
				r += "\\n";
			else if(isprint(c) && c != '"')
				r += c;
			else
				r += format("{%02X}", c);
		}
		return r + '"';
	}

	std::string toDOT(StringRef prevAncestor, StringRef nextAncestor) const {
		auto c = getCursor(prevAncestor, nextAncestor);
		c.moveFirst();

		std::string r;
		r += format("digraph PrefixTree%p {\n", this);

		do {
			const PathEntry &p = c.pathBack();
			const Node *n = p.node;
			const Node *left = p.parser.leftChild();
			const Node *right = p.parser.rightChild();

			std::string label = escapeForDOT(format("PrefixSource: %s\nPrefix: [%s]\nSplit: %s\nSuffix: %s",
				n->flags & Node::PREFIX_SOURCE_NEXT ? "Left" : "Right",
				p.getPrefix().toString().c_str(),
				p.parser.splitString().toString().c_str(),
				p.parser.suffixString().toString().c_str()
			));

			r += format("node%p [ label = %s ];\nnode%p -> { %s %s };\n", n, label.c_str(), n,
				left ? format("node%p", left).c_str() : "",
				right ? format("node%p", right).c_str() : ""
			);

		} while(c.moveNext());

		r += "}\n";

		return r;
	}

	// Returns number of bytes written
	int build(const EntryRef *begin, const EntryRef *end, StringRef prevAncestor, StringRef nextAncestor) {
		// The boundary leading to the new page acts as the last time we branched right
		if(begin == end) {
			size = 0;
		}
		else {
			size = sizeof(size) + build(root, begin, end, nextAncestor, prevAncestor);
		}
		ASSERT(size <= MaximumTreeSize());
		return size;
	}

private:
	static uint16_t build(Node &root, const EntryRef *begin, const EntryRef *end, const StringRef &nextAncestor, const StringRef &prevAncestor) {
		ASSERT(end != begin);

		int count = end - begin;

		// Find key to be stored in root
		int mid = perfectSubtreeSplitPointCached(count);
		const StringRef &key = begin[mid].key;
		const StringRef &val = begin[mid].value;

		// Since key must be between lastLeft and lastRight, any common prefix they share must be shared by key
		// so rather than comparing all of key to each one separately we can just compare lastLeft and lastRight
		// to each other and then skip over the resulting length in key
		int nextPrevCommon = commonPrefixLength(nextAncestor.begin(), prevAncestor.begin(), std::min(nextAncestor.size(), prevAncestor.size()));

		// Pointer to remainder of key after the left/right common bytes
		const uint8_t *keyExt = key.begin() + nextPrevCommon;

		// Find out how many bytes beyond leftRightCommon key has with each last left/right string separately
		int extNext = commonPrefixLength(keyExt, nextAncestor.begin() + nextPrevCommon, std::min(key.size(), nextAncestor.size()) - nextPrevCommon);
		int extPrev = commonPrefixLength(keyExt, prevAncestor.begin() + nextPrevCommon, std::min(key.size(), prevAncestor.size()) - nextPrevCommon);

		// Use the longer result
		bool prefixSourceNext = extNext > extPrev;
		
		int prefixLen = nextPrevCommon + (prefixSourceNext ? extNext : extPrev);

		int splitLen;   // Bytes after prefix required to make traversal decision
		int suffixLen;  // Remainder of key bytes after split key portion

		//printf("build: '%s'\n  prefixLen %d  prefixSourceNext %d\n", key.toHexString(20).c_str(), prefixLen, prefixSourceNext);

		// 2 entries or less means no right child, so just put all remaining key bytes into split string.
		if(count < 3) {
			splitLen = key.size() - prefixLen;
			suffixLen = 0;
		}
		else {
			// There are 2 children 
			// Avoid using the suffix at all if the remainder is small enough.
			splitLen = key.size() - prefixLen;
			if(splitLen < SERVER_KNOBS->PREFIX_TREE_IMMEDIATE_KEY_SIZE_LIMIT) {
				suffixLen = 0;
			}
			else {
				// Remainder of the key was not small enough to put entirely before the left child, so find the actual required to make the branch decision
				const StringRef &prevKey = begin[mid - 1].key;
				splitLen = commonPrefixLength(key.begin(), prevKey.begin(), std::min(key.size(), prevKey.size())) + 1 - prefixLen;

				// Put at least the minimum immediate byte count in the split key (before the left child)
				if(splitLen < SERVER_KNOBS->PREFIX_TREE_IMMEDIATE_KEY_SIZE_MIN)
					splitLen = std::min(key.size() - prefixLen, SERVER_KNOBS->PREFIX_TREE_IMMEDIATE_KEY_SIZE_MIN);

				suffixLen = key.size() - splitLen - prefixLen;
			}
		}

		// We now know enough about the fields present and their lengths to set the flag bits and write a header
		// If any int is more than 8 bits then use large ints
		bool large = prefixLen > 255 || splitLen > 255 || suffixLen > 255 || val.size() > 255;
		root.flags = large ? Node::USE_LARGE_LENGTHS : 0;

		if(prefixSourceNext)
			root.flags |= Node::PREFIX_SOURCE_NEXT;

		union {
			uint8_t *p8;
			uint16_t *p16;
		};
		p8 = &root.flags + 1;

		if(large)
			*p16++ = prefixLen;
		else
			*p8++ = prefixLen;

		if(splitLen > 0) {
			root.flags |= Node::HAS_SPLIT;
			if(large)
				*p16++ = splitLen;
			else
				*p8++ = splitLen;
		}

		uint16_t *pLeftLen = p16;
		if(count > 1) {
			++p16;
		}

		if(suffixLen > 0) {
			root.flags |= Node::HAS_SUFFIX;
			if(large)
				*p16++ = suffixLen;
			else
				*p8++ = suffixLen;
		}

		if(val.size() > 0) {
			root.flags |= Node::HAS_VALUE;
			if(large)
				*p16++ = val.size();
			else
				*p8++ = val.size();
		}

		// Header is written, now write strings and children in order.
		const uint8_t *keyPtr = key.begin() + prefixLen;

		// Serialize split bytes
		if(splitLen > 0) {
			memcpy(p8, keyPtr, splitLen);
			p8 += splitLen;
			keyPtr += splitLen;
		}

		// Serialize left child
		if(count > 1) {
			root.flags |= Node::HAS_LEFT_CHILD;
			int leftLen = build(*(Node *)(p8), begin, begin + mid, key, prevAncestor);
			*pLeftLen = leftLen;
			p8 += leftLen;
		}

		// Serialize suffix bytes
		if(suffixLen > 0) {
			memcpy(p8, keyPtr, suffixLen);
			p8 += suffixLen;
		}

		// Serialize value bytes
		if(val.size() > 0) {
			memcpy(p8, val.begin(), val.size());
			p8 += val.size();
		}

		// Serialize right child
		if(count > 2) {
			root.flags |= Node::HAS_RIGHT_CHILD;
			int rightLen = build(*(Node *)(p8), begin + mid + 1, end, nextAncestor, key);
			p8 += rightLen;
		}

/*
printf("\nBuilt: key '%s'  c %d  p %d  spl %d  suf %d\nRaw: %s\n", key.toString().c_str(), count, prefixLen, splitLen, suffixLen, StringRef(&root.flags, p8 - &root.flags).toHexString(20).c_str());
Node::Parser p(&root);
printf("parser: headerLen %d prefixLen %d leftPos %d rightPos %d split %s suffix %s val %s\n", 
	   p.headerLen, p.prefixLen, p.leftPos, p.rightPos, p.splitString().toString().c_str(), p.suffixString().toString().c_str(), p.valueString().toString().c_str());
*/
		return p8 - (uint8_t *)&root;
	}
};