Write path no longer uses non-caching reads because it is no longer necessary to avoid a page copy. Page copies are only done just before an actual change is made.
This commit is contained in:
parent
8e7a97f495
commit
751bac2271
|
@ -1046,6 +1046,10 @@ public:
|
|||
|
||||
Cursor(DecodeCache* cache, DeltaTree2* tree) : cache(cache), tree(tree), nodeIndex(-1) {}
|
||||
|
||||
Cursor(DecodeCache* cache, DeltaTree2* tree, int nodeIndex) : cache(cache), tree(tree), nodeIndex(nodeIndex) {
|
||||
updateItem();
|
||||
}
|
||||
|
||||
int rootIndex() {
|
||||
if (!cache->empty()) {
|
||||
return 0;
|
||||
|
@ -1109,6 +1113,13 @@ public:
|
|||
// the cursor's new current item.
|
||||
const T& get() const { return item; }
|
||||
|
||||
void switchTree(DeltaTree2* newTree) {
|
||||
tree = newTree;
|
||||
if (nodeIndex != -1) {
|
||||
updateItem();
|
||||
}
|
||||
}
|
||||
|
||||
// If the cursor is valid, return a reference to the cursor's internal T.
|
||||
// Otherwise, returns a reference to the cache's upper boundary.
|
||||
const T& getOrUpperBound() const { return valid() ? get() : cache->upperBound; }
|
||||
|
@ -1351,6 +1362,7 @@ public:
|
|||
// Returns true if successful, false if k does not fit in the space available
|
||||
// or if k is already in the tree (and was not already deleted).
|
||||
// Insertion on an empty tree returns false as well.
|
||||
// Insert does NOT change the cursor position.
|
||||
bool insert(const T& k, int skipLen = 0, int maxHeightAllowed = std::numeric_limits<int>::max()) {
|
||||
deltatree_printf("insert %s\n", k.toString().c_str());
|
||||
|
||||
|
|
|
@ -4388,6 +4388,18 @@ private:
|
|||
state BTreePageIDRef newID;
|
||||
newID.resize(*arena, oldID.size());
|
||||
|
||||
if (REDWOOD_DEBUG) {
|
||||
BTreePage* btPage = (BTreePage*)page->begin();
|
||||
BTreePage::BinaryTree::DecodeCache* cache = (BTreePage::BinaryTree::DecodeCache*)page->userData;
|
||||
debug_printf(
|
||||
"updateBTreePage(%s, %s) %s\n",
|
||||
::toString(oldID).c_str(),
|
||||
::toString(writeVersion).c_str(),
|
||||
cache == nullptr
|
||||
? "<noDecodeCache>"
|
||||
: btPage->toString(true, oldID, writeVersion, cache->lowerBound, cache->upperBound).c_str());
|
||||
}
|
||||
|
||||
if (oldID.size() == 1) {
|
||||
LogicalPageID id = wait(self->m_pager->atomicUpdatePage(oldID.front(), page, writeVersion));
|
||||
newID.front() = id;
|
||||
|
@ -4417,7 +4429,7 @@ private:
|
|||
}
|
||||
|
||||
// Copy page to a new page which shares the same DecodeCache with the old page
|
||||
Reference<ArenaPage> cloneForUpdate(Reference<const ArenaPage> page) {
|
||||
static Reference<ArenaPage> clonePageForUpdate(Reference<const ArenaPage> page) {
|
||||
Reference<ArenaPage> newPage = page->cloneContents();
|
||||
|
||||
BTreePage::BinaryTree::DecodeCache* cache = (BTreePage::BinaryTree::DecodeCache*)page->userData;
|
||||
|
@ -4425,6 +4437,7 @@ private:
|
|||
newPage->userData = cache;
|
||||
newPage->userDataDestructor = [](void* cache) { ((BTreePage::BinaryTree::DecodeCache*)cache)->delref(); };
|
||||
|
||||
debug_printf("cloneForUpdate(%p -> %p size=%d\n", page->begin(), newPage->begin(), page->size());
|
||||
return newPage;
|
||||
}
|
||||
|
||||
|
@ -4563,37 +4576,62 @@ private:
|
|||
|
||||
struct InternalPageModifier {
|
||||
InternalPageModifier() {}
|
||||
InternalPageModifier(BTreePage* p, BTreePage::BinaryTree::Cursor& c, bool updating, ParentInfo* parentInfo)
|
||||
: btPage(p), c(c), updating(updating), changesMade(false), parentInfo(parentInfo) {}
|
||||
InternalPageModifier(Reference<const ArenaPage> p,
|
||||
BTreePage::BinaryTree::Cursor& c,
|
||||
bool updating,
|
||||
ParentInfo* parentInfo)
|
||||
: page(p), clonedPage(false), cursor(c), updating(updating), changesMade(false), parentInfo(parentInfo) {}
|
||||
|
||||
// Whether updating the existing page is allowed
|
||||
bool updating;
|
||||
BTreePage* btPage;
|
||||
BTreePage::BinaryTree::Cursor c;
|
||||
Reference<const ArenaPage> page;
|
||||
|
||||
// Whether or not page has been cloned for update
|
||||
bool clonedPage;
|
||||
|
||||
BTreePage::BinaryTree::Cursor cursor;
|
||||
Standalone<VectorRef<RedwoodRecordRef>> rebuild;
|
||||
|
||||
// Whether there are any changes to the page, either made in place or staged in rebuild
|
||||
bool changesMade;
|
||||
ParentInfo* parentInfo;
|
||||
|
||||
BTreePage* btPage() { return (BTreePage*)page->begin(); }
|
||||
|
||||
bool empty() const {
|
||||
if (updating) {
|
||||
return c.tree->numItems == 0;
|
||||
return cursor.tree->numItems == 0;
|
||||
} else {
|
||||
return rebuild.empty();
|
||||
}
|
||||
}
|
||||
|
||||
void cloneForUpdate() {
|
||||
if (!clonedPage) {
|
||||
page = clonePageForUpdate(page);
|
||||
cursor.switchTree(&btPage()->tree());
|
||||
clonedPage = true;
|
||||
}
|
||||
}
|
||||
|
||||
// end is the cursor position of the first record of the unvisited child link range, which
|
||||
// is needed if the insert requires switching from update to rebuild mode.
|
||||
void insert(BTreePage::BinaryTree::Cursor end, const VectorRef<RedwoodRecordRef>& recs) {
|
||||
int i = 0;
|
||||
if (updating) {
|
||||
// Update must be done in the new tree, not the original tree where the end cursor will be from
|
||||
end.tree = cursor.tree;
|
||||
end.switchTree(cursor.tree);
|
||||
|
||||
// TODO: insert recs in a random order to avoid new subtree being entirely right child links
|
||||
while (i != recs.size()) {
|
||||
const RedwoodRecordRef& rec = recs[i];
|
||||
debug_printf("internal page (updating) insert: %s\n", rec.toString(false).c_str());
|
||||
|
||||
if (!c.insert(rec)) {
|
||||
if (!cursor.insert(rec)) {
|
||||
debug_printf("internal page: failed to insert %s, switching to rebuild\n",
|
||||
rec.toString(false).c_str());
|
||||
|
||||
// Update failed, so populate rebuild vector with everything up to but not including end, which
|
||||
// may include items from recs that were already added.
|
||||
auto c = end;
|
||||
|
@ -4608,7 +4646,7 @@ private:
|
|||
updating = false;
|
||||
break;
|
||||
}
|
||||
btPage->kvBytes += rec.kvBytes();
|
||||
btPage()->kvBytes += rec.kvBytes();
|
||||
++i;
|
||||
}
|
||||
}
|
||||
|
@ -4651,11 +4689,20 @@ private:
|
|||
if (u.childrenChanged) {
|
||||
if (updating) {
|
||||
auto c = u.cBegin;
|
||||
|
||||
if (c != u.cEnd) {
|
||||
cloneForUpdate();
|
||||
// must point c to the tree to erase from
|
||||
c.tree = cursor.tree;
|
||||
c.switchTree(cursor.tree);
|
||||
}
|
||||
|
||||
while (c != u.cEnd) {
|
||||
debug_printf("internal page (updating) erasing: %s\n", c.get().toString(false).c_str());
|
||||
btPage->kvBytes -= c.get().kvBytes();
|
||||
btPage()->kvBytes -= c.get().kvBytes();
|
||||
c.erase();
|
||||
}
|
||||
|
||||
// [cBegin, cEnd) is now erased, and cBegin is invalid, so cEnd represents the end
|
||||
// of the range that comes before any part of newLinks that can't be added if there
|
||||
// is not enough space.
|
||||
|
@ -4670,6 +4717,9 @@ private:
|
|||
changesMade = true;
|
||||
} else {
|
||||
|
||||
// If this was an in-place update, where the child page IDs do not change, notify the
|
||||
// parentInfo that those pages have been updated so it can possibly eliminate their
|
||||
// second writes later.
|
||||
if (u.inPlaceUpdate) {
|
||||
for (auto id : u.decodeLowerBound.getChildPage()) {
|
||||
parentInfo->pageUpdated(id);
|
||||
|
@ -4686,6 +4736,8 @@ private:
|
|||
!nextBoundary->sameExceptValue(u.expectedUpperBound.get()))) {
|
||||
RedwoodRecordRef rec = u.expectedUpperBound.get().withoutValue();
|
||||
debug_printf("applyUpdate adding dummy record %s\n", rec.toString(false).c_str());
|
||||
|
||||
cloneForUpdate();
|
||||
insert(u.cEnd, { &rec, 1 });
|
||||
changesMade = true;
|
||||
}
|
||||
|
@ -4729,11 +4781,15 @@ private:
|
|||
state Reference<FlowLock> commitReadLock = self->m_commitReadLock;
|
||||
wait(commitReadLock->take());
|
||||
state FlowLock::Releaser readLock(*commitReadLock);
|
||||
state bool fromCache = false;
|
||||
state Reference<const ArenaPage> page = wait(
|
||||
readPage(snapshot, rootID, update->decodeLowerBound, update->decodeUpperBound, false, false, &fromCache));
|
||||
state Reference<const ArenaPage> page =
|
||||
wait(readPage(snapshot, rootID, update->decodeLowerBound, update->decodeUpperBound, false, true));
|
||||
readLock.release();
|
||||
|
||||
// If in-place modification to the page is done, a copy of the page will be made in pageCopy
|
||||
// and the cursor will be pointed to it. The original page variable must stay in scope because
|
||||
// there could be RedwoodRecordRefs referencing its arenas.
|
||||
state Reference<const ArenaPage> pageCopy;
|
||||
|
||||
state BTreePage* btPage = (BTreePage*)page->begin();
|
||||
ASSERT(isLeaf == btPage->isLeaf());
|
||||
g_redwoodMetrics.level(btPage->height).pageCommitStart += 1;
|
||||
|
@ -4741,16 +4797,9 @@ private:
|
|||
// TODO: Decide if it is okay to update if the subtree boundaries are expanded. It can result in
|
||||
// records in a DeltaTree being outside its decode boundary range, which isn't actually invalid
|
||||
// though it is awkward to reason about.
|
||||
// TryToUpdate indicates insert and erase operations should be tried on the existing page first
|
||||
state bool tryToUpdate = btPage->tree().numItems > 0 && update->boundariesNormal();
|
||||
|
||||
// If trying to update the page and the page reference points into the cache,
|
||||
// we need to clone it so we don't modify the original version of the page.
|
||||
if (tryToUpdate && fromCache) {
|
||||
page = self->cloneForUpdate(page);
|
||||
btPage = (BTreePage*)page->begin();
|
||||
fromCache = false;
|
||||
}
|
||||
|
||||
debug_printf(
|
||||
"%s commitSubtree(): %s\n",
|
||||
context.c_str(),
|
||||
|
@ -4834,6 +4883,15 @@ private:
|
|||
debug_printf("%s Erasing %s [existing, boundary start]\n",
|
||||
context.c_str(),
|
||||
cursor.get().toString().c_str());
|
||||
|
||||
// Copy page for modification if not already copied
|
||||
if (!pageCopy.isValid()) {
|
||||
pageCopy = clonePageForUpdate(page);
|
||||
btPage = (BTreePage*)pageCopy->begin();
|
||||
cursor.tree = &btPage->tree();
|
||||
cursor.switchTree(&btPage->tree());
|
||||
}
|
||||
|
||||
btPage->kvBytes -= cursor.get().kvBytes();
|
||||
cursor.erase();
|
||||
} else {
|
||||
|
@ -4856,6 +4914,14 @@ private:
|
|||
|
||||
// If updating, add to the page, else add to the output set
|
||||
if (updating) {
|
||||
// Copy page for modification if not already copied
|
||||
if (!pageCopy.isValid()) {
|
||||
pageCopy = clonePageForUpdate(page);
|
||||
btPage = (BTreePage*)pageCopy->begin();
|
||||
cursor.tree = &btPage->tree();
|
||||
cursor.switchTree(&btPage->tree());
|
||||
}
|
||||
|
||||
if (cursor.insert(rec, update->skipLen, maxHeightAllowed)) {
|
||||
btPage->kvBytes += rec.kvBytes();
|
||||
debug_printf(
|
||||
|
@ -4911,6 +4977,15 @@ private:
|
|||
debug_printf("%s Erasing %s [existing, boundary start]\n",
|
||||
context.c_str(),
|
||||
cursor.get().toString().c_str());
|
||||
|
||||
// Copy page for modification if not already copied
|
||||
if (!pageCopy.isValid()) {
|
||||
pageCopy = clonePageForUpdate(page);
|
||||
btPage = (BTreePage*)pageCopy->begin();
|
||||
cursor.tree = &btPage->tree();
|
||||
cursor.switchTree(&btPage->tree());
|
||||
}
|
||||
|
||||
btPage->kvBytes -= cursor.get().kvBytes();
|
||||
cursor.erase();
|
||||
changesMade = true;
|
||||
|
@ -4947,6 +5022,15 @@ private:
|
|||
"%s Erasing %s and beyond [existing, matches changed upper mutation boundary]\n",
|
||||
context.c_str(),
|
||||
cursor.get().toString().c_str());
|
||||
|
||||
// Copy page for modification if not already copied
|
||||
if (!pageCopy.isValid()) {
|
||||
pageCopy = clonePageForUpdate(page);
|
||||
btPage = (BTreePage*)pageCopy->begin();
|
||||
cursor.tree = &btPage->tree();
|
||||
cursor.switchTree(&btPage->tree());
|
||||
}
|
||||
|
||||
btPage->kvBytes -= cursor.get().kvBytes();
|
||||
cursor.erase();
|
||||
} else {
|
||||
|
@ -4988,7 +5072,7 @@ private:
|
|||
} else {
|
||||
// Otherwise update it.
|
||||
BTreePageIDRef newID = wait(self->updateBTreePage(
|
||||
self, rootID, &update->newLinks.arena(), page.castTo<ArenaPage>(), writeVersion));
|
||||
self, rootID, &update->newLinks.arena(), pageCopy.castTo<ArenaPage>(), writeVersion));
|
||||
|
||||
update->updatedInPlace(newID, btPage, newID.size() * self->m_blockSize);
|
||||
debug_printf(
|
||||
|
@ -5206,13 +5290,13 @@ private:
|
|||
// Note: parentInfo could be invalid after a wait and must be re-initialized.
|
||||
// All uses below occur before waits so no reinitialization is done.
|
||||
state ParentInfo* parentInfo = &self->childUpdateTracker[rootID.front()];
|
||||
state InternalPageModifier m(btPage, cursor, tryToUpdate, parentInfo);
|
||||
state InternalPageModifier modifier(page, cursor, tryToUpdate, parentInfo);
|
||||
|
||||
// Apply the possible changes for each subtree range recursed to, except the last one.
|
||||
// For each range, the expected next record, if any, is checked against the first boundary
|
||||
// of the next range, if any.
|
||||
for (int i = 0, iEnd = slices.size() - 1; i < iEnd; ++i) {
|
||||
m.applyUpdate(*slices[i], slices[i + 1]->getFirstBoundary());
|
||||
modifier.applyUpdate(*slices[i], slices[i + 1]->getFirstBoundary());
|
||||
}
|
||||
|
||||
// The expected next record for the final range is checked against one of the upper boundaries passed to
|
||||
|
@ -5222,39 +5306,41 @@ private:
|
|||
// sole purpose of adding a dummy upper bound record.
|
||||
debug_printf("%s Applying final child range update. changesMade=%d Parent update is: %s\n",
|
||||
context.c_str(),
|
||||
m.changesMade,
|
||||
modifier.changesMade,
|
||||
update->toString().c_str());
|
||||
m.applyUpdate(*slices.back(), m.changesMade ? &update->subtreeUpperBound : &update->decodeUpperBound);
|
||||
modifier.applyUpdate(*slices.back(),
|
||||
modifier.changesMade ? &update->subtreeUpperBound : &update->decodeUpperBound);
|
||||
|
||||
state bool detachChildren = (parentInfo->count > 2);
|
||||
state bool forceUpdate = false;
|
||||
|
||||
// If no changes were made, but we should rewrite it to point directly to remapped child pages
|
||||
if (!m.changesMade && detachChildren) {
|
||||
if (!modifier.changesMade && detachChildren) {
|
||||
debug_printf(
|
||||
"%s Internal page forced rewrite because at least %d children have been updated in-place.\n",
|
||||
context.c_str(),
|
||||
parentInfo->count);
|
||||
forceUpdate = true;
|
||||
if (!m.updating) {
|
||||
m.updating = true;
|
||||
|
||||
// Copy the page before modification if the page references the cache
|
||||
if (fromCache) {
|
||||
page = self->cloneForUpdate(page);
|
||||
btPage = (BTreePage*)page->begin();
|
||||
m.btPage = btPage;
|
||||
cursor.tree = &btPage->tree();
|
||||
m.c.tree = cursor.tree;
|
||||
fromCache = false;
|
||||
}
|
||||
}
|
||||
forceUpdate = true;
|
||||
modifier.updating = true;
|
||||
|
||||
// Make sure the modifier cloned the page so we can update the child links in-place below.
|
||||
modifier.cloneForUpdate();
|
||||
|
||||
++g_redwoodMetrics.level(btPage->height).forceUpdate;
|
||||
}
|
||||
|
||||
// If the modifier cloned the page for updating, then update our local pageCopy, btPage, and cursor
|
||||
if (modifier.clonedPage) {
|
||||
pageCopy = modifier.page;
|
||||
btPage = modifier.btPage();
|
||||
cursor.tree = modifier.cursor.tree;
|
||||
cursor.switchTree(modifier.cursor.tree);
|
||||
}
|
||||
|
||||
// If page contents have changed
|
||||
if (m.changesMade || forceUpdate) {
|
||||
if (m.empty()) {
|
||||
if (modifier.changesMade || forceUpdate) {
|
||||
if (modifier.empty()) {
|
||||
update->cleared();
|
||||
debug_printf("%s All internal page children were deleted so deleting this page too, returning %s\n",
|
||||
context.c_str(),
|
||||
|
@ -5262,7 +5348,7 @@ private:
|
|||
self->freeBTreePage(rootID, writeVersion);
|
||||
self->childUpdateTracker.erase(rootID.front());
|
||||
} else {
|
||||
if (m.updating) {
|
||||
if (modifier.updating) {
|
||||
// Page was updated in place (or being forced to be updated in place to update child page ids)
|
||||
debug_printf(
|
||||
"%s Internal page modified in-place tryToUpdate=%d forceUpdate=%d detachChildren=%d\n",
|
||||
|
@ -5301,7 +5387,7 @@ private:
|
|||
}
|
||||
|
||||
BTreePageIDRef newID = wait(self->updateBTreePage(
|
||||
self, rootID, &update->newLinks.arena(), page.castTo<ArenaPage>(), writeVersion));
|
||||
self, rootID, &update->newLinks.arena(), pageCopy.castTo<ArenaPage>(), writeVersion));
|
||||
debug_printf(
|
||||
"%s commitSubtree(): Internal page updated in-place at version %s, new contents: %s\n",
|
||||
context.c_str(),
|
||||
|
@ -5325,7 +5411,7 @@ private:
|
|||
|
||||
if (detachChildren) {
|
||||
auto& stats = g_redwoodMetrics.level(btPage->height);
|
||||
for (auto& rec : m.rebuild) {
|
||||
for (auto& rec : modifier.rebuild) {
|
||||
if (rec.value.present()) {
|
||||
BTreePageIDRef oldPages = rec.getChildPage();
|
||||
BTreePageIDRef newPages;
|
||||
|
@ -5336,7 +5422,7 @@ private:
|
|||
if (newID != invalidLogicalPageID) {
|
||||
// Rebuild record values reference original page memory so make a copy
|
||||
if (newPages.empty()) {
|
||||
newPages = BTreePageIDRef(m.rebuild.arena(), oldPages);
|
||||
newPages = BTreePageIDRef(modifier.rebuild.arena(), oldPages);
|
||||
rec.setChildPage(newPages);
|
||||
}
|
||||
debug_printf("%s Detach updated %u -> %u\n", context.c_str(), p, newID);
|
||||
|
@ -5354,7 +5440,7 @@ private:
|
|||
wait(writePages(self,
|
||||
&update->subtreeLowerBound,
|
||||
&update->subtreeUpperBound,
|
||||
m.rebuild,
|
||||
modifier.rebuild,
|
||||
btPage->height,
|
||||
writeVersion,
|
||||
rootID));
|
||||
|
|
Loading…
Reference in New Issue