1181 lines
34 KiB
C++
1181 lines
34 KiB
C++
/*
|
|
* SkipList.cpp
|
|
*
|
|
* This source file is part of the FoundationDB open source project
|
|
*
|
|
* Copyright 2013-2018 Apple Inc. and the FoundationDB project authors
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
#include <stdint.h>
|
|
#include <memory.h>
|
|
#include <stdio.h>
|
|
#include <algorithm>
|
|
#include <numeric>
|
|
#include <string>
|
|
#include <vector>
|
|
|
|
#include "flow/Platform.h"
|
|
#include "fdbrpc/fdbrpc.h"
|
|
#include "fdbrpc/PerfMetric.h"
|
|
#include "fdbclient/FDBTypes.h"
|
|
#include "fdbclient/KeyRangeMap.h"
|
|
#include "fdbclient/SystemData.h"
|
|
#include "fdbserver/ConflictSet.h"
|
|
|
|
using std::max;
|
|
using std::min;
|
|
|
|
static std::vector<PerfDoubleCounter*> skc;
|
|
|
|
static thread_local uint32_t g_seed = 0;
|
|
|
|
static inline int skfastrand() {
|
|
g_seed = g_seed * 1664525L + 1013904223L;
|
|
return g_seed;
|
|
}
|
|
|
|
PerfDoubleCounter g_buildTest("Build", skc), g_add("Add", skc), g_detectConflicts("Detect", skc), g_sort("D.Sort", skc),
|
|
g_combine("D.Combine", skc), g_checkRead("D.CheckRead", skc), g_checkBatch("D.CheckIntraBatch", skc),
|
|
g_merge("D.MergeWrite", skc), g_removeBefore("D.RemoveBefore", skc);
|
|
|
|
static force_inline int compare(const StringRef& a, const StringRef& b) {
|
|
int c = memcmp(a.begin(), b.begin(), min(a.size(), b.size()));
|
|
if (c < 0)
|
|
return -1;
|
|
if (c > 0)
|
|
return +1;
|
|
if (a.size() < b.size())
|
|
return -1;
|
|
if (a.size() == b.size())
|
|
return 0;
|
|
return +1;
|
|
}
|
|
|
|
struct ReadConflictRange {
|
|
StringRef begin, end;
|
|
Version version;
|
|
int transaction;
|
|
int indexInTx;
|
|
VectorRef<int>* conflictingKeyRange;
|
|
Arena* cKRArena;
|
|
|
|
ReadConflictRange(StringRef begin,
|
|
StringRef end,
|
|
Version version,
|
|
int transaction,
|
|
int indexInTx,
|
|
VectorRef<int>* cKR = nullptr,
|
|
Arena* cKRArena = nullptr)
|
|
: begin(begin), end(end), version(version), transaction(transaction), indexInTx(indexInTx),
|
|
conflictingKeyRange(cKR), cKRArena(cKRArena) {}
|
|
bool operator<(const ReadConflictRange& rhs) const { return compare(begin, rhs.begin) < 0; }
|
|
};
|
|
|
|
struct KeyInfo {
|
|
StringRef key;
|
|
int* pIndex;
|
|
bool begin;
|
|
bool write;
|
|
int transaction;
|
|
|
|
KeyInfo() = default;
|
|
KeyInfo(StringRef key, bool begin, bool write, int transaction, int* pIndex)
|
|
: key(key), pIndex(pIndex), begin(begin), write(write), transaction(transaction) {}
|
|
};
|
|
|
|
force_inline int extra_ordering(const KeyInfo& ki) {
|
|
return ki.begin * 2 + (ki.write ^ ki.begin);
|
|
}
|
|
|
|
// returns true if done with string
|
|
force_inline bool getCharacter(const KeyInfo& ki, int character, int& outputCharacter) {
|
|
// normal case
|
|
if (character < ki.key.size()) {
|
|
outputCharacter = 5 + ki.key.begin()[character];
|
|
return false;
|
|
}
|
|
|
|
// termination
|
|
if (character == ki.key.size()) {
|
|
outputCharacter = 0;
|
|
return false;
|
|
}
|
|
|
|
if (character == ki.key.size() + 1) {
|
|
// end/begin+read/write relative sorting
|
|
outputCharacter = extra_ordering(ki);
|
|
return false;
|
|
}
|
|
|
|
outputCharacter = 0;
|
|
return true;
|
|
}
|
|
|
|
bool operator<(const KeyInfo& lhs, const KeyInfo& rhs) {
|
|
int i = min(lhs.key.size(), rhs.key.size());
|
|
int c = memcmp(lhs.key.begin(), rhs.key.begin(), i);
|
|
if (c != 0)
|
|
return c < 0;
|
|
|
|
// Always sort shorter keys before longer keys.
|
|
if (lhs.key.size() < rhs.key.size()) {
|
|
return true;
|
|
}
|
|
if (lhs.key.size() > rhs.key.size()) {
|
|
return false;
|
|
}
|
|
|
|
// When the keys are the same length, use the extra ordering constraint.
|
|
return extra_ordering(lhs) < extra_ordering(rhs);
|
|
}
|
|
|
|
bool operator==(const KeyInfo& lhs, const KeyInfo& rhs) {
|
|
return !(lhs < rhs || rhs < lhs);
|
|
}
|
|
|
|
void swapSort(std::vector<KeyInfo>& points, int a, int b) {
|
|
if (points[b] < points[a]) {
|
|
KeyInfo temp;
|
|
temp = points[a];
|
|
points[a] = points[b];
|
|
points[b] = temp;
|
|
}
|
|
}
|
|
|
|
void smallSort(std::vector<KeyInfo>& points, int start, int N) {
|
|
for (int i = 1; i < N; i++)
|
|
for (int j = i; j > 0; j -= 2)
|
|
swapSort(points, start + j - 1, start + j);
|
|
for (int i = N - 2; i > 0; i--)
|
|
for (int j = i; j > 0; j -= 2)
|
|
swapSort(points, start + j - 1, start + j);
|
|
}
|
|
|
|
struct SortTask {
|
|
int begin;
|
|
int size;
|
|
int character;
|
|
SortTask(int begin, int size, int character) : begin(begin), size(size), character(character) {}
|
|
};
|
|
|
|
void sortPoints(std::vector<KeyInfo>& points) {
|
|
std::vector<SortTask> tasks;
|
|
std::vector<KeyInfo> newPoints;
|
|
std::vector<int> counts;
|
|
|
|
tasks.emplace_back(0, points.size(), 0);
|
|
|
|
while (tasks.size()) {
|
|
SortTask st = tasks.back();
|
|
tasks.pop_back();
|
|
|
|
if (st.size < 10) {
|
|
// smallSort(points, st.begin, st.size);
|
|
std::sort(points.begin() + st.begin, points.begin() + st.begin + st.size);
|
|
continue;
|
|
}
|
|
|
|
newPoints.resize(st.size);
|
|
counts.assign(256 + 5, 0);
|
|
|
|
// get counts
|
|
int c;
|
|
bool allDone = true;
|
|
for (int i = st.begin; i < st.begin + st.size; i++) {
|
|
allDone &= getCharacter(points[i], st.character, c);
|
|
counts[c]++;
|
|
}
|
|
if (allDone)
|
|
continue;
|
|
|
|
// calculate offsets from counts and build next level of tasks
|
|
int total = 0;
|
|
for (int i = 0; i < counts.size(); i++) {
|
|
int temp = counts[i];
|
|
if (temp > 1)
|
|
tasks.emplace_back(st.begin + total, temp, st.character + 1);
|
|
counts[i] = total;
|
|
total += temp;
|
|
}
|
|
|
|
// put in their places
|
|
for (int i = st.begin; i < st.begin + st.size; i++) {
|
|
getCharacter(points[i], st.character, c);
|
|
newPoints[counts[c]++] = points[i];
|
|
}
|
|
|
|
// copy back into original points array
|
|
for (int i = 0; i < st.size; i++)
|
|
points[st.begin + i] = newPoints[i];
|
|
}
|
|
}
|
|
|
|
class SkipList : NonCopyable {
|
|
private:
|
|
static constexpr int MaxLevels = 26;
|
|
|
|
int randomLevel() const {
|
|
uint32_t i = uint32_t(skfastrand()) >> (32 - (MaxLevels - 1));
|
|
int level = 0;
|
|
while (i & 1) {
|
|
i >>= 1;
|
|
level++;
|
|
}
|
|
ASSERT(level < MaxLevels);
|
|
return level;
|
|
}
|
|
|
|
// Represent a node in the SkipList. The node has multiple (i.e., level) pointers to
|
|
// other nodes, and keeps a record of the max versions for each level.
|
|
struct Node {
|
|
int level() const { return nPointers - 1; }
|
|
uint8_t* value() { return end() + nPointers * (sizeof(Node*) + sizeof(Version)); }
|
|
int length() const { return valueLength; }
|
|
|
|
// Returns the next node pointer at the given level.
|
|
Node* getNext(int level) { return *((Node**)end() + level); }
|
|
// Sets the next node pointer at the given level.
|
|
void setNext(int level, Node* n) { *((Node**)end() + level) = n; }
|
|
|
|
// Returns the max version at the given level.
|
|
Version getMaxVersion(int i) const { return ((Version*)(end() + nPointers * sizeof(Node*)))[i]; }
|
|
// Sets the max version at the given level.
|
|
void setMaxVersion(int i, Version v) { ((Version*)(end() + nPointers * sizeof(Node*)))[i] = v; }
|
|
|
|
// Return a node with initialized value but uninitialized pointers
|
|
// Memory layout: *this, (level+1) Node*, (level+1) Version, value
|
|
static Node* create(const StringRef& value, int level) {
|
|
int nodeSize = sizeof(Node) + value.size() + (level + 1) * (sizeof(Node*) + sizeof(Version));
|
|
|
|
Node* n;
|
|
if (nodeSize <= 64) {
|
|
n = (Node*)FastAllocator<64>::allocate();
|
|
INSTRUMENT_ALLOCATE("SkipListNode64");
|
|
} else if (nodeSize <= 128) {
|
|
n = (Node*)FastAllocator<128>::allocate();
|
|
INSTRUMENT_ALLOCATE("SkipListNode128");
|
|
} else {
|
|
n = (Node*)new char[nodeSize];
|
|
INSTRUMENT_ALLOCATE("SkipListNodeLarge");
|
|
}
|
|
|
|
n->nPointers = level + 1;
|
|
|
|
n->valueLength = value.size();
|
|
if (value.size() > 0) {
|
|
memcpy(n->value(), value.begin(), value.size());
|
|
}
|
|
return n;
|
|
}
|
|
|
|
// pre: level>0, all lower level nodes between this and getNext(level) have correct maxversions
|
|
void calcVersionForLevel(int level) {
|
|
Node* end = getNext(level);
|
|
Version v = getMaxVersion(level - 1);
|
|
for (Node* x = getNext(level - 1); x != end; x = x->getNext(level - 1))
|
|
v = max(v, x->getMaxVersion(level - 1));
|
|
setMaxVersion(level, v);
|
|
}
|
|
|
|
void destroy() {
|
|
int nodeSize = getNodeSize();
|
|
if (nodeSize <= 64) {
|
|
FastAllocator<64>::release(this);
|
|
INSTRUMENT_RELEASE("SkipListNode64");
|
|
} else if (nodeSize <= 128) {
|
|
FastAllocator<128>::release(this);
|
|
INSTRUMENT_RELEASE("SkipListNode128");
|
|
} else {
|
|
delete[](char*) this;
|
|
INSTRUMENT_RELEASE("SkipListNodeLarge");
|
|
}
|
|
}
|
|
|
|
private:
|
|
int getNodeSize() const { return sizeof(Node) + valueLength + nPointers * (sizeof(Node*) + sizeof(Version)); }
|
|
// Returns the first Node* pointer
|
|
uint8_t* end() { return (uint8_t*)(this + 1); }
|
|
uint8_t const* end() const { return (uint8_t const*)(this + 1); }
|
|
int nPointers, valueLength;
|
|
};
|
|
|
|
static force_inline bool less(const uint8_t* a, int aLen, const uint8_t* b, int bLen) {
|
|
int c = memcmp(a, b, min(aLen, bLen));
|
|
if (c < 0)
|
|
return true;
|
|
if (c > 0)
|
|
return false;
|
|
return aLen < bLen;
|
|
}
|
|
|
|
Node* header;
|
|
|
|
void destroy() {
|
|
Node *next, *x;
|
|
for (x = header; x; x = next) {
|
|
next = x->getNext(0);
|
|
x->destroy();
|
|
}
|
|
}
|
|
|
|
public:
|
|
// Points the location (i.e., Node*) that value would appear in the SkipList.
|
|
// If the "value" is in the list, then finger[0] points to that exact node;
|
|
// otherwise, the finger points to Nodes that the value should be inserted before.
|
|
// Note the SkipList organizes all nodes at level 0, higher levels contain jump pointers.
|
|
struct Finger {
|
|
Node* finger[MaxLevels]; // valid for levels >= level
|
|
int level = MaxLevels;
|
|
Node* x = nullptr;
|
|
Node* alreadyChecked = nullptr;
|
|
StringRef value;
|
|
|
|
Finger() = default;
|
|
Finger(Node* header, const StringRef& ptr) : x(header), value(ptr) {}
|
|
|
|
void init(const StringRef& value, Node* header) {
|
|
this->value = value;
|
|
x = header;
|
|
alreadyChecked = nullptr;
|
|
level = MaxLevels;
|
|
}
|
|
|
|
// pre: !finished()
|
|
force_inline void prefetch() {
|
|
Node* next = x->getNext(level - 1);
|
|
if (next) {
|
|
_mm_prefetch((const char*)next, _MM_HINT_T0);
|
|
_mm_prefetch((const char*)next + 64, _MM_HINT_T0);
|
|
}
|
|
}
|
|
|
|
// pre: !finished()
|
|
// Advances the pointer at the current level to a Node that's >= finger's value
|
|
// if possible; or move to the next level (i.e., level--).
|
|
// Returns true if we have advanced to the next level
|
|
force_inline bool advance() {
|
|
Node* next = x->getNext(level - 1);
|
|
|
|
if (next == alreadyChecked || !less(next->value(), next->length(), value.begin(), value.size())) {
|
|
alreadyChecked = next;
|
|
level--;
|
|
finger[level] = x;
|
|
return true;
|
|
} else {
|
|
x = next;
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// pre: !finished()
|
|
force_inline void nextLevel() {
|
|
while (!advance())
|
|
;
|
|
}
|
|
|
|
force_inline bool finished() const { return level == 0; }
|
|
|
|
// Returns if the finger value is found in the SkipList.
|
|
force_inline Node* found() const {
|
|
// valid after finished returns true
|
|
Node* n = finger[0]->getNext(0); // or alreadyChecked, but that is more easily invalidated
|
|
if (n && n->length() == value.size() && !memcmp(n->value(), value.begin(), value.size()))
|
|
return n;
|
|
else
|
|
return nullptr;
|
|
}
|
|
|
|
StringRef getValue() const {
|
|
Node* n = finger[0]->getNext(0);
|
|
return n ? StringRef(n->value(), n->length()) : StringRef();
|
|
}
|
|
};
|
|
|
|
// Returns the total number of nodes in the list.
|
|
int count() const {
|
|
int count = 0;
|
|
Node* x = header->getNext(0);
|
|
while (x) {
|
|
x = x->getNext(0);
|
|
count++;
|
|
}
|
|
return count;
|
|
}
|
|
|
|
explicit SkipList(Version version = 0) {
|
|
header = Node::create(StringRef(), MaxLevels - 1);
|
|
for (int l = 0; l < MaxLevels; l++) {
|
|
header->setNext(l, nullptr);
|
|
header->setMaxVersion(l, version);
|
|
}
|
|
}
|
|
~SkipList() { destroy(); }
|
|
SkipList(SkipList&& other) noexcept : header(other.header) { other.header = nullptr; }
|
|
void operator=(SkipList&& other) noexcept {
|
|
destroy();
|
|
header = other.header;
|
|
other.header = nullptr;
|
|
}
|
|
void swap(SkipList& other) { std::swap(header, other.header); }
|
|
|
|
void addConflictRanges(const Finger* fingers, int rangeCount, Version version) {
|
|
for (int r = rangeCount - 1; r >= 0; r--) {
|
|
const Finger& startF = fingers[r * 2];
|
|
const Finger& endF = fingers[r * 2 + 1];
|
|
|
|
if (endF.found() == nullptr)
|
|
insert(endF, endF.finger[0]->getMaxVersion(0));
|
|
|
|
remove(startF, endF);
|
|
insert(startF, version);
|
|
}
|
|
}
|
|
|
|
void detectConflicts(ReadConflictRange* ranges, int count, bool* transactionConflictStatus) {
|
|
const int M = 16;
|
|
int nextJob[M];
|
|
CheckMax inProgress[M];
|
|
if (!count)
|
|
return;
|
|
|
|
int started = min(M, count);
|
|
for (int i = 0; i < started; i++) {
|
|
inProgress[i].init(ranges[i],
|
|
header,
|
|
transactionConflictStatus,
|
|
ranges[i].indexInTx,
|
|
ranges[i].conflictingKeyRange,
|
|
ranges[i].cKRArena);
|
|
nextJob[i] = i + 1;
|
|
}
|
|
nextJob[started - 1] = 0;
|
|
|
|
int prevJob = started - 1;
|
|
int job = 0;
|
|
// vtune: 340 parts
|
|
while (true) {
|
|
if (inProgress[job].advance()) {
|
|
if (started == count) {
|
|
if (prevJob == job)
|
|
break;
|
|
nextJob[prevJob] = nextJob[job];
|
|
job = prevJob;
|
|
} else {
|
|
int temp = started++;
|
|
inProgress[job].init(ranges[temp],
|
|
header,
|
|
transactionConflictStatus,
|
|
ranges[temp].indexInTx,
|
|
ranges[temp].conflictingKeyRange,
|
|
ranges[temp].cKRArena);
|
|
}
|
|
}
|
|
prevJob = job;
|
|
job = nextJob[job];
|
|
}
|
|
}
|
|
|
|
// Splits the version history represented by this skiplist into separate key ranges
|
|
// delimited by the given array of keys. This SkipList is left empty. this->partition
|
|
// is intended to be followed by a call to this->concatenate() recombining the same
|
|
// partitions. In between, operations on each partition must not touch any keys outside
|
|
// the partition. Specifically, the partition to the left of 'key' must not have a range
|
|
// [...,key) inserted, since that would insert an entry at 'key'.
|
|
// Note this function is not used.
|
|
void partition(StringRef* begin, int splitCount, SkipList* output) {
|
|
for (int i = splitCount - 1; i >= 0; i--) {
|
|
Finger f(header, begin[i]);
|
|
while (!f.finished())
|
|
f.nextLevel();
|
|
split(f, output[i + 1]);
|
|
}
|
|
swap(output[0]);
|
|
}
|
|
|
|
// Concatenates multiple SkipList objects into one and stores in input[0].
|
|
// Note this function is not used.
|
|
void concatenate(SkipList* input, int count) {
|
|
std::vector<Finger> ends(count - 1);
|
|
for (int i = 0; i < ends.size(); i++)
|
|
input[i].getEnd(ends[i]);
|
|
|
|
for (int l = 0; l < MaxLevels; l++) {
|
|
for (int i = ends.size() - 1; i >= 0; i--) {
|
|
ends[i].finger[l]->setNext(l, input[i + 1].header->getNext(l));
|
|
if (l && (!i || ends[i].finger[l] != input[i].header))
|
|
ends[i].finger[l]->calcVersionForLevel(l);
|
|
input[i + 1].header->setNext(l, nullptr);
|
|
}
|
|
}
|
|
swap(input[0]);
|
|
}
|
|
|
|
void find(const StringRef* values, Finger* results, int* temp, int count) {
|
|
// Relying on the ordering of values, descend until the values aren't all in the
|
|
// same part of the tree
|
|
|
|
// vtune: 11 parts
|
|
results[0].init(values[0], header);
|
|
const StringRef& endValue = values[count - 1];
|
|
while (results[0].level > 1) {
|
|
results[0].nextLevel();
|
|
Node* ac = results[0].alreadyChecked;
|
|
if (ac && less(ac->value(), ac->length(), endValue.begin(), endValue.size()))
|
|
break;
|
|
}
|
|
|
|
// Init all the other fingers to start descending where we stopped
|
|
// the first one
|
|
|
|
// SOMEDAY: this loop showed up on vtune, could be faster?
|
|
// vtune: 8 parts
|
|
int startLevel = results[0].level + 1;
|
|
Node* x = startLevel < MaxLevels ? results[0].finger[startLevel] : header;
|
|
for (int i = 1; i < count; i++) {
|
|
results[i].level = startLevel;
|
|
results[i].x = x;
|
|
results[i].alreadyChecked = nullptr;
|
|
results[i].value = values[i];
|
|
for (int j = startLevel; j < MaxLevels; j++)
|
|
results[i].finger[j] = results[0].finger[j];
|
|
}
|
|
|
|
int* nextJob = temp;
|
|
for (int i = 0; i < count - 1; i++)
|
|
nextJob[i] = i + 1;
|
|
nextJob[count - 1] = 0;
|
|
|
|
int prevJob = count - 1;
|
|
int job = 0;
|
|
|
|
// vtune: 225 parts
|
|
while (true) {
|
|
Finger* f = &results[job];
|
|
f->advance();
|
|
if (f->finished()) {
|
|
if (prevJob == job)
|
|
break;
|
|
nextJob[prevJob] = nextJob[job];
|
|
} else {
|
|
f->prefetch();
|
|
prevJob = job;
|
|
}
|
|
job = nextJob[job];
|
|
}
|
|
}
|
|
|
|
int removeBefore(Version v, Finger& f, int nodeCount) {
|
|
// f.x, f.alreadyChecked?
|
|
|
|
int removedCount = 0;
|
|
bool wasAbove = true;
|
|
while (nodeCount--) {
|
|
Node* x = f.finger[0]->getNext(0);
|
|
if (!x)
|
|
break;
|
|
|
|
// double prefetch gives +25% speed (single threaded)
|
|
Node* next = x->getNext(0);
|
|
_mm_prefetch((const char*)next, _MM_HINT_T0);
|
|
next = x->getNext(1);
|
|
_mm_prefetch((const char*)next, _MM_HINT_T0);
|
|
|
|
bool isAbove = x->getMaxVersion(0) >= v;
|
|
if (isAbove || wasAbove) { // f.nextItem
|
|
for (int l = 0; l <= x->level(); l++)
|
|
f.finger[l] = x;
|
|
} else { // f.eraseItem
|
|
removedCount++;
|
|
for (int l = 0; l <= x->level(); l++)
|
|
f.finger[l]->setNext(l, x->getNext(l));
|
|
for (int i = 1; i <= x->level(); i++)
|
|
f.finger[i]->setMaxVersion(i, max(f.finger[i]->getMaxVersion(i), x->getMaxVersion(i)));
|
|
x->destroy();
|
|
}
|
|
wasAbove = isAbove;
|
|
}
|
|
|
|
return removedCount;
|
|
}
|
|
|
|
private:
|
|
void remove(const Finger& start, const Finger& end) {
|
|
if (start.finger[0] == end.finger[0])
|
|
return;
|
|
|
|
Node* x = start.finger[0]->getNext(0);
|
|
|
|
// vtune says: this loop is the expensive parts (6 parts)
|
|
for (int i = 0; i < MaxLevels; i++)
|
|
if (start.finger[i] != end.finger[i])
|
|
start.finger[i]->setNext(i, end.finger[i]->getNext(i));
|
|
|
|
while (true) {
|
|
Node* next = x->getNext(0);
|
|
x->destroy();
|
|
if (x == end.finger[0])
|
|
break;
|
|
x = next;
|
|
}
|
|
}
|
|
|
|
void insert(const Finger& f, Version version) {
|
|
int level = randomLevel();
|
|
// cout << std::string((const char*)value,length) << " level: " << level << endl;
|
|
Node* x = Node::create(f.value, level);
|
|
x->setMaxVersion(0, version);
|
|
for (int i = 0; i <= level; i++) {
|
|
x->setNext(i, f.finger[i]->getNext(i));
|
|
f.finger[i]->setNext(i, x);
|
|
}
|
|
// vtune says: this loop is the costly part of this function
|
|
for (int i = 1; i <= level; i++) {
|
|
f.finger[i]->calcVersionForLevel(i);
|
|
x->calcVersionForLevel(i);
|
|
}
|
|
for (int i = level + 1; i < MaxLevels; i++) {
|
|
Version v = f.finger[i]->getMaxVersion(i);
|
|
if (v >= version)
|
|
break;
|
|
f.finger[i]->setMaxVersion(i, version);
|
|
}
|
|
}
|
|
|
|
void insert(const StringRef& value, Version version) {
|
|
Finger f(header, value);
|
|
while (!f.finished())
|
|
f.nextLevel();
|
|
// SOMEDAY: equality?
|
|
insert(f, version);
|
|
}
|
|
|
|
struct CheckMax {
|
|
Finger start, end;
|
|
Version version;
|
|
bool* result;
|
|
int state;
|
|
int indexInTx;
|
|
VectorRef<int>* conflictingKeyRange; // nullptr if report_conflicting_keys is not enabled.
|
|
Arena* cKRArena; // nullptr if report_conflicting_keys is not enabled.
|
|
|
|
void init(const ReadConflictRange& r,
|
|
Node* header,
|
|
bool* tCS,
|
|
int indexInTx,
|
|
VectorRef<int>* cKR,
|
|
Arena* cKRArena) {
|
|
this->start.init(r.begin, header);
|
|
this->end.init(r.end, header);
|
|
this->version = r.version;
|
|
this->indexInTx = indexInTx;
|
|
this->cKRArena = cKRArena;
|
|
result = &tCS[r.transaction];
|
|
conflictingKeyRange = cKR;
|
|
this->state = 0;
|
|
}
|
|
|
|
bool noConflict() const { return true; }
|
|
bool conflict() {
|
|
*result = true;
|
|
if (conflictingKeyRange != nullptr)
|
|
conflictingKeyRange->push_back(*cKRArena, indexInTx);
|
|
return true;
|
|
}
|
|
|
|
// Return true if finished
|
|
force_inline bool advance() {
|
|
switch (state) {
|
|
case 0:
|
|
// find where start and end fingers diverge
|
|
while (true) {
|
|
if (!start.advance()) {
|
|
start.prefetch();
|
|
return false;
|
|
}
|
|
end.x = start.x;
|
|
while (!end.advance())
|
|
;
|
|
|
|
int l = start.level;
|
|
if (start.finger[l] != end.finger[l])
|
|
break;
|
|
// accept if the range spans the check range, but does not have a greater version
|
|
if (start.finger[l]->getMaxVersion(l) <= version)
|
|
return noConflict();
|
|
if (l == 0)
|
|
return conflict();
|
|
}
|
|
state = 1;
|
|
case 1: {
|
|
// check the end side of the pyramid
|
|
Node* e = end.finger[end.level];
|
|
while (e->getMaxVersion(end.level) > version) {
|
|
if (end.finished())
|
|
return conflict();
|
|
end.nextLevel();
|
|
Node* f = end.finger[end.level];
|
|
while (e != f) {
|
|
if (e->getMaxVersion(end.level) > version)
|
|
return conflict();
|
|
e = e->getNext(end.level);
|
|
}
|
|
}
|
|
|
|
// check the start side of the pyramid
|
|
Node* s = end.finger[start.level];
|
|
while (true) {
|
|
Node* nextS = start.finger[start.level]->getNext(start.level);
|
|
Node* p = nextS;
|
|
while (p != s) {
|
|
if (p->getMaxVersion(start.level) > version)
|
|
return conflict();
|
|
p = p->getNext(start.level);
|
|
}
|
|
if (start.finger[start.level]->getMaxVersion(start.level) <= version)
|
|
return noConflict();
|
|
s = nextS;
|
|
if (start.finished()) {
|
|
if (nextS->length() == start.value.size() &&
|
|
!memcmp(nextS->value(), start.value.begin(), start.value.size()))
|
|
return noConflict();
|
|
else
|
|
return conflict();
|
|
}
|
|
start.nextLevel();
|
|
}
|
|
}
|
|
default:
|
|
__assume(false);
|
|
}
|
|
}
|
|
};
|
|
|
|
// Splits the SkipLists so that those after finger is moved to "right".
|
|
void split(const Finger& f, SkipList& right) {
|
|
ASSERT(!right.header->getNext(0)); // right must be empty
|
|
right.header->setMaxVersion(0, f.finger[0]->getMaxVersion(0));
|
|
for (int l = 0; l < MaxLevels; l++) {
|
|
right.header->setNext(l, f.finger[l]->getNext(l));
|
|
f.finger[l]->setNext(l, nullptr);
|
|
}
|
|
}
|
|
|
|
// Sets end's finger to the last nodes at all levels.
|
|
void getEnd(Finger& end) {
|
|
Node* node = header;
|
|
for (int l = MaxLevels - 1; l >= 0; l--) {
|
|
Node* next;
|
|
while ((next = node->getNext(l)) != nullptr)
|
|
node = next;
|
|
end.finger[l] = node;
|
|
}
|
|
end.level = 0;
|
|
}
|
|
};
|
|
|
|
struct ConflictSet {
|
|
ConflictSet() : removalKey(makeString(0)), oldestVersion(0) {}
|
|
~ConflictSet() {}
|
|
|
|
SkipList versionHistory;
|
|
Key removalKey;
|
|
Version oldestVersion;
|
|
};
|
|
|
|
ConflictSet* newConflictSet() {
|
|
return new ConflictSet;
|
|
}
|
|
void clearConflictSet(ConflictSet* cs, Version v) {
|
|
SkipList(v).swap(cs->versionHistory);
|
|
}
|
|
void destroyConflictSet(ConflictSet* cs) {
|
|
delete cs;
|
|
}
|
|
|
|
ConflictBatch::ConflictBatch(ConflictSet* cs,
|
|
std::map<int, VectorRef<int>>* conflictingKeyRangeMap,
|
|
Arena* resolveBatchReplyArena)
|
|
: cs(cs), transactionCount(0), conflictingKeyRangeMap(conflictingKeyRangeMap),
|
|
resolveBatchReplyArena(resolveBatchReplyArena) {}
|
|
|
|
ConflictBatch::~ConflictBatch() {}
|
|
|
|
struct TransactionInfo {
|
|
VectorRef<std::pair<int, int>> readRanges;
|
|
VectorRef<std::pair<int, int>> writeRanges;
|
|
bool tooOld;
|
|
bool reportConflictingKeys;
|
|
};
|
|
|
|
void ConflictBatch::addTransaction(const CommitTransactionRef& tr) {
|
|
const int t = transactionCount++;
|
|
|
|
Arena& arena = transactionInfo.arena();
|
|
TransactionInfo* info = new (arena) TransactionInfo;
|
|
info->reportConflictingKeys = tr.report_conflicting_keys;
|
|
|
|
if (tr.read_snapshot < cs->oldestVersion && tr.read_conflict_ranges.size()) {
|
|
info->tooOld = true;
|
|
} else {
|
|
info->tooOld = false;
|
|
info->readRanges.resize(arena, tr.read_conflict_ranges.size());
|
|
info->writeRanges.resize(arena, tr.write_conflict_ranges.size());
|
|
|
|
for (int r = 0; r < tr.read_conflict_ranges.size(); r++) {
|
|
const KeyRangeRef& range = tr.read_conflict_ranges[r];
|
|
points.emplace_back(range.begin, true, false, t, &info->readRanges[r].first);
|
|
points.emplace_back(range.end, false, false, t, &info->readRanges[r].second);
|
|
combinedReadConflictRanges.emplace_back(range.begin,
|
|
range.end,
|
|
tr.read_snapshot,
|
|
t,
|
|
r,
|
|
tr.report_conflicting_keys ? &(*conflictingKeyRangeMap)[t]
|
|
: nullptr,
|
|
tr.report_conflicting_keys ? resolveBatchReplyArena : nullptr);
|
|
}
|
|
for (int r = 0; r < tr.write_conflict_ranges.size(); r++) {
|
|
const KeyRangeRef& range = tr.write_conflict_ranges[r];
|
|
points.emplace_back(range.begin, true, true, t, &info->writeRanges[r].first);
|
|
points.emplace_back(range.end, false, true, t, &info->writeRanges[r].second);
|
|
}
|
|
}
|
|
|
|
transactionInfo.push_back(arena, info);
|
|
}
|
|
|
|
// SOMEDAY: This should probably be replaced with a roaring bitmap.
|
|
class MiniConflictSet : NonCopyable {
|
|
std::vector<bool> values;
|
|
|
|
public:
|
|
explicit MiniConflictSet(int size) { values.assign(size, false); }
|
|
void set(int begin, int end) {
|
|
for (int i = begin; i < end; i++)
|
|
values[i] = true;
|
|
}
|
|
bool any(int begin, int end) {
|
|
for (int i = begin; i < end; i++)
|
|
if (values[i])
|
|
return true;
|
|
return false;
|
|
}
|
|
};
|
|
|
|
void ConflictBatch::checkIntraBatchConflicts() {
|
|
int index = 0;
|
|
for (int p = 0; p < points.size(); p++)
|
|
*points[p].pIndex = index++;
|
|
|
|
MiniConflictSet mcs(index);
|
|
for (int t = 0; t < transactionInfo.size(); t++) {
|
|
const TransactionInfo& tr = *transactionInfo[t];
|
|
if (transactionConflictStatus[t])
|
|
continue;
|
|
bool conflict = tr.tooOld;
|
|
for (int i = 0; i < tr.readRanges.size(); i++) {
|
|
if (mcs.any(tr.readRanges[i].first, tr.readRanges[i].second)) {
|
|
if (tr.reportConflictingKeys) {
|
|
(*conflictingKeyRangeMap)[t].push_back(*resolveBatchReplyArena, i);
|
|
}
|
|
conflict = true;
|
|
break;
|
|
}
|
|
}
|
|
transactionConflictStatus[t] = conflict;
|
|
if (!conflict)
|
|
for (int i = 0; i < tr.writeRanges.size(); i++)
|
|
mcs.set(tr.writeRanges[i].first, tr.writeRanges[i].second);
|
|
}
|
|
}
|
|
|
|
void ConflictBatch::GetTooOldTransactions(std::vector<int>& tooOldTransactions) {
|
|
for (int i = 0; i < transactionInfo.size(); i++) {
|
|
if (transactionInfo[i]->tooOld) {
|
|
tooOldTransactions.push_back(i);
|
|
}
|
|
}
|
|
}
|
|
|
|
void ConflictBatch::detectConflicts(Version now,
|
|
Version newOldestVersion,
|
|
std::vector<int>& nonConflicting,
|
|
std::vector<int>* tooOldTransactions) {
|
|
double t = timer();
|
|
sortPoints(points);
|
|
g_sort += timer() - t;
|
|
|
|
transactionConflictStatus = new bool[transactionCount];
|
|
memset(transactionConflictStatus, 0, transactionCount * sizeof(bool));
|
|
|
|
t = timer();
|
|
checkReadConflictRanges();
|
|
g_checkRead += timer() - t;
|
|
|
|
t = timer();
|
|
checkIntraBatchConflicts();
|
|
g_checkBatch += timer() - t;
|
|
|
|
t = timer();
|
|
combineWriteConflictRanges();
|
|
g_combine += timer() - t;
|
|
|
|
t = timer();
|
|
mergeWriteConflictRanges(now);
|
|
g_merge += timer() - t;
|
|
|
|
for (int i = 0; i < transactionCount; i++) {
|
|
if (tooOldTransactions && transactionInfo[i]->tooOld) {
|
|
tooOldTransactions->push_back(i);
|
|
} else if (!transactionConflictStatus[i]) {
|
|
nonConflicting.push_back(i);
|
|
}
|
|
}
|
|
|
|
delete[] transactionConflictStatus;
|
|
|
|
t = timer();
|
|
if (newOldestVersion > cs->oldestVersion) {
|
|
cs->oldestVersion = newOldestVersion;
|
|
SkipList::Finger finger;
|
|
int temp;
|
|
cs->versionHistory.find(&cs->removalKey, &finger, &temp, 1);
|
|
cs->versionHistory.removeBefore(cs->oldestVersion, finger, combinedWriteConflictRanges.size() * 3 + 10);
|
|
cs->removalKey = finger.getValue();
|
|
}
|
|
g_removeBefore += timer() - t;
|
|
}
|
|
|
|
void ConflictBatch::checkReadConflictRanges() {
|
|
if (combinedReadConflictRanges.empty())
|
|
return;
|
|
|
|
cs->versionHistory.detectConflicts(
|
|
&combinedReadConflictRanges[0], combinedReadConflictRanges.size(), transactionConflictStatus);
|
|
}
|
|
|
|
void ConflictBatch::addConflictRanges(Version now,
|
|
std::vector<std::pair<StringRef, StringRef>>::iterator begin,
|
|
std::vector<std::pair<StringRef, StringRef>>::iterator end,
|
|
SkipList* part) {
|
|
const int count = end - begin;
|
|
static_assert(sizeof(*begin) == sizeof(StringRef) * 2,
|
|
"Write Conflict Range type not convertible to two StringPtrs");
|
|
const StringRef* strings = reinterpret_cast<const StringRef*>(&*begin);
|
|
const int stringCount = count * 2;
|
|
|
|
const int stripeSize = 16;
|
|
SkipList::Finger fingers[stripeSize];
|
|
int temp[stripeSize];
|
|
int stripes = (stringCount + stripeSize - 1) / stripeSize;
|
|
|
|
int ss = stringCount - (stripes - 1) * stripeSize;
|
|
for (int s = stripes - 1; s >= 0; s--) {
|
|
part->find(&strings[s * stripeSize], fingers, temp, ss);
|
|
part->addConflictRanges(fingers, ss / 2, now);
|
|
ss = stripeSize;
|
|
}
|
|
}
|
|
|
|
void ConflictBatch::mergeWriteConflictRanges(Version now) {
|
|
if (combinedWriteConflictRanges.empty())
|
|
return;
|
|
|
|
addConflictRanges(now, combinedWriteConflictRanges.begin(), combinedWriteConflictRanges.end(), &cs->versionHistory);
|
|
}
|
|
|
|
void ConflictBatch::combineWriteConflictRanges() {
|
|
int activeWriteCount = 0;
|
|
for (const KeyInfo& point : points) {
|
|
if (point.write && !transactionConflictStatus[point.transaction]) {
|
|
if (point.begin) {
|
|
activeWriteCount++;
|
|
if (activeWriteCount == 1)
|
|
combinedWriteConflictRanges.emplace_back(point.key, KeyRef());
|
|
} else /*if (point.end)*/ {
|
|
activeWriteCount--;
|
|
if (activeWriteCount == 0)
|
|
combinedWriteConflictRanges.back().second = point.key;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
namespace {
|
|
StringRef setK(Arena& arena, int i) {
|
|
char t[sizeof(i)];
|
|
*(int*)t = i;
|
|
|
|
const int keySize = 16;
|
|
|
|
char* ss = new (arena) char[keySize];
|
|
for (int c = 0; c < keySize - sizeof(i); c++)
|
|
ss[c] = '.';
|
|
for (int c = 0; c < sizeof(i); c++)
|
|
ss[c + keySize - sizeof(i)] = t[sizeof(i) - 1 - c];
|
|
|
|
return StringRef((const uint8_t*)ss, keySize);
|
|
}
|
|
|
|
void miniConflictSetTest() {
|
|
for (int i = 0; i < 2000000; i++) {
|
|
int size = 64 * 5; // Also run 64*64*5 to test multiple words of andValues and orValues
|
|
MiniConflictSet mini(size);
|
|
for (int j = 0; j < 2; j++) {
|
|
int a = deterministicRandom()->randomInt(0, size);
|
|
int b = deterministicRandom()->randomInt(a, size);
|
|
mini.set(a, b);
|
|
}
|
|
for (int j = 0; j < 4; j++) {
|
|
int a = deterministicRandom()->randomInt(0, size);
|
|
int b = deterministicRandom()->randomInt(a, size);
|
|
mini.any(a, b); // Tests correctness internally
|
|
}
|
|
}
|
|
printf("miniConflictSetTest complete\n");
|
|
}
|
|
|
|
void operatorLessThanTest() {
|
|
{ // Longer strings before shorter strings.
|
|
KeyInfo a(LiteralStringRef("hello"), /*begin=*/false, /*write=*/true, 0, nullptr);
|
|
KeyInfo b(LiteralStringRef("hello\0"), /*begin=*/false, /*write=*/false, 0, nullptr);
|
|
ASSERT(a < b);
|
|
ASSERT(!(b < a));
|
|
ASSERT(!(a == b));
|
|
}
|
|
|
|
{ // Reads before writes.
|
|
KeyInfo a(LiteralStringRef("hello"), /*begin=*/false, /*write=*/false, 0, nullptr);
|
|
KeyInfo b(LiteralStringRef("hello"), /*begin=*/false, /*write=*/true, 0, nullptr);
|
|
ASSERT(a < b);
|
|
ASSERT(!(b < a));
|
|
ASSERT(!(a == b));
|
|
}
|
|
|
|
{ // Begin reads after writes.
|
|
KeyInfo a(LiteralStringRef("hello"), /*begin=*/false, /*write=*/true, 0, nullptr);
|
|
KeyInfo b(LiteralStringRef("hello"), /*begin=*/true, /*write=*/false, 0, nullptr);
|
|
ASSERT(a < b);
|
|
ASSERT(!(b < a));
|
|
ASSERT(!(a == b));
|
|
}
|
|
|
|
{ // Begin writes after writes.
|
|
KeyInfo a(LiteralStringRef("hello"), /*begin=*/false, /*write=*/true, 0, nullptr);
|
|
KeyInfo b(LiteralStringRef("hello"), /*begin=*/true, /*write=*/true, 0, nullptr);
|
|
ASSERT(a < b);
|
|
ASSERT(!(b < a));
|
|
ASSERT(!(a == b));
|
|
}
|
|
}
|
|
} // namespace
|
|
|
|
void skipListTest() {
|
|
printf("Skip list test\n");
|
|
|
|
miniConflictSetTest();
|
|
|
|
operatorLessThanTest();
|
|
|
|
setAffinity(0);
|
|
|
|
double start;
|
|
|
|
ConflictSet* cs = newConflictSet();
|
|
|
|
Arena testDataArena;
|
|
VectorRef<VectorRef<KeyRangeRef>> testData;
|
|
const int batches = 500; // deterministicRandom()->randomInt(500, 5000);
|
|
const int data_per_batch = 5000;
|
|
testData.resize(testDataArena, batches);
|
|
std::vector<std::vector<uint8_t>> success(batches);
|
|
std::vector<std::vector<uint8_t>> success2(batches);
|
|
for (int i = 0; i < batches; i++) {
|
|
testData[i].resize(testDataArena, data_per_batch);
|
|
success[i].assign(data_per_batch, false);
|
|
success2[i].assign(data_per_batch, false);
|
|
for (int j = 0; j < data_per_batch; j++) {
|
|
int key = deterministicRandom()->randomInt(0, 20000000);
|
|
int key2 = key + 1 + deterministicRandom()->randomInt(0, 10);
|
|
testData[i][j] = KeyRangeRef(setK(testDataArena, key), setK(testDataArena, key2));
|
|
}
|
|
}
|
|
printf("Test data generated: %d batches, %d/batch\n", batches, data_per_batch);
|
|
|
|
printf("Running\n");
|
|
|
|
int readCount = 1, writeCount = 1;
|
|
int cranges = 0, tcount = 0;
|
|
|
|
start = timer();
|
|
std::vector<std::vector<int>> nonConflict(batches);
|
|
Version version = 0;
|
|
for (const auto& data : testData) {
|
|
Arena buf;
|
|
std::vector<CommitTransactionRef> trs;
|
|
double t = timer();
|
|
for (int j = 0; j + readCount + writeCount <= data.size(); j += readCount + writeCount) {
|
|
CommitTransactionRef tr;
|
|
for (int k = 0; k < readCount; k++) {
|
|
KeyRangeRef r(buf, data[j + k]);
|
|
tr.read_conflict_ranges.push_back(buf, r);
|
|
}
|
|
for (int k = 0; k < writeCount; k++) {
|
|
KeyRangeRef r(buf, data[j + readCount + k]);
|
|
tr.write_conflict_ranges.push_back(buf, r);
|
|
}
|
|
cranges += tr.read_conflict_ranges.size() + tr.write_conflict_ranges.size();
|
|
tr.read_snapshot = version;
|
|
trs.push_back(tr);
|
|
}
|
|
tcount += trs.size();
|
|
g_buildTest += timer() - t;
|
|
|
|
t = timer();
|
|
ConflictBatch batch(cs);
|
|
for (const auto& tr : trs) {
|
|
batch.addTransaction(tr);
|
|
}
|
|
g_add += timer() - t;
|
|
|
|
t = timer();
|
|
batch.detectConflicts(version + 50, version, nonConflict[version]);
|
|
g_detectConflicts += timer() - t;
|
|
|
|
version++;
|
|
}
|
|
double elapsed = timer() - start;
|
|
printf("New conflict set: %0.3f sec\n", elapsed);
|
|
printf(" %0.3f Mtransactions/sec\n", tcount / elapsed / 1e6);
|
|
printf(" %0.3f Mkeys/sec\n", cranges * 2 / elapsed / 1e6);
|
|
|
|
elapsed = g_detectConflicts.getValue();
|
|
printf("Detect only: %0.3f sec\n", elapsed);
|
|
printf(" %0.3f Mtransactions/sec\n", tcount / elapsed / 1e6);
|
|
printf(" %0.3f Mkeys/sec\n", cranges * 2 / elapsed / 1e6);
|
|
|
|
elapsed = g_checkRead.getValue() + g_merge.getValue();
|
|
printf("Skiplist only: %0.3f sec\n", elapsed);
|
|
printf(" %0.3f Mtransactions/sec\n", tcount / elapsed / 1e6);
|
|
printf(" %0.3f Mkeys/sec\n", cranges * 2 / elapsed / 1e6);
|
|
|
|
printf("Performance counters:\n");
|
|
for (const auto& counter : skc) {
|
|
printf("%20s: %s\n", counter->getMetric().name().c_str(), counter->getMetric().formatted().c_str());
|
|
}
|
|
|
|
printf("%d entries in version history\n", cs->versionHistory.count());
|
|
}
|