foundationdb/fdbclient/ReadYourWrites.actor.cpp

2426 lines
84 KiB
C++

/*
* ReadYourWrites.actor.cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2018 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "fdbclient/ReadYourWrites.h"
#include "fdbclient/Atomic.h"
#include "fdbclient/DatabaseContext.h"
#include "fdbclient/SpecialKeySpace.actor.h"
#include "fdbclient/StatusClient.h"
#include "fdbclient/MonitorLeader.h"
#include "flow/Util.h"
#include "flow/actorcompiler.h" // This must be the last #include.
class RYWImpl {
public:
template <class Iter>
static void dump(Iter it) {
it.skip(allKeys.begin);
Arena arena;
while (true) {
Optional<StringRef> key = StringRef();
if (it.is_kv()) {
auto kv = it.kv(arena);
if (kv)
key = kv->key;
}
TraceEvent("RYWDump")
.detail("Begin", it.beginKey())
.detail("End", it.endKey())
.detail("Unknown", it.is_unknown_range())
.detail("Empty", it.is_empty_range())
.detail("KV", it.is_kv())
.detail("Key", key.get());
if (it.endKey() == allKeys.end)
break;
++it;
}
}
struct GetValueReq {
explicit GetValueReq(Key key) : key(key) {}
Key key;
typedef Optional<Value> Result;
};
struct GetKeyReq {
explicit GetKeyReq(KeySelector key) : key(key) {}
KeySelector key;
typedef Key Result;
};
template <bool Reverse>
struct GetRangeReq {
GetRangeReq(KeySelector begin, KeySelector end, GetRangeLimits limits)
: begin(begin), end(end), limits(limits) {}
KeySelector begin, end;
GetRangeLimits limits;
using Result = RangeResult;
};
// read() Performs a read (get, getKey, getRange, etc), in the context of the given transaction. Snapshot or RYW
// reads are distingushed by the type Iter being SnapshotCache::iterator or RYWIterator. Fills in the snapshot cache
// as a side effect but does not affect conflict ranges. Some (indicated) overloads of read are required to update
// the given *it to point to the key that was read, so that the corresponding overload of addConflictRange() can
// make use of it.
ACTOR template <class Iter>
static Future<Optional<Value>> read(ReadYourWritesTransaction* ryw, GetValueReq read, Iter* it) {
// This overload is required to provide postcondition: it->extractWriteMapIterator().segmentContains(read.key)
if (ryw->options.bypassUnreadable) {
it->bypassUnreadableProtection();
}
it->skip(read.key);
state bool dependent = it->is_dependent();
if (it->is_kv()) {
const KeyValueRef* result = it->kv(ryw->arena);
if (result != nullptr) {
return result->value;
} else {
return Optional<Value>();
}
} else if (it->is_empty_range()) {
return Optional<Value>();
} else {
Optional<Value> res = wait(ryw->tr.get(read.key, true));
KeyRef k(ryw->arena, read.key);
if (res.present()) {
if (ryw->cache.insert(k, res.get()))
ryw->arena.dependsOn(res.get().arena());
if (!dependent)
return res;
} else {
ryw->cache.insert(k, Optional<ValueRef>());
if (!dependent)
return Optional<Value>();
}
// There was a dependent write at the key, so we need to lookup the iterator again
it->skip(k);
ASSERT(it->is_kv());
const KeyValueRef* result = it->kv(ryw->arena);
if (result != nullptr) {
return result->value;
} else {
return Optional<Value>();
}
}
}
ACTOR template <class Iter>
static Future<Key> read(ReadYourWritesTransaction* ryw, GetKeyReq read, Iter* it) {
if (read.key.offset > 0) {
RangeResult result =
wait(getRangeValue(ryw, read.key, firstGreaterOrEqual(ryw->getMaxReadKey()), GetRangeLimits(1), it));
if (result.readToBegin)
return allKeys.begin;
if (result.readThroughEnd || !result.size())
return ryw->getMaxReadKey();
return result[0].key;
} else {
read.key.offset++;
RangeResult result =
wait(getRangeValueBack(ryw, firstGreaterOrEqual(allKeys.begin), read.key, GetRangeLimits(1), it));
if (result.readThroughEnd)
return ryw->getMaxReadKey();
if (result.readToBegin || !result.size())
return allKeys.begin;
return result[0].key;
}
};
template <class Iter>
static Future<RangeResult> read(ReadYourWritesTransaction* ryw, GetRangeReq<false> read, Iter* it) {
return getRangeValue(ryw, read.begin, read.end, read.limits, it);
};
template <class Iter>
static Future<RangeResult> read(ReadYourWritesTransaction* ryw, GetRangeReq<true> read, Iter* it) {
return getRangeValueBack(ryw, read.begin, read.end, read.limits, it);
};
// readThrough() performs a read in the RYW disabled case, passing it on relatively directly to the underlying
// transaction. Responsible for clipping results to the non-system keyspace when appropriate, since NativeAPI
// doesn't do that.
static Future<Optional<Value>> readThrough(ReadYourWritesTransaction* ryw, GetValueReq read, bool snapshot) {
return ryw->tr.get(read.key, snapshot);
}
ACTOR static Future<Key> readThrough(ReadYourWritesTransaction* ryw, GetKeyReq read, bool snapshot) {
Key key = wait(ryw->tr.getKey(read.key, snapshot));
if (ryw->getMaxReadKey() < key)
return ryw->getMaxReadKey(); // Filter out results in the system keys if they are not accessible
return key;
}
ACTOR template <bool Reverse>
static Future<RangeResult> readThrough(ReadYourWritesTransaction* ryw, GetRangeReq<Reverse> read, bool snapshot) {
if (Reverse && read.end.offset > 1) {
// FIXME: Optimistically assume that this will not run into the system keys, and only reissue if the result
// actually does.
Key key = wait(ryw->tr.getKey(read.end, snapshot));
if (key > ryw->getMaxReadKey())
read.end = firstGreaterOrEqual(ryw->getMaxReadKey());
else
read.end = KeySelector(firstGreaterOrEqual(key), key.arena());
}
RangeResult v = wait(ryw->tr.getRange(read.begin, read.end, read.limits, snapshot, Reverse));
KeyRef maxKey = ryw->getMaxReadKey();
if (v.size() > 0) {
if (!Reverse && v[v.size() - 1].key >= maxKey) {
state RangeResult _v = v;
int i = _v.size() - 2;
for (; i >= 0 && _v[i].key >= maxKey; --i) {
}
return RangeResult(RangeResultRef(VectorRef<KeyValueRef>(&_v[0], i + 1), false), _v.arena());
}
}
return v;
}
// addConflictRange(ryw,read,result) is called after a serializable read and is responsible for adding the relevant
// conflict range
static void addConflictRange(ReadYourWritesTransaction* ryw,
GetValueReq read,
WriteMap::iterator& it,
Optional<Value> result) {
// it will already point to the right segment (see the calling code in read()), so we don't need to skip
// read.key will be copied into ryw->arena inside of updateConflictMap if it is being added
ryw->updateConflictMap(read.key, it);
}
static void addConflictRange(ReadYourWritesTransaction* ryw, GetKeyReq read, WriteMap::iterator& it, Key result) {
KeyRangeRef readRange;
if (read.key.offset <= 0)
readRange = KeyRangeRef(KeyRef(ryw->arena, result),
read.key.orEqual ? keyAfter(read.key.getKey(), ryw->arena)
: KeyRef(ryw->arena, read.key.getKey()));
else
readRange = KeyRangeRef(read.key.orEqual ? keyAfter(read.key.getKey(), ryw->arena)
: KeyRef(ryw->arena, read.key.getKey()),
keyAfter(result, ryw->arena));
it.skip(readRange.begin);
ryw->updateConflictMap(readRange, it);
}
static void addConflictRange(ReadYourWritesTransaction* ryw,
GetRangeReq<false> read,
WriteMap::iterator& it,
RangeResult const& result) {
KeyRef rangeBegin, rangeEnd;
bool endInArena = false;
if (read.begin.getKey() < read.end.getKey()) {
rangeBegin = read.begin.getKey();
rangeEnd = read.end.offset > 0 && result.more ? read.begin.getKey() : read.end.getKey();
} else {
rangeBegin = read.end.getKey();
rangeEnd = read.begin.getKey();
}
if (result.readToBegin && read.begin.offset <= 0)
rangeBegin = allKeys.begin;
if (result.readThroughEnd && read.end.offset > 0)
rangeEnd = ryw->getMaxReadKey();
if (result.size()) {
if (read.begin.offset <= 0)
rangeBegin = std::min(rangeBegin, result[0].key);
if (rangeEnd <= result.end()[-1].key) {
rangeEnd = keyAfter(result.end()[-1].key, ryw->arena);
endInArena = true;
}
}
KeyRangeRef readRange =
KeyRangeRef(KeyRef(ryw->arena, rangeBegin), endInArena ? rangeEnd : KeyRef(ryw->arena, rangeEnd));
it.skip(readRange.begin);
ryw->updateConflictMap(readRange, it);
}
static void addConflictRange(ReadYourWritesTransaction* ryw,
GetRangeReq<true> read,
WriteMap::iterator& it,
RangeResult const& result) {
KeyRef rangeBegin, rangeEnd;
bool endInArena = false;
if (read.begin.getKey() < read.end.getKey()) {
rangeBegin = read.begin.offset <= 0 && result.more ? read.end.getKey() : read.begin.getKey();
rangeEnd = read.end.getKey();
} else {
rangeBegin = read.end.getKey();
rangeEnd = read.begin.getKey();
}
if (result.readToBegin && read.begin.offset <= 0)
rangeBegin = allKeys.begin;
if (result.readThroughEnd && read.end.offset > 0)
rangeEnd = ryw->getMaxReadKey();
if (result.size()) {
rangeBegin = std::min(rangeBegin, result.end()[-1].key);
if (read.end.offset > 0 && rangeEnd <= result[0].key) {
rangeEnd = keyAfter(result[0].key, ryw->arena);
endInArena = true;
}
}
KeyRangeRef readRange =
KeyRangeRef(KeyRef(ryw->arena, rangeBegin), endInArena ? rangeEnd : KeyRef(ryw->arena, rangeEnd));
it.skip(readRange.begin);
ryw->updateConflictMap(readRange, it);
}
ACTOR template <class Req>
static Future<typename Req::Result> readWithConflictRangeThrough(ReadYourWritesTransaction* ryw,
Req req,
bool snapshot) {
choose {
when(typename Req::Result result = wait(readThrough(ryw, req, snapshot))) { return result; }
when(wait(ryw->resetPromise.getFuture())) { throw internal_error(); }
}
}
ACTOR template <class Req>
static Future<typename Req::Result> readWithConflictRangeSnapshot(ReadYourWritesTransaction* ryw, Req req) {
state SnapshotCache::iterator it(&ryw->cache, &ryw->writes);
choose {
when(typename Req::Result result = wait(read(ryw, req, &it))) { return result; }
when(wait(ryw->resetPromise.getFuture())) { throw internal_error(); }
}
}
ACTOR template <class Req>
static Future<typename Req::Result> readWithConflictRangeRYW(ReadYourWritesTransaction* ryw,
Req req,
bool snapshot) {
state RYWIterator it(&ryw->cache, &ryw->writes);
choose {
when(typename Req::Result result = wait(read(ryw, req, &it))) {
// Some overloads of addConflictRange() require it to point to the "right" key and others don't. The
// corresponding overloads of read() have to provide that guarantee!
if (!snapshot)
addConflictRange(ryw, req, it.extractWriteMapIterator(), result);
return result;
}
when(wait(ryw->resetPromise.getFuture())) { throw internal_error(); }
}
}
template <class Req>
static inline Future<typename Req::Result> readWithConflictRange(ReadYourWritesTransaction* ryw,
Req const& req,
bool snapshot) {
if (ryw->options.readYourWritesDisabled) {
return readWithConflictRangeThrough(ryw, req, snapshot);
} else if (snapshot && ryw->options.snapshotRywEnabled <= 0) {
return readWithConflictRangeSnapshot(ryw, req);
}
return readWithConflictRangeRYW(ryw, req, snapshot);
}
template <class Iter>
static void resolveKeySelectorFromCache(KeySelector& key,
Iter& it,
KeyRef const& maxKey,
bool* readToBegin,
bool* readThroughEnd,
int* actualOffset) {
// If the key indicated by `key` can be determined without reading unknown data from the snapshot, then
// it.kv().key is the resolved key. If the indicated key is determined to be "off the beginning or end" of the
// database, it points to the first or last segment in the DB,
// and key is an equivalent key selector relative to the beginning or end of the database.
// Otherwise it points to an unknown segment, and key is an equivalent key selector whose base key is in or
// adjoining the segment.
key.removeOrEqual(key.arena());
bool alreadyExhausted = key.offset == 1;
it.skip(key.getKey()); // TODO: or precondition?
if (key.offset <= 0 && it.beginKey() == key.getKey() && key.getKey() != allKeys.begin)
--it;
ExtStringRef keykey = key.getKey();
bool keyNeedsCopy = false;
// Invariant: it.beginKey() <= keykey && keykey <= it.endKey() && (key.isBackward() ? it.beginKey() != keykey :
// it.endKey() != keykey) Maintaining this invariant, we transform the key selector toward firstGreaterOrEqual
// form until we reach an unknown range or the result
while (key.offset > 1 && !it.is_unreadable() && !it.is_unknown_range() && it.endKey() < maxKey) {
if (it.is_kv())
--key.offset;
++it;
keykey = it.beginKey();
keyNeedsCopy = true;
}
while (key.offset < 1 && !it.is_unreadable() && !it.is_unknown_range() && it.beginKey() != allKeys.begin) {
if (it.is_kv()) {
++key.offset;
if (key.offset == 1) {
keykey = it.beginKey();
keyNeedsCopy = true;
break;
}
}
--it;
keykey = it.endKey();
keyNeedsCopy = true;
}
if (!alreadyExhausted) {
*actualOffset = key.offset;
}
if (!it.is_unreadable() && !it.is_unknown_range() && key.offset < 1) {
*readToBegin = true;
key.setKey(allKeys.begin);
key.offset = 1;
return;
}
if (!it.is_unreadable() && !it.is_unknown_range() && key.offset > 1) {
*readThroughEnd = true;
key.setKey(maxKey); // maxKey is a KeyRef, but points to a LiteralStringRef. TODO: how can we ASSERT this?
key.offset = 1;
return;
}
while (!it.is_unreadable() && it.is_empty_range() && it.endKey() < maxKey) {
++it;
keykey = it.beginKey();
keyNeedsCopy = true;
}
if (keyNeedsCopy) {
key.setKey(keykey.toArena(key.arena()));
}
}
static KeyRangeRef getKnownKeyRange(RangeResultRef data, KeySelector begin, KeySelector end, Arena& arena) {
StringRef beginKey = begin.offset <= 1 ? begin.getKey() : allKeys.end;
ExtStringRef endKey = !data.more && end.offset >= 1 ? end.getKey() : allKeys.begin;
if (data.readToBegin)
beginKey = allKeys.begin;
if (data.readThroughEnd)
endKey = allKeys.end;
if (data.size()) {
beginKey = std::min(beginKey, data[0].key);
if (data.readThrough.present()) {
endKey = std::max<ExtStringRef>(endKey, data.readThrough.get());
} else {
endKey = !data.more && data.end()[-1].key < endKey ? endKey : ExtStringRef(data.end()[-1].key, 1);
}
}
if (beginKey >= endKey)
return KeyRangeRef();
return KeyRangeRef(StringRef(arena, beginKey), endKey.toArena(arena));
}
// Pre: it points to an unknown range
// Increments it to point to the unknown range just before the next nontrivial known range (skips over trivial known
// ranges), but not more than iterationLimit ranges away
template <class Iter>
static int skipUncached(Iter& it, Iter const& end, int iterationLimit) {
ExtStringRef b = it.beginKey();
ExtStringRef e = it.endKey();
int singleEmpty = 0;
ASSERT(!it.is_unreadable() && it.is_unknown_range());
// b is the beginning of the most recent contiguous *empty* range
// e is it.endKey()
while (it != end && --iterationLimit >= 0) {
if (it.is_unreadable() || it.is_empty_range()) {
if (it.is_unreadable() || !e.isKeyAfter(b)) { // Assumes no degenerate ranges
while (it.is_unreadable() || !it.is_unknown_range())
--it;
return singleEmpty;
}
singleEmpty++;
} else
b = e;
++it;
e = it.endKey();
}
while (it.is_unreadable() || !it.is_unknown_range())
--it;
return singleEmpty;
}
// Pre: it points to an unknown range
// Returns the number of following empty single-key known ranges between it and the next nontrivial known range, but
// no more than maxClears Leaves `it` in an indeterminate state
template <class Iter>
static int countUncached(Iter&& it, KeyRef maxKey, int maxClears) {
if (maxClears <= 0)
return 0;
ExtStringRef b = it.beginKey();
ExtStringRef e = it.endKey();
int singleEmpty = 0;
while (e < maxKey) {
if (it.is_unreadable() || it.is_empty_range()) {
if (it.is_unreadable() || !e.isKeyAfter(b)) { // Assumes no degenerate ranges
return singleEmpty;
}
singleEmpty++;
if (singleEmpty >= maxClears)
return maxClears;
} else
b = e;
++it;
e = it.endKey();
}
return singleEmpty;
}
static void setRequestLimits(GetRangeLimits& requestLimit, int64_t additionalRows, int offset, int requestCount) {
requestLimit.minRows =
(int)std::min(std::max(1 + additionalRows, (int64_t)offset), (int64_t)std::numeric_limits<int>::max());
if (requestLimit.hasRowLimit()) {
requestLimit.rows =
(int)std::min(std::max(std::max(1, requestLimit.rows) + additionalRows, (int64_t)offset),
(int64_t)std::numeric_limits<int>::max());
}
// Calculating request byte limit
if (requestLimit.bytes == 0) {
requestLimit.bytes = GetRangeLimits::BYTE_LIMIT_UNLIMITED;
if (!requestLimit.hasRowLimit()) {
requestLimit.rows =
(int)std::min(std::max(std::max(1, requestLimit.rows) + additionalRows, (int64_t)offset),
(int64_t)std::numeric_limits<int>::max());
}
} else if (requestLimit.hasByteLimit()) {
requestLimit.bytes = std::min(int64_t(requestLimit.bytes) << std::min(requestCount, 20),
(int64_t)CLIENT_KNOBS->REPLY_BYTE_LIMIT);
}
}
// TODO: read to begin, read through end flags for result
ACTOR template <class Iter>
static Future<RangeResult> getRangeValue(ReadYourWritesTransaction* ryw,
KeySelector begin,
KeySelector end,
GetRangeLimits limits,
Iter* pit) {
state Iter& it(*pit);
state Iter itEnd(*pit);
state RangeResult result;
state int64_t additionalRows = 0;
state int itemsPastEnd = 0;
state int requestCount = 0;
state bool readToBegin = false;
state bool readThroughEnd = false;
state int actualBeginOffset = begin.offset;
state int actualEndOffset = end.offset;
// state UID randomID = nondeterministicRandom()->randomUniqueID();
resolveKeySelectorFromCache(begin, it, ryw->getMaxReadKey(), &readToBegin, &readThroughEnd, &actualBeginOffset);
resolveKeySelectorFromCache(end, itEnd, ryw->getMaxReadKey(), &readToBegin, &readThroughEnd, &actualEndOffset);
if (actualBeginOffset >= actualEndOffset && begin.getKey() >= end.getKey()) {
return RangeResultRef(false, false);
} else if ((begin.isFirstGreaterOrEqual() && begin.getKey() == ryw->getMaxReadKey()) ||
(end.isFirstGreaterOrEqual() && end.getKey() == allKeys.begin)) {
return RangeResultRef(readToBegin, readThroughEnd);
}
if (!end.isFirstGreaterOrEqual() && begin.getKey() > end.getKey()) {
Key resolvedEnd = wait(read(ryw, GetKeyReq(end), pit));
if (resolvedEnd == allKeys.begin)
readToBegin = true;
if (resolvedEnd == ryw->getMaxReadKey())
readThroughEnd = true;
if (begin.getKey() >= resolvedEnd && !begin.isBackward()) {
return RangeResultRef(false, false);
} else if (resolvedEnd == allKeys.begin) {
return RangeResultRef(readToBegin, readThroughEnd);
}
resolveKeySelectorFromCache(
begin, it, ryw->getMaxReadKey(), &readToBegin, &readThroughEnd, &actualBeginOffset);
resolveKeySelectorFromCache(
end, itEnd, ryw->getMaxReadKey(), &readToBegin, &readThroughEnd, &actualEndOffset);
}
//TraceEvent("RYWSelectorsStartForward", randomID).detail("ByteLimit", limits.bytes).detail("RowLimit", limits.rows);
loop {
/*TraceEvent("RYWSelectors", randomID).detail("Begin", begin.toString())
.detail("End", end.toString())
.detail("Reached", limits.isReached())
.detail("ItemsPastEnd", itemsPastEnd)
.detail("EndOffset", -end.offset)
.detail("ItBegin", it.beginKey())
.detail("ItEnd", itEnd.beginKey())
.detail("Unknown", it.is_unknown_range())
.detail("Requests", requestCount);*/
if (!result.size() && actualBeginOffset >= actualEndOffset && begin.getKey() >= end.getKey()) {
return RangeResultRef(false, false);
}
if (end.offset <= 1 && end.getKey() == allKeys.begin) {
return RangeResultRef(readToBegin, readThroughEnd);
}
if ((begin.offset >= end.offset && begin.getKey() >= end.getKey()) ||
(begin.offset >= 1 && begin.getKey() >= ryw->getMaxReadKey())) {
if (end.isFirstGreaterOrEqual())
break;
if (!result.size())
break;
Key resolvedEnd =
wait(read(ryw,
GetKeyReq(end),
pit)); // do not worry about iterator invalidation, because we are breaking for the loop
if (resolvedEnd == allKeys.begin)
readToBegin = true;
if (resolvedEnd == ryw->getMaxReadKey())
readThroughEnd = true;
end = firstGreaterOrEqual(resolvedEnd);
break;
}
if (!it.is_unreadable() && !it.is_unknown_range() && it.beginKey() > itEnd.beginKey()) {
if (end.isFirstGreaterOrEqual())
break;
return RangeResultRef(readToBegin, readThroughEnd);
}
if (limits.isReached() && itemsPastEnd >= 1 - end.offset)
break;
if (it == itEnd && ((!it.is_unreadable() && !it.is_unknown_range()) ||
(begin.offset > 0 && end.isFirstGreaterOrEqual() && end.getKey() == it.beginKey())))
break;
if (it.is_unknown_range()) {
if (limits.hasByteLimit() && result.size() && itemsPastEnd >= 1 - end.offset) {
result.more = true;
break;
}
Iter ucEnd(it);
int singleClears = 0;
int clearLimit = requestCount ? 1 << std::min(requestCount, 20) : 0;
if (it.beginKey() < itEnd.beginKey())
singleClears = std::min(skipUncached(ucEnd, itEnd, BUGGIFY ? 0 : clearLimit + 100), clearLimit);
state KeySelector read_end;
if (ucEnd != itEnd) {
Key k = ucEnd.endKey().toStandaloneStringRef();
read_end = KeySelector(firstGreaterOrEqual(k), k.arena());
if (end.offset < 1)
additionalRows += 1 - end.offset; // extra for items past end
} else if (end.offset < 1) {
read_end = KeySelector(firstGreaterOrEqual(end.getKey()), end.arena());
additionalRows += 1 - end.offset;
} else {
read_end = end;
if (end.offset > 1) {
singleClears +=
countUncached(std::move(ucEnd), ryw->getMaxReadKey(), clearLimit - singleClears);
read_end.offset += singleClears;
}
}
additionalRows += singleClears;
state KeySelector read_begin;
if (begin.isFirstGreaterOrEqual()) {
Key k = it.beginKey() > begin.getKey() ? it.beginKey().toStandaloneStringRef()
: Key(begin.getKey(), begin.arena());
begin = KeySelector(firstGreaterOrEqual(k), k.arena());
read_begin = begin;
} else if (begin.offset > 1) {
read_begin = KeySelector(firstGreaterOrEqual(begin.getKey()), begin.arena());
additionalRows += begin.offset - 1;
} else {
read_begin = begin;
ucEnd = it;
singleClears = countUncachedBack(std::move(ucEnd), clearLimit);
read_begin.offset -= singleClears;
additionalRows += singleClears;
}
if (read_end.getKey() < read_begin.getKey()) {
read_end.setKey(read_begin.getKey());
read_end.arena().dependsOn(read_begin.arena());
}
state GetRangeLimits requestLimit = limits;
setRequestLimits(requestLimit, additionalRows, 2 - read_begin.offset, requestCount);
requestCount++;
ASSERT(!requestLimit.hasRowLimit() || requestLimit.rows > 0);
ASSERT(requestLimit.hasRowLimit() || requestLimit.hasByteLimit());
//TraceEvent("RYWIssuing", randomID).detail("Begin", read_begin.toString()).detail("End", read_end.toString()).detail("Bytes", requestLimit.bytes).detail("Rows", requestLimit.rows).detail("Limits", limits.bytes).detail("Reached", limits.isReached()).detail("RequestCount", requestCount).detail("SingleClears", singleClears).detail("UcEnd", ucEnd.beginKey()).detail("MinRows", requestLimit.minRows);
additionalRows = 0;
RangeResult snapshot_read = wait(ryw->tr.getRange(read_begin, read_end, requestLimit, true, false));
KeyRangeRef range = getKnownKeyRange(snapshot_read, read_begin, read_end, ryw->arena);
//TraceEvent("RYWCacheInsert", randomID).detail("Range", range).detail("ExpectedSize", snapshot_read.expectedSize()).detail("Rows", snapshot_read.size()).detail("Results", snapshot_read).detail("More", snapshot_read.more).detail("ReadToBegin", snapshot_read.readToBegin).detail("ReadThroughEnd", snapshot_read.readThroughEnd).detail("ReadThrough", snapshot_read.readThrough);
if (ryw->cache.insert(range, snapshot_read))
ryw->arena.dependsOn(snapshot_read.arena());
// TODO: Is there a more efficient way to deal with invalidation?
resolveKeySelectorFromCache(
begin, it, ryw->getMaxReadKey(), &readToBegin, &readThroughEnd, &actualBeginOffset);
resolveKeySelectorFromCache(
end, itEnd, ryw->getMaxReadKey(), &readToBegin, &readThroughEnd, &actualEndOffset);
} else if (it.is_kv()) {
KeyValueRef const* start = it.kv(ryw->arena);
if (start == nullptr) {
++it;
continue;
}
it.skipContiguous(end.isFirstGreaterOrEqual()
? end.getKey()
: ryw->getMaxReadKey()); // not technically correct since this would add
// end.getKey(), but that is protected above
int maxCount = it.kv(ryw->arena) - start + 1;
int count = 0;
for (; count < maxCount && !limits.isReached(); count++) {
limits.decrement(start[count]);
}
itemsPastEnd += maxCount - count;
//TraceEvent("RYWaddKV", randomID).detail("Key", it.beginKey()).detail("Count", count).detail("MaxCount", maxCount).detail("ItemsPastEnd", itemsPastEnd);
if (count)
result.append(result.arena(), start, count);
++it;
} else
++it;
}
result.more = result.more || limits.isReached();
if (end.isFirstGreaterOrEqual()) {
int keepItems = std::lower_bound(result.begin(), result.end(), end.getKey(), KeyValueRef::OrderByKey()) -
result.begin();
if (keepItems < result.size())
result.more = false;
result.resize(result.arena(), keepItems);
}
result.readToBegin = readToBegin;
result.readThroughEnd = !result.more && readThroughEnd;
result.arena().dependsOn(ryw->arena);
return result;
}
static KeyRangeRef getKnownKeyRangeBack(RangeResultRef data, KeySelector begin, KeySelector end, Arena& arena) {
StringRef beginKey = !data.more && begin.offset <= 1 ? begin.getKey() : allKeys.end;
ExtStringRef endKey = end.offset >= 1 ? end.getKey() : allKeys.begin;
if (data.readToBegin)
beginKey = allKeys.begin;
if (data.readThroughEnd)
endKey = allKeys.end;
if (data.size()) {
if (data.readThrough.present()) {
beginKey = std::min(data.readThrough.get(), beginKey);
} else {
beginKey = !data.more && data.end()[-1].key > beginKey ? beginKey : data.end()[-1].key;
}
endKey = data[0].key < endKey ? endKey : ExtStringRef(data[0].key, 1);
}
if (beginKey >= endKey)
return KeyRangeRef();
return KeyRangeRef(StringRef(arena, beginKey), endKey.toArena(arena));
}
// Pre: it points to an unknown range
// Decrements it to point to the unknown range just before the last nontrivial known range (skips over trivial known
// ranges), but not more than iterationLimit ranges away Returns the number of single-key empty ranges skipped
template <class Iter>
static int skipUncachedBack(Iter& it, Iter const& end, int iterationLimit) {
ExtStringRef b = it.beginKey();
ExtStringRef e = it.endKey();
int singleEmpty = 0;
ASSERT(!it.is_unreadable() && it.is_unknown_range());
// b == it.beginKey()
// e is the end of the contiguous empty range containing it
while (it != end && --iterationLimit >= 0) {
if (it.is_unreadable() || it.is_empty_range()) {
if (it.is_unreadable() || !e.isKeyAfter(b)) { // Assumes no degenerate ranges
while (it.is_unreadable() || !it.is_unknown_range())
++it;
return singleEmpty;
}
singleEmpty++;
} else
e = b;
--it;
b = it.beginKey();
}
while (it.is_unreadable() || !it.is_unknown_range())
++it;
return singleEmpty;
}
// Pre: it points to an unknown range
// Returns the number of preceding empty single-key known ranges between it and the previous nontrivial known range,
// but no more than maxClears Leaves it in an indeterminate state
template <class Iter>
static int countUncachedBack(Iter&& it, int maxClears) {
if (maxClears <= 0)
return 0;
ExtStringRef b = it.beginKey();
ExtStringRef e = it.endKey();
int singleEmpty = 0;
while (b > allKeys.begin) {
if (it.is_unreadable() || it.is_empty_range()) {
if (it.is_unreadable() || !e.isKeyAfter(b)) { // Assumes no degenerate ranges
return singleEmpty;
}
singleEmpty++;
if (singleEmpty >= maxClears)
return maxClears;
} else
e = b;
--it;
b = it.beginKey();
}
return singleEmpty;
}
ACTOR template <class Iter>
static Future<RangeResult> getRangeValueBack(ReadYourWritesTransaction* ryw,
KeySelector begin,
KeySelector end,
GetRangeLimits limits,
Iter* pit) {
state Iter& it(*pit);
state Iter itEnd(*pit);
state RangeResult result;
state int64_t additionalRows = 0;
state int itemsPastBegin = 0;
state int requestCount = 0;
state bool readToBegin = false;
state bool readThroughEnd = false;
state int actualBeginOffset = begin.offset;
state int actualEndOffset = end.offset;
// state UID randomID = nondeterministicRandom()->randomUniqueID();
resolveKeySelectorFromCache(end, it, ryw->getMaxReadKey(), &readToBegin, &readThroughEnd, &actualEndOffset);
resolveKeySelectorFromCache(
begin, itEnd, ryw->getMaxReadKey(), &readToBegin, &readThroughEnd, &actualBeginOffset);
if (actualBeginOffset >= actualEndOffset && begin.getKey() >= end.getKey()) {
return RangeResultRef(false, false);
} else if ((begin.isFirstGreaterOrEqual() && begin.getKey() == ryw->getMaxReadKey()) ||
(end.isFirstGreaterOrEqual() && end.getKey() == allKeys.begin)) {
return RangeResultRef(readToBegin, readThroughEnd);
}
if (!begin.isFirstGreaterOrEqual() && begin.getKey() > end.getKey()) {
Key resolvedBegin = wait(read(ryw, GetKeyReq(begin), pit));
if (resolvedBegin == allKeys.begin)
readToBegin = true;
if (resolvedBegin == ryw->getMaxReadKey())
readThroughEnd = true;
if (resolvedBegin >= end.getKey() && end.offset <= 1) {
return RangeResultRef(false, false);
} else if (resolvedBegin == ryw->getMaxReadKey()) {
return RangeResultRef(readToBegin, readThroughEnd);
}
resolveKeySelectorFromCache(end, it, ryw->getMaxReadKey(), &readToBegin, &readThroughEnd, &actualEndOffset);
resolveKeySelectorFromCache(
begin, itEnd, ryw->getMaxReadKey(), &readToBegin, &readThroughEnd, &actualBeginOffset);
}
//TraceEvent("RYWSelectorsStartReverse", randomID).detail("ByteLimit", limits.bytes).detail("RowLimit", limits.rows);
loop {
/*TraceEvent("RYWSelectors", randomID).detail("Begin", begin.toString())
.detail("End", end.toString())
.detail("Reached", limits.isReached())
.detail("ItemsPastBegin", itemsPastBegin)
.detail("EndOffset", end.offset)
.detail("ItBegin", it.beginKey())
.detail("ItEnd", itEnd.beginKey())
.detail("Unknown", it.is_unknown_range())
.detail("Kv", it.is_kv())
.detail("Requests", requestCount);*/
if (!result.size() && actualBeginOffset >= actualEndOffset && begin.getKey() >= end.getKey()) {
return RangeResultRef(false, false);
}
if (!begin.isBackward() && begin.getKey() >= ryw->getMaxReadKey()) {
return RangeResultRef(readToBegin, readThroughEnd);
}
if ((begin.offset >= end.offset && begin.getKey() >= end.getKey()) ||
(end.offset <= 1 && end.getKey() == allKeys.begin)) {
if (begin.isFirstGreaterOrEqual())
break;
if (!result.size())
break;
Key resolvedBegin =
wait(read(ryw,
GetKeyReq(begin),
pit)); // do not worry about iterator invalidation, because we are breaking for the loop
if (resolvedBegin == allKeys.begin)
readToBegin = true;
if (resolvedBegin == ryw->getMaxReadKey())
readThroughEnd = true;
begin = firstGreaterOrEqual(resolvedBegin);
break;
}
if (itemsPastBegin >= begin.offset - 1 && !it.is_unreadable() && !it.is_unknown_range() &&
it.beginKey() < itEnd.beginKey()) {
if (begin.isFirstGreaterOrEqual())
break;
return RangeResultRef(readToBegin, readThroughEnd);
}
if (limits.isReached() && itemsPastBegin >= begin.offset - 1)
break;
if (end.isFirstGreaterOrEqual() && end.getKey() == it.beginKey()) {
if (itemsPastBegin >= begin.offset - 1 && it == itEnd)
break;
--it;
}
if (it.is_unknown_range()) {
if (limits.hasByteLimit() && result.size() && itemsPastBegin >= begin.offset - 1) {
result.more = true;
break;
}
Iter ucEnd(it);
int singleClears = 0;
int clearLimit = requestCount ? 1 << std::min(requestCount, 20) : 0;
if (it.beginKey() > itEnd.beginKey())
singleClears = std::min(skipUncachedBack(ucEnd, itEnd, BUGGIFY ? 0 : clearLimit + 100), clearLimit);
state KeySelector read_begin;
if (ucEnd != itEnd) {
Key k = ucEnd.beginKey().toStandaloneStringRef();
read_begin = KeySelector(firstGreaterOrEqual(k), k.arena());
if (begin.offset > 1)
additionalRows += begin.offset - 1; // extra for items past end
} else if (begin.offset > 1) {
read_begin = KeySelector(firstGreaterOrEqual(begin.getKey()), begin.arena());
additionalRows += begin.offset - 1;
} else {
read_begin = begin;
if (begin.offset < 1) {
singleClears += countUncachedBack(std::move(ucEnd), clearLimit - singleClears);
read_begin.offset -= singleClears;
}
}
additionalRows += singleClears;
state KeySelector read_end;
if (end.isFirstGreaterOrEqual()) {
Key k = it.endKey() < end.getKey() ? it.endKey().toStandaloneStringRef() : end.getKey();
end = KeySelector(firstGreaterOrEqual(k), k.arena());
read_end = end;
} else if (end.offset < 1) {
read_end = KeySelector(firstGreaterOrEqual(end.getKey()), end.arena());
additionalRows += 1 - end.offset;
} else {
read_end = end;
ucEnd = it;
singleClears = countUncached(std::move(ucEnd), ryw->getMaxReadKey(), clearLimit);
read_end.offset += singleClears;
additionalRows += singleClears;
}
if (read_begin.getKey() > read_end.getKey()) {
read_begin.setKey(read_end.getKey());
read_begin.arena().dependsOn(read_end.arena());
}
state GetRangeLimits requestLimit = limits;
setRequestLimits(requestLimit, additionalRows, read_end.offset, requestCount);
requestCount++;
ASSERT(!requestLimit.hasRowLimit() || requestLimit.rows > 0);
ASSERT(requestLimit.hasRowLimit() || requestLimit.hasByteLimit());
//TraceEvent("RYWIssuing", randomID).detail("Begin", read_begin.toString()).detail("End", read_end.toString()).detail("Bytes", requestLimit.bytes).detail("Rows", requestLimit.rows).detail("Limits", limits.bytes).detail("Reached", limits.isReached()).detail("RequestCount", requestCount).detail("SingleClears", singleClears).detail("UcEnd", ucEnd.beginKey()).detail("MinRows", requestLimit.minRows);
additionalRows = 0;
RangeResult snapshot_read = wait(ryw->tr.getRange(read_begin, read_end, requestLimit, true, true));
KeyRangeRef range = getKnownKeyRangeBack(snapshot_read, read_begin, read_end, ryw->arena);
//TraceEvent("RYWCacheInsert", randomID).detail("Range", range).detail("ExpectedSize", snapshot_read.expectedSize()).detail("Rows", snapshot_read.size()).detail("Results", snapshot_read).detail("More", snapshot_read.more).detail("ReadToBegin", snapshot_read.readToBegin).detail("ReadThroughEnd", snapshot_read.readThroughEnd).detail("ReadThrough", snapshot_read.readThrough);
RangeResultRef reversed;
reversed.resize(ryw->arena, snapshot_read.size());
for (int i = 0; i < snapshot_read.size(); i++) {
reversed[snapshot_read.size() - i - 1] = snapshot_read[i];
}
if (ryw->cache.insert(range, reversed))
ryw->arena.dependsOn(snapshot_read.arena());
// TODO: Is there a more efficient way to deal with invalidation?
resolveKeySelectorFromCache(
end, it, ryw->getMaxReadKey(), &readToBegin, &readThroughEnd, &actualEndOffset);
resolveKeySelectorFromCache(
begin, itEnd, ryw->getMaxReadKey(), &readToBegin, &readThroughEnd, &actualBeginOffset);
} else {
KeyValueRef const* end = it.is_kv() ? it.kv(ryw->arena) : nullptr;
if (end != nullptr) {
it.skipContiguousBack(begin.isFirstGreaterOrEqual() ? begin.getKey() : allKeys.begin);
KeyValueRef const* start = it.kv(ryw->arena);
ASSERT(start != nullptr);
int maxCount = end - start + 1;
int count = 0;
for (; count < maxCount && !limits.isReached(); count++) {
limits.decrement(start[maxCount - count - 1]);
}
itemsPastBegin += maxCount - count;
//TraceEvent("RYWaddKV", randomID).detail("Key", it.beginKey()).detail("Count", count).detail("MaxCount", maxCount).detail("ItemsPastBegin", itemsPastBegin);
if (count) {
int size = result.size();
result.resize(result.arena(), size + count);
for (int i = 0; i < count; i++) {
result[size + i] = start[maxCount - i - 1];
}
}
}
if (it == itEnd)
break;
--it;
}
}
result.more = result.more || limits.isReached();
if (begin.isFirstGreaterOrEqual()) {
int keepItems = result.rend() -
std::lower_bound(result.rbegin(), result.rend(), begin.getKey(), KeyValueRef::OrderByKey());
if (keepItems < result.size())
result.more = false;
result.resize(result.arena(), keepItems);
}
result.readToBegin = !result.more && readToBegin;
result.readThroughEnd = readThroughEnd;
result.arena().dependsOn(ryw->arena);
return result;
}
static void triggerWatches(ReadYourWritesTransaction* ryw,
KeyRangeRef range,
Optional<ValueRef> val,
bool valueKnown = true) {
for (auto it = ryw->watchMap.lower_bound(range.begin); it != ryw->watchMap.end() && it->key < range.end;) {
auto itCopy = it;
++it;
ASSERT(itCopy->value.size());
TEST(itCopy->value.size() > 1); // Multiple watches on the same key triggered by RYOW
for (int i = 0; i < itCopy->value.size(); i++) {
if (itCopy->value[i]->onChangeTrigger.isSet()) {
swapAndPop(&itCopy->value, i--);
} else if (!valueKnown ||
(itCopy->value[i]->setPresent &&
(itCopy->value[i]->setValue.present() != val.present() ||
(val.present() && itCopy->value[i]->setValue.get() != val.get()))) ||
(itCopy->value[i]->valuePresent &&
(itCopy->value[i]->value.present() != val.present() ||
(val.present() && itCopy->value[i]->value.get() != val.get())))) {
itCopy->value[i]->onChangeTrigger.send(Void());
swapAndPop(&itCopy->value, i--);
} else {
itCopy->value[i]->setPresent = true;
itCopy->value[i]->setValue = val.castTo<Value>();
}
}
if (itCopy->value.size() == 0)
ryw->watchMap.erase(itCopy);
}
}
static void triggerWatches(ReadYourWritesTransaction* ryw,
KeyRef key,
Optional<ValueRef> val,
bool valueKnown = true) {
triggerWatches(ryw, singleKeyRange(key), val, valueKnown);
}
ACTOR static Future<Void> watch(ReadYourWritesTransaction* ryw, Key key) {
state Future<Optional<Value>> val;
state Future<Void> watchFuture;
state Reference<Watch> watch(new Watch(key));
state Promise<Void> done;
ryw->reading.add(done.getFuture());
if (!ryw->options.readYourWritesDisabled) {
ryw->watchMap[key].push_back(watch);
val = readWithConflictRange(ryw, GetValueReq(key), false);
} else {
ryw->approximateSize += 2 * key.expectedSize() + 1;
val = ryw->tr.get(key);
}
try {
wait(ryw->resetPromise.getFuture() || success(val) || watch->onChangeTrigger.getFuture());
} catch (Error& e) {
done.send(Void());
throw;
}
if (watch->onChangeTrigger.getFuture().isReady()) {
done.send(Void());
if (watch->onChangeTrigger.getFuture().isError())
throw watch->onChangeTrigger.getFuture().getError();
return Void();
}
watch->valuePresent = true;
watch->value = val.get();
if (watch->setPresent && (watch->setValue.present() != watch->value.present() ||
(watch->value.present() && watch->setValue.get() != watch->value.get()))) {
watch->onChangeTrigger.send(Void());
done.send(Void());
return Void();
}
try {
watchFuture = ryw->tr.watch(watch); // throws if there are too many outstanding watches
} catch (Error& e) {
done.send(Void());
throw;
}
done.send(Void());
wait(watchFuture);
return Void();
}
ACTOR static void simulateTimeoutInFlightCommit(ReadYourWritesTransaction* ryw_) {
state Reference<ReadYourWritesTransaction> ryw = Reference<ReadYourWritesTransaction>::addRef(ryw_);
ASSERT(ryw->options.timeoutInSeconds > 0);
if (!ryw->resetPromise.isSet())
ryw->resetPromise.sendError(transaction_timed_out());
wait(delay(deterministicRandom()->random01() * 5));
TraceEvent("ClientBuggifyInFlightCommit");
wait(ryw->tr.commit());
}
ACTOR static Future<Void> commit(ReadYourWritesTransaction* ryw) {
try {
ryw->commitStarted = true;
if (ryw->options.specialKeySpaceChangeConfiguration)
wait(ryw->getDatabase()->specialKeySpace->commit(ryw));
Future<Void> ready = ryw->reading;
wait(ryw->resetPromise.getFuture() || ready);
if (ryw->options.readYourWritesDisabled) {
// Stash away conflict ranges to read after commit
ryw->nativeReadRanges = ryw->tr.readConflictRanges();
ryw->nativeWriteRanges = ryw->tr.writeConflictRanges();
for (const auto& f : ryw->tr.getExtraReadConflictRanges()) {
if (f.isReady() && f.get().first < f.get().second)
ryw->nativeReadRanges.push_back(
ryw->nativeReadRanges.arena(),
KeyRangeRef(f.get().first, f.get().second)
.withPrefix(readConflictRangeKeysRange.begin, ryw->nativeReadRanges.arena()));
}
if (ryw->resetPromise.isSet())
throw ryw->resetPromise.getFuture().getError();
if (CLIENT_BUGGIFY && ryw->options.timeoutInSeconds > 0) {
simulateTimeoutInFlightCommit(ryw);
throw transaction_timed_out();
}
wait(ryw->resetPromise.getFuture() || ryw->tr.commit());
ryw->debugLogRetries();
if (!ryw->tr.apiVersionAtLeast(410)) {
ryw->reset();
}
return Void();
}
ryw->writeRangeToNativeTransaction(KeyRangeRef(StringRef(), allKeys.end));
auto conflictRanges = ryw->readConflicts.ranges();
for (auto iter = conflictRanges.begin(); iter != conflictRanges.end(); ++iter) {
if (iter->value()) {
ryw->tr.addReadConflictRange(iter->range());
}
}
if (CLIENT_BUGGIFY && ryw->options.timeoutInSeconds > 0) {
simulateTimeoutInFlightCommit(ryw);
throw transaction_timed_out();
}
wait(ryw->resetPromise.getFuture() || ryw->tr.commit());
ryw->debugLogRetries();
if (!ryw->tr.apiVersionAtLeast(410)) {
ryw->reset();
}
return Void();
} catch (Error& e) {
if (!ryw->tr.apiVersionAtLeast(410)) {
ryw->commitStarted = false;
if (!ryw->resetPromise.isSet()) {
ryw->tr.reset();
ryw->resetRyow();
}
}
throw;
}
}
ACTOR static Future<Void> onError(ReadYourWritesTransaction* ryw, Error e) {
try {
if (ryw->resetPromise.isSet()) {
throw ryw->resetPromise.getFuture().getError();
}
bool retry_limit_hit = ryw->options.maxRetries != -1 && ryw->retries >= ryw->options.maxRetries;
if (ryw->retries < std::numeric_limits<int>::max())
ryw->retries++;
if (retry_limit_hit) {
throw e;
}
wait(ryw->resetPromise.getFuture() || ryw->tr.onError(e));
ryw->debugLogRetries(e);
ryw->resetRyow();
return Void();
} catch (Error& e) {
if (!ryw->resetPromise.isSet()) {
if (ryw->tr.apiVersionAtLeast(610)) {
ryw->resetPromise.sendError(transaction_cancelled());
} else {
ryw->resetRyow();
}
}
if (e.code() == error_code_broken_promise)
throw transaction_cancelled();
throw;
}
}
ACTOR static Future<Version> getReadVersion(ReadYourWritesTransaction* ryw) {
choose {
when(Version v = wait(ryw->tr.getReadVersion())) { return v; }
when(wait(ryw->resetPromise.getFuture())) { throw internal_error(); }
}
}
};
ReadYourWritesTransaction::ReadYourWritesTransaction(Database const& cx)
: cache(&arena), writes(&arena), tr(cx), retries(0), approximateSize(0), creationTime(now()), commitStarted(false),
options(tr), deferredError(cx->deferredError), versionStampFuture(tr.getVersionstamp()),
specialKeySpaceWriteMap(std::make_pair(false, Optional<Value>()), specialKeys.end) {
std::copy(
cx.getTransactionDefaults().begin(), cx.getTransactionDefaults().end(), std::back_inserter(persistentOptions));
applyPersistentOptions();
}
ACTOR Future<Void> timebomb(double endTime, Promise<Void> resetPromise) {
while (now() < endTime) {
wait(delayUntil(std::min(endTime + 0.0001, now() + CLIENT_KNOBS->TRANSACTION_TIMEOUT_DELAY_INTERVAL)));
}
if (!resetPromise.isSet())
resetPromise.sendError(transaction_timed_out());
throw transaction_timed_out();
}
void ReadYourWritesTransaction::resetTimeout() {
timeoutActor =
options.timeoutInSeconds == 0.0 ? Void() : timebomb(options.timeoutInSeconds + creationTime, resetPromise);
}
Future<Version> ReadYourWritesTransaction::getReadVersion() {
if (tr.apiVersionAtLeast(101)) {
if (resetPromise.isSet())
return resetPromise.getFuture().getError();
return RYWImpl::getReadVersion(this);
}
return tr.getReadVersion();
}
Optional<Value> getValueFromJSON(StatusObject statusObj) {
try {
Value output =
StringRef(json_spirit::write_string(json_spirit::mValue(statusObj), json_spirit::Output_options::none));
return output;
} catch (std::exception& e) {
TraceEvent(SevError, "UnableToUnparseStatusJSON").detail("What", e.what());
throw internal_error();
}
}
ACTOR Future<Optional<Value>> getJSON(Database db) {
StatusObject statusObj = wait(StatusClient::statusFetcher(db));
return getValueFromJSON(statusObj);
}
ACTOR Future<RangeResult> getWorkerInterfaces(Reference<ClusterConnectionFile> clusterFile) {
state Reference<AsyncVar<Optional<ClusterInterface>>> clusterInterface(new AsyncVar<Optional<ClusterInterface>>);
state Future<Void> leaderMon = monitorLeader<ClusterInterface>(clusterFile, clusterInterface);
loop {
choose {
when(vector<ClientWorkerInterface> workers =
wait(clusterInterface->get().present()
? brokenPromiseToNever(
clusterInterface->get().get().getClientWorkers.getReply(GetClientWorkersRequest()))
: Never())) {
RangeResult result;
for (auto& it : workers) {
result.push_back_deep(
result.arena(),
KeyValueRef(it.address().toString(), BinaryWriter::toValue(it, IncludeVersion())));
}
return result;
}
when(wait(clusterInterface->onChange())) {}
}
}
}
Future<Optional<Value>> ReadYourWritesTransaction::get(const Key& key, bool snapshot) {
TEST(true); // ReadYourWritesTransaction::get
if (getDatabase()->apiVersionAtLeast(630)) {
if (specialKeys.contains(key)) {
TEST(true); // Special keys get
return getDatabase()->specialKeySpace->get(this, key);
}
} else {
if (key == LiteralStringRef("\xff\xff/status/json")) {
if (tr.getDatabase().getPtr() && tr.getDatabase()->getConnectionFile()) {
++tr.getDatabase()->transactionStatusRequests;
return getJSON(tr.getDatabase());
} else {
return Optional<Value>();
}
}
if (key == LiteralStringRef("\xff\xff/cluster_file_path")) {
try {
if (tr.getDatabase().getPtr() && tr.getDatabase()->getConnectionFile()) {
Optional<Value> output = StringRef(tr.getDatabase()->getConnectionFile()->getFilename());
return output;
}
} catch (Error& e) {
return e;
}
return Optional<Value>();
}
if (key == LiteralStringRef("\xff\xff/connection_string")) {
try {
if (tr.getDatabase().getPtr() && tr.getDatabase()->getConnectionFile()) {
Reference<ClusterConnectionFile> f = tr.getDatabase()->getConnectionFile();
Optional<Value> output = StringRef(f->getConnectionString().toString());
return output;
}
} catch (Error& e) {
return e;
}
return Optional<Value>();
}
}
if (checkUsedDuringCommit()) {
return used_during_commit();
}
if (resetPromise.isSet())
return resetPromise.getFuture().getError();
if (key >= getMaxReadKey() && key != metadataVersionKey)
return key_outside_legal_range();
// There are no keys in the database with size greater than KEY_SIZE_LIMIT
if (key.size() >
(key.startsWith(systemKeys.begin) ? CLIENT_KNOBS->SYSTEM_KEY_SIZE_LIMIT : CLIENT_KNOBS->KEY_SIZE_LIMIT))
return Optional<Value>();
Future<Optional<Value>> result = RYWImpl::readWithConflictRange(this, RYWImpl::GetValueReq(key), snapshot);
reading.add(success(result));
return result;
}
Future<Key> ReadYourWritesTransaction::getKey(const KeySelector& key, bool snapshot) {
if (checkUsedDuringCommit()) {
return used_during_commit();
}
if (resetPromise.isSet())
return resetPromise.getFuture().getError();
if (key.getKey() > getMaxReadKey())
return key_outside_legal_range();
Future<Key> result = RYWImpl::readWithConflictRange(this, RYWImpl::GetKeyReq(key), snapshot);
reading.add(success(result));
return result;
}
Future<RangeResult> ReadYourWritesTransaction::getRange(KeySelector begin,
KeySelector end,
GetRangeLimits limits,
bool snapshot,
bool reverse) {
if (getDatabase()->apiVersionAtLeast(630)) {
if (specialKeys.contains(begin.getKey()) && specialKeys.begin <= end.getKey() &&
end.getKey() <= specialKeys.end) {
TEST(true); // Special key space get range
return getDatabase()->specialKeySpace->getRange(this, begin, end, limits, reverse);
}
} else {
if (begin.getKey() == LiteralStringRef("\xff\xff/worker_interfaces")) {
if (tr.getDatabase().getPtr() && tr.getDatabase()->getConnectionFile()) {
return getWorkerInterfaces(tr.getDatabase()->getConnectionFile());
} else {
return RangeResult();
}
}
}
if (checkUsedDuringCommit()) {
return used_during_commit();
}
if (resetPromise.isSet())
return resetPromise.getFuture().getError();
KeyRef maxKey = getMaxReadKey();
if (begin.getKey() > maxKey || end.getKey() > maxKey)
return key_outside_legal_range();
// This optimization prevents nullptr operations from being added to the conflict range
if (limits.isReached()) {
TEST(true); // RYW range read limit 0
return RangeResult();
}
if (!limits.isValid())
return range_limits_invalid();
if (begin.orEqual)
begin.removeOrEqual(begin.arena());
if (end.orEqual)
end.removeOrEqual(end.arena());
if (begin.offset >= end.offset && begin.getKey() >= end.getKey()) {
TEST(true); // RYW range inverted
return RangeResult();
}
Future<RangeResult> result =
reverse ? RYWImpl::readWithConflictRange(this, RYWImpl::GetRangeReq<true>(begin, end, limits), snapshot)
: RYWImpl::readWithConflictRange(this, RYWImpl::GetRangeReq<false>(begin, end, limits), snapshot);
reading.add(success(result));
return result;
}
Future<RangeResult> ReadYourWritesTransaction::getRange(const KeySelector& begin,
const KeySelector& end,
int limit,
bool snapshot,
bool reverse) {
return getRange(begin, end, GetRangeLimits(limit), snapshot, reverse);
}
Future<Standalone<VectorRef<const char*>>> ReadYourWritesTransaction::getAddressesForKey(const Key& key) {
if (checkUsedDuringCommit()) {
return used_during_commit();
}
if (resetPromise.isSet())
return resetPromise.getFuture().getError();
// If key >= allKeys.end, then our resulting address vector will be empty.
Future<Standalone<VectorRef<const char*>>> result =
waitOrError(tr.getAddressesForKey(key), resetPromise.getFuture());
reading.add(success(result));
return result;
}
Future<int64_t> ReadYourWritesTransaction::getEstimatedRangeSizeBytes(const KeyRange& keys) {
if (checkUsedDuringCommit()) {
throw used_during_commit();
}
if (resetPromise.isSet())
return resetPromise.getFuture().getError();
return map(waitOrError(tr.getStorageMetrics(keys, -1), resetPromise.getFuture()),
[](const StorageMetrics& m) { return m.bytes; });
}
Future<Standalone<VectorRef<KeyRef>>> ReadYourWritesTransaction::getRangeSplitPoints(const KeyRange& range,
int64_t chunkSize) {
if (checkUsedDuringCommit()) {
return used_during_commit();
}
if (resetPromise.isSet())
return resetPromise.getFuture().getError();
KeyRef maxKey = getMaxReadKey();
if (range.begin > maxKey || range.end > maxKey)
return key_outside_legal_range();
return waitOrError(tr.getRangeSplitPoints(range, chunkSize), resetPromise.getFuture());
}
void ReadYourWritesTransaction::addReadConflictRange(KeyRangeRef const& keys) {
if (checkUsedDuringCommit()) {
throw used_during_commit();
}
if (tr.apiVersionAtLeast(300)) {
if ((keys.begin > getMaxReadKey() || keys.end > getMaxReadKey()) &&
(keys.begin != metadataVersionKey || keys.end != metadataVersionKeyEnd)) {
throw key_outside_legal_range();
}
}
// There aren't any keys in the database with size larger than KEY_SIZE_LIMIT, so if range contains large keys
// we can translate it to an equivalent one with smaller keys
KeyRef begin = keys.begin;
KeyRef end = keys.end;
if (begin.size() >
(begin.startsWith(systemKeys.begin) ? CLIENT_KNOBS->SYSTEM_KEY_SIZE_LIMIT : CLIENT_KNOBS->KEY_SIZE_LIMIT))
begin = begin.substr(
0,
(begin.startsWith(systemKeys.begin) ? CLIENT_KNOBS->SYSTEM_KEY_SIZE_LIMIT : CLIENT_KNOBS->KEY_SIZE_LIMIT) +
1);
if (end.size() >
(end.startsWith(systemKeys.begin) ? CLIENT_KNOBS->SYSTEM_KEY_SIZE_LIMIT : CLIENT_KNOBS->KEY_SIZE_LIMIT))
end = end.substr(
0,
(end.startsWith(systemKeys.begin) ? CLIENT_KNOBS->SYSTEM_KEY_SIZE_LIMIT : CLIENT_KNOBS->KEY_SIZE_LIMIT) +
1);
KeyRangeRef r = KeyRangeRef(begin, end);
if (r.empty()) {
return;
}
if (options.readYourWritesDisabled) {
approximateSize += r.expectedSize() + sizeof(KeyRangeRef);
tr.addReadConflictRange(r);
return;
}
WriteMap::iterator it(&writes);
KeyRangeRef readRange(arena, r);
it.skip(readRange.begin);
updateConflictMap(readRange, it);
}
void ReadYourWritesTransaction::updateConflictMap(KeyRef const& key, WriteMap::iterator& it) {
// it.skip( key );
// ASSERT( it.beginKey() <= key && key < it.endKey() );
if (it.is_unmodified_range() || (it.is_operation() && !it.is_independent())) {
approximateSize += 2 * key.expectedSize() + 1 + sizeof(KeyRangeRef);
readConflicts.insert(singleKeyRange(key, arena), true);
}
}
void ReadYourWritesTransaction::updateConflictMap(KeyRangeRef const& keys, WriteMap::iterator& it) {
// it.skip( keys.begin );
// ASSERT( it.beginKey() <= keys.begin && keys.begin < it.endKey() );
for (; it.beginKey() < keys.end; ++it) {
if (it.is_unmodified_range() || (it.is_operation() && !it.is_independent())) {
KeyRangeRef insert_range = KeyRangeRef(std::max(keys.begin, it.beginKey().toArenaOrRef(arena)),
std::min(keys.end, it.endKey().toArenaOrRef(arena)));
if (!insert_range.empty()) {
approximateSize += keys.expectedSize() + sizeof(KeyRangeRef);
readConflicts.insert(insert_range, true);
}
}
}
}
void ReadYourWritesTransaction::writeRangeToNativeTransaction(KeyRangeRef const& keys) {
WriteMap::iterator it(&writes);
it.skip(keys.begin);
bool inClearRange = false;
ExtStringRef clearBegin;
// Clear ranges must be done first because of keys that are both cleared and set to a new value
for (; it.beginKey() < keys.end; ++it) {
if (it.is_cleared_range() && !inClearRange) {
clearBegin = std::max(ExtStringRef(keys.begin), it.beginKey());
inClearRange = true;
} else if (!it.is_cleared_range() && inClearRange) {
tr.clear(KeyRangeRef(clearBegin.toArenaOrRef(arena), it.beginKey().toArenaOrRef(arena)), false);
inClearRange = false;
}
}
if (inClearRange) {
tr.clear(KeyRangeRef(clearBegin.toArenaOrRef(arena), keys.end), false);
}
it.skip(keys.begin);
bool inConflictRange = false;
ExtStringRef conflictBegin;
for (; it.beginKey() < keys.end; ++it) {
if (it.is_conflict_range() && !inConflictRange) {
conflictBegin = std::max(ExtStringRef(keys.begin), it.beginKey());
inConflictRange = true;
} else if (!it.is_conflict_range() && inConflictRange) {
tr.addWriteConflictRange(KeyRangeRef(conflictBegin.toArenaOrRef(arena), it.beginKey().toArenaOrRef(arena)));
inConflictRange = false;
}
// SOMEDAY: make atomicOp take set to avoid switch
if (it.is_operation()) {
auto op = it.op();
for (int i = 0; i < op.size(); ++i) {
switch (op[i].type) {
case MutationRef::SetValue:
if (op[i].value.present()) {
tr.set(it.beginKey().assertRef(), op[i].value.get(), false);
} else {
tr.clear(it.beginKey().assertRef(), false);
}
break;
case MutationRef::AddValue:
case MutationRef::AppendIfFits:
case MutationRef::And:
case MutationRef::Or:
case MutationRef::Xor:
case MutationRef::Max:
case MutationRef::Min:
case MutationRef::SetVersionstampedKey:
case MutationRef::SetVersionstampedValue:
case MutationRef::ByteMin:
case MutationRef::ByteMax:
case MutationRef::MinV2:
case MutationRef::AndV2:
case MutationRef::CompareAndClear:
tr.atomicOp(it.beginKey().assertRef(), op[i].value.get(), op[i].type, false);
break;
default:
break;
}
}
}
}
if (inConflictRange) {
tr.addWriteConflictRange(KeyRangeRef(conflictBegin.toArenaOrRef(arena), keys.end));
}
}
ReadYourWritesTransactionOptions::ReadYourWritesTransactionOptions(Transaction const& tr) {
reset(tr);
}
void ReadYourWritesTransactionOptions::reset(Transaction const& tr) {
memset(this, 0, sizeof(*this));
timeoutInSeconds = 0.0;
maxRetries = -1;
snapshotRywEnabled = tr.getDatabase()->snapshotRywEnabled;
}
bool ReadYourWritesTransactionOptions::getAndResetWriteConflictDisabled() {
bool disabled = nextWriteDisableConflictRange;
nextWriteDisableConflictRange = false;
return disabled;
}
void ReadYourWritesTransaction::getWriteConflicts(KeyRangeMap<bool>* result) {
WriteMap::iterator it(&writes);
it.skip(allKeys.begin);
bool inConflictRange = false;
ExtStringRef conflictBegin;
for (; it.beginKey() < getMaxWriteKey(); ++it) {
if (it.is_conflict_range() && !inConflictRange) {
conflictBegin = it.beginKey();
inConflictRange = true;
} else if (!it.is_conflict_range() && inConflictRange) {
result->insert(KeyRangeRef(conflictBegin.toArenaOrRef(arena), it.beginKey().toArenaOrRef(arena)), true);
inConflictRange = false;
}
}
if (inConflictRange) {
result->insert(KeyRangeRef(conflictBegin.toArenaOrRef(arena), getMaxWriteKey()), true);
}
}
void ReadYourWritesTransaction::setTransactionID(uint64_t id) {
tr.setTransactionID(id);
}
void ReadYourWritesTransaction::setToken(uint64_t token) {
tr.setToken(token);
}
RangeResult ReadYourWritesTransaction::getReadConflictRangeIntersecting(KeyRangeRef kr) {
TEST(true); // Special keys read conflict range
ASSERT(readConflictRangeKeysRange.contains(kr));
ASSERT(!tr.options.checkWritesEnabled);
RangeResult result;
if (!options.readYourWritesDisabled) {
kr = kr.removePrefix(readConflictRangeKeysRange.begin);
auto iter = readConflicts.rangeContainingKeyBefore(kr.begin);
if (iter->begin() == allKeys.begin && !iter->value()) {
++iter; // Conventionally '' is missing from the result range if it's not part of a read conflict
}
for (; iter->begin() < kr.end; ++iter) {
if (kr.begin <= iter->begin() && iter->begin() < kr.end) {
result.push_back(result.arena(),
KeyValueRef(iter->begin().withPrefix(readConflictRangeKeysRange.begin, result.arena()),
iter->value() ? LiteralStringRef("1") : LiteralStringRef("0")));
}
}
} else {
CoalescedKeyRefRangeMap<ValueRef> readConflicts{ LiteralStringRef("0"), specialKeys.end };
for (const auto& range : tr.readConflictRanges())
readConflicts.insert(range.withPrefix(readConflictRangeKeysRange.begin, result.arena()),
LiteralStringRef("1"));
for (const auto& range : nativeReadRanges)
readConflicts.insert(range.withPrefix(readConflictRangeKeysRange.begin, result.arena()),
LiteralStringRef("1"));
for (const auto& f : tr.getExtraReadConflictRanges()) {
if (f.isReady() && f.get().first < f.get().second)
readConflicts.insert(KeyRangeRef(f.get().first, f.get().second)
.withPrefix(readConflictRangeKeysRange.begin, result.arena()),
LiteralStringRef("1"));
}
auto beginIter = readConflicts.rangeContaining(kr.begin);
if (beginIter->begin() != kr.begin)
++beginIter;
for (auto it = beginIter; it->begin() < kr.end; ++it) {
result.push_back(result.arena(), KeyValueRef(it->begin(), it->value()));
}
}
return result;
}
RangeResult ReadYourWritesTransaction::getWriteConflictRangeIntersecting(KeyRangeRef kr) {
TEST(true); // Special keys write conflict range
ASSERT(writeConflictRangeKeysRange.contains(kr));
RangeResult result;
// Memory owned by result
CoalescedKeyRefRangeMap<ValueRef> writeConflicts{ LiteralStringRef("0"), specialKeys.end };
if (!options.readYourWritesDisabled) {
KeyRangeRef strippedWriteRangePrefix = kr.removePrefix(writeConflictRangeKeysRange.begin);
WriteMap::iterator it(&writes);
it.skip(strippedWriteRangePrefix.begin);
if (it.beginKey() > allKeys.begin)
--it;
for (; it.beginKey() < strippedWriteRangePrefix.end; ++it) {
if (it.is_conflict_range())
writeConflicts.insert(
KeyRangeRef(it.beginKey().toArena(result.arena()), it.endKey().toArena(result.arena()))
.withPrefix(writeConflictRangeKeysRange.begin, result.arena()),
LiteralStringRef("1"));
}
} else {
for (const auto& range : tr.writeConflictRanges())
writeConflicts.insert(range.withPrefix(writeConflictRangeKeysRange.begin, result.arena()),
LiteralStringRef("1"));
for (const auto& range : nativeWriteRanges)
writeConflicts.insert(range.withPrefix(writeConflictRangeKeysRange.begin, result.arena()),
LiteralStringRef("1"));
}
for (const auto& k : versionStampKeys) {
KeyRange range;
if (versionStampFuture.isValid() && versionStampFuture.isReady() && !versionStampFuture.isError()) {
const auto& stamp = versionStampFuture.get();
StringRef key(range.arena(), k); // Copy
ASSERT(k.size() >= 4);
int32_t pos;
memcpy(&pos, k.end() - sizeof(int32_t), sizeof(int32_t));
pos = littleEndian32(pos);
ASSERT(pos >= 0 && pos + stamp.size() <= key.size());
memcpy(mutateString(key) + pos, stamp.begin(), stamp.size());
*(mutateString(key) + key.size() - 4) = '\x00';
// singleKeyRange, but share begin and end's memory
range = KeyRangeRef(key.substr(0, key.size() - 4), key.substr(0, key.size() - 3));
} else {
range = getVersionstampKeyRange(result.arena(), k, tr.getCachedReadVersion().orDefault(0), getMaxReadKey());
}
writeConflicts.insert(range.withPrefix(writeConflictRangeKeysRange.begin, result.arena()),
LiteralStringRef("1"));
}
auto beginIter = writeConflicts.rangeContaining(kr.begin);
if (beginIter->begin() != kr.begin)
++beginIter;
for (auto it = beginIter; it->begin() < kr.end; ++it) {
result.push_back(result.arena(), KeyValueRef(it->begin(), it->value()));
}
return result;
}
void ReadYourWritesTransaction::atomicOp(const KeyRef& key, const ValueRef& operand, uint32_t operationType) {
bool addWriteConflict = !options.getAndResetWriteConflictDisabled();
if (checkUsedDuringCommit()) {
throw used_during_commit();
}
if (key == metadataVersionKey) {
if (operationType != MutationRef::SetVersionstampedValue || operand != metadataVersionRequiredValue) {
throw client_invalid_operation();
}
} else if (key >= getMaxWriteKey()) {
throw key_outside_legal_range();
}
if (!isValidMutationType(operationType) || !isAtomicOp((MutationRef::Type)operationType))
throw invalid_mutation_type();
if (key.size() >
(key.startsWith(systemKeys.begin) ? CLIENT_KNOBS->SYSTEM_KEY_SIZE_LIMIT : CLIENT_KNOBS->KEY_SIZE_LIMIT))
throw key_too_large();
if (operand.size() > CLIENT_KNOBS->VALUE_SIZE_LIMIT)
throw value_too_large();
if (tr.apiVersionAtLeast(510)) {
if (operationType == MutationRef::Min)
operationType = MutationRef::MinV2;
else if (operationType == MutationRef::And)
operationType = MutationRef::AndV2;
}
KeyRef k;
if (!tr.apiVersionAtLeast(520) && operationType == MutationRef::SetVersionstampedKey) {
k = key.withSuffix(LiteralStringRef("\x00\x00"), arena);
} else {
k = KeyRef(arena, key);
}
ValueRef v;
if (!tr.apiVersionAtLeast(520) && operationType == MutationRef::SetVersionstampedValue) {
v = operand.withSuffix(LiteralStringRef("\x00\x00\x00\x00"), arena);
} else {
v = ValueRef(arena, operand);
}
if (operationType == MutationRef::SetVersionstampedKey) {
TEST(options.readYourWritesDisabled); // SetVersionstampedKey without ryw enabled
// this does validation of the key and needs to be performed before the readYourWritesDisabled path
KeyRangeRef range = getVersionstampKeyRange(arena, k, tr.getCachedReadVersion().orDefault(0), getMaxReadKey());
versionStampKeys.push_back(arena, k);
addWriteConflict = false;
if (!options.readYourWritesDisabled) {
writeRangeToNativeTransaction(range);
writes.addUnmodifiedAndUnreadableRange(range);
}
// k is the unversionstamped key provided by the user. If we've filled in a minimum bound
// for the versionstamp, we need to make sure that's reflected when we insert it into the
// WriteMap below.
transformVersionstampKey(k, tr.getCachedReadVersion().orDefault(0), 0);
}
if (operationType == MutationRef::SetVersionstampedValue) {
if (v.size() < 4)
throw client_invalid_operation();
int32_t pos;
memcpy(&pos, v.end() - sizeof(int32_t), sizeof(int32_t));
pos = littleEndian32(pos);
if (pos < 0 || pos + 10 > v.size() - 4)
throw client_invalid_operation();
}
approximateSize += k.expectedSize() + v.expectedSize() + sizeof(MutationRef) +
(addWriteConflict ? sizeof(KeyRangeRef) + 2 * key.expectedSize() + 1 : 0);
if (options.readYourWritesDisabled) {
return tr.atomicOp(k, v, (MutationRef::Type)operationType, addWriteConflict);
}
writes.mutate(k, (MutationRef::Type)operationType, v, addWriteConflict);
RYWImpl::triggerWatches(this, k, Optional<ValueRef>(), false);
}
void ReadYourWritesTransaction::set(const KeyRef& key, const ValueRef& value) {
if (key == metadataVersionKey) {
throw client_invalid_operation();
}
if (specialKeys.contains(key)) {
if (getDatabase()->apiVersionAtLeast(700)) {
return getDatabase()->specialKeySpace->set(this, key, value);
} else {
// These three special keys are deprecated in 7.0 and an alternative C API is added
// TODO : Rewrite related code using C api
if (key == LiteralStringRef("\xff\xff/reboot_worker")) {
BinaryReader::fromStringRef<ClientWorkerInterface>(value, IncludeVersion())
.reboot.send(RebootRequest());
return;
}
if (key == LiteralStringRef("\xff\xff/suspend_worker")) {
BinaryReader::fromStringRef<ClientWorkerInterface>(value, IncludeVersion())
.reboot.send(RebootRequest(false, false, options.timeoutInSeconds));
return;
}
if (key == LiteralStringRef("\xff\xff/reboot_and_check_worker")) {
BinaryReader::fromStringRef<ClientWorkerInterface>(value, IncludeVersion())
.reboot.send(RebootRequest(false, true));
return;
}
}
}
bool addWriteConflict = !options.getAndResetWriteConflictDisabled();
if (checkUsedDuringCommit()) {
throw used_during_commit();
}
if (key >= getMaxWriteKey())
throw key_outside_legal_range();
approximateSize += key.expectedSize() + value.expectedSize() + sizeof(MutationRef) +
(addWriteConflict ? sizeof(KeyRangeRef) + 2 * key.expectedSize() + 1 : 0);
if (options.readYourWritesDisabled) {
return tr.set(key, value, addWriteConflict);
}
// TODO: check transaction size here
if (key.size() >
(key.startsWith(systemKeys.begin) ? CLIENT_KNOBS->SYSTEM_KEY_SIZE_LIMIT : CLIENT_KNOBS->KEY_SIZE_LIMIT))
throw key_too_large();
if (value.size() > CLIENT_KNOBS->VALUE_SIZE_LIMIT)
throw value_too_large();
KeyRef k = KeyRef(arena, key);
ValueRef v = ValueRef(arena, value);
writes.mutate(k, MutationRef::SetValue, v, addWriteConflict);
RYWImpl::triggerWatches(this, key, value);
}
void ReadYourWritesTransaction::clear(const KeyRangeRef& range) {
bool addWriteConflict = !options.getAndResetWriteConflictDisabled();
if (checkUsedDuringCommit()) {
throw used_during_commit();
}
if (specialKeys.contains(range)) {
if (getDatabase()->apiVersionAtLeast(700)) {
return getDatabase()->specialKeySpace->clear(this, range);
}
}
KeyRef maxKey = getMaxWriteKey();
if (range.begin > maxKey || range.end > maxKey)
throw key_outside_legal_range();
approximateSize += range.expectedSize() + sizeof(MutationRef) +
(addWriteConflict ? sizeof(KeyRangeRef) + range.expectedSize() : 0);
if (options.readYourWritesDisabled) {
return tr.clear(range, addWriteConflict);
}
// There aren't any keys in the database with size larger than KEY_SIZE_LIMIT, so if range contains large keys
// we can translate it to an equivalent one with smaller keys
KeyRef begin = range.begin;
KeyRef end = range.end;
if (begin.size() >
(begin.startsWith(systemKeys.begin) ? CLIENT_KNOBS->SYSTEM_KEY_SIZE_LIMIT : CLIENT_KNOBS->KEY_SIZE_LIMIT))
begin = begin.substr(
0,
(begin.startsWith(systemKeys.begin) ? CLIENT_KNOBS->SYSTEM_KEY_SIZE_LIMIT : CLIENT_KNOBS->KEY_SIZE_LIMIT) +
1);
if (end.size() >
(end.startsWith(systemKeys.begin) ? CLIENT_KNOBS->SYSTEM_KEY_SIZE_LIMIT : CLIENT_KNOBS->KEY_SIZE_LIMIT))
end = end.substr(
0,
(end.startsWith(systemKeys.begin) ? CLIENT_KNOBS->SYSTEM_KEY_SIZE_LIMIT : CLIENT_KNOBS->KEY_SIZE_LIMIT) +
1);
KeyRangeRef r = KeyRangeRef(begin, end);
if (r.empty()) {
return;
}
r = KeyRangeRef(arena, r);
writes.clear(r, addWriteConflict);
RYWImpl::triggerWatches(this, r, Optional<ValueRef>());
}
void ReadYourWritesTransaction::clear(const KeyRef& key) {
bool addWriteConflict = !options.getAndResetWriteConflictDisabled();
if (checkUsedDuringCommit()) {
throw used_during_commit();
}
if (specialKeys.contains(key)) {
if (getDatabase()->apiVersionAtLeast(700)) {
return getDatabase()->specialKeySpace->clear(this, key);
}
}
if (key >= getMaxWriteKey())
throw key_outside_legal_range();
if (key.size() >
(key.startsWith(systemKeys.begin) ? CLIENT_KNOBS->SYSTEM_KEY_SIZE_LIMIT : CLIENT_KNOBS->KEY_SIZE_LIMIT))
return;
if (options.readYourWritesDisabled) {
return tr.clear(key, addWriteConflict);
}
KeyRangeRef r = singleKeyRange(key, arena);
approximateSize +=
r.expectedSize() + sizeof(KeyRangeRef) + (addWriteConflict ? sizeof(KeyRangeRef) + r.expectedSize() : 0);
// SOMEDAY: add an optimized single key clear to write map
writes.clear(r, addWriteConflict);
RYWImpl::triggerWatches(this, r, Optional<ValueRef>());
}
Future<Void> ReadYourWritesTransaction::watch(const Key& key) {
if (checkUsedDuringCommit()) {
return used_during_commit();
}
if (resetPromise.isSet())
return resetPromise.getFuture().getError();
if (options.readYourWritesDisabled)
return watches_disabled();
if (key >= allKeys.end || (key >= getMaxReadKey() && key != metadataVersionKey && tr.apiVersionAtLeast(300)))
return key_outside_legal_range();
if (key.size() >
(key.startsWith(systemKeys.begin) ? CLIENT_KNOBS->SYSTEM_KEY_SIZE_LIMIT : CLIENT_KNOBS->KEY_SIZE_LIMIT))
return key_too_large();
return RYWImpl::watch(this, key);
}
void ReadYourWritesTransaction::addWriteConflictRange(KeyRangeRef const& keys) {
if (checkUsedDuringCommit()) {
throw used_during_commit();
}
if (tr.apiVersionAtLeast(300)) {
if (keys.begin > getMaxWriteKey() || keys.end > getMaxWriteKey()) {
throw key_outside_legal_range();
}
}
// There aren't any keys in the database with size larger than KEY_SIZE_LIMIT, so if range contains large keys
// we can translate it to an equivalent one with smaller keys
KeyRef begin = keys.begin;
KeyRef end = keys.end;
if (begin.size() >
(begin.startsWith(systemKeys.begin) ? CLIENT_KNOBS->SYSTEM_KEY_SIZE_LIMIT : CLIENT_KNOBS->KEY_SIZE_LIMIT))
begin = begin.substr(
0,
(begin.startsWith(systemKeys.begin) ? CLIENT_KNOBS->SYSTEM_KEY_SIZE_LIMIT : CLIENT_KNOBS->KEY_SIZE_LIMIT) +
1);
if (end.size() >
(end.startsWith(systemKeys.begin) ? CLIENT_KNOBS->SYSTEM_KEY_SIZE_LIMIT : CLIENT_KNOBS->KEY_SIZE_LIMIT))
end = end.substr(
0,
(end.startsWith(systemKeys.begin) ? CLIENT_KNOBS->SYSTEM_KEY_SIZE_LIMIT : CLIENT_KNOBS->KEY_SIZE_LIMIT) +
1);
KeyRangeRef r = KeyRangeRef(begin, end);
if (r.empty()) {
return;
}
approximateSize += r.expectedSize() + sizeof(KeyRangeRef);
if (options.readYourWritesDisabled) {
tr.addWriteConflictRange(r);
return;
}
r = KeyRangeRef(arena, r);
writes.addConflictRange(r);
}
Future<Void> ReadYourWritesTransaction::commit() {
if (checkUsedDuringCommit()) {
return used_during_commit();
}
if (resetPromise.isSet())
return resetPromise.getFuture().getError();
return RYWImpl::commit(this);
}
Future<Standalone<StringRef>> ReadYourWritesTransaction::getVersionstamp() {
if (checkUsedDuringCommit()) {
return used_during_commit();
}
return waitOrError(tr.getVersionstamp(), resetPromise.getFuture());
}
void ReadYourWritesTransaction::setOption(FDBTransactionOptions::Option option, Optional<StringRef> value) {
setOptionImpl(option, value);
if (FDBTransactionOptions::optionInfo.getMustExist(option).persistent) {
persistentOptions.emplace_back(option, value.castTo<Standalone<StringRef>>());
}
}
void ReadYourWritesTransaction::setOptionImpl(FDBTransactionOptions::Option option, Optional<StringRef> value) {
switch (option) {
case FDBTransactionOptions::READ_YOUR_WRITES_DISABLE:
validateOptionValue(value, false);
if (!reading.isReady() || !cache.empty() || !writes.empty())
throw client_invalid_operation();
options.readYourWritesDisabled = true;
break;
case FDBTransactionOptions::READ_AHEAD_DISABLE:
validateOptionValue(value, false);
options.readAheadDisabled = true;
break;
case FDBTransactionOptions::NEXT_WRITE_NO_WRITE_CONFLICT_RANGE:
validateOptionValue(value, false);
options.nextWriteDisableConflictRange = true;
break;
case FDBTransactionOptions::ACCESS_SYSTEM_KEYS:
validateOptionValue(value, false);
options.readSystemKeys = true;
options.writeSystemKeys = true;
break;
case FDBTransactionOptions::READ_SYSTEM_KEYS:
validateOptionValue(value, false);
options.readSystemKeys = true;
break;
case FDBTransactionOptions::TIMEOUT:
options.timeoutInSeconds = extractIntOption(value, 0, std::numeric_limits<int>::max()) / 1000.0;
resetTimeout();
break;
case FDBTransactionOptions::RETRY_LIMIT:
options.maxRetries = (int)extractIntOption(value, -1, std::numeric_limits<int>::max());
break;
case FDBTransactionOptions::DEBUG_RETRY_LOGGING:
options.debugRetryLogging = true;
if (!transactionDebugInfo) {
transactionDebugInfo = Reference<TransactionDebugInfo>::addRef(new TransactionDebugInfo());
transactionDebugInfo->lastRetryLogTime = creationTime;
}
transactionDebugInfo->transactionName = value.present() ? value.get().toString() : "";
break;
case FDBTransactionOptions::SNAPSHOT_RYW_ENABLE:
validateOptionValue(value, false);
options.snapshotRywEnabled++;
break;
case FDBTransactionOptions::SNAPSHOT_RYW_DISABLE:
validateOptionValue(value, false);
options.snapshotRywEnabled--;
break;
case FDBTransactionOptions::USED_DURING_COMMIT_PROTECTION_DISABLE:
validateOptionValue(value, false);
options.disableUsedDuringCommitProtection = true;
break;
case FDBTransactionOptions::SPECIAL_KEY_SPACE_RELAXED:
validateOptionValue(value, false);
options.specialKeySpaceRelaxed = true;
break;
case FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES:
validateOptionValue(value, false);
options.specialKeySpaceChangeConfiguration = true;
break;
case FDBTransactionOptions::BYPASS_UNREADABLE:
validateOptionValue(value, false);
options.bypassUnreadable = true;
break;
default:
break;
}
tr.setOption(option, value);
}
void ReadYourWritesTransaction::operator=(ReadYourWritesTransaction&& r) noexcept {
cache = std::move(r.cache);
writes = std::move(r.writes);
arena = std::move(r.arena);
tr = std::move(r.tr);
readConflicts = std::move(r.readConflicts);
watchMap = std::move(r.watchMap);
reading = std::move(r.reading);
resetPromise = std::move(r.resetPromise);
r.resetPromise = Promise<Void>();
deferredError = std::move(r.deferredError);
retries = r.retries;
approximateSize = r.approximateSize;
timeoutActor = r.timeoutActor;
creationTime = r.creationTime;
commitStarted = r.commitStarted;
options = r.options;
transactionDebugInfo = r.transactionDebugInfo;
cache.arena = &arena;
writes.arena = &arena;
persistentOptions = std::move(r.persistentOptions);
nativeReadRanges = std::move(r.nativeReadRanges);
nativeWriteRanges = std::move(r.nativeWriteRanges);
versionStampKeys = std::move(r.versionStampKeys);
specialKeySpaceWriteMap = std::move(r.specialKeySpaceWriteMap);
}
ReadYourWritesTransaction::ReadYourWritesTransaction(ReadYourWritesTransaction&& r) noexcept
: cache(std::move(r.cache)), writes(std::move(r.writes)), arena(std::move(r.arena)), reading(std::move(r.reading)),
retries(r.retries), approximateSize(r.approximateSize), creationTime(r.creationTime),
deferredError(std::move(r.deferredError)), timeoutActor(std::move(r.timeoutActor)),
resetPromise(std::move(r.resetPromise)), commitStarted(r.commitStarted), options(r.options),
transactionDebugInfo(r.transactionDebugInfo) {
cache.arena = &arena;
writes.arena = &arena;
tr = std::move(r.tr);
readConflicts = std::move(r.readConflicts);
watchMap = std::move(r.watchMap);
r.resetPromise = Promise<Void>();
persistentOptions = std::move(r.persistentOptions);
nativeReadRanges = std::move(r.nativeReadRanges);
nativeWriteRanges = std::move(r.nativeWriteRanges);
versionStampKeys = std::move(r.versionStampKeys);
specialKeySpaceWriteMap = std::move(r.specialKeySpaceWriteMap);
}
Future<Void> ReadYourWritesTransaction::onError(Error const& e) {
return RYWImpl::onError(this, e);
}
void ReadYourWritesTransaction::applyPersistentOptions() {
Optional<StringRef> timeout;
for (auto option : persistentOptions) {
if (option.first == FDBTransactionOptions::TIMEOUT) {
timeout = option.second.castTo<StringRef>();
} else {
setOptionImpl(option.first, option.second.castTo<StringRef>());
}
}
// Setting a timeout can immediately cause a transaction to fail. The only timeout
// that matters is the one most recently set, so we ignore any earlier set timeouts
// that might inadvertently fail the transaction.
if (timeout.present()) {
setOptionImpl(FDBTransactionOptions::TIMEOUT, timeout);
}
}
void ReadYourWritesTransaction::resetRyow() {
Promise<Void> oldReset = resetPromise;
resetPromise = Promise<Void>();
timeoutActor.cancel();
arena = Arena();
cache = SnapshotCache(&arena);
writes = WriteMap(&arena);
readConflicts = CoalescedKeyRefRangeMap<bool>();
versionStampKeys = VectorRef<KeyRef>();
nativeReadRanges = Standalone<VectorRef<KeyRangeRef>>();
nativeWriteRanges = Standalone<VectorRef<KeyRangeRef>>();
specialKeySpaceWriteMap =
KeyRangeMap<std::pair<bool, Optional<Value>>>(std::make_pair(false, Optional<Value>()), specialKeys.end);
specialKeySpaceErrorMsg.reset();
watchMap.clear();
reading = AndFuture();
approximateSize = 0;
commitStarted = false;
deferredError = Error();
if (tr.apiVersionAtLeast(16)) {
options.reset(tr);
applyPersistentOptions();
}
if (!oldReset.isSet())
oldReset.sendError(transaction_cancelled());
}
void ReadYourWritesTransaction::cancel() {
if (!resetPromise.isSet())
resetPromise.sendError(transaction_cancelled());
}
void ReadYourWritesTransaction::reset() {
retries = 0;
approximateSize = 0;
creationTime = now();
timeoutActor.cancel();
persistentOptions.clear();
options.reset(tr);
transactionDebugInfo.clear();
tr.fullReset();
versionStampFuture = tr.getVersionstamp();
std::copy(tr.getDatabase().getTransactionDefaults().begin(),
tr.getDatabase().getTransactionDefaults().end(),
std::back_inserter(persistentOptions));
resetRyow();
}
KeyRef ReadYourWritesTransaction::getMaxReadKey() {
if (options.readSystemKeys)
return systemKeys.end;
else
return normalKeys.end;
}
KeyRef ReadYourWritesTransaction::getMaxWriteKey() {
if (options.writeSystemKeys)
return systemKeys.end;
else
return normalKeys.end;
}
ReadYourWritesTransaction::~ReadYourWritesTransaction() {
if (!resetPromise.isSet())
resetPromise.sendError(transaction_cancelled());
}
bool ReadYourWritesTransaction::checkUsedDuringCommit() {
if (commitStarted && !resetPromise.isSet() && !options.disableUsedDuringCommitProtection) {
resetPromise.sendError(used_during_commit());
}
return commitStarted;
}
void ReadYourWritesTransaction::debugLogRetries(Optional<Error> error) {
bool committed = !error.present();
if (options.debugRetryLogging) {
double timeSinceLastLog = now() - transactionDebugInfo->lastRetryLogTime;
double elapsed = now() - creationTime;
if (timeSinceLastLog >= 1 || (committed && elapsed > 1)) {
std::string transactionNameStr = "";
if (!transactionDebugInfo->transactionName.empty())
transactionNameStr =
format(" in transaction '%s'", printable(StringRef(transactionDebugInfo->transactionName)).c_str());
if (!g_network->isSimulated()) // Fuzz workload turns this on, but we do not want stderr output in
// simulation
fprintf(stderr,
"fdb WARNING: long transaction (%.2fs elapsed%s, %d retries, %s)\n",
elapsed,
transactionNameStr.c_str(),
retries,
committed ? "committed" : error.get().what());
{
TraceEvent trace = TraceEvent("LongTransaction");
if (error.present())
trace.error(error.get(), true);
if (!transactionDebugInfo->transactionName.empty())
trace.detail("TransactionName", transactionDebugInfo->transactionName);
trace.detail("Elapsed", elapsed).detail("Retries", retries).detail("Committed", committed);
}
transactionDebugInfo->lastRetryLogTime = now();
}
}
}