Merge branch 'main' into feature-metacluster

This commit is contained in:
A.J. Beamon 2022-07-21 14:48:53 -07:00
commit 3e1763e7ac
58 changed files with 1208 additions and 792 deletions

View File

@ -194,11 +194,7 @@ if(NOT WIN32)
target_link_libraries(fdb_c_client_memory_test PRIVATE fdb_c Threads::Threads)
target_include_directories(fdb_c_api_tester_impl PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR}/foundationdb/ ${CMAKE_SOURCE_DIR}/flow/include ${CMAKE_BINARY_DIR}/flow/include)
if(USE_SANITIZER)
target_link_libraries(fdb_c_api_tester_impl PRIVATE fdb_cpp toml11_target Threads::Threads fmt::fmt boost_asan)
else()
target_link_libraries(fdb_c_api_tester_impl PRIVATE fdb_cpp toml11_target Threads::Threads fmt::fmt boost_target)
endif()
target_link_libraries(fdb_c_api_tester_impl PRIVATE fdb_cpp toml11_target Threads::Threads fmt::fmt boost_target)
target_link_libraries(fdb_c_api_tester_impl PRIVATE SimpleOpt)
target_include_directories(fdb_c_unit_tests_impl PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR}/foundationdb/)
@ -211,11 +207,7 @@ if(NOT WIN32)
# do not set RPATH for mako
set_property(TARGET mako PROPERTY SKIP_BUILD_RPATH TRUE)
if (USE_SANITIZER)
target_link_libraries(mako PRIVATE fdb_c fdbclient fmt::fmt Threads::Threads fdb_cpp boost_asan rapidjson)
else()
target_link_libraries(mako PRIVATE fdb_c fdbclient fmt::fmt Threads::Threads fdb_cpp boost_target rapidjson)
endif()
target_link_libraries(mako PRIVATE fdb_c fdbclient fmt::fmt Threads::Threads fdb_cpp boost_target rapidjson)
if(NOT OPEN_FOR_IDE)
# Make sure that fdb_c.h is compatible with c90
@ -439,7 +431,7 @@ if (OPEN_FOR_IDE)
add_library(fdb_c_shim OBJECT fdb_c_shim.cpp)
target_link_libraries(fdb_c_shim PUBLIC dl)
elseif(NOT WIN32 AND NOT APPLE AND NOT OPEN_FOR_IDE) # Linux Only
elseif(NOT WIN32 AND NOT APPLE AND NOT OPEN_FOR_IDE AND NOT USE_UBSAN) # Linux, non-ubsan only
set(SHIM_LIB_OUTPUT_DIR ${CMAKE_CURRENT_BINARY_DIR})
@ -472,7 +464,7 @@ elseif(NOT WIN32 AND NOT APPLE AND NOT OPEN_FOR_IDE) # Linux Only
--api-test-dir ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests
)
endif() # End Linux only
endif() # End Linux, non-ubsan only
# TODO: re-enable once the old vcxproj-based build system is removed.
#generate_export_header(fdb_c EXPORT_MACRO_NAME "DLLEXPORT"

View File

@ -941,13 +941,13 @@ static Value dataOfRecord(const int i) {
return Value(format("data-of-record-%08d", i));
}
static std::string indexEntryKey(const int i) {
return Tuple().append(StringRef(prefix)).append(INDEX).append(indexKey(i)).append(primaryKey(i)).pack().toString();
return Tuple::makeTuple(prefix, INDEX, indexKey(i), primaryKey(i)).pack().toString();
}
static std::string recordKey(const int i, const int split) {
return Tuple().append(prefix).append(RECORD).append(primaryKey(i)).append(split).pack().toString();
return Tuple::makeTuple(prefix, RECORD, primaryKey(i), split).pack().toString();
}
static std::string recordValue(const int i, const int split) {
return Tuple().append(dataOfRecord(i)).append(split).pack().toString();
return Tuple::makeTuple(dataOfRecord(i), split).pack().toString();
}
const static int SPLIT_SIZE = 3;
@ -993,13 +993,8 @@ GetMappedRangeResult getMappedIndexEntries(int beginId,
fdb::Transaction& tr,
int matchIndex,
bool allMissing) {
std::string mapper = Tuple()
.append(prefix)
.append(RECORD)
.append(allMissing ? "{K[2]}"_sr : "{K[3]}"_sr)
.append("{...}"_sr)
.pack()
.toString();
std::string mapper =
Tuple::makeTuple(prefix, RECORD, (allMissing ? "{K[2]}"_sr : "{K[3]}"_sr), "{...}"_sr).pack().toString();
return getMappedIndexEntries(beginId, endId, tr, mapper, matchIndex);
}
@ -1037,7 +1032,7 @@ TEST_CASE("tuple_support_versionstamp") {
// a random 12 bytes long StringRef as a versionstamp
StringRef str = "\x01\x02\x03\x04\x05\x06\x07\x08\x09\x10\x11\x12"_sr;
Versionstamp vs(str);
const Tuple t = Tuple().append(prefix).append(RECORD).appendVersionstamp(vs).append("{K[3]}"_sr).append("{...}"_sr);
const Tuple t = Tuple::makeTuple(prefix, RECORD, vs, "{K[3]}"_sr, "{...}"_sr);
ASSERT(t.getVersionstamp(2) == vs);
// verify the round-way pack-unpack path for a Tuple containing a versionstamp
@ -1181,7 +1176,7 @@ TEST_CASE("fdb_transaction_get_mapped_range_missing_all_secondary") {
}
TEST_CASE("fdb_transaction_get_mapped_range_restricted_to_serializable") {
std::string mapper = Tuple().append(prefix).append(RECORD).append("{K[3]}"_sr).pack().toString();
std::string mapper = Tuple::makeTuple(prefix, RECORD, "{K[3]}"_sr).pack().toString();
fdb::Transaction tr(db);
auto result = get_mapped_range(
tr,
@ -1200,7 +1195,7 @@ TEST_CASE("fdb_transaction_get_mapped_range_restricted_to_serializable") {
}
TEST_CASE("fdb_transaction_get_mapped_range_restricted_to_ryw_enable") {
std::string mapper = Tuple().append(prefix).append(RECORD).append("{K[3]}"_sr).pack().toString();
std::string mapper = Tuple::makeTuple(prefix, RECORD, "{K[3]}"_sr).pack().toString();
fdb::Transaction tr(db);
fdb_check(tr.set_option(FDB_TR_OPTION_READ_YOUR_WRITES_DISABLE, nullptr, 0)); // Not disable RYW
auto result = get_mapped_range(

View File

@ -38,7 +38,10 @@ function(compile_boost)
set(BOOST_LINK_FLAGS "")
if(APPLE OR CLANG OR ICX OR USE_LIBCXX)
list(APPEND BOOST_COMPILER_FLAGS -stdlib=libc++ -nostdlib++)
list(APPEND BOOST_LINK_FLAGS -static-libgcc -lc++ -lc++abi)
list(APPEND BOOST_LINK_FLAGS -lc++ -lc++abi)
if (NOT APPLE)
list(APPEND BOOST_LINK_FLAGS -static-libgcc)
endif()
endif()
# Update the user-config.jam
@ -46,9 +49,9 @@ function(compile_boost)
foreach(flag IN LISTS BOOST_COMPILER_FLAGS COMPILE_BOOST_CXXFLAGS)
string(APPEND BOOST_ADDITIONAL_COMPILE_OPTIONS "<cxxflags>${flag} ")
endforeach()
#foreach(flag IN LISTS BOOST_LINK_FLAGS COMPILE_BOOST_LDFLAGS)
# string(APPEND BOOST_ADDITIONAL_COMPILE_OPTIONS "<linkflags>${flag} ")
#endforeach()
foreach(flag IN LISTS BOOST_LINK_FLAGS COMPILE_BOOST_LDFLAGS)
string(APPEND BOOST_ADDITIONAL_COMPILE_OPTIONS "<linkflags>${flag} ")
endforeach()
configure_file(${CMAKE_SOURCE_DIR}/cmake/user-config.jam.cmake ${CMAKE_BINARY_DIR}/user-config.jam)
set(USER_CONFIG_FLAG --user-config=${CMAKE_BINARY_DIR}/user-config.jam)
@ -92,10 +95,10 @@ if(USE_SANITIZER)
endif()
message(STATUS "A sanitizer is enabled, need to build boost from source")
if (USE_VALGRIND)
compile_boost(TARGET boost_asan BUILD_ARGS valgrind=on
compile_boost(TARGET boost_target BUILD_ARGS valgrind=on
CXXFLAGS ${SANITIZER_COMPILE_OPTIONS} LDFLAGS ${SANITIZER_LINK_OPTIONS})
else()
compile_boost(TARGET boost_asan BUILD_ARGS context-impl=ucontext
compile_boost(TARGET boost_target BUILD_ARGS context-impl=ucontext
CXXFLAGS ${SANITIZER_COMPILE_OPTIONS} LDFLAGS ${SANITIZER_LINK_OPTIONS})
endif()
return()

View File

@ -0,0 +1,227 @@
# Load Balancing in FoundationDB
## Introduction
FoundationDB is a distributed key-value database. A FoundationDB cluster is constituted by one or more processes over one or more physical machines, where each process is a *worker* and takes certain *role*s, such as coordinator, proxy, TLog, storage server, etc., in the system.
The interpocess communications (IPC) between the processes are supported by the [`flow`](https://github.com/apple/foundationdb/tree/main/flow) infrastructure. In the `flow` context, each process will expose one or more *interface*(s). Each interface is able to accept given type of *request*s, and *reply* `Void`, requested data or error. The interfaces and the corresponding request/reply pairs forms the IPC protocol of FoundationDB.
In many cases, the same request can be proceed by multiple processes, e.g. all commit proxies can accept commit requests, and multiple storage server processes can provide values for a given key in double/triple redundancy mode. A load balancer (LB) can be used to distribute the requests over the possible interfaces, preventing one or a few processes getting overloaded. The interface candidates are also referred as *alternative*s. The LB is also able to react when one or more interfaces are (temporarily) unavailable by retrying, or re-routing the request to other candidates. The interface candidates are also known as *alternative*s.
Two LBs are provided in FoundationDB: `basicLoadBalance` and `loadBalance`, both defined in [`LoadBalance.actor.h`](https://github.com/apple/foundationdb/blob/main/fdbrpc/include/fdbrpc/LoadBalance.actor.h). The `basicLoadBalance` is a simple load balancer which each interface is equally chosen; while the `loadBalance` accepts a model object, which provides [datacenter](https://apple.github.io/foundationdb/configuration.html#configuring-regions) (DC) awaring balancing algorithms, allowing requests being sent to interfaces in the same DC.
In the following sections, the two LBs will be discussed in details.
## `basicLoadBalance`
`basicLoadBalance` implements a simple load balancing algorithm. It applies to
* Commit proxy interface
* GetReadVersion proxy interface
* ConfigFollower interface
Here, the interfaces are assumed to be always *fresh*, i.e. the list of the servers is fixed.
```mermaid
graph LR
H0{Has alternatives?}
H1[Pick an alternative]
H2[Backoff]
H3[Request]
H4([Reply])
H5([Error])
H6([Never])
H((Start)) --> H0
H0 --No--> H6
H0 --Yes--> H1
H1 --No healthy alternatives--> H2 --Retry--> H1
H1 --Has alternative--> H3 --Success--> H4
H3 --Exception--> H5
H3 --Broken Promise --> H2
```
### Alternative pick algorithm
In `basicLoadBalance`, a *best* alternative is picked and used at the beginning. At this stage, this alternative is randomly picked among all alternatives. If the best alternative does not work, it will iteratively try other interfaces, see [here](#picking-up-an-alternative-in-basic-load-balancing-algorithm).
## `loadBalance`
`loadBalance` provides a more sophisticated implementation of load balancing. In addition of the basic load balancing, it also provides a variety of features:
* Support for Test Storage Server ([TSS](https://github.com/apple/foundationdb/blob/main/documentation/sphinx/source/tss.rst))
* Datacenter awaring alternative election
* Recording the latency and penalty from interfaces, and [prioritize the interfaces based on previously stored data](#with-queuemodel).
* Able to handle timeouts and SS exceptions with retries.
Currently it is used for
* Storage Server interface
* BlobWorker interface
```mermaid
graph LR
H((Start))
H0{Has alternatives?}
H1[Choose initial candidates]
H4([Never])
H5[pick an alternative]
H6[Send request]
H7[Wait for available alternative]
H8([Response])
H9([All alternatives failed])
H --> H0 --No--> H4
H0 --Yes--> H1
H1 --> H5
H5 --Has alternative--> H6
H5 --No alternative-->H7
H6 --Success--> H8
H6 --Failure--> H5
H7 --At least one alternative--> H5
H7 --> H9
```
Note:
* The response could be either a reply, or an `Error`, e.g. `process_behind` or `request_maybe_delivered`.
### Choose initial candidates
Two initial candidates will be picked before the requests start. They will be selected as the first two alternatives for the load balancer. If both of them failed, other alternatives are used in a round-robin way.
#### No `QueueModel`
If no `QueueModel` is provided, the initial candidates are picked randomly. The first candidate, or the *best* alternative, will be the one that in the same DC, if possible.
#### With `QueueModel`
`QueueModel` holds information about each candidate related to future version, latency and penalty.
* If the storage server is returning a future version error, it is marked as not available until some certain time.
* Penalty is reported by storage server in each response (see `storageserver.actor.cpp:StorageServer::getPenalty`). It is determined by the write queue length and the durability lagging.
If `QueueModel` exists, the candidates will be picked base on the penalty. Workers with high penalties will be avoided when picking the first two candidates.
### Pick an alternative
The alternatives are chosen in the round-robin way when the first two candidates failed. If all alternatives failed, a flag is set, and if the next request fails with `process_behind`, the caller will receive the `process_behind` error.
### Send requests to workers
Here it is assumed that there are at least one alternative available. If no alternative is available, the LB will wait.
```mermaid
graph LR
H((start))
H0{Is first request}
H1[Send first request]
H2([Response])
H3[Pick up next alternative]
H4[Send additional request]
H --> H3
H3 -->H0
H0 --Yes--> H1
H1 --Success--> H2
H1 --Timeout--> H3
H0 --No--> H4
H4 --First request succeed--> H2
H4 --Second request succeed--> H2
H4 --Additional request failed--> H3
```
The first request has a timeout option. If the LB is not able to retrieve the response within the timout, more requests will be sent to secondary and other available interfaces. If the first request failed, it is reset and the next request will be considered as the first request. Certain types of errors can also be returned as response, e.g. `request_may_be_delivered` or `process_behind`, which may not trigger a load-balancer retry.
### Wait for available alternative
When there is no alternatives available, the load balancer may wait until at least one interface is up.
```mermaid
graph LR
H0((start))
H1{Is first request in-flight}
H2[Wait for the first request]
H3([Response])
H4([Retry])
H5[Wait for alternatives]
H6([all_alternatives_failed])
H0 --> H1
H1 --Yes--> H2
H1 --No--> H5
H5 --Timeout-->H6
H5 --Success-->H4
H2 --Success-->H3
H2 --Failed-->H4
```
Note that "Wait for alternatives" will only timeout if the alternatives are always not fresh, i.e. this only happens when accessing storage servers. LB will throw `all_alternatives_failed` when timeout in this case.
#### Requests
Original requests in `loadBalancer` are wrapped by `LoadBalance.actor.h:RequestData`. It provides the following additional operations besides the original `flow` request:
* TSS support if `QueueModel` is available
* Translate some errors into `maybe_delivered`, `process_behind` or retries
* Update the `QueueModel` information including latency, penalty, etc.
## Appendix
### Picking an alternative in basic load balancing algorithm
The following script simulates the alternative picking up algorithm. The chosen alternatives will be printed out one-by-one. The `loadBalance` function uses a similar approach, though the interfaces in the same DC are used firstly.
```python
#! /usr/bin/env python3
import random
import time
class Alternatives:
def __init__(self, num_alternatives):
self._size = num_alternatives
def size(self):
return self._size
def get_best(self):
return random.randint(0, self._size - 1)
# Entry
NUM_ALTERNATIVES = 10
alts = Alternatives(NUM_ALTERNATIVES)
best_alt = alts.get_best()
next_alt = random.randint(0, alts.size() - 2)
if next_alt >= best_alt:
next_alt += 1
start_alt = next_alt
start_distance = (best_alt + alts.size() - start_alt) % alts.size()
use_alt = None
print("best_alt = {}".format(best_alt))
print("start_alt = {}".format(start_alt))
print("start_distance = {}".format(start_distance))
while True:
for alt_num in range(0, alts.size()):
use_alt = next_alt
if next_alt == start_alt:
print(" Going back to the start_alt")
use_alt = best_alt
elif (next_alt + alts.size() - start_alt) % alts.size() <= start_distance:
print(" Entering start_distance")
use_alt = (next_alt + alts.size() - 1) % alts.size()
print("Attempting alt: {}".format(use_alt))
# Next loop
next_alt = (next_alt + 1) % alts.size()
time.sleep(.2)
```

Binary file not shown.

View File

@ -85,7 +85,7 @@ Values must always be encoded according to the :ref:`api-python-tuple-layer`.
const KeyRef myGlobalConfigKey = LiteralStringRef("config/key");
// When you want to set the value..
Tuple value = Tuple().appendDouble(1.5);
Tuple value = Tuple::makeTuple((double)1.5);
FDBTransaction* tr = ...;
tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES);

View File

@ -97,8 +97,8 @@ ACTOR Future<bool> profileCommandActor(Database db,
}
}
Tuple rate = Tuple().appendDouble(sampleRate);
Tuple size = Tuple().append(sizeLimit);
Tuple rate = Tuple::makeTuple(sampleRate);
Tuple size = Tuple::makeTuple(sizeLimit);
tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES);
tr->set(GlobalConfig::prefixedKey(fdbClientInfoTxnSampleRate), rate.pack());
tr->set(GlobalConfig::prefixedKey(fdbClientInfoTxnSizeLimit), size.pack());

View File

@ -1216,6 +1216,8 @@ TEST_CASE("/blobgranule/files/snapshotFormatUnitTest") {
std::unordered_set<std::string> usedKeys;
Standalone<GranuleSnapshot> data;
int totalDataBytes = 0;
const int maxKeyGenAttempts = 1000;
int nAttempts = 0;
while (totalDataBytes < targetDataBytes) {
int keySize = deterministicRandom()->randomInt(targetKeyLength / 2, targetKeyLength * 3 / 2);
keySize = std::min(keySize, uidSize);
@ -1232,6 +1234,13 @@ TEST_CASE("/blobgranule/files/snapshotFormatUnitTest") {
data.push_back_deep(data.arena(), KeyValueRef(KeyRef(key), ValueRef(value)));
totalDataBytes += key.size() + value.size();
nAttempts = 0;
} else if (nAttempts > maxKeyGenAttempts) {
// KeySpace exhausted, avoid infinite loop
break;
} else {
// Keep exploring the KeySpace
nAttempts++;
}
}

View File

@ -71,13 +71,6 @@ if(WITH_AWS_BACKUP)
include(awssdk)
endif()
find_package(ZLIB)
if(ZLIB_FOUND)
add_compile_definitions(ZLIB_LIB_SUPPORTED)
else()
message(STATUS "ZLIB package not found")
endif()
add_flow_target(STATIC_LIBRARY NAME fdbclient SRCS ${FDBCLIENT_SRCS} ADDL_SRCS ${options_srcs})
target_include_directories(fdbclient PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/include" "${CMAKE_CURRENT_BINARY_DIR}/include")
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/versions.h.cmake ${CMAKE_CURRENT_BINARY_DIR}/include/fdbclient/versions.h)

View File

@ -144,10 +144,7 @@ std::string configDBTypeToString(ConfigDBType configDBType) {
}
TEST_CASE("/fdbclient/ConfigDB/ConfigKey/EncodeDecode") {
Tuple tuple;
tuple << "class-A"_sr
<< "test_long"_sr;
auto packed = tuple.pack();
auto packed = Tuple::makeTuple("class-A"_sr, "test_long"_sr).pack();
auto unpacked = ConfigKeyRef::decodeKey(packed);
ASSERT(unpacked.configClass.get() == "class-A"_sr);
ASSERT(unpacked.knobName == "test_long"_sr);
@ -169,18 +166,8 @@ void decodeFailureTest(KeyRef key) {
} // namespace
TEST_CASE("/fdbclient/ConfigDB/ConfigKey/DecodeFailure") {
{
Tuple tuple;
tuple << "s1"_sr
<< "s2"_sr
<< "s3"_sr;
decodeFailureTest(tuple.pack());
}
{
Tuple tuple;
tuple << "s1"_sr << 5;
decodeFailureTest(tuple.pack());
}
decodeFailureTest(Tuple::makeTuple("s1"_sr, "s2"_sr, "s3"_sr).pack());
decodeFailureTest(Tuple::makeTuple("s1"_sr, 5).pack());
decodeFailureTest("non-tuple-key"_sr);
return Void();
}

View File

@ -200,13 +200,7 @@ public:
Version endVersion{ ::invalidVersion }; // not meaningful for range files
Tuple pack() const {
return Tuple()
.append(version)
.append(StringRef(fileName))
.append(isRange)
.append(fileSize)
.append(blockSize)
.append(endVersion);
return Tuple::makeTuple(version, fileName, (int)isRange, fileSize, blockSize, endVersion);
}
static RestoreFile unpack(Tuple const& t) {
RestoreFile r;

View File

@ -183,13 +183,13 @@ ACTOR Future<Void> GlobalConfig::migrate(GlobalConfig* self) {
if (sampleRate.present()) {
const double sampleRateDbl =
BinaryReader::fromStringRef<double>(sampleRate.get().contents(), Unversioned());
Tuple rate = Tuple().appendDouble(sampleRateDbl);
Tuple rate = Tuple::makeTuple(sampleRateDbl);
tr->set(GlobalConfig::prefixedKey(fdbClientInfoTxnSampleRate), rate.pack());
}
if (sizeLimit.present()) {
const int64_t sizeLimitInt =
BinaryReader::fromStringRef<int64_t>(sizeLimit.get().contents(), Unversioned());
Tuple size = Tuple().append(sizeLimitInt);
Tuple size = Tuple::makeTuple(sizeLimitInt);
tr->set(GlobalConfig::prefixedKey(fdbClientInfoTxnSizeLimit), size.pack());
}

View File

@ -1132,10 +1132,9 @@ ACTOR static Future<Void> handleTssMismatches(DatabaseContext* cx) {
for (const DetailedTSSMismatch& d : data.second) {
// <tssid, time, mismatchid> -> mismatch data
tssMismatchDB.set(
tr,
Tuple().append(data.first.toString()).append(d.timestamp).append(d.mismatchId.toString()),
d.traceString);
tssMismatchDB.set(tr,
Tuple::makeTuple(data.first.toString(), d.timestamp, d.mismatchId.toString()),
d.traceString);
}
wait(tr->commit());

View File

@ -916,7 +916,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( BG_ENABLE_MERGING, true ); if (randomize && BUGGIFY) BG_ENABLE_MERGING = false;
init( BG_MERGE_CANDIDATE_THRESHOLD_SECONDS, isSimulated ? 20.0 : 30 * 60 ); if (randomize && BUGGIFY) BG_MERGE_CANDIDATE_THRESHOLD_SECONDS = 5.0;
init( BG_MERGE_CANDIDATE_DELAY_SECONDS, BG_MERGE_CANDIDATE_THRESHOLD_SECONDS / 10.0 );
init( BLOB_WORKER_INITIAL_SNAPSHOT_PARALLELISM, 8 ); if( randomize && BUGGIFY ) BLOB_WORKER_INITIAL_SNAPSHOT_PARALLELISM = 1;
init( BLOB_WORKER_TIMEOUT, 10.0 ); if( randomize && BUGGIFY ) BLOB_WORKER_TIMEOUT = 1.0;
@ -928,6 +928,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( BLOB_MANAGER_STATUS_EXP_BACKOFF_MIN, 0.1 );
init( BLOB_MANAGER_STATUS_EXP_BACKOFF_MAX, 5.0 );
init( BLOB_MANAGER_STATUS_EXP_BACKOFF_EXPONENT, 1.5 );
init( BLOB_MANAGER_CONCURRENT_MERGE_CHECKS, 64 ); if( randomize && BUGGIFY ) BLOB_MANAGER_CONCURRENT_MERGE_CHECKS = 1 << deterministicRandom()->randomInt(0, 7);
init( BGCC_TIMEOUT, isSimulated ? 10.0 : 120.0 );
init( BGCC_MIN_INTERVAL, isSimulated ? 1.0 : 10.0 );

View File

@ -679,13 +679,14 @@ Future<RangeResult> ConflictingKeysImpl::getRange(ReadYourWritesTransaction* ryw
if (ryw->getTransactionState()->conflictingKeys) {
auto krMapPtr = ryw->getTransactionState()->conflictingKeys.get();
auto beginIter = krMapPtr->rangeContaining(kr.begin);
if (beginIter->begin() != kr.begin)
++beginIter;
auto endIter = krMapPtr->rangeContaining(kr.end);
if (!kr.contains(beginIter->begin()) && beginIter != endIter)
++beginIter;
for (auto it = beginIter; it != endIter; ++it) {
result.push_back_deep(result.arena(), KeyValueRef(it->begin(), it->value()));
}
if (endIter->begin() != kr.end)
if (kr.contains(endIter->begin()))
result.push_back_deep(result.arena(), KeyValueRef(endIter->begin(), endIter->value()));
}
return result;
@ -2005,7 +2006,7 @@ Future<Optional<std::string>> ClientProfilingImpl::commit(ReadYourWritesTransact
} else {
try {
double sampleRate = boost::lexical_cast<double>(sampleRateStr);
Tuple rate = Tuple().appendDouble(sampleRate);
Tuple rate = Tuple::makeTuple(sampleRate);
insertions.push_back_deep(insertions.arena(), KeyValueRef(fdbClientInfoTxnSampleRate, rate.pack()));
} catch (boost::bad_lexical_cast& e) {
return Optional<std::string>(ManagementAPIError::toJsonString(
@ -2024,7 +2025,7 @@ Future<Optional<std::string>> ClientProfilingImpl::commit(ReadYourWritesTransact
} else {
try {
int64_t sizeLimit = boost::lexical_cast<int64_t>(sizeLimitStr);
Tuple size = Tuple().append(sizeLimit);
Tuple size = Tuple::makeTuple(sizeLimit);
insertions.push_back_deep(insertions.arena(), KeyValueRef(fdbClientInfoTxnSizeLimit, size.pack()));
} catch (boost::bad_lexical_cast& e) {
return Optional<std::string>(ManagementAPIError::toJsonString(
@ -2731,3 +2732,85 @@ Future<Optional<std::string>> FailedLocalitiesRangeImpl::commit(ReadYourWritesTr
// exclude locality with failed option as true.
return excludeLocalityCommitActor(ryw, true);
}
ACTOR Future<Void> validateSpecialSubrangeRead(ReadYourWritesTransaction* ryw,
KeySelector begin,
KeySelector end,
GetRangeLimits limits,
Reverse reverse,
RangeResult result) {
if (!result.size()) {
RangeResult testResult = wait(ryw->getRange(begin, end, limits, Snapshot::True, reverse));
ASSERT(testResult == result);
return Void();
}
if (reverse) {
ASSERT(std::is_sorted(result.begin(), result.end(), KeyValueRef::OrderByKeyBack{}));
} else {
ASSERT(std::is_sorted(result.begin(), result.end(), KeyValueRef::OrderByKey{}));
}
// Generate a keyrange where we can determine the expected result based solely on the previous readrange, and whose
// boundaries may or may not be keys in result.
std::vector<Key> candidateKeys;
if (reverse) {
for (int i = result.size() - 1; i >= 0; --i) {
candidateKeys.emplace_back(result[i].key);
if (i - 1 >= 0) {
candidateKeys.emplace_back(keyBetween(KeyRangeRef(result[i].key, result[i - 1].key)));
}
}
} else {
for (int i = 0; i < result.size(); ++i) {
candidateKeys.emplace_back(result[i].key);
if (i + 1 < result.size()) {
candidateKeys.emplace_back(keyBetween(KeyRangeRef(result[i].key, result[i + 1].key)));
}
}
}
std::sort(candidateKeys.begin(), candidateKeys.end());
int originalSize = candidateKeys.size();
// Add more candidate keys so that we might read a range between two adjacent result keys.
for (int i = 0; i < originalSize - 1; ++i) {
candidateKeys.emplace_back(keyBetween(KeyRangeRef(candidateKeys[i], candidateKeys[i + 1])));
}
std::vector<Key> keys;
keys = { deterministicRandom()->randomChoice(candidateKeys), deterministicRandom()->randomChoice(candidateKeys) };
std::sort(keys.begin(), keys.end());
state KeySelector testBegin = firstGreaterOrEqual(keys[0]);
state KeySelector testEnd = firstGreaterOrEqual(keys[1]);
// Generate expected result. Linear time is ok here since we're in simulation, and there's a benefit to keeping this
// simple (as we're using it as an test oracle)
state RangeResult expectedResult;
// The reverse parameter should be the same as for the original read, so if
// reverse is true then the results are _already_ in reverse order.
for (const auto& kr : result) {
if (kr.key >= keys[0] && kr.key < keys[1]) {
expectedResult.push_back(expectedResult.arena(), kr);
}
}
// Test
RangeResult testResult = wait(ryw->getRange(testBegin, testEnd, limits, Snapshot::True, reverse));
if (testResult != expectedResult) {
fmt::print("Reverse: {}\n", reverse);
fmt::print("Original range: [{}, {})\n", begin.toString(), end.toString());
fmt::print("Original result:\n");
for (const auto& kr : result) {
fmt::print(" {} -> {}\n", kr.key.printable(), kr.value.printable());
}
fmt::print("Test range: [{}, {})\n", testBegin.getKey().printable(), testEnd.getKey().printable());
fmt::print("Expected:\n");
for (const auto& kr : expectedResult) {
fmt::print(" {} -> {}\n", kr.key.printable(), kr.value.printable());
}
fmt::print("Got:\n");
for (const auto& kr : testResult) {
fmt::print(" {} -> {}\n", kr.key.printable(), kr.value.printable());
}
ASSERT(testResult == expectedResult);
}
return Void();
}

View File

@ -1524,9 +1524,9 @@ std::pair<BlobGranuleSplitState, Version> decodeBlobGranuleSplitValue(const Valu
const Value blobGranuleMergeValueFor(KeyRange mergeKeyRange,
std::vector<UID> parentGranuleIDs,
std::vector<KeyRange> parentGranuleRanges,
std::vector<Key> parentGranuleRanges,
std::vector<Version> parentGranuleStartVersions) {
ASSERT(parentGranuleIDs.size() == parentGranuleRanges.size());
ASSERT(parentGranuleIDs.size() == parentGranuleRanges.size() - 1);
ASSERT(parentGranuleIDs.size() == parentGranuleStartVersions.size());
BinaryWriter wr(IncludeVersion(ProtocolVersion::withBlobGranule()));
@ -1536,12 +1536,12 @@ const Value blobGranuleMergeValueFor(KeyRange mergeKeyRange,
wr << parentGranuleStartVersions;
return addVersionStampAtEnd(wr.toValue());
}
std::tuple<KeyRange, Version, std::vector<UID>, std::vector<KeyRange>, std::vector<Version>>
decodeBlobGranuleMergeValue(ValueRef const& value) {
std::tuple<KeyRange, Version, std::vector<UID>, std::vector<Key>, std::vector<Version>> decodeBlobGranuleMergeValue(
ValueRef const& value) {
KeyRange range;
Version v;
std::vector<UID> parentGranuleIDs;
std::vector<KeyRange> parentGranuleRanges;
std::vector<Key> parentGranuleRanges;
std::vector<Version> parentGranuleStartVersions;
BinaryReader reader(value, IncludeVersion());
@ -1551,7 +1551,7 @@ decodeBlobGranuleMergeValue(ValueRef const& value) {
reader >> parentGranuleStartVersions;
reader >> v;
ASSERT(parentGranuleIDs.size() == parentGranuleRanges.size());
ASSERT(parentGranuleIDs.size() == parentGranuleRanges.size() - 1);
ASSERT(parentGranuleIDs.size() == parentGranuleStartVersions.size());
ASSERT(bigEndian64(v) >= 0);
@ -1581,6 +1581,8 @@ const KeyRange blobGranuleHistoryKeyRangeFor(KeyRangeRef const& range) {
}
const Value blobGranuleHistoryValueFor(Standalone<BlobGranuleHistoryValue> const& historyValue) {
ASSERT(historyValue.parentVersions.empty() ||
historyValue.parentBoundaries.size() - 1 == historyValue.parentVersions.size());
BinaryWriter wr(IncludeVersion(ProtocolVersion::withBlobGranule()));
wr << historyValue;
return wr.toValue();
@ -1590,6 +1592,8 @@ Standalone<BlobGranuleHistoryValue> decodeBlobGranuleHistoryValue(const ValueRef
Standalone<BlobGranuleHistoryValue> historyValue;
BinaryReader reader(value, IncludeVersion());
reader >> historyValue;
ASSERT(historyValue.parentVersions.empty() ||
historyValue.parentBoundaries.size() - 1 == historyValue.parentVersions.size());
return historyValue;
}

View File

@ -141,12 +141,7 @@ bool ThrottleApi::TagQuotaValue::isValid() const {
}
Value ThrottleApi::TagQuotaValue::toValue() const {
Tuple tuple;
tuple.appendDouble(reservedReadQuota);
tuple.appendDouble(totalReadQuota);
tuple.appendDouble(reservedWriteQuota);
tuple.appendDouble(totalWriteQuota);
return tuple.pack();
return Tuple::makeTuple(reservedReadQuota, totalReadQuota, reservedWriteQuota, totalWriteQuota).pack();
}
ThrottleApi::TagQuotaValue ThrottleApi::TagQuotaValue::fromValue(ValueRef value) {

View File

@ -651,10 +651,7 @@ public:
Reference<Task> task) {
taskBucket->setOptions(tr);
Tuple t;
t.append(task->timeoutVersion);
t.append(task->key);
Tuple t = Tuple::makeTuple(task->timeoutVersion, task->key);
RangeResult values = wait(tr->getRange(taskBucket->timeouts.range(t), 1));
if (values.size() > 0)
return false;
@ -996,9 +993,7 @@ Future<bool> TaskBucket::isEmpty(Reference<ReadYourWritesTransaction> tr) {
Future<Void> TaskBucket::finish(Reference<ReadYourWritesTransaction> tr, Reference<Task> task) {
setOptions(tr);
Tuple t;
t.append(task->timeoutVersion);
t.append(task->key);
Tuple t = Tuple::makeTuple(task->timeoutVersion, task->key);
tr->atomicOp(prefix.pack(LiteralStringRef("task_count")),
LiteralStringRef("\xff\xff\xff\xff\xff\xff\xff\xff"),

View File

@ -19,6 +19,7 @@
*/
#include "fdbclient/Tuple.h"
#include "flow/UnitTest.h"
const uint8_t VERSIONSTAMP_96_CODE = 0x33;
@ -103,7 +104,7 @@ Tuple& Tuple::append(Tuple const& tuple) {
return *this;
}
Tuple& Tuple::appendVersionstamp(Versionstamp const& vs) {
Tuple& Tuple::append(Versionstamp const& vs) {
offsets.push_back(data.size());
data.push_back(data.arena(), VERSIONSTAMP_96_CODE);
@ -134,6 +135,10 @@ Tuple& Tuple::append(StringRef const& str, bool utf8) {
return *this;
}
Tuple& Tuple::append(UnicodeStr const& str) {
return append(str.str, true);
}
Tuple& Tuple::appendRaw(StringRef const& str) {
offsets.push_back(data.size());
@ -166,7 +171,11 @@ Tuple& Tuple::append(int64_t value) {
return *this;
}
Tuple& Tuple::appendBool(bool value) {
Tuple& Tuple::append(int32_t value) {
return append((int64_t)value);
}
Tuple& Tuple::append(bool value) {
offsets.push_back(data.size());
if (value) {
data.push_back(data.arena(), 0x27);
@ -176,7 +185,7 @@ Tuple& Tuple::appendBool(bool value) {
return *this;
}
Tuple& Tuple::appendFloat(float value) {
Tuple& Tuple::append(float value) {
offsets.push_back(data.size());
float swap = bigEndianFloat(value);
uint8_t* bytes = (uint8_t*)&swap;
@ -187,7 +196,7 @@ Tuple& Tuple::appendFloat(float value) {
return *this;
}
Tuple& Tuple::appendDouble(double value) {
Tuple& Tuple::append(double value) {
offsets.push_back(data.size());
double swap = value;
swap = bigEndianDouble(swap);
@ -199,12 +208,16 @@ Tuple& Tuple::appendDouble(double value) {
return *this;
}
Tuple& Tuple::appendNull() {
Tuple& Tuple::append(nullptr_t) {
offsets.push_back(data.size());
data.push_back(data.arena(), (uint8_t)'\x00');
return *this;
}
Tuple& Tuple::appendNull() {
return append(nullptr);
}
Tuple::ElementType Tuple::getType(size_t index) const {
if (index >= offsets.size()) {
throw invalid_tuple_index();
@ -426,3 +439,30 @@ StringRef Tuple::subTupleRawString(size_t index) const {
size_t endPos = end < offsets.size() ? offsets[end] : data.size();
return StringRef(data.begin() + offsets[index], endPos - offsets[index]);
}
TEST_CASE("fdbclient/Tuple/makeTuple") {
Tuple t1 = Tuple::makeTuple(
1, 1.0f, 1.0, false, "byteStr"_sr, Tuple::UnicodeStr("str"_sr), nullptr, Versionstamp("000000000000"_sr));
Tuple t2 = Tuple()
.append(1)
.append(1.0f)
.append(1.0)
.append(false)
.append("byteStr"_sr)
.append(Tuple::UnicodeStr("str"_sr))
.append(nullptr)
.append(Versionstamp("000000000000"_sr));
ASSERT(t1.pack() == t2.pack());
ASSERT(t1.getType(0) == Tuple::INT);
ASSERT(t1.getType(1) == Tuple::FLOAT);
ASSERT(t1.getType(2) == Tuple::DOUBLE);
ASSERT(t1.getType(3) == Tuple::BOOL);
ASSERT(t1.getType(4) == Tuple::BYTES);
ASSERT(t1.getType(5) == Tuple::UTF8);
ASSERT(t1.getType(6) == Tuple::NULL_TYPE);
ASSERT(t1.getType(7) == Tuple::VERSIONSTAMP);
ASSERT(t1.size() == 8);
return Void();
}

View File

@ -333,7 +333,10 @@ Future<Void> BackupContainerAzureBlobStore::create() {
TraceEvent(SevDebug, "BCAzureBlobStoreCreateContainer").detail("ContainerName", containerName);
Future<Void> createContainerFuture =
asyncTaskThread.execAsync([containerName = this->containerName, client = this->client] {
waitAzureFuture(client->create_container(containerName), "create_container");
auto outcome = client->get_container_properties(containerName).get();
if (!outcome.success()) {
waitAzureFuture(client->create_container(containerName), "create_container");
}
return Void();
});
Future<Void> encryptionSetupFuture = usesEncryption() ? encryptionSetupComplete() : Void();

View File

@ -361,7 +361,7 @@ public:
template <>
inline Standalone<StringRef> TupleCodec<FileBackupAgent::ERestoreState>::pack(
FileBackupAgent::ERestoreState const& val) {
return Tuple().append(val).pack();
return Tuple::makeTuple(static_cast<int>(val)).pack();
}
template <>
inline FileBackupAgent::ERestoreState TupleCodec<FileBackupAgent::ERestoreState>::unpack(
@ -578,7 +578,7 @@ ACTOR Future<Void> cleanupBackup(Database cx, DeleteData deleteData);
using EBackupState = BackupAgentBase::EnumState;
template <>
inline Standalone<StringRef> TupleCodec<EBackupState>::pack(EBackupState const& val) {
return Tuple().append(static_cast<int>(val)).pack();
return Tuple::makeTuple(static_cast<int>(val)).pack();
}
template <>
inline EBackupState TupleCodec<EBackupState>::unpack(Standalone<StringRef> const& val) {
@ -727,8 +727,7 @@ protected:
template <>
inline Standalone<StringRef> TupleCodec<Reference<IBackupContainer>>::pack(Reference<IBackupContainer> const& bc) {
Tuple tuple;
tuple.append(StringRef(bc->getURL()));
Tuple tuple = Tuple::makeTuple(bc->getURL());
if (bc->getEncryptionKeyFileName().present()) {
tuple.append(bc->getEncryptionKeyFileName().get());
@ -775,9 +774,7 @@ public:
Version version;
std::string fileName;
int64_t fileSize;
Tuple pack() const {
return Tuple().append(begin).append(version).append(StringRef(fileName)).append(fileSize);
}
Tuple pack() const { return Tuple::makeTuple(begin, version, fileName, fileSize); }
static RangeSlice unpack(Tuple const& t) {
RangeSlice r;
int i = 0;

View File

@ -244,11 +244,13 @@ enum BlobGranuleSplitState { Unknown = 0, Initialized = 1, Assigned = 2, Done =
struct BlobGranuleHistoryValue {
constexpr static FileIdentifier file_identifier = 991434;
UID granuleID;
VectorRef<std::pair<KeyRangeRef, Version>> parentGranules;
// VectorRef<std::pair<KeyRangeRef, Version>> parentGranules;
VectorRef<KeyRef> parentBoundaries;
VectorRef<Version> parentVersions;
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, granuleID, parentGranules);
serializer(ar, granuleID, parentBoundaries, parentVersions);
}
};

View File

@ -134,7 +134,7 @@ private:
if (!candidateValueFuture.get().present()) {
tr->addWriteConflictRange(singleKeyRange(self->recent.get(candidate).key()));
return Tuple().append(candidate).pack();
return Tuple::makeTuple(candidate).pack();
}
}
}

View File

@ -59,7 +59,7 @@ inline Tuple TupleCodec<Tuple>::unpack(Standalone<StringRef> const& val) {
template <>
inline Standalone<StringRef> TupleCodec<int64_t>::pack(int64_t const& val) {
return Tuple().append(val).pack();
return Tuple::makeTuple(val).pack();
}
template <>
inline int64_t TupleCodec<int64_t>::unpack(Standalone<StringRef> const& val) {
@ -68,7 +68,7 @@ inline int64_t TupleCodec<int64_t>::unpack(Standalone<StringRef> const& val) {
template <>
inline Standalone<StringRef> TupleCodec<bool>::pack(bool const& val) {
return Tuple().append(val ? 1 : 0).pack();
return Tuple::makeTuple(val ? 1 : 0).pack();
}
template <>
inline bool TupleCodec<bool>::unpack(Standalone<StringRef> const& val) {
@ -77,7 +77,7 @@ inline bool TupleCodec<bool>::unpack(Standalone<StringRef> const& val) {
template <>
inline Standalone<StringRef> TupleCodec<Standalone<StringRef>>::pack(Standalone<StringRef> const& val) {
return Tuple().append(val).pack();
return Tuple::makeTuple(val).pack();
}
template <>
inline Standalone<StringRef> TupleCodec<Standalone<StringRef>>::unpack(Standalone<StringRef> const& val) {
@ -96,7 +96,7 @@ inline UID TupleCodec<UID>::unpack(Standalone<StringRef> const& val) {
// This is backward compatible with TupleCodec<Standalone<StringRef>>
template <>
inline Standalone<StringRef> TupleCodec<std::string>::pack(std::string const& val) {
return Tuple().append(StringRef(val)).pack();
return Tuple::makeTuple(val).pack();
}
template <>
inline std::string TupleCodec<std::string>::unpack(Standalone<StringRef> const& val) {
@ -143,7 +143,7 @@ struct TupleCodec<std::vector<T>> {
template <>
inline Standalone<StringRef> TupleCodec<KeyRange>::pack(KeyRange const& val) {
return Tuple().append(val.begin).append(val.end).pack();
return Tuple::makeTuple(val.begin, val.end).pack();
}
template <>
inline KeyRange TupleCodec<KeyRange>::unpack(Standalone<StringRef> const& val) {

View File

@ -892,6 +892,7 @@ public:
int BG_CONSISTENCY_CHECK_TARGET_SPEED_KB;
bool BG_ENABLE_MERGING;
int BG_MERGE_CANDIDATE_THRESHOLD_SECONDS;
int BG_MERGE_CANDIDATE_DELAY_SECONDS;
int BLOB_WORKER_INITIAL_SNAPSHOT_PARALLELISM;
double BLOB_WORKER_TIMEOUT; // Blob Manager's reaction time to a blob worker failure
@ -902,6 +903,7 @@ public:
double BLOB_MANAGER_STATUS_EXP_BACKOFF_MIN;
double BLOB_MANAGER_STATUS_EXP_BACKOFF_MAX;
double BLOB_MANAGER_STATUS_EXP_BACKOFF_EXPONENT;
int BLOB_MANAGER_CONCURRENT_MERGE_CHECKS;
double BGCC_TIMEOUT;
double BGCC_MIN_INTERVAL;

View File

@ -539,5 +539,15 @@ public:
Future<Optional<std::string>> commit(ReadYourWritesTransaction* ryw) override;
};
// If the underlying set of key-value pairs of a key space is not changing, then we expect repeating a read to give the
// same result. Additionally, we can generate the expected result of any read if that read is reading a subrange. This
// actor performs a read of an arbitrary subrange of [begin, end) and validates the results.
ACTOR Future<Void> validateSpecialSubrangeRead(ReadYourWritesTransaction* ryw,
KeySelector begin,
KeySelector end,
GetRangeLimits limits,
Reverse reverse,
RangeResult result);
#include "flow/unactorcompiler.h"
#endif

View File

@ -605,7 +605,6 @@ struct StorageMetrics {
int64_t bytesPerKSecond = 0; // network bandwidth (average over 10s)
int64_t iosPerKSecond = 0;
int64_t bytesReadPerKSecond = 0;
Optional<KeyRange> keys; // this metric belongs to which range
static const int64_t infinity = 1LL << 60;

View File

@ -42,9 +42,7 @@ public:
template <class T>
Key pack(T const& item) const {
Tuple t;
t.append(item);
return pack(t);
return pack(Tuple::makeTuple(item));
}
Key pack(StringRef const& item, bool utf8 = false) const {
@ -58,9 +56,7 @@ public:
template <class T>
Subspace get(T const& item) const {
Tuple t;
t.append(item);
return get(t);
return get(Tuple::makeTuple(item));
}
Subspace get(StringRef const& item, bool utf8 = false) const {

View File

@ -658,11 +658,11 @@ std::pair<BlobGranuleSplitState, Version> decodeBlobGranuleSplitValue(ValueRef c
const Value blobGranuleMergeValueFor(KeyRange mergeKeyRange,
std::vector<UID> parentGranuleIDs,
std::vector<KeyRange> parentGranuleRanges,
std::vector<Key> parentGranuleRanges,
std::vector<Version> parentGranuleStartVersions);
// FIXME: probably just define object type for this?
std::tuple<KeyRange, Version, std::vector<UID>, std::vector<KeyRange>, std::vector<Version>>
decodeBlobGranuleMergeValue(ValueRef const& value);
std::tuple<KeyRange, Version, std::vector<UID>, std::vector<Key>, std::vector<Version>> decodeBlobGranuleMergeValue(
ValueRef const& value);
const Key blobGranuleHistoryKeyFor(KeyRangeRef const& range, Version version);
std::pair<KeyRange, Version> decodeBlobGranuleHistoryKey(KeyRef const& key);

View File

@ -28,6 +28,11 @@
#include "fdbclient/Versionstamp.h"
struct Tuple {
struct UnicodeStr {
StringRef str;
explicit UnicodeStr(StringRef str) : str(str) {}
};
Tuple() {}
// Tuple parsing normally does not care of the final value is a numeric type and is incomplete.
@ -41,14 +46,15 @@ struct Tuple {
// the str needs to be a Tuple encoded string.
Tuple& appendRaw(StringRef const& str);
Tuple& append(StringRef const& str, bool utf8 = false);
Tuple& append(UnicodeStr const& str);
Tuple& append(int32_t);
Tuple& append(int64_t);
// There are some ambiguous append calls in fdbclient, so to make it easier
// to add append for floats and doubles, name them differently for now.
Tuple& appendBool(bool);
Tuple& appendFloat(float);
Tuple& appendDouble(double);
Tuple& append(bool);
Tuple& append(float);
Tuple& append(double);
Tuple& append(std::nullptr_t);
Tuple& appendNull();
Tuple& appendVersionstamp(Versionstamp const&);
Tuple& append(Versionstamp const&);
StringRef pack() const { return StringRef(data.begin(), data.size()); }
@ -84,10 +90,15 @@ struct Tuple {
Standalone<VectorRef<uint8_t>> getData() { return data; }
Standalone<StringRef> getDataAsStandalone() { return Standalone<StringRef>(pack(), data.arena()); }
// Create a tuple from a parameter pack
template <class... Types>
static Tuple makeTuple(Types&&... args) {
Tuple t;
(t << ... << args);
// Use a fold expression to append each argument using the << operator.
// https://en.cppreference.com/w/cpp/language/fold
(t << ... << std::forward<Types>(args));
return t;
}

View File

@ -165,4 +165,35 @@ bool compareFDBAndBlob(RangeResult fdb,
}
}
return correct;
}
}
ACTOR Future<Void> clearAndAwaitMerge(Database cx, KeyRange range) {
// clear key range and check whether it is merged or not, repeatedly
state Transaction tr(cx);
state int reClearCount = 1;
state int reClearInterval = 1; // do quadratic backoff on clear rate, b/c large keys can keep it not write-cold
loop {
try {
Standalone<VectorRef<KeyRangeRef>> ranges = wait(tr.getBlobGranuleRanges(range));
if (ranges.size() == 1) {
return Void();
}
CODE_PROBE(true, "ClearAndAwaitMerge doing clear");
reClearCount--;
if (reClearCount <= 0) {
tr.clear(range);
wait(tr.commit());
fmt::print("ClearAndAwaitMerge cleared [{0} - {1}) @ {2}\n",
range.begin.printable(),
range.end.printable(),
tr.getCommittedVersion());
reClearCount = reClearInterval;
reClearInterval++;
}
wait(delay(30.0)); // sleep a bit before checking on merge again
tr.reset();
} catch (Error& e) {
wait(tr.onError(e));
}
}
}

View File

@ -25,6 +25,7 @@
#include <vector>
#include <unordered_map>
#include "fdbrpc/simulator.h"
#include "fmt/format.h"
#include "fdbclient/BackupContainerFileSystem.h"
#include "fdbclient/BlobGranuleCommon.h"
@ -281,6 +282,43 @@ struct BlobManagerStats {
}
};
enum MergeCandidateState {
MergeCandidateCannotMerge,
MergeCandidateCanMerge,
MergeCandidateUnknown,
MergeCandidateMerging
};
// The current merge algorithm, skipping just granules that will be merge-eligible on the next pass, but not
// their neighbors, is optimal for guaranteeing merges to make progress where possible, with decently
// optimal but not globally optimal merge behavior.
// Alternative algorithms include not doing a two-pass consideration at all and immediately considering
// all merge candidates, which guarantees the most progress but pretty much guarantees undesirably
// suboptimal merge decisions, because of the time variance of granules becoming merge candidates. Or,
// also skipping adjacent eligible granules in addition to the one that will be eligible next pass,
// which ensures optimally large merges in a future pass, but adds decent delay to doing the merge. Or,
// smarter considering of merge candidates adjacent to the one that will be eligible next pass
// (depending on whether potential future merges with adjacent ones could include this candidate), which
// would be the best of both worlds, but would add a decent amount of code complexity.
struct MergeCandidateInfo {
MergeCandidateState st;
UID granuleID;
Version startVersion;
bool mergeNow;
MergeCandidateInfo() : st(MergeCandidateUnknown), startVersion(invalidVersion), mergeNow(false) {}
MergeCandidateInfo(MergeCandidateState st) : st(st), startVersion(invalidVersion), mergeNow(false) {
ASSERT(st != MergeCandidateCanMerge);
}
MergeCandidateInfo(UID granuleID, Version startVersion)
: st(MergeCandidateCanMerge), granuleID(granuleID), startVersion(startVersion), mergeNow(false) {}
bool canMerge() const { return st == MergeCandidateCanMerge; }
bool canMergeNow() const { return st == MergeCandidateCanMerge && mergeNow; }
};
struct BlobManagerData : NonCopyable, ReferenceCounted<BlobManagerData> {
UID id;
Database db;
@ -301,11 +339,13 @@ struct BlobManagerData : NonCopyable, ReferenceCounted<BlobManagerData> {
KeyRangeMap<BoundaryEvaluation> boundaryEvaluations;
KeyRangeMap<bool> knownBlobRanges;
BGTenantMap tenantData;
KeyRangeMap<Optional<std::pair<UID, Version>>> mergeCandidates; // granule range to granule id + start version.
KeyRangeMap<MergeCandidateInfo> mergeCandidates; // granule range to granule id + start version.
KeyRangeMap<Version> activeGranuleMerges; // range map of active granule merges, because range in boundaryEval
// doesn't correspond to merge range. invalidVersion is no merge,
// 0 is no merge version determined yet
FlowLock concurrentMergeChecks;
AsyncTrigger startRecruiting;
Debouncer restartRecruiting;
std::set<NetworkAddress> recruitingLocalities; // the addrs of the workers being recruited on
@ -321,9 +361,10 @@ struct BlobManagerData : NonCopyable, ReferenceCounted<BlobManagerData> {
BlobManagerData(UID id, Reference<AsyncVar<ServerDBInfo> const> dbInfo, Database db, Optional<Key> dcId)
: id(id), db(db), dcId(dcId), stats(id, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, &workersById),
knownBlobRanges(false, normalKeys.end), tenantData(BGTenantMap(dbInfo)),
mergeCandidates(Optional<std::pair<UID, Version>>(), normalKeys.end),
activeGranuleMerges(invalidVersion, normalKeys.end), restartRecruiting(SERVER_KNOBS->DEBOUNCE_RECRUITING_DELAY),
recruitingStream(0) {}
mergeCandidates(MergeCandidateInfo(MergeCandidateUnknown), normalKeys.end),
activeGranuleMerges(invalidVersion, normalKeys.end),
concurrentMergeChecks(SERVER_KNOBS->BLOB_MANAGER_CONCURRENT_MERGE_CHECKS),
restartRecruiting(SERVER_KNOBS->DEBOUNCE_RECRUITING_DELAY), recruitingStream(0) {}
// only initialize blob store if actually needed
void initBStore() {
@ -347,6 +388,7 @@ struct BlobManagerData : NonCopyable, ReferenceCounted<BlobManagerData> {
}
return false;
}
Version activeMergeVersion(const KeyRangeRef& range) {
auto ranges = activeGranuleMerges.intersectingRanges(range);
Version v = invalidVersion;
@ -355,6 +397,30 @@ struct BlobManagerData : NonCopyable, ReferenceCounted<BlobManagerData> {
}
return v;
}
void setMergeCandidate(const KeyRangeRef& range, UID granuleID, Version startVersion) {
// Want this to be idempotent. If a granule was already reported as merge-eligible, we want to use the existing
// merge and mergeNow state.
auto it = mergeCandidates.rangeContaining(range.begin);
if (it->begin() == range.begin && it.end() == range.end) {
if (it->cvalue().st != MergeCandidateCanMerge) {
// same range, just update
it->value() = MergeCandidateInfo(granuleID, startVersion);
} else {
// else no-op, but validate data
ASSERT(granuleID == it->cvalue().granuleID);
ASSERT(startVersion == it->cvalue().startVersion);
}
} else if (it->cvalue().st != MergeCandidateMerging) {
mergeCandidates.insert(range, MergeCandidateInfo(granuleID, startVersion));
}
}
void clearMergeCandidate(const KeyRangeRef& range, MergeCandidateState st) {
ASSERT(st != MergeCandidateCanMerge);
mergeCandidates.insert(range, MergeCandidateInfo(st));
}
};
ACTOR Future<Standalone<VectorRef<KeyRef>>> splitRange(Reference<BlobManagerData> bmData,
@ -1273,8 +1339,9 @@ ACTOR Future<Void> maybeSplitRange(Reference<BlobManagerData> bmData,
Standalone<BlobGranuleHistoryValue> historyValue;
historyValue.granuleID = newGranuleIDs[i];
historyValue.parentGranules.push_back(historyValue.arena(),
std::pair(granuleRange, granuleStartVersion));
historyValue.parentBoundaries.push_back(historyValue.arena(), granuleRange.begin);
historyValue.parentBoundaries.push_back(historyValue.arena(), granuleRange.end);
historyValue.parentVersions.push_back(historyValue.arena(), granuleStartVersion);
tr->set(historyKey, blobGranuleHistoryValueFor(historyValue));
@ -1452,7 +1519,8 @@ ACTOR Future<Void> forceGranuleFlush(Reference<BlobManagerData> bmData, KeyRange
break;
} else {
if (BM_DEBUG) {
fmt::print("ERROR: Error flushing range [{0} - {1}): {2}!\n",
fmt::print("ERROR: BM {0} Error flushing range [{1} - {2}): {3}!\n",
bmData->epoch,
blobGranuleMapping[j].key.printable(),
blobGranuleMapping[j + 1].key.printable(),
e.name());
@ -1491,7 +1559,7 @@ ACTOR Future<Void> forceGranuleFlush(Reference<BlobManagerData> bmData, KeyRange
ACTOR Future<std::pair<UID, Version>> persistMergeGranulesStart(Reference<BlobManagerData> bmData,
KeyRange mergeRange,
std::vector<UID> parentGranuleIDs,
std::vector<KeyRange> parentGranuleRanges,
std::vector<Key> parentGranuleRanges,
std::vector<Version> parentGranuleStartVersions) {
state UID mergeGranuleID = deterministicRandom()->randomUniqueID();
state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(bmData->db);
@ -1549,7 +1617,7 @@ ACTOR Future<Void> persistMergeGranulesDone(Reference<BlobManagerData> bmData,
KeyRange mergeRange,
Version mergeVersion,
std::vector<UID> parentGranuleIDs,
std::vector<KeyRange> parentGranuleRanges,
std::vector<Key> parentGranuleRanges,
std::vector<Version> parentGranuleStartVersions) {
state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(bmData->db);
// pick worker that has part of old range, it will soon get overridden anyway
@ -1580,13 +1648,14 @@ ACTOR Future<Void> persistMergeGranulesDone(Reference<BlobManagerData> bmData,
state int parentIdx;
// TODO: could parallelize these
for (parentIdx = 0; parentIdx < parentGranuleIDs.size(); parentIdx++) {
state Key lockKey = blobGranuleLockKeyFor(parentGranuleRanges[parentIdx]);
KeyRange parentRange(KeyRangeRef(parentGranuleRanges[parentIdx], parentGranuleRanges[parentIdx + 1]));
state Key lockKey = blobGranuleLockKeyFor(parentRange);
state Future<Optional<Value>> oldLockFuture = tr->get(lockKey);
wait(updateChangeFeed(tr,
granuleIDToCFKey(parentGranuleIDs[parentIdx]),
ChangeFeedStatus::CHANGE_FEED_DESTROY,
parentGranuleRanges[parentIdx]));
parentRange));
if (BM_DEBUG) {
fmt::print("Granule merge destroying CF {0} ({1})!\n",
parentGranuleIDs[parentIdx].shortString().substr(0, 6),
@ -1615,10 +1684,10 @@ ACTOR Future<Void> persistMergeGranulesDone(Reference<BlobManagerData> bmData,
Standalone<BlobGranuleHistoryValue> historyValue;
historyValue.granuleID = mergeGranuleID;
for (parentIdx = 0; parentIdx < parentGranuleIDs.size(); parentIdx++) {
historyValue.parentGranules.push_back(
historyValue.arena(),
std::pair(parentGranuleRanges[parentIdx], parentGranuleStartVersions[parentIdx]));
historyValue.parentBoundaries.push_back(historyValue.arena(), parentGranuleRanges[parentIdx]);
historyValue.parentVersions.push_back(historyValue.arena(), parentGranuleStartVersions[parentIdx]);
}
historyValue.parentBoundaries.push_back(historyValue.arena(), parentGranuleRanges.back());
tr->set(historyKey, blobGranuleHistoryValueFor(historyValue));
@ -1646,7 +1715,7 @@ ACTOR Future<Void> finishMergeGranules(Reference<BlobManagerData> bmData,
KeyRange mergeRange,
Version mergeVersion,
std::vector<UID> parentGranuleIDs,
std::vector<KeyRange> parentGranuleRanges,
std::vector<Key> parentGranuleRanges,
std::vector<Version> parentGranuleStartVersions) {
// wait for BM to be fully recovered before starting actual merges
@ -1685,307 +1754,196 @@ ACTOR Future<Void> finishMergeGranules(Reference<BlobManagerData> bmData,
bmData->boundaryEvaluations.insert(mergeRange,
BoundaryEvaluation(bmData->epoch, seqnoForEval, BoundaryEvalType::MERGE, 0, 0));
bmData->clearMergeCandidate(mergeRange, MergeCandidateMerging);
return Void();
}
// Make a decision on whether to merge this granule with surrounding ones.
ACTOR Future<Void> maybeMergeRange(Reference<BlobManagerData> bmData,
UID granuleID,
KeyRange granuleRange,
Version granuleStartVersion) {
state std::deque<std::tuple<UID, KeyRange, Version>> beforeCandidates, afterCandidates;
ACTOR Future<Void> doMerge(Reference<BlobManagerData> bmData,
KeyRange mergeRange,
std::vector<std::tuple<UID, KeyRange, Version>> toMerge) {
// switch to format persist merge wants
state std::vector<UID> ids;
state std::vector<Key> ranges;
state std::vector<Version> startVersions;
for (auto& it : toMerge) {
ids.push_back(std::get<0>(it));
ranges.push_back(std::get<1>(it).begin);
startVersions.push_back(std::get<2>(it));
}
ranges.push_back(std::get<1>(toMerge.back()).end);
try {
std::pair<UID, Version> persistMerge =
wait(persistMergeGranulesStart(bmData, mergeRange, ids, ranges, startVersions));
wait(finishMergeGranules(
bmData, persistMerge.first, mergeRange, persistMerge.second, ids, ranges, startVersions));
return Void();
} catch (Error& e) {
if (e.code() == error_code_operation_cancelled || e.code() == error_code_blob_manager_replaced) {
throw;
}
TraceEvent(SevError, "UnexpectedErrorGranuleMerge").error(e).detail("Range", mergeRange);
throw e;
}
}
// Needs to not be an actor to run synchronously for the race checking.
// Technically this could just be the first part of doMerge, but this guarantees no waits happen for the checks before
// the logic starts
static void attemptStartMerge(Reference<BlobManagerData> bmData,
const std::vector<std::tuple<UID, KeyRange, Version>>& toMerge) {
if (toMerge.size() < 2) {
return;
}
// TODO REMOVE validation eventually
for (int i = 0; i < toMerge.size() - 1; i++) {
ASSERT(std::get<1>(toMerge[i]).end == std::get<1>(toMerge[i + 1]).begin);
}
KeyRange mergeRange(KeyRangeRef(std::get<1>(toMerge.front()).begin, std::get<1>(toMerge.back()).end));
// merge/merge races should not be possible because granuleMergeChecker should only start attemptMerges() for
// disjoint ranges, and merge candidate is not updated if it is already in the state MergeCandidateMerging
ASSERT(!bmData->isMergeActive(mergeRange));
// Check to avoid races where a split eval came in while merge was evaluating. This also effectively checks
// boundaryEvals because they're both updated before maybeSplitRange is called. This handles split/merge races.
auto reCheckMergeCandidates = bmData->mergeCandidates.intersectingRanges(mergeRange);
for (auto it : reCheckMergeCandidates) {
if (!it->cvalue().canMergeNow()) {
CODE_PROBE(true, " granule no longer merge candidate after checking metrics, aborting merge");
return;
}
}
if (BM_DEBUG) {
fmt::print("BM {0} maybe merge [{1} - {2}): Start\n",
fmt::print("BM {0} Starting merge of [{1} - {2}) ({3})\n",
bmData->epoch,
granuleRange.begin.printable(),
granuleRange.end.printable());
mergeRange.begin.printable(),
mergeRange.end.printable(),
toMerge.size());
}
CODE_PROBE(true, "Doing granule merge");
bmData->activeGranuleMerges.insert(mergeRange, 0);
bmData->clearMergeCandidate(mergeRange, MergeCandidateMerging);
// Now, after setting activeGranuleMerges, we have committed to doing the merge, so any subsequent split eval for
// any of the ranges will be ignored. This handles merge/split races.
bmData->addActor.send(doMerge(bmData, mergeRange, toMerge));
}
// look for candidates to the left
if (granuleRange.begin != normalKeys.begin) {
auto rangeBefore = bmData->mergeCandidates.rangeContainingKeyBefore(granuleRange.begin);
while (rangeBefore.cvalue().present() && beforeCandidates.size() < SERVER_KNOBS->BG_MAX_MERGE_FANIN - 1) {
// if it is a merge candidate, add it to the list
beforeCandidates.push_front(
std::tuple(rangeBefore.cvalue().get().first, rangeBefore.range(), rangeBefore.cvalue().get().second));
// Greedily merges any consecutive 2+ granules in a row that are mergeable
ACTOR Future<Void> attemptMerges(Reference<BlobManagerData> bmData,
std::vector<std::tuple<UID, KeyRange, Version>> candidates) {
ASSERT(candidates.size() >= 2);
if (BM_DEBUG) {
fmt::print("BM {0} maybe merge [{1} - {2}): Before candidate [{3} - {4})\n",
bmData->epoch,
granuleRange.begin.printable(),
granuleRange.end.printable(),
rangeBefore.begin().printable(),
rangeBefore.end().printable());
}
ASSERT(rangeBefore.begin() >= normalKeys.begin);
if (rangeBefore.begin() == normalKeys.begin) {
break;
} else {
--rangeBefore;
}
}
// TODO REMOVE validation eventually
for (int i = 0; i < candidates.size() - 1; i++) {
ASSERT(std::get<1>(candidates[i]).end == std::get<1>(candidates[i + 1]).begin);
}
CODE_PROBE(true, "Candidate ranges to merge");
wait(bmData->concurrentMergeChecks.take());
state FlowLock::Releaser holdingDVL(bmData->concurrentMergeChecks);
// look for candidates to right
if (granuleRange.end != normalKeys.end) {
auto rangeAfter = bmData->mergeCandidates.rangeContaining(granuleRange.end);
while (rangeAfter.cvalue().present() && afterCandidates.size() < SERVER_KNOBS->BG_MAX_MERGE_FANIN - 1) {
// if it is a merge candidate, add it to the list
afterCandidates.push_back(
std::tuple(rangeAfter.cvalue().get().first, rangeAfter.range(), rangeAfter.cvalue().get().second));
if (BM_DEBUG) {
fmt::print("BM {0} maybe merge [{1} - {2}): After candidate [{3} - {4})\n",
bmData->epoch,
granuleRange.begin.printable(),
granuleRange.end.printable(),
rangeAfter.begin().printable(),
rangeAfter.end().printable());
}
ASSERT(rangeAfter.end() <= normalKeys.end);
if (rangeAfter.end() == normalKeys.end) {
break;
} else {
++rangeAfter;
}
}
}
if (beforeCandidates.empty() && afterCandidates.empty()) {
CODE_PROBE(true, "no consecutive merge candidates");
if (BM_DEBUG) {
fmt::print("BM {0} maybe merge [{1} - {2}): No merge candidates\n",
bmData->epoch,
granuleRange.begin.printable(),
granuleRange.end.printable());
}
return Void();
}
CODE_PROBE(true, "consecutive granule merge candidates");
if (BM_DEBUG) {
fmt::print("BM {0} maybe merge [{1} - {2}): Checking metrics for {3} candidates ({4} - {5})\n",
bmData->epoch,
granuleRange.begin.printable(),
granuleRange.end.printable(),
beforeCandidates.size() + afterCandidates.size() + 1,
beforeCandidates.size(),
afterCandidates.size());
}
// get metrics for current granule to see if it is still mergeable
StorageMetrics targetGranuleMetrics = wait(bmData->db->getStorageMetrics(granuleRange, CLIENT_KNOBS->TOO_MANY));
if (targetGranuleMetrics.bytesPerKSecond >= SERVER_KNOBS->SHARD_MIN_BYTES_PER_KSEC ||
targetGranuleMetrics.bytes >= SERVER_KNOBS->BG_SNAPSHOT_FILE_TARGET_BYTES) {
CODE_PROBE(true, "granule merge candidate no longer mergeable");
return Void();
}
// best set of granules to merge
state std::vector<UID> bestGranuleIDs;
state std::vector<KeyRange> bestGranuleRanges;
state std::vector<Version> bestGranuleStartVersions;
state KeyRange bestGranuleRange;
// current set of granules being evaluated
state std::deque<std::tuple<UID, KeyRange, Version, int64_t>> windowGranules;
state int64_t windowBytes = targetGranuleMetrics.bytes;
windowGranules.push_back(std::tuple(granuleID, granuleRange, granuleStartVersion, windowBytes));
// first walk backwards through before candidates until combined granule would be too large to merge, or we hit a
// granule that has too high bytesPerKSec and isn't mergeable
// start merging any set of 2+ consecutive granules that can be merged
state int64_t currentBytes = 0;
// large keys can cause a large number of granules in the merge to exceed the maximum value size
state int currentKeySumBytes = 0;
state std::vector<std::tuple<UID, KeyRange, Version>> currentCandidates;
state int i;
for (i = beforeCandidates.size() - 1; i >= 0; i--) {
if (BM_DEBUG) {
fmt::print("BM {0} maybe merge [{1} - {2}): Checking before candidate [{3} - {4})\n",
bmData->epoch,
granuleRange.begin.printable(),
granuleRange.end.printable(),
std::get<1>(beforeCandidates[i]).begin.printable(),
std::get<1>(beforeCandidates[i]).end.printable());
for (i = 0; i < candidates.size(); i++) {
StorageMetrics metrics =
wait(bmData->db->getStorageMetrics(std::get<1>(candidates[i]), CLIENT_KNOBS->TOO_MANY));
if (metrics.bytes >= SERVER_KNOBS->BG_SNAPSHOT_FILE_TARGET_BYTES ||
metrics.bytesPerKSecond >= SERVER_KNOBS->SHARD_MIN_BYTES_PER_KSEC) {
// This granule cannot be merged with any neighbors.
// If current candidates up to here can be merged, merge them and skip over this one
attemptStartMerge(bmData, currentCandidates);
currentCandidates.clear();
currentBytes = 0;
currentKeySumBytes = 0;
continue;
}
StorageMetrics beforeMetrics =
wait(bmData->db->getStorageMetrics(std::get<1>(beforeCandidates[i]), CLIENT_KNOBS->TOO_MANY));
if (windowBytes + beforeMetrics.bytes >= SERVER_KNOBS->BG_SNAPSHOT_FILE_TARGET_BYTES ||
beforeMetrics.bytesPerKSecond >= SERVER_KNOBS->SHARD_MIN_BYTES_PER_KSEC) {
break;
// if the current window is already at the maximum merge size, or adding this granule would push the window over
// the edge, merge the existing candidates if possible
ASSERT(currentCandidates.size() <= SERVER_KNOBS->BG_MAX_MERGE_FANIN);
if (currentCandidates.size() == SERVER_KNOBS->BG_MAX_MERGE_FANIN ||
currentBytes + metrics.bytes > SERVER_KNOBS->BG_SNAPSHOT_FILE_TARGET_BYTES ||
currentKeySumBytes >= CLIENT_KNOBS->VALUE_SIZE_LIMIT / 2) {
ASSERT(currentBytes <= SERVER_KNOBS->BG_SNAPSHOT_FILE_TARGET_BYTES);
CODE_PROBE(currentKeySumBytes >= CLIENT_KNOBS->VALUE_SIZE_LIMIT / 2, "merge early because of key size");
attemptStartMerge(bmData, currentCandidates);
currentCandidates.clear();
currentBytes = 0;
currentKeySumBytes = 0;
}
if (BM_DEBUG) {
fmt::print("BM {0} maybe merge [{1} - {2}): Before Candidate [{3} - {4}): {5} bytes\n",
bmData->epoch,
granuleRange.begin.printable(),
granuleRange.end.printable(),
std::get<1>(beforeCandidates[i]).begin.printable(),
std::get<1>(beforeCandidates[i]).end.printable(),
beforeMetrics.bytes);
// add this granule to the window
if (currentCandidates.empty()) {
currentKeySumBytes += std::get<1>(candidates[i]).begin.size();
}
windowBytes += beforeMetrics.bytes;
windowGranules.push_front(std::tuple(std::get<0>(beforeCandidates[i]),
std::get<1>(beforeCandidates[i]),
std::get<2>(beforeCandidates[i]),
beforeMetrics.bytes));
currentKeySumBytes += std::get<1>(candidates[i]).end.size();
currentCandidates.push_back(candidates[i]);
}
// set first window as the best range
bestGranuleRange = KeyRangeRef(std::get<1>(windowGranules.front()).begin, std::get<1>(windowGranules.back()).end);
for (auto& it : windowGranules) {
bestGranuleIDs.push_back(std::get<0>(it));
bestGranuleRanges.push_back(std::get<1>(it));
bestGranuleStartVersions.push_back(std::get<2>(it));
}
// Do moving window algorithm where we add the next afterCandidate to the merge window, and then remove the tail end
// of beforeCandidates until we are down to a mergeable granule
for (i = 0; i < afterCandidates.size(); i++) {
if (BM_DEBUG) {
fmt::print("BM {0} maybe merge [{1} - {2}): Checking after candidate [{3} - {4})\n",
bmData->epoch,
granuleRange.begin.printable(),
granuleRange.end.printable(),
std::get<1>(afterCandidates[i]).begin.printable(),
std::get<1>(afterCandidates[i]).end.printable());
}
// include this granule in the window
StorageMetrics afterMetrics =
wait(bmData->db->getStorageMetrics(std::get<1>(afterCandidates[i]), CLIENT_KNOBS->TOO_MANY));
if (afterMetrics.bytes >= SERVER_KNOBS->BG_SNAPSHOT_FILE_TARGET_BYTES ||
afterMetrics.bytesPerKSecond >= SERVER_KNOBS->SHARD_MIN_BYTES_PER_KSEC) {
break;
}
if (BM_DEBUG) {
fmt::print("BM {0} maybe merge [{1} - {2}): After Candidate [{3} - {4}): {5} bytes\n",
bmData->epoch,
granuleRange.begin.printable(),
granuleRange.end.printable(),
std::get<1>(afterCandidates[i]).begin.printable(),
std::get<1>(afterCandidates[i]).end.printable(),
afterMetrics.bytes);
}
windowBytes += afterMetrics.bytes;
windowGranules.push_back(std::tuple(std::get<0>(afterCandidates[i]),
std::get<1>(afterCandidates[i]),
std::get<2>(afterCandidates[i]),
afterMetrics.bytes));
// slide the window forward back down to mergeable size
while (windowBytes >= SERVER_KNOBS->BG_SNAPSHOT_FILE_TARGET_BYTES) {
if (BM_DEBUG) {
fmt::print("BM {0} maybe merge [{1} - {2}): window bytes {3} >= target {4}\n",
bmData->epoch,
granuleRange.begin.printable(),
granuleRange.end.printable(),
windowBytes,
SERVER_KNOBS->BG_SNAPSHOT_FILE_TARGET_BYTES);
}
ASSERT(!windowGranules.empty());
if (std::get<0>(windowGranules.front()) == granuleID) {
// merge must include target granule
break;
}
if (BM_DEBUG) {
fmt::print(
"BM {0} maybe merge [{1} - {2}): After Candidate [{3} - {4}) popping [{5} - {6}): {7} bytes\n",
bmData->epoch,
granuleRange.begin.printable(),
granuleRange.end.printable(),
std::get<1>(afterCandidates[i]).begin.printable(),
std::get<1>(afterCandidates[i]).end.printable(),
std::get<1>(windowGranules.front()).begin.printable(),
std::get<1>(windowGranules.front()).end.printable(),
std::get<3>(windowGranules.front()));
}
windowBytes -= std::get<3>(windowGranules.front());
windowGranules.pop_front();
}
// compare this candidate window to previous best
if (windowBytes >= SERVER_KNOBS->BG_SNAPSHOT_FILE_TARGET_BYTES) {
break;
} else if (windowGranules.size() > bestGranuleIDs.size()) {
if (BM_DEBUG) {
fmt::print("BM {0} maybe merge [{1} - {2}): new best granules {3}\n",
bmData->epoch,
granuleRange.begin.printable(),
granuleRange.end.printable(),
windowGranules.size());
}
bestGranuleRange =
KeyRangeRef(std::get<1>(windowGranules.front()).begin, std::get<1>(windowGranules.back()).end);
bestGranuleIDs.clear();
bestGranuleRanges.clear();
bestGranuleStartVersions.clear();
for (auto& it : windowGranules) {
bestGranuleIDs.push_back(std::get<0>(it));
bestGranuleRanges.push_back(std::get<1>(it));
bestGranuleStartVersions.push_back(std::get<2>(it));
}
}
}
CODE_PROBE(bestGranuleIDs.size() == 1, "Cannot combine merge candidates into mergeable granule");
CODE_PROBE(bestGranuleIDs.size() > 1, "Granule ready for merge!");
if (bestGranuleIDs.size() > 1) {
if (BM_DEBUG) {
fmt::print("BM {0} maybe merge [{1} - {2}): Found {3} consecutive granules in range [{4} - {5}):\n",
bmData->epoch,
granuleRange.begin.printable(),
granuleRange.end.printable(),
bestGranuleIDs.size(),
bestGranuleRange.begin.printable(),
bestGranuleRange.end.printable());
}
// This code block must execute withou a wait for the lock checks (isMergeActive, mergeCandidates) to not
// deadlock and to avoid merge-merge races.
if ((!g_network->isSimulated() || !g_simulator.speedUpSimulation) && !bmData->isMergeActive(bestGranuleRange)) {
// check to avoid races where a split eval came in while merge was evaluating
auto reCheckMergeCandidates = bmData->mergeCandidates.intersectingRanges(bestGranuleRange);
bool mergeStillOk = true;
for (auto it : reCheckMergeCandidates) {
if (!it->cvalue().present()) {
CODE_PROBE(true, "granule no longer merge candidate after checking metrics, because of split eval");
mergeStillOk = false;
break;
}
}
if (mergeStillOk) {
fmt::print("BM {0} maybe merge [{1} - {2}): Starting merge of [{3} - {4}) ({5})\n",
bmData->epoch,
granuleRange.begin.printable(),
granuleRange.end.printable(),
bestGranuleRange.begin.printable(),
bestGranuleRange.end.printable(),
bestGranuleIDs.size());
CODE_PROBE(true, "Doing granule merge!");
bmData->activeGranuleMerges.insert(bestGranuleRange, 0);
bmData->mergeCandidates.insert(bestGranuleRange, Optional<std::pair<UID, Version>>());
state std::pair<UID, Version> persistMerge = wait(persistMergeGranulesStart(
bmData, bestGranuleRange, bestGranuleIDs, bestGranuleRanges, bestGranuleStartVersions));
wait(finishMergeGranules(bmData,
persistMerge.first,
bestGranuleRange,
persistMerge.second,
bestGranuleIDs,
bestGranuleRanges,
bestGranuleStartVersions));
}
}
} else {
if (BM_DEBUG) {
fmt::print("BM {0} maybe merge [{1} - {2}): No mergeable granules after checking metrics\n",
bmData->epoch,
granuleRange.begin.printable(),
granuleRange.end.printable());
}
}
attemptStartMerge(bmData, currentCandidates);
return Void();
}
// Uses single-pass algorithm to identify mergeable sections of granules.
// To ensure each granule waits to see whether all of its neighbors are merge-eligible before merging it, a newly
// merge-eligible granule will be ignored on the first pass
ACTOR Future<Void> granuleMergeChecker(Reference<BlobManagerData> bmData) {
// initial sleep
wait(delayJittered(SERVER_KNOBS->BG_MERGE_CANDIDATE_DELAY_SECONDS));
// TODO could optimize to not check if there are no new merge-eligible granules and none in merge pending state
loop {
double sleepTime = SERVER_KNOBS->BG_MERGE_CANDIDATE_DELAY_SECONDS;
// Check more frequently if speedUpSimulation is set. This may
if (g_network->isSimulated() && g_simulator.speedUpSimulation) {
sleepTime = std::min(5.0, sleepTime);
}
// start delay at the start of the loop, to account for time spend in calculation
state Future<Void> intervalDelay = delayJittered(sleepTime);
// go over granule states, and start a findMergeableGranules for each sub-range of mergeable granules
// FIXME: avoid SlowTask by breaking this up periodically
// Break it up into parallel chunks. This makes it possible to process large ranges, but does mean the merges
// can be slightly suboptimal at boundaries. Use relatively large chunks to minimize the impact of this.
int maxRangeSize = SERVER_KNOBS->BG_MAX_MERGE_FANIN * 10;
state std::vector<Future<Void>> mergeChecks;
auto allRanges = bmData->mergeCandidates.ranges();
std::vector<std::tuple<UID, KeyRange, Version>> currentCandidates;
for (auto& it : allRanges) {
if (!it->cvalue().canMergeNow() || currentCandidates.size() == maxRangeSize) {
if (currentCandidates.size() >= 2) {
mergeChecks.push_back(attemptMerges(bmData, currentCandidates));
}
currentCandidates.clear();
}
if (it->cvalue().canMergeNow()) {
currentCandidates.push_back(std::tuple(it->cvalue().granuleID, it->range(), it->cvalue().startVersion));
} else if (it->cvalue().canMerge()) {
// set flag so this can get merged on the next pass
it->value().mergeNow = true;
}
}
if (currentCandidates.size() >= 2) {
mergeChecks.push_back(attemptMerges(bmData, currentCandidates));
}
CODE_PROBE(mergeChecks.size() > 1, "parallel merge checks");
wait(waitForAll(mergeChecks));
// if the calculation took longer than the desired interval, still wait a bit
wait(intervalDelay && delay(5.0));
}
}
ACTOR Future<Void> deregisterBlobWorker(Reference<BlobManagerData> bmData, BlobWorkerInterface interf) {
state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(bmData->db);
loop {
@ -2310,34 +2268,22 @@ ACTOR Future<Void> monitorBlobWorkerStatus(Reference<BlobManagerData> bmData, Bl
// clear merge candidates for range, if not already merging
if (clearMergeCandidate) {
bmData->mergeCandidates.insert(rep.granuleRange, Optional<std::pair<UID, Version>>());
bmData->clearMergeCandidate(rep.granuleRange, MergeCandidateCannotMerge);
}
}
if (rep.mergeCandidate && !ignore) {
// mark granule as merge candidate
ASSERT(!rep.doSplit);
// TODO: do we need any sort of validation that this is coming from the worker that currently owns
// the granule?
if (existingInProgress.present()) {
// TODO LOG?
} else {
if (BM_DEBUG) {
fmt::print("Manager {0} evaluating [{1} - {2}) {3}\n",
bmData->epoch,
rep.granuleRange.begin.printable().c_str(),
rep.granuleRange.end.printable().c_str(),
newEval.toString());
}
if (!bmData->isMergeActive(rep.granuleRange)) {
ASSERT(rep.mergeCandidate);
CODE_PROBE(true, "Granule merge candidate");
bmData->mergeCandidates.insert(rep.granuleRange,
std::pair(rep.granuleID, rep.startVersion));
newEval.inProgress =
maybeMergeRange(bmData, rep.granuleID, rep.granuleRange, rep.startVersion);
// still update epoch/seqno even if not doing a merge eval
bmData->boundaryEvaluations.insert(rep.granuleRange, newEval);
}
CODE_PROBE(true, "Granule merge candidate");
if (BM_DEBUG) {
fmt::print("Manager {0} merge candidate granule [{1} - {2}) {3}\n",
bmData->epoch,
rep.granuleRange.begin.printable().c_str(),
rep.granuleRange.end.printable().c_str(),
newEval.toString());
}
bmData->boundaryEvaluations.insert(rep.granuleRange, newEval);
bmData->setMergeCandidate(rep.granuleRange, rep.granuleID, rep.startVersion);
}
}
} catch (Error& e) {
@ -2580,7 +2526,7 @@ ACTOR Future<Void> resumeActiveMerges(Reference<BlobManagerData> bmData) {
UID mergeGranuleID = decodeBlobGranuleMergeKey(it.key);
KeyRange mergeRange;
std::vector<UID> parentGranuleIDs;
std::vector<KeyRange> parentGranuleRanges;
std::vector<Key> parentGranuleRanges;
std::vector<Version> parentGranuleStartVersions;
Version mergeVersion;
std::tie(mergeRange, mergeVersion, parentGranuleIDs, parentGranuleRanges, parentGranuleStartVersions) =
@ -2598,15 +2544,16 @@ ACTOR Future<Void> resumeActiveMerges(Reference<BlobManagerData> bmData) {
// report updated status. Start with early (epoch, seqno) to guarantee lower than later status
BoundaryEvaluation eval(1, 0, BoundaryEvalType::MERGE, 1, 0);
ASSERT(!bmData->isMergeActive(mergeRange));
eval.inProgress = finishMergeGranules(bmData,
mergeGranuleID,
mergeRange,
mergeVersion,
parentGranuleIDs,
parentGranuleRanges,
parentGranuleStartVersions);
bmData->addActor.send(finishMergeGranules(bmData,
mergeGranuleID,
mergeRange,
mergeVersion,
parentGranuleIDs,
parentGranuleRanges,
parentGranuleStartVersions));
bmData->boundaryEvaluations.insert(mergeRange, eval);
bmData->activeGranuleMerges.insert(mergeRange, mergeVersion);
bmData->clearMergeCandidate(mergeRange, MergeCandidateMerging);
}
if (result.more) {
@ -3564,27 +3511,30 @@ ACTOR Future<Void> purgeRange(Reference<BlobManagerData> self, KeyRangeRef range
}
// add all of the node's parents to the queue
for (auto& parent : currHistoryNode.parentGranules) {
for (int i = 0; i < currHistoryNode.parentVersions.size(); i++) {
// for (auto& parent : currHistoryNode.parentVersions.size()) {
// if we already added this node to queue, skip it; otherwise, mark it as visited
if (visited.count({ parent.first.begin.begin(), parent.second })) {
KeyRangeRef parentRange(currHistoryNode.parentBoundaries[i], currHistoryNode.parentBoundaries[i + 1]);
Version parentVersion = currHistoryNode.parentVersions[i];
if (visited.count({ parentRange.begin.begin(), parentVersion })) {
if (BM_DEBUG) {
fmt::print("Already added {0} to queue, so skipping it\n", currHistoryNode.granuleID.toString());
}
continue;
}
visited.insert({ parent.first.begin.begin(), parent.second });
visited.insert({ parentRange.begin.begin(), parentVersion });
if (BM_DEBUG) {
fmt::print("Adding parent [{0} - {1}) with versions [{2} - {3}) to queue\n",
parent.first.begin.printable(),
parent.first.end.printable(),
parent.second,
parentRange.begin.printable(),
parentRange.end.printable(),
parentVersion,
startVersion);
}
// the parent's end version is this node's startVersion,
// since this node must have started where it's parent finished
historyEntryQueue.push({ parent.first, parent.second, startVersion });
historyEntryQueue.push({ parentRange, parentVersion, startVersion });
}
}
@ -3984,6 +3934,9 @@ ACTOR Future<Void> blobManager(BlobManagerInterface bmInterf,
if (SERVER_KNOBS->BG_CONSISTENCY_CHECK_ENABLED) {
self->addActor.send(bgConsistencyCheck(self));
}
if (SERVER_KNOBS->BG_ENABLE_MERGING) {
self->addActor.send(granuleMergeChecker(self));
}
if (BUGGIFY) {
self->addActor.send(chaosRangeMover(self));

View File

@ -1188,13 +1188,13 @@ ACTOR Future<Void> granuleCheckMergeCandidate(Reference<BlobWorkerData> bwData,
}
// wait for the last snapshot to finish, so that the delay is from the last snapshot
wait(waitStart);
wait(delayJittered(SERVER_KNOBS->BG_MERGE_CANDIDATE_THRESHOLD_SECONDS));
double jitter = deterministicRandom()->random01() * 0.8 * SERVER_KNOBS->BG_MERGE_CANDIDATE_DELAY_SECONDS;
wait(delay(SERVER_KNOBS->BG_MERGE_CANDIDATE_THRESHOLD_SECONDS + jitter));
loop {
// this actor will be cancelled if a split check happened, or if the granule was moved away, so this
// being here means that granule is cold enough during that period. Now we just need to check if it is
// also small enough to be a merge candidate.
StorageMetrics currentMetrics = wait(bwData->db->getStorageMetrics(metadata->keyRange, CLIENT_KNOBS->TOO_MANY));
state int64_t granuleBytes = currentMetrics.bytes;
// FIXME: maybe separate knob and/or value for write rate?
if (currentMetrics.bytes >= SERVER_KNOBS->BG_SNAPSHOT_FILE_TARGET_BYTES / 2 ||
@ -1241,11 +1241,9 @@ ACTOR Future<Void> granuleCheckMergeCandidate(Reference<BlobWorkerData> bwData,
metadata->originalEpoch,
metadata->originalSeqno));
// if a new manager appears, also tell it about this granule being mergeable
state int64_t lastSendEpoch = bwData->currentManagerEpoch;
while (lastSendEpoch == bwData->currentManagerEpoch) {
wait(bwData->currentManagerStatusStream.onChange());
wait(delay(0));
}
// or if a new stream from the existing manager, it may have missed the message due to a network issue
wait(bwData->currentManagerStatusStream.onChange());
wait(delay(0));
CODE_PROBE(true, "Blob worker re-sending merge candidate to new manager");
} catch (Error& e) {
if (e.code() == error_code_operation_cancelled) {
@ -1926,6 +1924,7 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
.detail("RollbackVersion", rollbackVersion);
}
Version oldPendingSnapshot = metadata->pendingSnapshotVersion;
Version cfRollbackVersion = doGranuleRollback(metadata,
deltas.version,
rollbackVersion,
@ -1933,6 +1932,23 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
rollbacksInProgress,
rollbacksCompleted);
if (oldPendingSnapshot > metadata->pendingSnapshotVersion) {
// If rollback cancelled in-flight snapshot, merge candidate checker also got
// cancelled. Restart it
CODE_PROBE(true,
"Restarting merge candidate checker after rolling back snapshot");
checkMergeCandidate = granuleCheckMergeCandidate(
bwData,
metadata,
startState.granuleID,
inFlightFiles.empty() ? Future<Void>(Void())
: success(inFlightFiles.back().future));
}
// reset force flush state, requests should retry and add it back once feed is ready
forceFlushVersions.clear();
lastForceFlushVersion = 0;
metadata->forceFlushVersion = NotifiedVersion();
Reference<ChangeFeedData> cfData = makeReference<ChangeFeedData>();
if (!readOldChangeFeed && cfRollbackVersion < startState.changeFeedStartVersion) {
@ -2352,7 +2368,7 @@ ACTOR Future<Void> blobGranuleLoadHistory(Reference<BlobWorkerData> bwData,
GranuleStartState startState = wait(assignFuture);
state Optional<GranuleHistory> activeHistory = startState.history;
if (activeHistory.present() && activeHistory.get().value.parentGranules.size() > 0) {
if (activeHistory.present() && activeHistory.get().value.parentVersions.size() > 0) {
state int64_t loadId = nextHistoryLoadId++;
if (BW_HISTORY_DEBUG) {
fmt::print("HL {0} {1}) Loading history data for [{2} - {3})\n",
@ -2368,7 +2384,7 @@ ACTOR Future<Void> blobGranuleLoadHistory(Reference<BlobWorkerData> bwData,
std::priority_queue<OrderedHistoryKey, std::vector<OrderedHistoryKey>, std::greater<OrderedHistoryKey>>
rootGranules;
state Transaction tr(bwData->db);
if (!activeHistory.get().value.parentGranules.empty()) {
if (!activeHistory.get().value.parentVersions.empty()) {
if (BW_HISTORY_DEBUG) {
fmt::print("HL {0} {1}) Starting history [{2} - {3}) @ {4}\n",
bwData->id.shortString().substr(0, 5),
@ -2437,17 +2453,16 @@ ACTOR Future<Void> blobGranuleLoadHistory(Reference<BlobWorkerData> bwData,
state bool noParentsPresent = true;
// FIXME: parallelize this for all parents/all entries in queue?
loop {
if (pIdx >= curHistory.value.parentGranules.size()) {
if (pIdx >= curHistory.value.parentVersions.size()) {
break;
}
try {
Optional<Value> v =
wait(tr.get(blobGranuleHistoryKeyFor(curHistory.value.parentGranules[pIdx].first,
curHistory.value.parentGranules[pIdx].second)));
state KeyRangeRef parentRange(curHistory.value.parentBoundaries[pIdx],
curHistory.value.parentBoundaries[pIdx + 1]);
state Version parentVersion = curHistory.value.parentVersions[pIdx];
Optional<Value> v = wait(tr.get(blobGranuleHistoryKeyFor(parentRange, parentVersion)));
if (v.present()) {
next = GranuleHistory(curHistory.value.parentGranules[pIdx].first,
curHistory.value.parentGranules[pIdx].second,
decodeBlobGranuleHistoryValue(v.get()));
next = GranuleHistory(parentRange, parentVersion, decodeBlobGranuleHistoryValue(v.get()));
ASSERT(next.version != invalidVersion);
auto inserted = forwardHistory.insert({ next.value.granuleID, ForwardHistoryValue() });
@ -3410,12 +3425,13 @@ ACTOR Future<GranuleStartState> openGranule(Reference<BlobWorkerData> bwData, As
// If anything in previousGranules, need to do the handoff logic and set
// ret.previousChangeFeedId, and the previous durable version will come from the previous
// granules
if (info.history.present() && info.history.get().value.parentGranules.size() > 0) {
if (info.history.present() && info.history.get().value.parentVersions.size() > 0) {
CODE_PROBE(true, "Granule open found parent");
if (info.history.get().value.parentGranules.size() == 1) { // split
state Key parentHistoryKey =
blobGranuleHistoryKeyFor(info.history.get().value.parentGranules[0].first,
info.history.get().value.parentGranules[0].second);
if (info.history.get().value.parentVersions.size() == 1) { // split
state KeyRangeRef parentRange(info.history.get().value.parentBoundaries[0],
info.history.get().value.parentBoundaries[1]);
state Version parentVersion = info.history.get().value.parentVersions[0];
state Key parentHistoryKey = blobGranuleHistoryKeyFor(parentRange, parentVersion);
Optional<Value> historyParentValue = wait(tr.get(parentHistoryKey));
@ -3424,8 +3440,7 @@ ACTOR Future<GranuleStartState> openGranule(Reference<BlobWorkerData> bwData, As
decodeBlobGranuleHistoryValue(historyParentValue.get());
UID parentGranuleID = val.granuleID;
info.splitParentGranule =
std::pair(info.history.get().value.parentGranules[0].first, parentGranuleID);
info.splitParentGranule = std::pair(parentRange, parentGranuleID);
state std::pair<BlobGranuleSplitState, Version> granuleSplitState =
std::pair(BlobGranuleSplitState::Initialized, invalidVersion);
@ -3479,8 +3494,12 @@ ACTOR Future<GranuleStartState> openGranule(Reference<BlobWorkerData> bwData, As
// Can't roll back past re-snapshot version
info.changeFeedStartVersion = info.history.get().version;
for (auto& it : info.history.get().value.parentGranules) {
parentGranulesToSnapshot.push_back(loadParentGranuleForMergeSnapshot(&tr, it.first, it.second));
for (int i = 0; i < info.history.get().value.parentVersions.size(); i++) {
KeyRangeRef parentRange(info.history.get().value.parentBoundaries[i],
info.history.get().value.parentBoundaries[i + 1]);
Version parentVersion = info.history.get().value.parentVersions[i];
parentGranulesToSnapshot.push_back(
loadParentGranuleForMergeSnapshot(&tr, parentRange, parentVersion));
}
state int pIdx;
@ -4062,94 +4081,103 @@ ACTOR Future<Void> handleFlushGranuleReq(Reference<BlobWorkerData> self, FlushGr
}
}
// force granule to flush at this version, and wait
if (req.flushVersion > metadata->pendingDeltaVersion) {
// first, wait for granule active
loop {
// force granule to flush at this version, and wait
if (req.flushVersion > metadata->pendingDeltaVersion) {
// first, wait for granule active
// wait for change feed version to catch up to ensure we have all data
if (metadata->activeCFData.get()->getVersion() < req.flushVersion) {
if (BW_DEBUG) {
fmt::print("BW {0} flushing granule [{1} - {2}) @ {3}: waiting for CF version "
"(currently {4})\n",
self->id.toString().substr(0, 5),
req.granuleRange.begin.printable(),
req.granuleRange.end.printable(),
req.flushVersion,
metadata->activeCFData.get()->getVersion());
}
// wait for change feed version to catch up to ensure we have all data
if (metadata->activeCFData.get()->getVersion() < req.flushVersion) {
if (BW_DEBUG) {
fmt::print("BW {0} flushing granule [{1} - {2}) @ {3}: waiting for CF version "
"(currently {4})\n",
self->id.toString().substr(0, 5),
req.granuleRange.begin.printable(),
req.granuleRange.end.printable(),
req.flushVersion,
metadata->activeCFData.get()->getVersion());
}
loop {
choose {
when(wait(metadata->activeCFData.get().isValid()
? metadata->activeCFData.get()->whenAtLeast(req.flushVersion)
: Never())) {
break;
}
when(wait(metadata->activeCFData.onChange())) {}
when(wait(granuleCancelled.getFuture())) {
if (BW_DEBUG) {
fmt::print("BW {0} flush granule [{1} - {2}) cancelled 2\n",
self->id.toString().substr(0, 5),
req.granuleRange.begin.printable(),
req.granuleRange.end.printable());
loop {
choose {
when(wait(metadata->activeCFData.get().isValid()
? metadata->activeCFData.get()->whenAtLeast(req.flushVersion)
: Never())) {
break;
}
when(wait(metadata->activeCFData.onChange())) {}
when(wait(granuleCancelled.getFuture())) {
if (BW_DEBUG) {
fmt::print("BW {0} flush granule [{1} - {2}) cancelled 2\n",
self->id.toString().substr(0, 5),
req.granuleRange.begin.printable(),
req.granuleRange.end.printable());
}
req.reply.sendError(wrong_shard_server());
return Void();
}
req.reply.sendError(wrong_shard_server());
return Void();
}
}
ASSERT(metadata->activeCFData.get()->getVersion() >= req.flushVersion);
if (BW_DEBUG) {
fmt::print("BW {0} flushing granule [{1} - {2}) @ {3}: got CF version\n",
self->id.toString().substr(0, 5),
req.granuleRange.begin.printable(),
req.granuleRange.end.printable(),
req.flushVersion);
}
}
ASSERT(metadata->activeCFData.get()->getVersion() >= req.flushVersion);
if (BW_DEBUG) {
fmt::print("BW {0} flushing granule [{1} - {2}) @ {3}: got CF version\n",
self->id.toString().substr(0, 5),
req.granuleRange.begin.printable(),
req.granuleRange.end.printable(),
req.flushVersion);
if (req.flushVersion > metadata->pendingDeltaVersion) {
if (BW_DEBUG) {
fmt::print("BW {0} flushing granule [{1} - {2}) @ {3}: setting force flush version\n",
self->id.toString().substr(0, 5),
req.granuleRange.begin.printable(),
req.granuleRange.end.printable(),
req.flushVersion);
}
// if after waiting for CF version, flushVersion still higher than pendingDeltaVersion,
// set forceFlushVersion
metadata->forceFlushVersion.set(req.flushVersion);
}
}
if (req.flushVersion > metadata->pendingDeltaVersion) {
if (BW_DEBUG) {
fmt::print("BW {0} flushing granule [{1} - {2}) @ {3}: setting force flush version\n",
self->id.toString().substr(0, 5),
req.granuleRange.begin.printable(),
req.granuleRange.end.printable(),
req.flushVersion);
}
// if after waiting for CF version, flushVersion still higher than pendingDeltaVersion,
// set forceFlushVersion
metadata->forceFlushVersion.set(req.flushVersion);
if (BW_DEBUG) {
fmt::print("BW {0} flushing granule [{1} - {2}) @ {3}: waiting durable\n",
self->id.toString().substr(0, 5),
req.granuleRange.begin.printable(),
req.granuleRange.end.printable(),
req.flushVersion);
}
}
choose {
when(wait(metadata->durableDeltaVersion.whenAtLeast(req.flushVersion))) {
if (BW_DEBUG) {
fmt::print("BW {0} flushing granule [{1} - {2}) @ {3}: got durable\n",
self->id.toString().substr(0, 5),
req.granuleRange.begin.printable(),
req.granuleRange.end.printable(),
req.flushVersion);
}
if (BW_DEBUG) {
fmt::print("BW {0} flushing granule [{1} - {2}) @ {3}: waiting durable\n",
self->id.toString().substr(0, 5),
req.granuleRange.begin.printable(),
req.granuleRange.end.printable(),
req.flushVersion);
}
choose {
when(wait(metadata->durableDeltaVersion.whenAtLeast(req.flushVersion))) {}
when(wait(granuleCancelled.getFuture())) {
if (BW_DEBUG) {
fmt::print("BW {0} flush granule [{1} - {2}) cancelled 3\n",
self->id.toString().substr(0, 5),
req.granuleRange.begin.printable(),
req.granuleRange.end.printable());
req.reply.send(Void());
return Void();
}
when(wait(metadata->activeCFData.onChange())) {
// if a rollback happens, need to restart flush process
}
when(wait(granuleCancelled.getFuture())) {
if (BW_DEBUG) {
fmt::print("BW {0} flush granule [{1} - {2}) cancelled 3\n",
self->id.toString().substr(0, 5),
req.granuleRange.begin.printable(),
req.granuleRange.end.printable());
}
req.reply.sendError(wrong_shard_server());
return Void();
}
req.reply.sendError(wrong_shard_server());
return Void();
}
}
if (BW_DEBUG) {
fmt::print("BW {0} flushing granule [{1} - {2}) @ {3}: got durable\n",
self->id.toString().substr(0, 5),
req.granuleRange.begin.printable(),
req.granuleRange.end.printable(),
req.flushVersion);
}
} catch (Error& e) {
if (BW_DEBUG) {
fmt::print("BW {0} flushing granule [{1} - {2}) @ {3}: got unexpected error {4}\n",
@ -4169,10 +4197,10 @@ ACTOR Future<Void> handleFlushGranuleReq(Reference<BlobWorkerData> self, FlushGr
req.granuleRange.end.printable(),
req.flushVersion);
}
}
req.reply.send(Void());
return Void();
req.reply.send(Void());
return Void();
}
}
ACTOR Future<Void> blobWorker(BlobWorkerInterface bwInterf,

View File

@ -23,13 +23,6 @@ file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/workloads)
add_flow_target(EXECUTABLE NAME fdbserver SRCS ${FDBSERVER_SRCS})
find_package(ZLIB)
if(ZLIB_FOUND)
add_compile_definitions(ZLIB_LIB_SUPPORTED)
else()
message(STATUS "ZLIB package not found")
endif()
target_include_directories(fdbserver PRIVATE
${CMAKE_SOURCE_DIR}/bindings/c
${CMAKE_BINARY_DIR}/bindings/c

View File

@ -1639,6 +1639,10 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueueData* self,
healthyDestinations.addDataInFlightToTeam(-metrics.bytes);
auto readLoad = metrics.bytesReadPerKSecond;
// Note: Its equal to trigger([healthyDestinations, readLoad], which is a value capture of
// healthyDestinations. Have to create a reference to healthyDestinations because in ACTOR the state
// variable is actually a member variable, I cant write trigger([healthyDestinations, readLoad]
// directly.
auto& destinationRef = healthyDestinations;
self->noErrorActors.add(
trigger([destinationRef, readLoad]() mutable { destinationRef.addReadInFlightToTeam(-readLoad); },
@ -1778,10 +1782,6 @@ ACTOR Future<bool> rebalanceReadLoad(DDQueueData* self,
state Future<HealthMetrics> healthMetrics = self->cx->getHealthMetrics(true);
state GetTopKMetricsRequest req(
shards, topK, (srcLoad - destLoad) * SERVER_KNOBS->READ_REBALANCE_MAX_SHARD_FRAC, srcLoad / shards.size());
req.comparator = [](const StorageMetrics& a, const StorageMetrics& b) {
return a.bytesReadPerKSecond / std::max(a.bytes * 1.0, 1.0) >
b.bytesReadPerKSecond / std::max(b.bytes * 1.0, 1.0);
};
state GetTopKMetricsReply reply = wait(brokenPromiseToNever(self->getTopKMetrics.getReply(req)));
wait(ready(healthMetrics));
auto cpu = getWorstCpu(healthMetrics.get(), sourceTeam->getServerIDs());
@ -1790,31 +1790,24 @@ ACTOR Future<bool> rebalanceReadLoad(DDQueueData* self,
return false;
}
auto& metricsList = reply.metrics;
auto& metricsList = reply.shardMetrics;
// NOTE: randomize is important here since we don't want to always push the same shard into the queue
deterministicRandom()->randomShuffle(metricsList);
traceEvent->detail("MinReadLoad", reply.minReadLoad).detail("MaxReadLoad", reply.maxReadLoad);
int chosenIdx = -1;
for (int i = 0; i < metricsList.size(); ++i) {
if (metricsList[i].keys.present()) {
chosenIdx = i;
break;
}
}
if (chosenIdx == -1) {
if (metricsList.empty()) {
traceEvent->detail("SkipReason", "NoEligibleShards");
return false;
}
auto& metrics = metricsList[chosenIdx];
auto& [shard, metrics] = metricsList[0];
traceEvent->detail("ShardReadBandwidth", metrics.bytesReadPerKSecond);
// Verify the shard is still in ShardsAffectedByTeamFailure
shards = self->shardsAffectedByTeamFailure->getShardsFor(
ShardsAffectedByTeamFailure::Team(sourceTeam->getServerIDs(), primary));
for (int i = 0; i < shards.size(); i++) {
if (metrics.keys == shards[i]) {
self->output.send(RelocateShard(metrics.keys.get(), priority, RelocateReason::REBALANCE_READ));
if (shard == shards[i]) {
self->output.send(RelocateShard(shard, priority, RelocateReason::REBALANCE_READ));
self->updateLastAsSource(sourceTeam->getServerIDs());
return true;
}

View File

@ -831,9 +831,8 @@ ACTOR Future<Void> trackInitialShards(DataDistributionTracker* self, Reference<I
}
ACTOR Future<Void> fetchTopKShardMetrics_impl(DataDistributionTracker* self, GetTopKMetricsRequest req) {
ASSERT(req.comparator);
state Future<Void> onChange;
state std::vector<StorageMetrics> returnMetrics;
state std::vector<GetTopKMetricsReply::KeyRangeStorageMetrics> returnMetrics;
// random pick a portion of shard
if (req.keys.size() > SERVER_KNOBS->DD_SHARD_COMPARE_LIMIT) {
deterministicRandom()->randomShuffle(req.keys, SERVER_KNOBS->DD_SHARD_COMPARE_LIMIT);
@ -867,8 +866,7 @@ ACTOR Future<Void> fetchTopKShardMetrics_impl(DataDistributionTracker* self, Get
maxReadLoad = std::max(metrics.bytesReadPerKSecond, maxReadLoad);
if (req.minBytesReadPerKSecond <= metrics.bytesReadPerKSecond &&
metrics.bytesReadPerKSecond <= req.maxBytesReadPerKSecond) {
metrics.keys = range;
returnMetrics.push_back(metrics);
returnMetrics.emplace_back(range, metrics);
}
}
@ -882,11 +880,11 @@ ACTOR Future<Void> fetchTopKShardMetrics_impl(DataDistributionTracker* self, Get
std::nth_element(returnMetrics.begin(),
returnMetrics.begin() + req.topK - 1,
returnMetrics.end(),
req.comparator);
req.reply.send(GetTopKMetricsReply(
std::vector<StorageMetrics>(returnMetrics.begin(), returnMetrics.begin() + req.topK),
minReadLoad,
maxReadLoad));
GetTopKMetricsRequest::compare);
req.reply.send(GetTopKMetricsReply(std::vector<GetTopKMetricsReply::KeyRangeStorageMetrics>(
returnMetrics.begin(), returnMetrics.begin() + req.topK),
minReadLoad,
maxReadLoad));
}
return Void();
}

View File

@ -688,7 +688,7 @@ public:
return status;
}
metadataShard->readIterPool->update();
TraceEvent(SevVerbose, "InitializeMetaDataShard", this->logId)
TraceEvent(SevInfo, "InitializeMetaDataShard", this->logId)
.detail("MetadataShardCF", metadataShard->cf->GetID());
}
physicalShards["kvs-metadata"] = metadataShard;
@ -2063,7 +2063,9 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
? true
: false) {
for (const DataShard* shard : shards) {
shardRanges.emplace_back(shard->physicalShard, keys & shard->range);
if (shard != nullptr) {
shardRanges.emplace_back(shard->physicalShard, keys & shard->range);
}
}
}
double getTimeEstimate() const override { return SERVER_KNOBS->READ_RANGE_TIME_ESTIMATE; }
@ -2105,7 +2107,12 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
int accumulatedBytes = 0;
int numShards = 0;
for (auto& [shard, range] : a.shardRanges) {
ASSERT(shard != nullptr && shard->initialized());
if (shard == nullptr || !shard->initialized()) {
TraceEvent(SevWarn, "ShardedRocksReadRangeShardNotReady", logId)
.detail("Range", range)
.detail("Reason", shard == nullptr ? "Not Exist" : "Not Initialized");
continue;
}
auto bytesRead = readRangeInDb(shard, range, rowLimit, byteLimit, &result);
if (bytesRead < 0) {
// Error reading an instance.
@ -2293,7 +2300,9 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
auto* shard = shardManager.getDataShard(key);
if (shard == nullptr || !shard->physicalShard->initialized()) {
// TODO: read non-exist system key range should not cause an error.
TraceEvent(SevError, "ShardedRocksDB").detail("Detail", "Read non-exist key range").detail("ReadKey", key);
TraceEvent(SevWarnAlways, "ShardedRocksDB")
.detail("Detail", "Read non-exist key range")
.detail("ReadKey", key);
return Optional<Value>();
}
@ -2367,12 +2376,6 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
TraceEvent(SevVerbose, "ShardedRocksReadRangeBegin", this->id).detail("Range", keys);
auto shards = shardManager.getDataShardsByRange(keys);
for (DataShard* shard : shards) {
if (shard == nullptr || !shard->physicalShard->initialized()) {
return RangeResult();
}
}
if (!shouldThrottle(type, keys.begin)) {
auto a = new Reader::ReadRangeAction(keys, shards, rowLimit, byteLimit);
auto res = a->result.getFuture();
@ -2511,9 +2514,10 @@ TEST_CASE("noSim/ShardedRocksDB/RangeOps") {
addRangeFutures.push_back(kvStore->addRange(KeyRangeRef("0"_sr, "3"_sr), "shard-1"));
addRangeFutures.push_back(kvStore->addRange(KeyRangeRef("4"_sr, "7"_sr), "shard-2"));
kvStore->persistRangeMapping(KeyRangeRef("0"_sr, "7"_sr), true);
wait(waitForAll(addRangeFutures));
kvStore->persistRangeMapping(KeyRangeRef("0"_sr, "7"_sr), true);
// write to shard 1
state RangeResult expectedRows;
for (int i = 0; i < 30; ++i) {
@ -2701,6 +2705,7 @@ TEST_CASE("noSim/ShardedRocksDB/ShardOps") {
mapping.push_back(std::make_pair(KeyRange(KeyRangeRef("m"_sr, "n"_sr)), "shard-3"));
mapping.push_back(std::make_pair(KeyRange(KeyRangeRef("u"_sr, "v"_sr)), "shard-3"));
mapping.push_back(std::make_pair(KeyRange(KeyRangeRef("x"_sr, "z"_sr)), "shard-1"));
mapping.push_back(std::make_pair(specialKeys, "default"));
for (auto it = dataMap.begin(); it != dataMap.end(); ++it) {
std::cout << "Begin " << it->first.begin.toString() << ", End " << it->first.end.toString() << ", id "
@ -2738,7 +2743,7 @@ TEST_CASE("noSim/ShardedRocksDB/ShardOps") {
wait(kvStore->cleanUpShardsIfNeeded(shardsToCleanUp));
auto dataMap = rocksdbStore->getDataMapping();
ASSERT_EQ(dataMap.size(), 1);
ASSERT_EQ(dataMap.size(), 2);
ASSERT(dataMap[0].second == "shard-2");
Future<Void> closed = kvStore->onClosed();

View File

@ -40,13 +40,7 @@ struct MetricsRule {
int minLevel;
Tuple pack() const {
return Tuple()
.append(namePattern)
.append(typePattern)
.append(addressPattern)
.append(idPattern)
.append(enabled ? 1 : 0)
.append(minLevel);
return Tuple::makeTuple(namePattern, typePattern, addressPattern, idPattern, enabled ? 1 : 0, minLevel);
}
static inline MetricsRule unpack(Tuple const& t) {

View File

@ -458,24 +458,6 @@ struct LogData : NonCopyable, public ReferenceCounted<LogData> {
specialCounter(cc, "Version", [this]() { return this->version.get(); });
specialCounter(cc, "SharedBytesInput", [tLogData]() { return tLogData->bytesInput; });
specialCounter(cc, "SharedBytesDurable", [tLogData]() { return tLogData->bytesDurable; });
specialCounter(
cc, "KvstoreBytesUsed", [tLogData]() { return tLogData->persistentData->getStorageBytes().used; });
specialCounter(
cc, "KvstoreBytesFree", [tLogData]() { return tLogData->persistentData->getStorageBytes().free; });
specialCounter(cc, "KvstoreBytesAvailable", [tLogData]() {
return tLogData->persistentData->getStorageBytes().available;
});
specialCounter(
cc, "KvstoreBytesTotal", [tLogData]() { return tLogData->persistentData->getStorageBytes().total; });
specialCounter(
cc, "QueueDiskBytesUsed", [tLogData]() { return tLogData->rawPersistentQueue->getStorageBytes().used; });
specialCounter(
cc, "QueueDiskBytesFree", [tLogData]() { return tLogData->rawPersistentQueue->getStorageBytes().free; });
specialCounter(cc, "QueueDiskBytesAvailable", [tLogData]() {
return tLogData->rawPersistentQueue->getStorageBytes().available;
});
specialCounter(
cc, "QueueDiskBytesTotal", [tLogData]() { return tLogData->rawPersistentQueue->getStorageBytes().total; });
specialCounter(cc, "ActivePeekStreams", [tLogData]() { return tLogData->activePeekStreams; });
}
@ -1424,7 +1406,23 @@ ACTOR Future<Void> tLogCore(TLogData* self, Reference<LogData> logData) {
logData->logId,
SERVER_KNOBS->STORAGE_LOGGING_DELAY,
&logData->cc,
logData->logId.toString() + "/TLogMetrics"));
logData->logId.toString() + "/TLogMetrics",
[self = self](TraceEvent& te) {
StorageBytes sbTlog = self->persistentData->getStorageBytes();
te.detail("KvstoreBytesUsed", sbTlog.used);
te.detail("KvstoreBytesFree", sbTlog.free);
te.detail("KvstoreBytesAvailable", sbTlog.available);
te.detail("KvstoreBytesTotal", sbTlog.total);
te.detail("KvstoreBytesTemp", sbTlog.temp);
StorageBytes sbQueue = self->rawPersistentQueue->getStorageBytes();
te.detail("QueueDiskBytesUsed", sbQueue.used);
te.detail("QueueDiskBytesFree", sbQueue.free);
te.detail("QueueDiskBytesAvailable", sbQueue.available);
te.detail("QueueDiskBytesTotal", sbQueue.total);
te.detail("QueueDiskBytesTemp", sbQueue.temp);
}));
logData->addActor.send(serveTLogInterface(self, logData->tli, logData, warningCollectorInput));
try {

View File

@ -554,24 +554,6 @@ struct LogData : NonCopyable, public ReferenceCounted<LogData> {
specialCounter(cc, "SharedBytesDurable", [tLogData]() { return tLogData->bytesDurable; });
specialCounter(cc, "SharedOverheadBytesInput", [tLogData]() { return tLogData->overheadBytesInput; });
specialCounter(cc, "SharedOverheadBytesDurable", [tLogData]() { return tLogData->overheadBytesDurable; });
specialCounter(
cc, "KvstoreBytesUsed", [tLogData]() { return tLogData->persistentData->getStorageBytes().used; });
specialCounter(
cc, "KvstoreBytesFree", [tLogData]() { return tLogData->persistentData->getStorageBytes().free; });
specialCounter(cc, "KvstoreBytesAvailable", [tLogData]() {
return tLogData->persistentData->getStorageBytes().available;
});
specialCounter(
cc, "KvstoreBytesTotal", [tLogData]() { return tLogData->persistentData->getStorageBytes().total; });
specialCounter(
cc, "QueueDiskBytesUsed", [tLogData]() { return tLogData->rawPersistentQueue->getStorageBytes().used; });
specialCounter(
cc, "QueueDiskBytesFree", [tLogData]() { return tLogData->rawPersistentQueue->getStorageBytes().free; });
specialCounter(cc, "QueueDiskBytesAvailable", [tLogData]() {
return tLogData->rawPersistentQueue->getStorageBytes().available;
});
specialCounter(
cc, "QueueDiskBytesTotal", [tLogData]() { return tLogData->rawPersistentQueue->getStorageBytes().total; });
specialCounter(cc, "ActivePeekStreams", [tLogData]() { return tLogData->activePeekStreams; });
}
@ -2241,7 +2223,23 @@ ACTOR Future<Void> tLogCore(TLogData* self,
logData->logId,
SERVER_KNOBS->STORAGE_LOGGING_DELAY,
&logData->cc,
logData->logId.toString() + "/TLogMetrics"));
logData->logId.toString() + "/TLogMetrics",
[self = self](TraceEvent& te) {
StorageBytes sbTlog = self->persistentData->getStorageBytes();
te.detail("KvstoreBytesUsed", sbTlog.used);
te.detail("KvstoreBytesFree", sbTlog.free);
te.detail("KvstoreBytesAvailable", sbTlog.available);
te.detail("KvstoreBytesTotal", sbTlog.total);
te.detail("KvstoreBytesTemp", sbTlog.temp);
StorageBytes sbQueue = self->rawPersistentQueue->getStorageBytes();
te.detail("QueueDiskBytesUsed", sbQueue.used);
te.detail("QueueDiskBytesFree", sbQueue.free);
te.detail("QueueDiskBytesAvailable", sbQueue.available);
te.detail("QueueDiskBytesTotal", sbQueue.total);
te.detail("QueueDiskBytesTemp", sbQueue.temp);
}));
logData->addActor.send(serveTLogInterface(self, tli, logData, warningCollectorInput));
logData->addActor.send(cleanupPeekTrackers(logData.getPtr()));
logData->addActor.send(logPeekTrackers(logData.getPtr()));
@ -2770,8 +2768,10 @@ ACTOR Future<Void> tLogStart(TLogData* self, InitializeTLogRequest req, Locality
req.reply.send(recruited);
TraceEvent("TLogReady", logData->logId)
.detail("AllTags", describe(req.allTags))
.detail("Locality", logData->locality);
.detail("Locality", logData->locality)
.setMaxEventLength(11000)
.setMaxFieldLength(10000)
.detail("AllTags", describe(req.allTags));
updater = Void();
wait(tLogCore(self, logData, recruited, pulledRecoveryVersions));

View File

@ -641,24 +641,6 @@ struct LogData : NonCopyable, public ReferenceCounted<LogData> {
specialCounter(cc, "SharedBytesDurable", [tLogData]() { return tLogData->bytesDurable; });
specialCounter(cc, "SharedOverheadBytesInput", [tLogData]() { return tLogData->overheadBytesInput; });
specialCounter(cc, "SharedOverheadBytesDurable", [tLogData]() { return tLogData->overheadBytesDurable; });
specialCounter(
cc, "KvstoreBytesUsed", [tLogData]() { return tLogData->persistentData->getStorageBytes().used; });
specialCounter(
cc, "KvstoreBytesFree", [tLogData]() { return tLogData->persistentData->getStorageBytes().free; });
specialCounter(cc, "KvstoreBytesAvailable", [tLogData]() {
return tLogData->persistentData->getStorageBytes().available;
});
specialCounter(
cc, "KvstoreBytesTotal", [tLogData]() { return tLogData->persistentData->getStorageBytes().total; });
specialCounter(
cc, "QueueDiskBytesUsed", [tLogData]() { return tLogData->rawPersistentQueue->getStorageBytes().used; });
specialCounter(
cc, "QueueDiskBytesFree", [tLogData]() { return tLogData->rawPersistentQueue->getStorageBytes().free; });
specialCounter(cc, "QueueDiskBytesAvailable", [tLogData]() {
return tLogData->rawPersistentQueue->getStorageBytes().available;
});
specialCounter(
cc, "QueueDiskBytesTotal", [tLogData]() { return tLogData->rawPersistentQueue->getStorageBytes().total; });
specialCounter(cc, "PeekMemoryReserved", [tLogData]() { return tLogData->peekMemoryLimiter.activePermits(); });
specialCounter(cc, "PeekMemoryRequestsStalled", [tLogData]() { return tLogData->peekMemoryLimiter.waiters(); });
specialCounter(cc, "ActivePeekStreams", [tLogData]() { return tLogData->activePeekStreams; });
@ -2701,7 +2683,23 @@ ACTOR Future<Void> tLogCore(TLogData* self,
logData->logId,
SERVER_KNOBS->STORAGE_LOGGING_DELAY,
&logData->cc,
logData->logId.toString() + "/TLogMetrics"));
logData->logId.toString() + "/TLogMetrics",
[self = self](TraceEvent& te) {
StorageBytes sbTlog = self->persistentData->getStorageBytes();
te.detail("KvstoreBytesUsed", sbTlog.used);
te.detail("KvstoreBytesFree", sbTlog.free);
te.detail("KvstoreBytesAvailable", sbTlog.available);
te.detail("KvstoreBytesTotal", sbTlog.total);
te.detail("KvstoreBytesTemp", sbTlog.temp);
StorageBytes sbQueue = self->rawPersistentQueue->getStorageBytes();
te.detail("QueueDiskBytesUsed", sbQueue.used);
te.detail("QueueDiskBytesFree", sbQueue.free);
te.detail("QueueDiskBytesAvailable", sbQueue.available);
te.detail("QueueDiskBytesTotal", sbQueue.total);
te.detail("QueueDiskBytesTemp", sbQueue.temp);
}));
logData->addActor.send(serveTLogInterface(self, tli, logData, warningCollectorInput));
logData->addActor.send(cleanupPeekTrackers(logData.getPtr()));
logData->addActor.send(logPeekTrackers(logData.getPtr()));
@ -3261,8 +3259,10 @@ ACTOR Future<Void> tLogStart(TLogData* self, InitializeTLogRequest req, Locality
req.reply.send(recruited);
TraceEvent("TLogReady", logData->logId)
.detail("AllTags", describe(req.allTags))
.detail("Locality", logData->locality);
.detail("Locality", logData->locality)
.setMaxEventLength(11000)
.setMaxFieldLength(10000)
.detail("AllTags", describe(req.allTags));
updater = Void();
wait(tLogCore(self, logData, recruited, pulledRecoveryVersions));

View File

@ -472,7 +472,6 @@ ACTOR Future<Void> getValueQ(StorageCacheData* data, GetValueRequest req) {
try {
++data->counters.getValueQueries;
++data->counters.allQueries;
//++data->readQueueSizeMetric;
// TODO later
// data->maxQueryQueue = std::max<int>( data->maxQueryQueue, data->counters.allQueries.getValue() -
// data->counters.finishedQueries.getValue());
@ -544,7 +543,6 @@ ACTOR Future<Void> getValueQ(StorageCacheData* data, GetValueRequest req) {
}
++data->counters.finishedQueries;
//--data->readQueueSizeMetric;
// if(data->latencyBandConfig.present()) {
// int maxReadBytes =
// data->latencyBandConfig.get().readConfig.maxReadBytes.orDefault(std::numeric_limits<int>::max());
@ -728,7 +726,6 @@ ACTOR Future<Void> getKeyValues(StorageCacheData* data, GetKeyValuesRequest req)
++data->counters.getRangeQueries;
++data->counters.allQueries;
// printf("\nSCGetKeyValues\n");
//++data->readQueueSizeMetric;
// data->maxQueryQueue = std::max<int>( data->maxQueryQueue, data->counters.allQueries.getValue() -
// data->counters.finishedQueries.getValue());

View File

@ -3502,8 +3502,10 @@ ACTOR Future<Void> tLogStart(TLogData* self, InitializeTLogRequest req, Locality
req.reply.send(recruited);
TraceEvent("TLogReady", logData->logId)
.detail("AllTags", describe(req.allTags))
.detail("Locality", logData->locality);
.detail("Locality", logData->locality)
.setMaxEventLength(11000)
.setMaxFieldLength(10000)
.detail("AllTags", describe(req.allTags));
updater = Void();
wait(tLogCore(self, logData, recruited, pulledRecoveryVersions));

View File

@ -2614,6 +2614,8 @@ ACTOR Future<Void> TagPartitionedLogSystem::newRemoteEpoch(TagPartitionedLogSyst
req.tLogLocalities = localities;
req.tLogPolicy = logSet->tLogPolicy;
req.locality = remoteLocality;
TraceEvent("RemoteTLogRouterReplies", self->dbgid)
.detail("WorkerID", remoteWorkers.logRouters[i % remoteWorkers.logRouters.size()].id());
logRouterInitializationReplies.push_back(transformErrors(
throwErrorOr(
remoteWorkers.logRouters[i % remoteWorkers.logRouters.size()].logRouter.getReplyUnlessFailedFor(
@ -2693,11 +2695,13 @@ ACTOR Future<Void> TagPartitionedLogSystem::newRemoteEpoch(TagPartitionedLogSyst
}
remoteTLogInitializationReplies.reserve(remoteWorkers.remoteTLogs.size());
for (int i = 0; i < remoteWorkers.remoteTLogs.size(); i++)
for (int i = 0; i < remoteWorkers.remoteTLogs.size(); i++) {
TraceEvent("RemoteTLogReplies", self->dbgid).detail("WorkerID", remoteWorkers.remoteTLogs[i].id());
remoteTLogInitializationReplies.push_back(transformErrors(
throwErrorOr(remoteWorkers.remoteTLogs[i].tLog.getReplyUnlessFailedFor(
remoteTLogReqs[i], SERVER_KNOBS->TLOG_TIMEOUT, SERVER_KNOBS->MASTER_FAILURE_SLOPE_DURING_RECOVERY)),
cluster_recovery_failed()));
}
TraceEvent("RemoteLogRecruitment_InitializingRemoteLogs")
.detail("StartVersion", logSet->startVersion)
@ -2966,11 +2970,13 @@ ACTOR Future<Reference<ILogSystem>> TagPartitionedLogSystem::newEpoch(
}
initializationReplies.reserve(recr.tLogs.size());
for (int i = 0; i < recr.tLogs.size(); i++)
for (int i = 0; i < recr.tLogs.size(); i++) {
TraceEvent("PrimaryTLogReplies", logSystem->getDebugID()).detail("WorkerID", recr.tLogs[i].id());
initializationReplies.push_back(transformErrors(
throwErrorOr(recr.tLogs[i].tLog.getReplyUnlessFailedFor(
reqs[i], SERVER_KNOBS->TLOG_TIMEOUT, SERVER_KNOBS->MASTER_FAILURE_SLOPE_DURING_RECOVERY)),
cluster_recovery_failed()));
}
state std::vector<Future<Void>> recoveryComplete;
@ -3034,11 +3040,14 @@ ACTOR Future<Reference<ILogSystem>> TagPartitionedLogSystem::newEpoch(
}
satelliteInitializationReplies.reserve(recr.satelliteTLogs.size());
for (int i = 0; i < recr.satelliteTLogs.size(); i++)
for (int i = 0; i < recr.satelliteTLogs.size(); i++) {
TraceEvent("PrimarySatelliteTLogReplies", logSystem->getDebugID())
.detail("WorkerID", recr.satelliteTLogs[i].id());
satelliteInitializationReplies.push_back(transformErrors(
throwErrorOr(recr.satelliteTLogs[i].tLog.getReplyUnlessFailedFor(
sreqs[i], SERVER_KNOBS->TLOG_TIMEOUT, SERVER_KNOBS->MASTER_FAILURE_SLOPE_DURING_RECOVERY)),
cluster_recovery_failed()));
}
wait(waitForAll(satelliteInitializationReplies) || oldRouterRecruitment);

View File

@ -51,6 +51,8 @@ bool compareFDBAndBlob(RangeResult fdb,
Version v,
bool debug);
ACTOR Future<Void> clearAndAwaitMerge(Database cx, KeyRange range);
#include "flow/unactorcompiler.h"
#endif

View File

@ -228,17 +228,20 @@ struct GetMetricsRequest {
};
struct GetTopKMetricsReply {
std::vector<StorageMetrics> metrics;
struct KeyRangeStorageMetrics {
KeyRange range;
StorageMetrics metrics;
KeyRangeStorageMetrics() = default;
KeyRangeStorageMetrics(const KeyRange& range, const StorageMetrics& s) : range(range), metrics(s) {}
};
std::vector<KeyRangeStorageMetrics> shardMetrics;
double minReadLoad = -1, maxReadLoad = -1;
GetTopKMetricsReply() {}
GetTopKMetricsReply(std::vector<StorageMetrics> const& m, double minReadLoad, double maxReadLoad)
: metrics(m), minReadLoad(minReadLoad), maxReadLoad(maxReadLoad) {}
GetTopKMetricsReply(std::vector<KeyRangeStorageMetrics> const& m, double minReadLoad, double maxReadLoad)
: shardMetrics(m), minReadLoad(minReadLoad), maxReadLoad(maxReadLoad) {}
};
struct GetTopKMetricsRequest {
// whether a > b
typedef std::function<bool(const StorageMetrics& a, const StorageMetrics& b)> MetricsComparator;
int topK = 1; // default only return the top 1 shard based on the comparator
MetricsComparator comparator; // Return true if a.score > b.score, return the largest topK in keys
int topK = 1; // default only return the top 1 shard based on the GetTopKMetricsRequest::compare function
std::vector<KeyRange> keys;
Promise<GetTopKMetricsReply> reply; // topK storage metrics
double maxBytesReadPerKSecond = 0, minBytesReadPerKSecond = 0; // all returned shards won't exceed this read load
@ -250,6 +253,20 @@ struct GetTopKMetricsRequest {
double minBytesReadPerKSecond = 0)
: topK(topK), keys(keys), maxBytesReadPerKSecond(maxBytesReadPerKSecond),
minBytesReadPerKSecond(minBytesReadPerKSecond) {}
// Return true if a.score > b.score, return the largest topK in keys
static bool compare(const GetTopKMetricsReply::KeyRangeStorageMetrics& a,
const GetTopKMetricsReply::KeyRangeStorageMetrics& b) {
return compareByReadDensity(a, b);
}
private:
// larger read density means higher score
static bool compareByReadDensity(const GetTopKMetricsReply::KeyRangeStorageMetrics& a,
const GetTopKMetricsReply::KeyRangeStorageMetrics& b) {
return a.metrics.bytesReadPerKSecond / std::max(a.metrics.bytes * 1.0, 1.0) >
b.metrics.bytesReadPerKSecond / std::max(b.metrics.bytes * 1.0, 1.0);
}
};
struct GetMetricsListRequest {

View File

@ -112,13 +112,7 @@ public:
Tuple pack() const {
// fprintf(stderr, "Filename:%s\n", fileName.c_str());
return Tuple()
.append(version)
.append(StringRef(fileName))
.append(isRange)
.append(fileSize)
.append(blockSize)
.append(endVersion);
return Tuple::makeTuple(version, fileName, (int)isRange, fileSize, blockSize, endVersion);
}
static RestoreFile unpack(Tuple const& t) {
RestoreFile r;
@ -190,17 +184,16 @@ struct RestoreFileFR {
int partitionId = -1; // Partition ID (Log Router Tag ID) for mutation files.
Tuple pack() const {
return Tuple()
.append(version)
.append(StringRef(fileName))
.append(isRange)
.append(fileSize)
.append(blockSize)
.append(endVersion)
.append(beginVersion)
.append(cursor)
.append(fileIndex)
.append(partitionId);
return Tuple::makeTuple(version,
fileName,
(int)isRange,
fileSize,
blockSize,
endVersion,
beginVersion,
cursor,
fileIndex,
partitionId);
}
static RestoreFileFR unpack(Tuple const& t) {
RestoreFileFR r;

View File

@ -839,8 +839,6 @@ public:
AsyncVar<bool> noRecentUpdates;
double lastUpdate;
Int64MetricHandle readQueueSizeMetric;
std::string folder;
// defined only during splitMutations()/addMutation()
@ -951,6 +949,9 @@ public:
LatencySample readLatencySample;
LatencyBands readLatencyBands;
LatencySample mappedRangeSample; // Samples getMappedRange latency
LatencySample mappedRangeRemoteSample; // Samples getMappedRange remote subquery latency
LatencySample mappedRangeLocalSample; // Samples getMappedRange local subquery latency
Counters(StorageServer* self)
: cc("StorageServer", self->thisServerID.toString()), allQueries("QueryQueue", cc),
@ -982,7 +983,19 @@ public:
self->thisServerID,
SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
SERVER_KNOBS->LATENCY_SAMPLE_SIZE),
readLatencyBands("ReadLatencyBands", self->thisServerID, SERVER_KNOBS->STORAGE_LOGGING_DELAY) {
readLatencyBands("ReadLatencyBands", self->thisServerID, SERVER_KNOBS->STORAGE_LOGGING_DELAY),
mappedRangeSample("GetMappedRangeMetrics",
self->thisServerID,
SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
SERVER_KNOBS->LATENCY_SAMPLE_SIZE),
mappedRangeRemoteSample("GetMappedRangeRemoteMetrics",
self->thisServerID,
SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
SERVER_KNOBS->LATENCY_SAMPLE_SIZE),
mappedRangeLocalSample("GetMappedRangeLocalMetrics",
self->thisServerID,
SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
SERVER_KNOBS->LATENCY_SAMPLE_SIZE) {
specialCounter(cc, "LastTLogVersion", [self]() { return self->lastTLogVersion; });
specialCounter(cc, "Version", [self]() { return self->version.get(); });
specialCounter(cc, "StorageVersion", [self]() { return self->storageVersion(); });
@ -1056,8 +1069,7 @@ public:
primaryLocality(tagLocalityInvalid), knownCommittedVersion(0), versionLag(0), logProtocol(0),
thisServerID(ssi.id()), tssInQuarantine(false), db(db), actors(false),
byteSampleClears(false, LiteralStringRef("\xff\xff\xff")), durableInProgress(Void()), watchBytes(0),
numWatches(0), noRecentUpdates(false), lastUpdate(now()),
readQueueSizeMetric(LiteralStringRef("StorageServer.ReadQueueSize")), updateEagerReads(nullptr),
numWatches(0), noRecentUpdates(false), lastUpdate(now()), updateEagerReads(nullptr),
fetchKeysParallelismLock(SERVER_KNOBS->FETCH_KEYS_PARALLELISM),
fetchChangeFeedParallelismLock(SERVER_KNOBS->FETCH_KEYS_PARALLELISM),
fetchKeysBytesBudget(SERVER_KNOBS->STORAGE_FETCH_BYTES), fetchKeysBudgetUsed(false),
@ -1561,7 +1573,6 @@ ACTOR Future<Void> getValueQ(StorageServer* data, GetValueRequest req) {
try {
++data->counters.getValueQueries;
++data->counters.allQueries;
++data->readQueueSizeMetric;
data->maxQueryQueue = std::max<int>(
data->maxQueryQueue, data->counters.allQueries.getValue() - data->counters.finishedQueries.getValue());
@ -1669,7 +1680,6 @@ ACTOR Future<Void> getValueQ(StorageServer* data, GetValueRequest req) {
data->transactionTagCounter.addRequest(req.tags, resultSize);
++data->counters.finishedQueries;
--data->readQueueSizeMetric;
double duration = g_network->timer() - req.requestTime();
data->counters.readLatencySample.addMeasurement(duration);
@ -2950,6 +2960,7 @@ ACTOR Future<GetValueReqAndResultRef> quickGetValue(StorageServer* data,
// To provide span context, tags, debug ID to underlying lookups.
GetMappedKeyValuesRequest* pOriginalReq) {
state GetValueReqAndResultRef getValue;
state double getValueStart = g_network->timer();
getValue.key = key;
if (data->shards[key]->isReadable()) {
@ -2970,6 +2981,8 @@ ACTOR Future<GetValueReqAndResultRef> quickGetValue(StorageServer* data,
if (!reply.error.present()) {
++data->counters.quickGetValueHit;
copyOptionalValue(a, getValue, reply.value);
const double duration = g_network->timer() - getValueStart;
data->counters.mappedRangeLocalSample.addMeasurement(duration);
return getValue;
}
// Otherwise fallback.
@ -2989,6 +3002,8 @@ ACTOR Future<GetValueReqAndResultRef> quickGetValue(StorageServer* data,
// TODO: async in case it needs to read from other servers.
Optional<Value> valueOption = wait(valueFuture);
copyOptionalValue(a, getValue, valueOption);
double duration = g_network->timer() - getValueStart;
data->counters.mappedRangeRemoteSample.addMeasurement(duration);
return getValue;
} else {
throw quick_get_value_miss();
@ -3375,7 +3390,6 @@ ACTOR Future<Void> getKeyValuesQ(StorageServer* data, GetKeyValuesRequest req)
++data->counters.getRangeQueries;
++data->counters.allQueries;
++data->readQueueSizeMetric;
data->maxQueryQueue = std::max<int>(
data->maxQueryQueue, data->counters.allQueries.getValue() - data->counters.finishedQueries.getValue());
@ -3530,7 +3544,6 @@ ACTOR Future<Void> getKeyValuesQ(StorageServer* data, GetKeyValuesRequest req)
data->transactionTagCounter.addRequest(req.tags, resultSize);
++data->counters.finishedQueries;
--data->readQueueSizeMetric;
double duration = g_network->timer() - req.requestTime();
data->counters.readLatencySample.addMeasurement(duration);
@ -3556,6 +3569,7 @@ ACTOR Future<GetRangeReqAndResultRef> quickGetKeyValues(
// To provide span context, tags, debug ID to underlying lookups.
GetMappedKeyValuesRequest* pOriginalReq) {
state GetRangeReqAndResultRef getRange;
state double getValuesStart = g_network->timer();
getRange.begin = firstGreaterOrEqual(KeyRef(*a, prefix));
getRange.end = firstGreaterOrEqual(strinc(prefix, *a));
try {
@ -3586,6 +3600,8 @@ ACTOR Future<GetRangeReqAndResultRef> quickGetKeyValues(
// Convert GetKeyValuesReply to RangeResult.
a->dependsOn(reply.arena);
getRange.result = RangeResultRef(reply.data, reply.more);
const double duration = g_network->timer() - getValuesStart;
data->counters.mappedRangeLocalSample.addMeasurement(duration);
return getRange;
}
// Otherwise fallback.
@ -3605,6 +3621,8 @@ ACTOR Future<GetRangeReqAndResultRef> quickGetKeyValues(
RangeResult rangeResult = wait(rangeResultFuture);
a->dependsOn(rangeResult.arena());
getRange.result = rangeResult;
const double duration = g_network->timer() - getValuesStart;
data->counters.mappedRangeRemoteSample.addMeasurement(duration);
return getRange;
} else {
throw quick_get_key_values_miss();
@ -3676,9 +3694,7 @@ void preprocessMappedKey(Tuple& mappedKeyFormatTuple, std::vector<Optional<Tuple
bool escaped = unescapeLiterals(s, "{{", "{");
escaped = unescapeLiterals(s, "}}", "}") || escaped;
if (escaped) {
Tuple escapedTuple;
escapedTuple.append(s);
vt.emplace_back(escapedTuple);
vt.emplace_back(Tuple::makeTuple(s));
} else if (singleKeyOrValue(s, sz)) {
// when it is SingleKeyOrValue, insert an empty Tuple to vector as placeholder
vt.emplace_back(Tuple());
@ -3750,16 +3766,12 @@ Key constructMappedKey(KeyValueRef* keyValue,
}
TEST_CASE("/fdbserver/storageserver/constructMappedKey") {
Key key = Tuple().append("key-0"_sr).append("key-1"_sr).append("key-2"_sr).getDataAsStandalone();
Value value = Tuple().append("value-0"_sr).append("value-1"_sr).append("value-2"_sr).getDataAsStandalone();
Key key = Tuple::makeTuple("key-0"_sr, "key-1"_sr, "key-2"_sr).getDataAsStandalone();
Value value = Tuple::makeTuple("value-0"_sr, "value-1"_sr, "value-2"_sr).getDataAsStandalone();
state KeyValueRef kvr(key, value);
{
Tuple mappedKeyFormatTuple = Tuple()
.append("normal"_sr)
.append("{{escaped}}"_sr)
.append("{K[2]}"_sr)
.append("{V[0]}"_sr)
.append("{...}"_sr);
Tuple mappedKeyFormatTuple =
Tuple::makeTuple("normal"_sr, "{{escaped}}"_sr, "{K[2]}"_sr, "{V[0]}"_sr, "{...}"_sr);
Tuple mappedKeyTuple;
std::vector<Optional<Tuple>> vt;
@ -3768,19 +3780,15 @@ TEST_CASE("/fdbserver/storageserver/constructMappedKey") {
Key mappedKey = constructMappedKey(&kvr, vt, mappedKeyTuple, mappedKeyFormatTuple);
Key expectedMappedKey = Tuple()
.append("normal"_sr)
.append("{escaped}"_sr)
.append("key-2"_sr)
.append("value-0"_sr)
.getDataAsStandalone();
Key expectedMappedKey =
Tuple::makeTuple("normal"_sr, "{escaped}"_sr, "key-2"_sr, "value-0"_sr).getDataAsStandalone();
// std::cout << printable(mappedKey) << " == " << printable(expectedMappedKey) << std::endl;
ASSERT(mappedKey.compare(expectedMappedKey) == 0);
ASSERT(isRangeQuery == true);
}
{
Tuple mappedKeyFormatTuple = Tuple().append("{{{{}}"_sr).append("}}"_sr);
Tuple mappedKeyFormatTuple = Tuple::makeTuple("{{{{}}"_sr, "}}"_sr);
Tuple mappedKeyTuple;
std::vector<Optional<Tuple>> vt;
@ -3788,13 +3796,13 @@ TEST_CASE("/fdbserver/storageserver/constructMappedKey") {
preprocessMappedKey(mappedKeyFormatTuple, vt, isRangeQuery);
Key mappedKey = constructMappedKey(&kvr, vt, mappedKeyTuple, mappedKeyFormatTuple);
Key expectedMappedKey = Tuple().append("{{}"_sr).append("}"_sr).getDataAsStandalone();
Key expectedMappedKey = Tuple::makeTuple("{{}"_sr, "}"_sr).getDataAsStandalone();
// std::cout << printable(mappedKey) << " == " << printable(expectedMappedKey) << std::endl;
ASSERT(mappedKey.compare(expectedMappedKey) == 0);
ASSERT(isRangeQuery == false);
}
{
Tuple mappedKeyFormatTuple = Tuple().append("{{{{}}"_sr).append("}}"_sr);
Tuple mappedKeyFormatTuple = Tuple::makeTuple("{{{{}}"_sr, "}}"_sr);
Tuple mappedKeyTuple;
std::vector<Optional<Tuple>> vt;
@ -3802,13 +3810,13 @@ TEST_CASE("/fdbserver/storageserver/constructMappedKey") {
preprocessMappedKey(mappedKeyFormatTuple, vt, isRangeQuery);
Key mappedKey = constructMappedKey(&kvr, vt, mappedKeyTuple, mappedKeyFormatTuple);
Key expectedMappedKey = Tuple().append("{{}"_sr).append("}"_sr).getDataAsStandalone();
Key expectedMappedKey = Tuple::makeTuple("{{}"_sr, "}"_sr).getDataAsStandalone();
// std::cout << printable(mappedKey) << " == " << printable(expectedMappedKey) << std::endl;
ASSERT(mappedKey.compare(expectedMappedKey) == 0);
ASSERT(isRangeQuery == false);
}
{
Tuple mappedKeyFormatTuple = Tuple().append("{K[100]}"_sr);
Tuple mappedKeyFormatTuple = Tuple::makeTuple("{K[100]}"_sr);
state bool throwException = false;
try {
Tuple mappedKeyTuple;
@ -3824,7 +3832,7 @@ TEST_CASE("/fdbserver/storageserver/constructMappedKey") {
ASSERT(throwException);
}
{
Tuple mappedKeyFormatTuple = Tuple().append("{...}"_sr).append("last-element"_sr);
Tuple mappedKeyFormatTuple = Tuple::makeTuple("{...}"_sr, "last-element"_sr);
state bool throwException2 = false;
try {
Tuple mappedKeyTuple;
@ -3840,7 +3848,7 @@ TEST_CASE("/fdbserver/storageserver/constructMappedKey") {
ASSERT(throwException2);
}
{
Tuple mappedKeyFormatTuple = Tuple().append("{K[not-a-number]}"_sr);
Tuple mappedKeyFormatTuple = Tuple::makeTuple("{K[not-a-number]}"_sr);
state bool throwException3 = false;
try {
Tuple mappedKeyTuple;
@ -4097,7 +4105,6 @@ ACTOR Future<Void> getMappedKeyValuesQ(StorageServer* data, GetMappedKeyValuesRe
++data->counters.getMappedRangeQueries;
++data->counters.allQueries;
++data->readQueueSizeMetric;
data->maxQueryQueue = std::max<int>(
data->maxQueryQueue, data->counters.allQueries.getValue() - data->counters.finishedQueries.getValue());
@ -4270,10 +4277,10 @@ ACTOR Future<Void> getMappedKeyValuesQ(StorageServer* data, GetMappedKeyValuesRe
data->transactionTagCounter.addRequest(req.tags, resultSize);
++data->counters.finishedQueries;
--data->readQueueSizeMetric;
double duration = g_network->timer() - req.requestTime();
data->counters.readLatencySample.addMeasurement(duration);
data->counters.mappedRangeSample.addMeasurement(duration);
if (data->latencyBandConfig.present()) {
int maxReadBytes =
data->latencyBandConfig.get().readConfig.maxReadBytes.orDefault(std::numeric_limits<int>::max());
@ -4304,7 +4311,6 @@ ACTOR Future<Void> getKeyValuesStreamQ(StorageServer* data, GetKeyValuesStreamRe
req.reply.setByteLimit(SERVER_KNOBS->RANGESTREAM_LIMIT_BYTES);
++data->counters.getRangeStreamQueries;
++data->counters.allQueries;
++data->readQueueSizeMetric;
data->maxQueryQueue = std::max<int>(
data->maxQueryQueue, data->counters.allQueries.getValue() - data->counters.finishedQueries.getValue());
@ -4490,7 +4496,6 @@ ACTOR Future<Void> getKeyValuesStreamQ(StorageServer* data, GetKeyValuesStreamRe
data->transactionTagCounter.addRequest(req.tags, resultSize);
++data->counters.finishedQueries;
--data->readQueueSizeMetric;
return Void();
}
@ -4505,7 +4510,6 @@ ACTOR Future<Void> getKeyQ(StorageServer* data, GetKeyRequest req) {
++data->counters.getKeyQueries;
++data->counters.allQueries;
++data->readQueueSizeMetric;
data->maxQueryQueue = std::max<int>(
data->maxQueryQueue, data->counters.allQueries.getValue() - data->counters.finishedQueries.getValue());
@ -4577,7 +4581,6 @@ ACTOR Future<Void> getKeyQ(StorageServer* data, GetKeyRequest req) {
data->transactionTagCounter.addRequest(req.tags, resultSize);
++data->counters.finishedQueries;
--data->readQueueSizeMetric;
double duration = g_network->timer() - req.requestTime();
data->counters.readLatencySample.addMeasurement(duration);

View File

@ -149,6 +149,7 @@ struct BlobGranuleCorrectnessWorkload : TestWorkload {
// parameters global across all clients
int64_t targetByteRate;
bool doMergeCheckAtEnd;
std::vector<Reference<ThreadData>> directories;
std::vector<Future<Void>> clients;
@ -162,6 +163,9 @@ struct BlobGranuleCorrectnessWorkload : TestWorkload {
// different parameters within those constraints
int64_t randomness = sharedRandomNumber;
doMergeCheckAtEnd = randomness % 10 == 0;
randomness /= 10;
// randomize between low and high directory count
int64_t targetDirectories = 1 + (randomness % 8);
randomness /= 8;
@ -912,7 +916,7 @@ struct BlobGranuleCorrectnessWorkload : TestWorkload {
}
wait(self->checkTenantRanges(self, cx, threadData));
bool initialCheck = result;
state bool initialCheck = result;
result &= threadData->mismatches == 0 && (threadData->timeTravelTooOld == 0);
fmt::print("Blob Granule Workload Directory {0} {1}:\n", threadData->directoryID, result ? "passed" : "failed");
@ -935,6 +939,11 @@ struct BlobGranuleCorrectnessWorkload : TestWorkload {
// For some reason simulation is still passing when this fails?.. so assert for now
ASSERT(result);
if (self->clientId == 0 && SERVER_KNOBS->BG_ENABLE_MERGING && self->doMergeCheckAtEnd) {
CODE_PROBE(true, "BGCorrectness clearing database and awaiting merge");
wait(clearAndAwaitMerge(cx, threadData->directoryRange));
}
return result;
}

View File

@ -451,7 +451,8 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
if (BGV_DEBUG && startReadVersion != readVersion) {
fmt::print("Availability check updated read version from {0} to {1}\n", startReadVersion, readVersion);
}
bool result = availabilityPassed && self->mismatches == 0 && (checks > 0) && (self->timeTravelTooOld == 0);
state bool result =
availabilityPassed && self->mismatches == 0 && (checks > 0) && (self->timeTravelTooOld == 0);
fmt::print("Blob Granule Verifier {0} {1}:\n", self->clientId, result ? "passed" : "failed");
fmt::print(" {} successful final granule checks\n", checks);
fmt::print(" {} failed final granule checks\n", availabilityPassed ? 0 : 1);
@ -470,6 +471,11 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
// For some reason simulation is still passing when this fails?.. so assert for now
ASSERT(result);
if (self->clientId == 0 && SERVER_KNOBS->BG_ENABLE_MERGING && deterministicRandom()->random01() < 0.1) {
CODE_PROBE(true, "BGV clearing database and awaiting merge");
wait(clearAndAwaitMerge(cx, normalKeys));
}
return result;
}

View File

@ -294,8 +294,8 @@ struct ClientTransactionProfileCorrectnessWorkload : TestWorkload {
wait(runRYWTransaction(cx, [=](Reference<ReadYourWritesTransaction> tr) -> Future<Void> {
tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES);
tr->setOption(FDBTransactionOptions::LOCK_AWARE);
Tuple rate = Tuple().appendDouble(sampleProbability);
Tuple size = Tuple().append(sizeLimit);
Tuple rate = Tuple::makeTuple(sampleProbability);
Tuple size = Tuple::makeTuple(sizeLimit);
tr->set(GlobalConfig::prefixedKey(fdbClientInfoTxnSampleRate), rate.pack());
tr->set(GlobalConfig::prefixedKey(fdbClientInfoTxnSizeLimit), size.pack());
return Void();

View File

@ -36,12 +36,7 @@ class ConfigIncrementWorkload : public TestWorkload {
PerfIntCounter transactions, retries, commitUnknownResult;
static Key getConfigKey() {
Tuple tuple;
tuple.appendNull(); // config class
tuple << testKnobName;
return tuple.pack();
}
static Key getConfigKey() { return Tuple::makeTuple(/* config class */ nullptr, testKnobName).pack(); }
ACTOR static Future<int> get(Reference<ISingleThreadTransaction> tr) {
TraceEvent(SevDebug, "ConfigIncrementGet");

View File

@ -83,15 +83,11 @@ struct GetMappedRangeWorkload : ApiWorkload {
static Value dataOfRecord(int i) { return Key(format("data-of-record-%08d", i)); }
static Value dataOfRecord(int i, int split) { return Key(format("data-of-record-%08d-split-%08d", i, split)); }
static Key indexEntryKey(int i) {
return Tuple().append(prefix).append(INDEX).append(indexKey(i)).append(primaryKey(i)).pack();
}
static Key recordKey(int i) { return Tuple().append(prefix).append(RECORD).append(primaryKey(i)).pack(); }
static Key recordKey(int i, int split) {
return Tuple().append(prefix).append(RECORD).append(primaryKey(i)).append(split).pack();
}
static Value recordValue(int i) { return Tuple().append(dataOfRecord(i)).pack(); }
static Value recordValue(int i, int split) { return Tuple().append(dataOfRecord(i, split)).pack(); }
static Key indexEntryKey(int i) { return Tuple::makeTuple(prefix, INDEX, indexKey(i), primaryKey(i)).pack(); }
static Key recordKey(int i) { return Tuple::makeTuple(prefix, RECORD, primaryKey(i)).pack(); }
static Key recordKey(int i, int split) { return Tuple::makeTuple(prefix, RECORD, primaryKey(i), split).pack(); }
static Value recordValue(int i) { return Tuple::makeTuple(dataOfRecord(i)).pack(); }
static Value recordValue(int i, int split) { return Tuple::makeTuple(dataOfRecord(i, split)).pack(); }
ACTOR Future<Void> fillInRecords(Database cx, int n, GetMappedRangeWorkload* self) {
state Transaction tr(cx);
@ -270,9 +266,9 @@ struct GetMappedRangeWorkload : ApiWorkload {
GetMappedRangeWorkload* self,
int matchIndex,
bool allMissing = false) {
Key beginTuple = Tuple().append(prefix).append(INDEX).append(indexKey(beginId)).getDataAsStandalone();
Key beginTuple = Tuple::makeTuple(prefix, INDEX, indexKey(beginId)).getDataAsStandalone();
state KeySelector beginSelector = KeySelector(firstGreaterOrEqual(beginTuple));
Key endTuple = Tuple().append(prefix).append(INDEX).append(indexKey(endId)).getDataAsStandalone();
Key endTuple = Tuple::makeTuple(prefix, INDEX, indexKey(endId)).getDataAsStandalone();
state KeySelector endSelector = KeySelector(firstGreaterOrEqual(endTuple));
state int limit = 100;
state int expectedBeginId = beginId;
@ -322,9 +318,9 @@ struct GetMappedRangeWorkload : ApiWorkload {
Reference<TransactionWrapper>& tr,
GetMappedRangeWorkload* self) {
Key mapper = getMapper(self, false);
Key beginTuple = Tuple().append(prefix).append(INDEX).append(indexKey(beginId)).getDataAsStandalone();
Key beginTuple = Tuple::makeTuple(prefix, INDEX, indexKey(beginId)).getDataAsStandalone();
KeySelector beginSelector = KeySelector(firstGreaterOrEqual(beginTuple));
Key endTuple = Tuple().append(prefix).append(INDEX).append(indexKey(endId)).getDataAsStandalone();
Key endTuple = Tuple::makeTuple(prefix, INDEX, indexKey(endId)).getDataAsStandalone();
KeySelector endSelector = KeySelector(firstGreaterOrEqual(endTuple));
return tr->getMappedRange(beginSelector,
endSelector,

View File

@ -192,9 +192,16 @@ struct ReportConflictingKeysWorkload : TestWorkload {
LiteralStringRef("\xff\xff").withPrefix(conflictingKeysRange.begin));
// The getRange here using the special key prefix "\xff\xff/transaction/conflicting_keys/" happens
// locally Thus, the error handling is not needed here
Future<RangeResult> conflictingKeyRangesFuture = tr2->getRange(ckr, CLIENT_KNOBS->TOO_MANY);
state Future<RangeResult> conflictingKeyRangesFuture = tr2->getRange(ckr, CLIENT_KNOBS->TOO_MANY);
ASSERT(conflictingKeyRangesFuture.isReady());
wait(validateSpecialSubrangeRead(tr2.getPtr(),
firstGreaterOrEqual(ckr.begin),
firstGreaterOrEqual(ckr.end),
GetRangeLimits(),
Reverse::False,
conflictingKeyRangesFuture.get()));
tr2 = makeReference<ReadYourWritesTransaction>(cx);
const RangeResult conflictingKeyRanges = conflictingKeyRangesFuture.get();

View File

@ -359,8 +359,7 @@ TEST_CASE("/fdbclient/TaskBucket/Subspace") {
print_subspace_key(subspace_test1, 1);
ASSERT(subspace_test1.key() == LiteralStringRef("abc"));
Tuple t;
t.append(LiteralStringRef("user"));
Tuple t = Tuple::makeTuple("user"_sr);
Subspace subspace_test2(t);
print_subspace_key(subspace_test2, 2);
ASSERT(subspace_test2.key() == LiteralStringRef("\x01user\x00"));
@ -369,8 +368,7 @@ TEST_CASE("/fdbclient/TaskBucket/Subspace") {
print_subspace_key(subspace_test3, 3);
ASSERT(subspace_test3.key() == LiteralStringRef("abc\x01user\x00"));
Tuple t1;
t1.append(1);
Tuple t1 = Tuple::makeTuple(1);
Subspace subspace_test4(t1);
print_subspace_key(subspace_test4, 4);
ASSERT(subspace_test4.key() == LiteralStringRef("\x15\x01"));
@ -400,8 +398,7 @@ TEST_CASE("/fdbclient/TaskBucket/Subspace") {
ASSERT(subspace_test8.key() == LiteralStringRef("\x01subitem\x00"));
// pack
Tuple t3;
t3.append(StringRef());
Tuple t3 = Tuple::makeTuple(""_sr);
printf("%d==========%s===%d\n", 10, printable(subspace_test5.pack(t3)).c_str(), subspace_test5.pack(t3).size());
ASSERT(subspace_test5.pack(t3) == subspace_test5.pack(StringRef()));
ASSERT(subspace_test5.pack(t3) == LiteralStringRef("abc\x01user\x00\x15\x7b\x01\x00"));

View File

@ -774,3 +774,45 @@ TEST_CASE("/flow/Arena/Size") {
return Void();
}
TEST_CASE("flow/StringRef/eat") {
StringRef str = "test/case"_sr;
StringRef first = str.eat("/");
ASSERT(first == "test"_sr);
ASSERT(str == "case"_sr);
str = "test/case"_sr;
first = str.eat("/"_sr);
ASSERT(first == "test"_sr);
ASSERT(str == "case"_sr);
str = "testcase"_sr;
first = str.eat("/"_sr);
ASSERT(first == "testcase"_sr);
ASSERT(str == ""_sr);
str = "testcase/"_sr;
first = str.eat("/"_sr);
ASSERT(first == "testcase"_sr);
ASSERT(str == ""_sr);
str = "test/case/extra"_sr;
first = str.eat("/"_sr);
ASSERT(first == "test"_sr);
ASSERT(str == "case/extra"_sr);
bool hasSep;
str = "test/case"_sr;
first = str.eat("/"_sr, &hasSep);
ASSERT(hasSep);
ASSERT(first == "test"_sr);
ASSERT(str == "case"_sr);
str = "testcase"_sr;
first = str.eat("/", &hasSep);
ASSERT(!hasSep);
ASSERT(first == "testcase"_sr);
ASSERT(str == ""_sr);
return Void();
}

View File

@ -26,8 +26,8 @@ target_link_libraries(flowlinktest PRIVATE flow stacktrace)
find_package(ZLIB)
if(ZLIB_FOUND)
add_compile_definitions(ZLIB_LIB_SUPPORTED)
target_link_libraries(flow PRIVATE ZLIB::ZLIB)
target_compile_definitions(flow PUBLIC ZLIB_LIB_SUPPORTED)
target_link_libraries(flow PUBLIC ZLIB::ZLIB)
else()
message(STATUS "ZLIB package not found")
endif()
@ -66,11 +66,7 @@ foreach(ft flow flow_sampling flowlinktest)
target_include_directories(${ft} SYSTEM BEFORE PUBLIC ${WOLFSSL_INCLUDE_DIR}/wolfssl)
endif()
target_link_libraries(${ft} PUBLIC Threads::Threads ${CMAKE_DL_LIBS})
if(USE_SANITIZER)
target_link_libraries(${ft} PUBLIC boost_asan)
else()
target_link_libraries(${ft} PUBLIC boost_target)
endif()
target_link_libraries(${ft} PUBLIC boost_target)
if(USE_VALGRIND)
target_link_libraries(${ft} PUBLIC Valgrind)
endif()

View File

@ -7,6 +7,7 @@ PORT_PREFIX=1500
# default cluster settings, override with options
STATELESS_COUNT=4
REPLICATION_COUNT=1
LOGS_COUNT=8
STORAGE_COUNT=16
KNOBS=""
@ -25,6 +26,7 @@ function usage {
printf "\t--logs_taskset BITMASK\n\r\t\tbitmask of CPUs to pin logs to. Default is all CPUs.\n\r"
printf "\t--storage_count COUNT\n\r\t\tnumber of storage daemons to start. Default ${STORAGE_COUNT}\n\r"
printf "\t--storage_taskset BITMASK\n\r\t\tBitmask of CPUs to pin storage to. Default is all CPUs.\n\r"
printf "\t--replication_count COUNT\n\r\t\tReplication count may be 1,2 or 3. Default is 1.\n\r"
echo "Example"
printf "\t${0} . --knobs '--knob_proxy_use_resolver_private_mutations=1' --stateless_count 4 --stateless_taskset 0xf --logs_count 8 --logs_taskset 0xff0 --storage_taskset 0xffff000\n\r"
exit 1
@ -36,7 +38,8 @@ function start_servers {
DATA=${DIR}/${SERVER_COUNT}/data
mkdir -p ${LOG} ${DATA}
PORT=$(( $PORT_PREFIX + $SERVER_COUNT ))
$2 ${FDB} -p auto:${PORT} "$KNOBS" -c $3 -d $DATA -L $LOG -C $CLUSTER &
ZONE=$(( $j % $REPLICATION_COUNT ))
$2 ${FDB} -p auto:${PORT} "$KNOBS" -c $3 -d $DATA -L $LOG -C $CLUSTER --locality-zoneid Z-$ZONE --locality-machineid M-$SERVER_COUNT &
SERVER_COUNT=$(( $SERVER_COUNT + 1 ))
done
}
@ -76,6 +79,9 @@ while [[ $# -gt 0 ]]; do
--storage_count)
STORAGE_COUNT=$2
;;
--replication_count)
REPLICATION_COUNT=$2
;;
esac
shift; shift
done
@ -86,6 +92,15 @@ if [ ! -f ${FDB} ]; then
usage
fi
if [ $REPLICATION_COUNT -eq 1 ]; then
replication="single"
elif [ $REPLICATION_COUNT -eq 2 ]; then
replication="double"
elif [ $REPLICATION_COUNT -eq 3 ]; then
replication="triple"
else
usage
fi
DIR=./loopback-cluster
rm -rf $DIR
@ -102,7 +117,7 @@ start_servers $LOGS_COUNT "$LOGS_TASKSET" log
start_servers $STORAGE_COUNT "$STORAGE_TASKSET" storage
CLI="$BUILD/bin/fdbcli -C ${CLUSTER} --exec"
echo "configure new ssd single - stand by"
echo "configure new ssd $replication - stand by"
# sleep 2 seconds to wait for workers to join cluster, then configure database
( sleep 2 ; $CLI "configure new ssd single" )
# sleep 2 seconds to wait for workers to join cluster, then configure database and coordinators
( sleep 2 ; $CLI "configure new ssd $replication" ; $CLI "coordinators auto")