Merge branch 'feature-tree-broadcast' into feature-small-endpoint

This commit is contained in:
Evan Tschannen 2020-04-17 17:16:04 -07:00
commit 0ee62badcd
74 changed files with 1443 additions and 525 deletions

View File

@ -27,7 +27,7 @@ package fdb
import "C"
import (
"runtime"
"sync"
)
// Database is a handle to a FoundationDB database. Database is a lightweight
@ -74,13 +74,14 @@ func (d Database) CreateTransaction() (Transaction, error) {
return Transaction{}, Error{int(err)}
}
t := &transaction{outt, d}
runtime.SetFinalizer(t, (*transaction).destroy)
t := &transaction{outt, d, sync.Once{}}
return Transaction{t}, nil
}
func retryable(wrapped func() (interface{}, error), onError func(Error) FutureNil) (ret interface{}, e error) {
func retryable(t Transaction, wrapped func() (interface{}, error), onError func(Error) FutureNil) (ret interface{}, e error) {
defer t.Close()
for {
ret, e = wrapped()
@ -140,7 +141,7 @@ func (d Database) Transact(f func(Transaction) (interface{}, error)) (interface{
return
}
return retryable(wrapped, tr.OnError)
return retryable(tr, wrapped, tr.OnError)
}
// ReadTransact runs a caller-provided function inside a retry loop, providing
@ -180,7 +181,7 @@ func (d Database) ReadTransact(f func(ReadTransaction) (interface{}, error)) (in
return
}
return retryable(wrapped, tr.OnError)
return retryable(tr, wrapped, tr.OnError)
}
// Options returns a DatabaseOptions instance suitable for setting options

View File

@ -417,6 +417,7 @@ func (dl directoryLayer) subdirNames(rtr fdb.ReadTransaction, node subspace.Subs
rr := rtr.GetRange(sd, fdb.RangeOptions{})
ri := rr.Iterator()
defer ri.Close()
var ret []string
@ -442,6 +443,7 @@ func (dl directoryLayer) subdirNodes(tr fdb.Transaction, node subspace.Subspace)
rr := tr.GetRange(sd, fdb.RangeOptions{})
ri := rr.Iterator()
defer ri.Close()
var ret []subspace.Subspace

View File

@ -246,6 +246,7 @@ func ExampleRangeIterator() {
rr := tr.GetRange(fdb.KeyRange{fdb.Key(""), fdb.Key{0xFF}}, fdb.RangeOptions{})
ri := rr.Iterator()
defer ri.Close()
// Advance will return true until the iterator is exhausted
for ri.Advance() {

View File

@ -39,7 +39,6 @@ package fdb
import "C"
import (
"runtime"
"sync"
"unsafe"
)
@ -75,9 +74,7 @@ type future struct {
}
func newFuture(ptr *C.FDBFuture) *future {
f := &future{ptr}
runtime.SetFinalizer(f, func(f *future) { C.fdb_future_destroy(f.ptr) })
return f
return &future{ptr}
}
// Note: This function guarantees the callback will be executed **at most once**.
@ -100,17 +97,14 @@ func fdb_future_block_until_ready(f *C.FDBFuture) {
}
func (f *future) BlockUntilReady() {
defer runtime.KeepAlive(f)
fdb_future_block_until_ready(f.ptr)
}
func (f *future) IsReady() bool {
defer runtime.KeepAlive(f)
return C.fdb_future_is_ready(f.ptr) != 0
}
func (f *future) Cancel() {
defer runtime.KeepAlive(f)
C.fdb_future_cancel(f.ptr)
}
@ -142,7 +136,7 @@ type futureByteSlice struct {
func (f *futureByteSlice) Get() ([]byte, error) {
f.o.Do(func() {
defer runtime.KeepAlive(f.future)
defer C.fdb_future_destroy(f.ptr)
var present C.fdb_bool_t
var value *C.uint8_t
@ -156,10 +150,14 @@ func (f *futureByteSlice) Get() ([]byte, error) {
}
if present != 0 {
f.v = C.GoBytes(unsafe.Pointer(value), length)
}
// Copy the native `value` into a Go byte slice so the underlying
// native Future can be freed. This avoids the need for finalizers.
valueDestination := make([]byte, length)
valueSource := C.GoBytes(unsafe.Pointer(value), length)
copy(valueDestination, valueSource)
C.fdb_future_release_memory(f.ptr)
f.v = valueDestination
}
})
return f.v, f.e
@ -199,7 +197,7 @@ type futureKey struct {
func (f *futureKey) Get() (Key, error) {
f.o.Do(func() {
defer runtime.KeepAlive(f.future)
defer C.fdb_future_destroy(f.ptr)
var value *C.uint8_t
var length C.int
@ -211,8 +209,11 @@ func (f *futureKey) Get() (Key, error) {
return
}
f.k = C.GoBytes(unsafe.Pointer(value), length)
C.fdb_future_release_memory(f.ptr)
keySource := C.GoBytes(unsafe.Pointer(value), length)
keyDestination := make([]byte, length)
copy(keyDestination, keySource)
f.k = keyDestination
})
return f.k, f.e
@ -245,17 +246,21 @@ type FutureNil interface {
type futureNil struct {
*future
o sync.Once
e error
}
func (f *futureNil) Get() error {
defer runtime.KeepAlive(f.future)
f.o.Do(func() {
defer C.fdb_future_destroy(f.ptr)
f.BlockUntilReady()
if err := C.fdb_future_get_error(f.ptr); err != 0 {
return Error{int(err)}
}
f.BlockUntilReady()
if err := C.fdb_future_get_error(f.ptr); err != 0 {
f.e = Error{int(err)}
}
})
return nil
return f.e
}
func (f *futureNil) MustGet() {
@ -281,8 +286,6 @@ func stringRefToSlice(ptr unsafe.Pointer) []byte {
}
func (f *futureKeyValueArray) Get() ([]KeyValue, bool, error) {
defer runtime.KeepAlive(f.future)
f.BlockUntilReady()
var kvs *C.FDBKeyValue
@ -293,13 +296,42 @@ func (f *futureKeyValueArray) Get() ([]KeyValue, bool, error) {
return nil, false, Error{int(err)}
}
// To minimize the number of individual allocations, we first calculate the
// final size used by all keys and values returned from this iteration,
// then perform one larger allocation and slice within it.
poolSize := 0
for i := 0; i < int(count); i++ {
kvptr := unsafe.Pointer(uintptr(unsafe.Pointer(kvs)) + uintptr(i*24))
poolSize += len(stringRefToSlice(kvptr))
poolSize += len(stringRefToSlice(unsafe.Pointer(uintptr(kvptr) + 12)))
}
poolOffset := 0
pool := make([]byte, poolSize)
ret := make([]KeyValue, int(count))
for i := 0; i < int(count); i++ {
kvptr := unsafe.Pointer(uintptr(unsafe.Pointer(kvs)) + uintptr(i*24))
ret[i].Key = stringRefToSlice(kvptr)
ret[i].Value = stringRefToSlice(unsafe.Pointer(uintptr(kvptr) + 12))
keySource := stringRefToSlice(kvptr)
valueSource := stringRefToSlice(unsafe.Pointer(uintptr(kvptr) + 12))
keyDestination := pool[poolOffset : poolOffset+len(keySource)]
poolOffset += len(keySource)
valueDestination := pool[poolOffset : poolOffset+len(valueSource)]
poolOffset += len(valueSource)
copy(keyDestination, keySource)
copy(valueDestination, valueSource)
ret[i] = KeyValue{
Key: keyDestination,
Value: valueDestination,
}
}
return ret, (more != 0), nil
@ -324,19 +356,28 @@ type FutureInt64 interface {
type futureInt64 struct {
*future
o sync.Once
e error
v int64
}
func (f *futureInt64) Get() (int64, error) {
defer runtime.KeepAlive(f.future)
f.o.Do(func() {
defer C.fdb_future_destroy(f.ptr)
f.BlockUntilReady()
f.BlockUntilReady()
var ver C.int64_t
if err := C.fdb_future_get_int64(f.ptr, &ver); err != 0 {
return 0, Error{int(err)}
}
var ver C.int64_t
if err := C.fdb_future_get_int64(f.ptr, &ver); err != 0 {
f.v = 0
f.e = Error{int(err)}
return
}
return int64(ver), nil
f.v = int64(ver)
})
return f.v, f.e
}
func (f *futureInt64) MustGet() int64 {
@ -367,27 +408,40 @@ type FutureStringSlice interface {
type futureStringSlice struct {
*future
o sync.Once
e error
v []string
}
func (f *futureStringSlice) Get() ([]string, error) {
defer runtime.KeepAlive(f.future)
f.o.Do(func() {
defer C.fdb_future_destroy(f.ptr)
f.BlockUntilReady()
f.BlockUntilReady()
var strings **C.char
var count C.int
var strings **C.char
var count C.int
if err := C.fdb_future_get_string_array(f.ptr, (***C.char)(unsafe.Pointer(&strings)), &count); err != 0 {
return nil, Error{int(err)}
}
if err := C.fdb_future_get_string_array(f.ptr, (***C.char)(unsafe.Pointer(&strings)), &count); err != 0 {
f.e = Error{int(err)}
return
}
ret := make([]string, int(count))
ret := make([]string, int(count))
for i := 0; i < int(count); i++ {
ret[i] = C.GoString((*C.char)(*(**C.char)(unsafe.Pointer(uintptr(unsafe.Pointer(strings)) + uintptr(i*8)))))
}
for i := 0; i < int(count); i++ {
source := C.GoString((*C.char)(*(**C.char)(unsafe.Pointer(uintptr(unsafe.Pointer(strings)) + uintptr(i*8)))))
return ret, nil
destination := make([]byte, len(source))
copy(destination, source)
ret[i] = string(destination)
}
f.v = ret
})
return f.v, f.e
}
func (f *futureStringSlice) MustGet() []string {

View File

@ -28,6 +28,7 @@ import "C"
import (
"fmt"
"sync"
)
// KeyValue represents a single key-value pair in the database.
@ -140,6 +141,7 @@ func (rr RangeResult) GetSliceWithError() ([]KeyValue, error) {
var ret []KeyValue
ri := rr.Iterator()
defer ri.Close()
if rr.options.Limit != 0 {
ri.options.Mode = StreamingModeExact
@ -207,6 +209,18 @@ type RangeIterator struct {
index int
err error
snapshot bool
o sync.Once
}
// Close releases the underlying native resources for all the `KeyValue`s
// ever returned by this iterator. The `KeyValue`s themselves are copied
// before they're returned, so they are still safe to use after calling
// this function. This is instended to be called with `defer` inside
// your transaction function.
func (ri *RangeIterator) Close() {
ri.o.Do(func() {
C.fdb_future_destroy(ri.f.ptr)
})
}
// Advance attempts to advance the iterator to the next key-value pair. Advance

View File

@ -25,6 +25,7 @@ package fdb
// #define FDB_API_VERSION 630
// #include <foundationdb/fdb_c.h>
import "C"
import "sync"
// A ReadTransaction can asynchronously read from a FoundationDB
// database. Transaction and Snapshot both satisfy the ReadTransaction
@ -70,6 +71,7 @@ type Transaction struct {
type transaction struct {
ptr *C.FDBTransaction
db Database
o sync.Once
}
// TransactionOptions is a handle with which to set options that affect a
@ -85,16 +87,18 @@ func (opt TransactionOptions) setOpt(code int, param []byte) error {
}, param)
}
func (t *transaction) destroy() {
C.fdb_transaction_destroy(t.ptr)
}
// GetDatabase returns a handle to the database with which this transaction is
// interacting.
func (t Transaction) GetDatabase() Database {
return t.transaction.db
}
func (t Transaction) Close() {
t.o.Do(func() {
C.fdb_transaction_destroy(t.ptr)
})
}
// Transact executes the caller-provided function, passing it the Transaction
// receiver object.
//

View File

@ -169,8 +169,6 @@ file(WRITE ${MANIFEST_FILE} ${MANIFEST_TEXT})
add_jar(fdb-java ${JAVA_BINDING_SRCS} ${GENERATED_JAVA_FILES} ${CMAKE_SOURCE_DIR}/LICENSE
OUTPUT_DIR ${PROJECT_BINARY_DIR}/lib VERSION ${CMAKE_PROJECT_VERSION} MANIFEST ${MANIFEST_FILE})
add_dependencies(fdb-java fdb_java_options fdb_java)
add_jar(foundationdb-tests SOURCES ${JAVA_TESTS_SRCS} INCLUDE_JARS fdb-java)
add_dependencies(foundationdb-tests fdb_java_options)
# TODO[mpilman]: The java RPM will require some more effort (mostly on debian). However,
# most people will use the fat-jar, so it is not clear how high this priority is.
@ -237,6 +235,16 @@ if(NOT OPEN_FOR_IDE)
WORKING_DIRECTORY ${unpack_dir}
DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/lib_copied
COMMENT "Build ${target_jar}")
add_jar(foundationdb-tests SOURCES ${JAVA_TESTS_SRCS} INCLUDE_JARS fdb-java)
add_dependencies(foundationdb-tests fdb_java_options)
set(tests_jar ${jar_destination}/fdb-java-${CMAKE_PROJECT_VERSION}${prerelease_string}-tests.jar)
add_custom_command(OUTPUT ${tests_jar}
COMMAND ${CMAKE_COMMAND} -E copy foundationdb-tests.jar "${tests_jar}"
WORKING_DIRECTORY .
DEPENDS foundationdb-tests
COMMENT "Build ${tests_jar}")
add_custom_target(fdb-java-tests ALL DEPENDS ${tests_jar})
add_dependencies(fdb-java-tests foundationdb-tests)
add_custom_target(fat-jar ALL DEPENDS ${target_jar})
add_dependencies(fat-jar fdb-java)
add_dependencies(fat-jar copy_lib)

View File

@ -30,7 +30,7 @@ num_groups=${#gids[@]}
additional_groups="-G sudo"
for ((i=0;i<num_groups;i++))
do
echo "RUN groupadd -g ${gids[$i]} ${groups[$i]}" >> Dockerfile
echo "RUN groupadd -g ${gids[$i]} ${groups[$i]} || true" >> Dockerfile
if [ ${gids[i]} -ne ${gid} ]
then
additional_groups="${additional_groups},${gids[$i]}"
@ -72,9 +72,21 @@ sudo docker run --rm `# delete (temporary) image after return` \\
--security-opt seccomp=unconfined \\
-v "${HOME}:${HOME}" `# Mount home directory` \\
\${ccache_args} \\
${image}
${image} "\$@"
EOF
cat <<EOF $HOME/bin/clangd
#!/usr/bin/bash
fdb-dev scl enable devtoolset-8 rh-python36 rh-ruby24 -- clangd
EOF
if [[ ":$PATH:" != *":$HOME/bin:"* ]]
then
echo "WARNING: $HOME/bin is not in your PATH!"
echo -e "\tThis can cause problems with some scripts (like fdb-clangd)"
fi
chmod +x $HOME/bin/fdb-dev
echo "To start the dev docker image run $HOME/bin/fdb-dev"
echo "You can edit this file but be aware that this script will overwrite your changes if you rerun it"
echo "$HOME/bin/clangd can be used for IDE integration"
echo "You can edit these files but be aware that this script will overwrite your changes if you rerun it"

View File

@ -189,19 +189,29 @@ function(create_test_package)
add_custom_command(
OUTPUT ${tar_file}
DEPENDS ${out_files}
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/correctnessTest.sh ${CMAKE_BINARY_DIR}/packages/joshua_test
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/correctnessTimeout.sh ${CMAKE_BINARY_DIR}/packages/joshua_timeout
${CMAKE_BINARY_DIR}/packages/bin/fdbserver
${CMAKE_BINARY_DIR}/packages/bin/TestHarness.exe
${CMAKE_BINARY_DIR}/packages/bin/TraceLogHelper.dll
${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/correctnessTest.sh
${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/correctnessTimeout.sh
${external_files}
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/correctnessTest.sh
${CMAKE_BINARY_DIR}/packages/joshua_test
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/correctnessTimeout.sh
${CMAKE_BINARY_DIR}/packages/joshua_timeout
COMMAND ${CMAKE_COMMAND} -E tar cfz ${tar_file} ${CMAKE_BINARY_DIR}/packages/bin/fdbserver
${CMAKE_BINARY_DIR}/packages/bin/TestHarness.exe
${CMAKE_BINARY_DIR}/packages/bin/TraceLogHelper.dll
${CMAKE_BINARY_DIR}/packages/joshua_test
${CMAKE_BINARY_DIR}/packages/joshua_timeout
${out_files} ${external_files}
${CMAKE_BINARY_DIR}/packages/bin/TestHarness.exe
${CMAKE_BINARY_DIR}/packages/bin/TraceLogHelper.dll
${CMAKE_BINARY_DIR}/packages/joshua_test
${CMAKE_BINARY_DIR}/packages/joshua_timeout
${out_files}
${external_files}
COMMAND ${CMAKE_COMMAND} -E remove ${CMAKE_BINARY_DIR}/packages/joshua_test ${CMAKE_BINARY_DIR}/packages/joshua_timeout
WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/packages
COMMENT "Package correctness archive"
)
add_custom_target(package_tests ALL DEPENDS ${tar_file})
# seems make needs this dependency while this does nothing with ninja
add_dependencies(package_tests strip_only_fdbserver TestHarness)
endif()
@ -210,14 +220,24 @@ function(create_test_package)
add_custom_command(
OUTPUT ${tar_file}
DEPENDS ${out_files}
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/valgrindTest.sh ${CMAKE_BINARY_DIR}/packages/joshua_test
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/valgrindTimeout.sh ${CMAKE_BINARY_DIR}/packages/joshua_timeout
COMMAND ${CMAKE_COMMAND} -E tar cfz ${tar_file} ${CMAKE_BINARY_DIR}/packages/bin/fdbserver
${CMAKE_BINARY_DIR}/packages/bin/TestHarness.exe
${CMAKE_BINARY_DIR}/packages/bin/TraceLogHelper.dll
${CMAKE_BINARY_DIR}/packages/joshua_test
${CMAKE_BINARY_DIR}/packages/joshua_timeout
${out_files} ${external_files}
${CMAKE_BINARY_DIR}/packages/bin/fdbserver
${CMAKE_BINARY_DIR}/packages/bin/TestHarness.exe
${CMAKE_BINARY_DIR}/packages/bin/TraceLogHelper.dll
${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/valgrindTest.sh
${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/valgrindTimeout.sh
${external_files}
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/valgrindTest.sh
${CMAKE_BINARY_DIR}/packages/joshua_test
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/valgrindTimeout.sh
${CMAKE_BINARY_DIR}/packages/joshua_timeout
COMMAND ${CMAKE_COMMAND} -E tar cfz ${tar_file}
${CMAKE_BINARY_DIR}/packages/bin/fdbserver
${CMAKE_BINARY_DIR}/packages/bin/TestHarness.exe
${CMAKE_BINARY_DIR}/packages/bin/TraceLogHelper.dll
${CMAKE_BINARY_DIR}/packages/joshua_test
${CMAKE_BINARY_DIR}/packages/joshua_timeout
${out_files}
${external_files}
COMMAND ${CMAKE_COMMAND} -E remove ${CMAKE_BINARY_DIR}/packages/joshua_test ${CMAKE_BINARY_DIR}/packages/joshua_timeout
WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/packages
COMMENT "Package correctness archive"

View File

@ -21,6 +21,10 @@ set(RELATIVE_DEBUG_PATHS OFF CACHE BOOL "Use relative file paths in debug info")
set(STATIC_LINK_LIBCXX ON CACHE BOOL "Statically link libstdcpp/libc++")
set(USE_WERROR OFF CACHE BOOL "Compile with -Werror. Recommended for local development and CI.")
if(USE_LIBCXX AND STATIC_LINK_LIBCXX AND NOT USE_LD STREQUAL "LLD")
message(FATAL_ERROR "Unsupported configuration: STATIC_LINK_LIBCXX with libc+++ only works if USE_LD=LLD")
endif()
set(rel_debug_paths OFF)
if(RELATIVE_DEBUG_PATHS)
set(rel_debug_paths ON)
@ -189,13 +193,16 @@ else()
add_compile_options()
# Clang has link errors unless `atomic` is specifically requested.
if(NOT APPLE)
add_link_options(-latomic)
#add_link_options(-latomic)
endif()
if (APPLE OR USE_LIBCXX)
add_compile_options($<$<COMPILE_LANGUAGE:CXX>:-stdlib=libc++>)
add_compile_definitions(WITH_LIBCXX)
if (NOT APPLE)
add_link_options(-lc++ -lc++abi -Wl,-build-id=sha1)
if (STATIC_LINK_LIBCXX)
add_link_options(-static-libgcc -nostdlib++ -Wl,-Bstatic -lc++ -lc++abi -Wl,-Bdynamic)
endif()
add_link_options(-stdlib=libc++ -Wl,-build-id=sha1)
endif()
endif()
if (OPEN_FOR_IDE)
@ -215,7 +222,7 @@ else()
if (USE_CCACHE)
add_compile_options(
-Wno-register
-Wno-error=unused-command-line-argument)
-Wno-unused-command-line-argument)
endif()
endif()
if (USE_WERROR)

View File

@ -135,11 +135,11 @@ function(strip_debug_symbols target)
add_custom_target(strip_only_${target} DEPENDS ${out_file})
if(is_exec AND NOT APPLE)
add_custom_command(OUTPUT "${out_file}.debug"
DEPENDS strip_only_${target}
COMMAND objcopy --verbose --only-keep-debug $<TARGET_FILE:${target}> "${out_file}.debug"
COMMAND objcopy --verbose --add-gnu-debuglink="${out_file}.debug" "${out_file}"
COMMENT "Copy debug symbols to ${out_name}.debug")
add_custom_target(strip_${target} DEPENDS "${out_file}.debug")
add_dependencies(strip_${target} ${target} strip_only_${target})
else()
add_custom_target(strip_${target})
add_dependencies(strip_${target} strip_only_${target})

View File

@ -10,8 +10,6 @@ set(out_file ${CMAKE_BINARY_DIR}/packages/bin/TestHarness.exe)
add_custom_command(OUTPUT ${out_file}
COMMAND ${MCS_EXECUTABLE} ARGS ${TEST_HARNESS_REFERENCES} ${SRCS} "-target:exe" "-out:${out_file}"
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
DEPENDS ${SRCS}
DEPENDS ${SRCS} TraceLogHelper
COMMENT "Compile TestHarness" VERBATIM)
add_custom_target(TestHarness DEPENDS ${out_file})
add_dependencies(TestHarness TraceLogHelper)
set(TestHarnesExe "${out_file}" PARENT_SCOPE)

View File

@ -1 +1,2 @@
add_executable(actor_flamegraph actor_flamegraph.cpp)
target_link_libraries(actor_flamegraph PRIVATE Threads::Threads)

View File

@ -0,0 +1,81 @@
# Special-Key-Space
This document discusses why we need the proposed special-key-space framwork. And for what problems the framework aims to solve and in what scenarios a developer should use it.
## Motivation
Currently, there are several client functions implemented as FDB calls by passing through special keys(`prefixed with \xff\xff`). Below are all existing features:
- **status/json**: `get("\xff\xff/status/json")`
- **cluster_file_path**: `get("\xff\xff/cluster_file_path)`
- **connection_string**: `get("\xff\xff/connection_string)`
- **worker_interfaces**: `getRange("\xff\xff/worker_interfaces", <any_key>)`
- **conflicting-keys**: `getRange("\xff\xff/transaction/conflicting_keys/", "\xff\xff/transaction/conflicting_keys/\xff")`
At present, implementions are hard-coded and the pain points are obvious:
- **Maintainability**: As more features added, the hard-coded snippets are hard to maintain
- **Granularity**: It is impossible to scale up and down. For example, you want a cheap call like `get("\xff\xff/status/json/<certain_field>")` instead of calling `status/json` and parsing the results. On the contrary, sometime you want to aggregate results from several similiar features like `getRange("\xff\xff/transaction/, \xff\xff/transaction/\xff")` to get all transaction related info. Both of them are not achievable at present.
- **Consistency**: While using FDB calls like `get` or `getRange`, the behavior that the result of `get("\xff\xff/B")` is not included in `getRange("\xff\xff/A", "\xff\xff/C")` is inconsistent with general FDB calls.
Consequently, the special-key-space framework wants to integrate all client functions using special keys(`prefixed with \xff`) and solve the pain points listed above.
## When
If your feature is exposing information to clients and the results are easily formatted as key-value pairs, then you can use special-key-space to implement your client function.
## How
If you choose to use, you need to implement a function class that inherits from `SpecialKeyRangeBaseImpl`, which has an abstract method `Future<Standalone<RangeResultRef>> getRange(Reference<ReadYourWritesTransaction> ryw, KeyRangeRef kr)`.
This method can be treated as a callback, whose implementation details are determined by the developer.
Once you fill out the method, register the function class to the corresponding key range.
Below is a detailed example.
```c++
// Implement the function class,
// the corresponding key range is [\xff\xff/example/, \xff\xff/example/\xff)
class SKRExampleImpl : public SpecialKeyRangeBaseImpl {
public:
explicit SKRExampleImpl(KeyRangeRef kr): SpecialKeyRangeBaseImpl(kr) {
// Our implementation is quite simple here, the key-value pairs are formatted as:
// \xff\xff/example/<country_name> : <capital_city_name>
CountryToCapitalCity[LiteralStringRef("USA")] = LiteralStringRef("Washington, D.C.");
CountryToCapitalCity[LiteralStringRef("UK")] = LiteralStringRef("London");
CountryToCapitalCity[LiteralStringRef("Japan")] = LiteralStringRef("Tokyo");
CountryToCapitalCity[LiteralStringRef("China")] = LiteralStringRef("Beijing");
}
// Implement the getRange interface
Future<Standalone<RangeResultRef>> getRange(Reference<ReadYourWritesTransaction> ryw,
KeyRangeRef kr) const override {
Standalone<RangeResultRef> result;
for (auto const& country : CountryToCapitalCity) {
// the registered range here: [\xff\xff/example/, \xff\xff/example/\xff]
Key keyWithPrefix = country.first.withPrefix(range.begin);
// check if any valid keys are given in the range
if (kr.contains(keyWithPrefix)) {
result.push_back(result.arena(), KeyValueRef(keyWithPrefix, country.second));
result.arena().dependsOn(keyWithPrefix.arena());
}
}
return result;
}
private:
std::map<Key, Value> CountryToCapitalCity;
};
// Instantiate the function object
// In development, you should have a function object pointer in DatabaseContext(DatabaseContext.h) and initialize in DatabaseContext's constructor(NativeAPI.actor.cpp)
const KeyRangeRef exampleRange(LiteralStringRef("\xff\xff/example/"), LiteralStringRef("\xff\xff/example/\xff"));
SKRExampleImpl exampleImpl(exampleRange);
// Assuming the database handler is `cx`, register to special-key-space
// In development, you should register all function objects in the constructor of DatabaseContext(NativeAPI.actor.cpp)
cx->specialKeySpace->registerKeyRange(exampleRange, &exampleImpl);
// Now any ReadYourWritesTransaction associated with `cx` is able to query the info
state ReadYourWritesTransaction tr(cx);
// get
Optional<Value> res1 = wait(tr.get("\xff\xff/example/Japan"));
ASSERT(res1.present() && res.getValue() == LiteralStringRef("Tokyo"));
// getRange
// Note: for getRange(key1, key2), both key1 and key2 should prefixed with \xff\xff
// something like getRange("normal_key", "\xff\xff/...") is not supported yet
Standalone<RangeResultRef> res2 = wait(tr.getRange(LiteralStringRef("\xff\xff/example/U"), LiteralStringRef("\xff\xff/example/U\xff")));
// res2 should contain USA and UK
ASSERT(
res2.size() == 2 &&
res2[0].value == LiteralStringRef("London") &&
res2[1].value == LiteralStringRef("Washington, D.C.")
);
```

View File

@ -0,0 +1,217 @@
<meta charset="utf-8">
# Forward Compatibility for Transaction Logs
## Background
A repeated concern with adopting FoundationDB has been that upgrades are one
way, with no supported rollback. If one were to upgrade a cluster running 6.0
to a 6.1, then there's no way to roll back to 6.0 if the new version results in
worse client application performance or unavailability. In the interest of
increasing adoption, work has begun on supporting on-disk forward
compatibility, which allows for upgrades to be rolled back.
The traditional way of allowing roll backs is to have one version, `N`, that
introduces a feature, but is left as disabled. `N+1` enables the feature, and
then `N+2` removes whatever was deprecated in `N`. However, FDB currently has
a 6 month release cadence, and waiting 6 months to be able to use a new feature
in production is unacceptably long. Thus, the goal is to have a way to be able
to have a sane and user-friendly, rollback-supporting upgrade path, but still
allow features to be used immediately if desired.
This document also carries two specific restrictions to the scope of what it covers:
1. This document specifically is **not** a discussion of network protocol
compatibility nor supporting rolling upgrades. Rolling upgrades of FDB are
still discouraged, and minor versions are still protocol incompatible with
each other.
2. This only covers the proposed design of how forward compatibility for
transaction logs will be handled, and not forward compatibility for
FoundationDB as a whole. There are other parts of the system that durably
store data, the coordinators and storage servers, that will not be discussed.
## Overview
A new configuration option, `log_version`, will be introduced to allow a user
to control which on-disk format the transaction logs are allowed to use. Not
every release will affect the on-disk format of the transaction logs, so
`log_version` is an opaque integer that is incremented by one whenever the
on-disk format of the transaction log is changed.
`log_version` is set by from `fdbcli`, with an invocation looking like
`$ fdbcli -C cluster.file --exec "configure log_version:=2"`. Note that `:=`
is used instead of `=`, to keep the convention in `fdbcli` that configuration
options that users aren't expected to need (or wish) to modify are set with
`:=`.
Right now, FDB releases and `log_version` values are as follows:
| Release | Log Version |
| ------- | ----------- |
| pre-5.2 | 1 |
| 5.2-6.0 | 2 |
| 6.1+ | 3 |
| 6.2 | 4 |
| 6.3 | 5 |
If a user does not specify any configuration for `log_version`, then
`log_version` will be set so that rolling back to the previous minor version of
FDB will be possible. FDB will always support loading files generated by
default from the next minor version. It will be possible to configure
`log_version` to a higher value on the release that introduces it, it the user
is willing to sacrifice the ability to roll back.
This means FDB's releases will work like the following:
| | 6.0 | 6.1 | 6.2 | 6.3 |
|--------------|-----|-----|-------|---------|
| Configurable | 2 | 2,3 | 3,4 | 4,5 |
| Default | 2 | 2 | 3 | 4 |
| Recoverable | 2 | 2,3 | 2,3,4 | 2,3,4,5 |
Where...
* "configurable" means values considered an acceptable configuration setting for `fdbcli> configure log_version:=N`.
* "default" means what `log_version` will be if you don't configure it.
* "recoverable" means that FDB can load files that were generated from the specified `log_version`.
Configuring to a `log_version` will cause FDB to use the maximum of that
`log_version` and default `log_version`. The default `log_version` will always
be the minimum configurable log version. This is done so that manually setting
`log_version` once, and then upgrading FDB multiple times, will eventually
cause a low `log_version` left in the database configuration to act as a
request for the default. Configuring `log_version` to a very high number (e.g. 9999)
will cause FDB to always use the highest available log version.
As a concrete example, 6.1 will introduce a new transaction log feature with
on-disk format implications. If you wish to use it, you'll first have to
`configure log_version:=3`. Otherwise, after upgrading to FDB6.2, it will
become the default. If problems are discovered when upgrading to FDB6.2, then
roll back to FDB6.1. (Theoretically. See scope restrictions above.)
## Detailed Implementation
`fdbcli> configure log_version:=3` sets `\xff/conf/log_version` to `3`. This
version is also persisted as part of the `LogSystemConfig` and thus
`DBCoreState`, so that any code handling the log system will have access to the
`log_version` that was used to create it.
Changing `log_version` will result in a recovery, and FoundationDB will recover
into the requested transaction log implementation. This involves locking the
previous generation of transaction logs, and then recruiting a new generation
of transaction logs. FDB will load `\xff/conf/log_version` as the requested
`log_version`, and when sending a `InitializeTLogRequest` to recruit a new
transaction log, it uses the maximum of the requested log version and the
default `log_version`.
A worker, when receiving an `InitializeTLogRequest`, will initialize a
transaction log corresponding to the requested `log_version`. Transaction logs
can pack multiple generations of transaction logs into the same shared entity,
a `SharedTLog`. `SharedTLog` instances correspond to one set of files, and
will only contain transaction log generations of the same `log_version`.
This allows us to have multiple generations of transaction logs running within
one worker that have different `log_version`s, and if the worker crashes and
restarts, we need to be able to recreate those transaction log instances.
Transaction logs maintain two types of files, one is a pair files prefixed with
`logqueue-` that are the DiskQueue, and the other is the metadata store, which
is normally a mini `ssd-2` storage engine running within the transaction log.
When a worker first starts, it scans its data directory for any files that were
instances of a transaction log. It then needs to construct a transaction log
instance that can read the format of the file to be able to reconnect the data
in the files back to the FDB cluster, so that it can be used in a recovery if
needed.
This presents a problem that the worker needs to know all the configuration
options that were used to decide the file format of the transaction log
*before* it can rejoin a cluster and get far enough through a recovery to find
out what that configuration was. To get around this, the relevant
configuration options have been added to the file name so that they're
available when scanning the list of files.
Currently, FDB identifies a transaction log instance via seeing a file that starts
with `log-`, which represents the metadata store. This filename has the format
of `log-<UUID>.<SUFFIX>` where UUID is the `logId`, and SUFFIX tells us if the
metadata store is a memory or ssd storage engine file.
This format is being changed to `log2-&lt;KV PAIRS&gt;-<UUID>.<SUFFIX>`, where KV
PAIRS is a small amount of information encoded into the file name to give us
the metadata *about* the file that is required. According to POSIX, the
characters allowed for "fully portable filenames" are `AZ az 09 . _ -` and
the filename length should stay under 255 characters. This leaves only `_` as
the only character not already used. Therefore, the KV pair encoding
`K1_V1_K2_V2_...`, so keys and values separated by an `_`, and kv pairs are
also separated by an `_`.
The currently supported keys are:
V
: A copy of `log_version`
LS
: `log_spill`, a new configuration option in 6.1
and any unrecognized keys are ignored, which will likely help forward compatibility.
An example file name is `log2-V_3_LS_2-46a5f353ac18d787852d44c3a2e51527-0.fdq`.
### Testing
`SimulationConfig` has been changed to randomly set `log_version` according to
what is supported. This means that with restarting upgrade tests that simulate
upgrading from `N` to `N+1`, the `N+1` version will see files that came from an
FDB running with any `log_version` value that was previously supported. If
`N+1` can't handle the files correctly, then the simulation test will fail.
`ConfigureTest` tries randomly toggling `log_version` up and down in a live
database, along with all the other log related options. Some are valid, some
are invalid and should be rejected, or will cause ASSERTs in later parts of the
code.
I've added a new test, `ConfigureTestRestart` that tests changing
configurations and then upgrading FDB, to cover testing that upgrades still
happen correctly when `log_version` has been changed. This also verifies that
on-disk formats for those `log_version`s are still loadable by future FDB
versions.
There are no tests that mix the `ConfigureDatabase` and `Attrition` workloads.
It would be good to do so, to cover the case of `log_version` changes in the
presence of failures, but one cannot be added easily. The simulator calculates
what processes/machines are safe to kill by looking at the current
configuration. For `ConfigureTest`, this isn't good enough, because `triple`
could mean that there are three replicas, or that the FDB cluster just changed
from `single` to `triple` and only have one replica of data until data
distribution finishes. It would be good to add a `ConfigureKillTest` sometime
in the future.
For FDB to actually announce that rolling back from `N+1` to `N` is supported,
there will need to be downgrade tests from `N+1` to `N` also. The default in
`N+1` should always be recoverable within `N`. As FDB isn't promising forward
compatibility yet, these tests haven't been implemented.
# Transaction Log Forward Compatibility Operational Guide
## Notable Behavior Changes
When release notes mention a new `log_version` is available, after deploying
that release, it's worth considering upgrading `log_version`. Doing so will
allow a controlled upgrade, and reduce the number of new changes that will
take effect when upgrading to the next release.
## Observability
* When running with a non-default `log_version`, the setting will appear in `fdbcli> status`.
## Monitoring and Alerting
If anyone is doing anything that relies on the file names the transaction log uses, they'll be changing.
<!-- Force long-style table of contents -->
<script>window.markdeepOptions={}; window.markdeepOptions.tocStyle="long";</script>
<!-- When printed, top level section headers should force page breaks -->
<style>.md h1, .md .nonumberh1 {page-break-before:always}</style>
<!-- Markdeep: -->
<style class="fallback">body{visibility:hidden;white-space:pre;font-family:monospace}</style><script src="markdeep.min.js" charset="utf-8"></script><script src="https://casual-effects.com/markdeep/latest/markdeep.min.js" charset="utf-8"></script><script>window.alreadyProcessedMarkdeep||(document.body.style.visibility="visible")</script>

View File

@ -5,9 +5,16 @@ Release Notes
7.0.0
=====
Features
--------
* Improved the slow task profiler to also report backtraces for periods when the run loop is saturated. `(PR #2608) <https://github.com/apple/foundationdb/pull/2608>`_
Performance
-----------
* Improve GRV tail latencies, particularly as the transaction rate gets nearer the ratekeeper limit. `(PR #2735) <https://github.com/apple/foundationdb/pull/2735>`_
* The proxies are now more responsive to changes in workload when unthrottling lower priority transactions. `(PR #2735) <https://github.com/apple/foundationdb/pull/2735>`_
Fixes
-----
@ -20,6 +27,10 @@ Bindings
* API version updated to 630. See the :ref:`API version upgrade guide <api-version-upgrade-guide-630>` for upgrade details.
* Java: Introduced ``keyAfter`` utility function that can be used to create the immediate next key for a given byte array. `(PR #2458) <https://github.com/apple/foundationdb/pull/2458>`_
* C: The ``FDBKeyValue`` struct's ``key`` and ``value`` members have changed type from ``void*`` to ``uint8_t*``. `(PR #2622) <https://github.com/apple/foundationdb/pull/2622>`_
* Deprecated ``enable_slow_task_profiling`` transaction option and replaced it with ``enable_run_loop_profiling``. `(PR #2608) <https://github.com/apple/foundationdb/pull/2608>`_
* Go: Added a ``Close`` function to ``RangeIterator`` which **must** be called to free resources returned from ``Transaction.GetRange``. `(PR #1910) <https://github.com/apple/foundationdb/pull/1910>`_.
* Go: Finalizers are no longer used to clean up native resources. ``Future`` results are now copied from the native heap to the Go heap, and native resources are freed immediately. `(PR #1910) <https://github.com/apple/foundationdb/pull/1910>`_.
Other Changes
-------------

View File

@ -115,6 +115,7 @@ std::string BackupDescription::toString() const {
info.append(format("URL: %s\n", url.c_str()));
info.append(format("Restorable: %s\n", maxRestorableVersion.present() ? "true" : "false"));
info.append(format("Partitioned logs: %s\n", partitioned ? "true" : "false"));
auto formatVersion = [&](Version v) {
std::string s;
@ -169,6 +170,7 @@ std::string BackupDescription::toJSON() const {
doc.setKey("SchemaVersion", "1.0.0");
doc.setKey("URL", url.c_str());
doc.setKey("Restorable", maxRestorableVersion.present());
doc.setKey("Partitioned", partitioned);
auto formatVersion = [&](Version v) {
JsonBuilderObject doc;
@ -243,10 +245,10 @@ std::string BackupDescription::toJSON() const {
* /plogs/...log,startVersion,endVersion,UID,tagID-of-N,blocksize
* /logs/.../log,startVersion,endVersion,UID,blockSize
* where ... is a multi level path which sorts lexically into version order and results in approximately 1
* unique folder per day containing about 5,000 files. Logs after 7.0 are stored in "plogs"
* directory and are partitioned according to tagIDs (0, 1, 2, ...) and the total number
* partitions is N. Logs before 7.0 are
* stored in "logs" directory and are not partitioned.
* unique folder per day containing about 5,000 files. Logs after FDB 6.3 are stored in "plogs"
* directory and are partitioned according to tagIDs (0, 1, 2, ...) and the total number partitions is N.
* Old backup logs FDB 6.2 and earlier are stored in "logs" directory and are not partitioned.
* After FDB 6.3, users can choose to use the new partitioned logs or old logs.
*
*
* BACKWARD COMPATIBILITY
@ -657,18 +659,6 @@ public:
return dumpFileList_impl(Reference<BackupContainerFileSystem>::addRef(this), begin, end);
}
ACTOR static Future<bool> isPartitionedBackup_impl(Reference<BackupContainerFileSystem> bc) {
BackupFileList list = wait(bc->dumpFileList(0, std::numeric_limits<Version>::max()));
for (const auto& file : list.logs) {
if (file.isPartitionedLog()) return true;
}
return false;
}
Future<bool> isPartitionedBackup() final {
return isPartitionedBackup_impl(Reference<BackupContainerFileSystem>::addRef(this));
}
static Version resolveRelativeVersion(Optional<Version> max, Version v, const char *name, Error e) {
if(v == invalidVersion) {
TraceEvent(SevError, "BackupExpireInvalidVersion").detail(name, v);
@ -704,7 +694,8 @@ public:
}
}
ACTOR static Future<BackupDescription> describeBackup_impl(Reference<BackupContainerFileSystem> bc, bool deepScan, Version logStartVersionOverride, bool partitioned) {
ACTOR static Future<BackupDescription> describeBackup_impl(Reference<BackupContainerFileSystem> bc, bool deepScan,
Version logStartVersionOverride) {
state BackupDescription desc;
desc.url = bc->getURL();
@ -722,8 +713,7 @@ public:
// from which to resolve the relative version.
// This could be handled more efficiently without recursion but it's tricky, this will do for now.
if(logStartVersionOverride != invalidVersion && logStartVersionOverride < 0) {
BackupDescription tmp = wait(partitioned ? bc->describePartitionedBackup(false, invalidVersion)
: bc->describeBackup(false, invalidVersion));
BackupDescription tmp = wait(bc->describeBackup(false, invalidVersion));
logStartVersionOverride = resolveRelativeVersion(tmp.maxLogEnd, logStartVersionOverride,
"LogStartVersionOverride", invalid_option_value());
}
@ -733,10 +723,12 @@ public:
state Optional<Version> metaLogEnd;
state Optional<Version> metaExpiredEnd;
state Optional<Version> metaUnreliableEnd;
state Optional<Version> metaLogType;
std::vector<Future<Void>> metaReads;
metaReads.push_back(store(metaExpiredEnd, bc->expiredEndVersion().get()));
metaReads.push_back(store(metaUnreliableEnd, bc->unreliableEndVersion().get()));
metaReads.push_back(store(metaLogType, bc->logType().get()));
// Only read log begin/end versions if not doing a deep scan, otherwise scan files and recalculate them.
if(!deepScan) {
@ -747,12 +739,13 @@ public:
wait(waitForAll(metaReads));
TraceEvent("BackupContainerDescribe2")
.detail("URL", bc->getURL())
.detail("LogStartVersionOverride", logStartVersionOverride)
.detail("ExpiredEndVersion", metaExpiredEnd.orDefault(invalidVersion))
.detail("UnreliableEndVersion", metaUnreliableEnd.orDefault(invalidVersion))
.detail("LogBeginVersion", metaLogBegin.orDefault(invalidVersion))
.detail("LogEndVersion", metaLogEnd.orDefault(invalidVersion));
.detail("URL", bc->getURL())
.detail("LogStartVersionOverride", logStartVersionOverride)
.detail("ExpiredEndVersion", metaExpiredEnd.orDefault(invalidVersion))
.detail("UnreliableEndVersion", metaUnreliableEnd.orDefault(invalidVersion))
.detail("LogBeginVersion", metaLogBegin.orDefault(invalidVersion))
.detail("LogEndVersion", metaLogEnd.orDefault(invalidVersion))
.detail("LogType", metaLogType.orDefault(-1));
// If the logStartVersionOverride is positive (not relative) then ensure that unreliableEndVersion is equal or greater
if(logStartVersionOverride != invalidVersion && metaUnreliableEnd.orDefault(invalidVersion) < logStartVersionOverride) {
@ -811,9 +804,18 @@ public:
}
state std::vector<LogFile> logs;
wait(store(logs, bc->listLogFiles(scanBegin, scanEnd, partitioned)) &&
state std::vector<LogFile> plogs;
wait(store(logs, bc->listLogFiles(scanBegin, scanEnd, false)) &&
store(plogs, bc->listLogFiles(scanBegin, scanEnd, true)) &&
store(desc.snapshots, bc->listKeyspaceSnapshots()));
if (plogs.size() > 0) {
desc.partitioned = true;
logs.swap(plogs);
} else {
desc.partitioned = metaLogType.present() && metaLogType.get() == PARTITIONED_MUTATION_LOG;
}
// List logs in version order so log continuity can be analyzed
std::sort(logs.begin(), logs.end());
@ -823,7 +825,7 @@ public:
// If we didn't get log versions above then seed them using the first log file
if (!desc.contiguousLogEnd.present()) {
desc.minLogBegin = logs.begin()->beginVersion;
if (partitioned) {
if (desc.partitioned) {
// Cannot use the first file's end version, which may not be contiguous
// for other partitions. Set to its beginVersion to be safe.
desc.contiguousLogEnd = logs.begin()->beginVersion;
@ -832,7 +834,7 @@ public:
}
}
if (partitioned) {
if (desc.partitioned) {
updatePartitionedLogsContinuousEnd(&desc, logs, scanBegin, scanEnd);
} else {
Version& end = desc.contiguousLogEnd.get();
@ -858,6 +860,11 @@ public:
updates = updates && bc->logEndVersion().set(desc.contiguousLogEnd.get());
}
if (!metaLogType.present()) {
updates = updates && bc->logType().set(desc.partitioned ? PARTITIONED_MUTATION_LOG
: NON_PARTITIONED_MUTATION_LOG);
}
wait(updates);
} catch(Error &e) {
if(e.code() == error_code_actor_cancelled)
@ -906,11 +913,8 @@ public:
// Uses the virtual methods to describe the backup contents
Future<BackupDescription> describeBackup(bool deepScan, Version logStartVersionOverride) final {
return describeBackup_impl(Reference<BackupContainerFileSystem>::addRef(this), deepScan, logStartVersionOverride, false);
}
Future<BackupDescription> describePartitionedBackup(bool deepScan, Version logStartVersionOverride) final {
return describeBackup_impl(Reference<BackupContainerFileSystem>::addRef(this), deepScan, logStartVersionOverride, true);
return describeBackup_impl(Reference<BackupContainerFileSystem>::addRef(this), deepScan,
logStartVersionOverride);
}
ACTOR static Future<Void> expireData_impl(Reference<BackupContainerFileSystem> bc, Version expireEndVersion, bool force, ExpireProgress *progress, Version restorableBeginVersion) {
@ -1287,7 +1291,7 @@ public:
return end;
}
ACTOR static Future<Optional<RestorableFileSet>> getRestoreSet_impl(Reference<BackupContainerFileSystem> bc, Version targetVersion, bool partitioned) {
ACTOR static Future<Optional<RestorableFileSet>> getRestoreSet_impl(Reference<BackupContainerFileSystem> bc, Version targetVersion) {
// Find the most recent keyrange snapshot to end at or before targetVersion
state Optional<KeyspaceSnapshotFile> snapshot;
std::vector<KeyspaceSnapshotFile> snapshots = wait(bc->listKeyspaceSnapshots());
@ -1311,9 +1315,13 @@ public:
}
// FIXME: check if there are tagged logs. for each tag, there is no version gap.
state std::vector<LogFile> logs = wait(bc->listLogFiles(snapshot.get().beginVersion, targetVersion, partitioned));
state std::vector<LogFile> logs;
state std::vector<LogFile> plogs;
wait(store(logs, bc->listLogFiles(snapshot.get().beginVersion, targetVersion, false)) &&
store(plogs, bc->listLogFiles(snapshot.get().beginVersion, targetVersion, true)));
if (partitioned) {
if (plogs.size() > 0) {
logs.swap(plogs);
// sort by tag ID so that filterDuplicates works.
std::sort(logs.begin(), logs.end(), [](const LogFile& a, const LogFile& b) {
return std::tie(a.tagId, a.beginVersion, a.endVersion) <
@ -1349,11 +1357,7 @@ public:
}
Future<Optional<RestorableFileSet>> getRestoreSet(Version targetVersion) final {
return getRestoreSet_impl(Reference<BackupContainerFileSystem>::addRef(this), targetVersion, false);
}
Future<Optional<RestorableFileSet>> getPartitionedRestoreSet(Version targetVersion) final {
return getRestoreSet_impl(Reference<BackupContainerFileSystem>::addRef(this), targetVersion, true);
return getRestoreSet_impl(Reference<BackupContainerFileSystem>::addRef(this), targetVersion);
}
private:
@ -1388,6 +1392,11 @@ public:
VersionProperty expiredEndVersion() { return {Reference<BackupContainerFileSystem>::addRef(this), "expired_end_version"}; }
VersionProperty unreliableEndVersion() { return {Reference<BackupContainerFileSystem>::addRef(this), "unreliable_end_version"}; }
// Backup log types
const static Version NON_PARTITIONED_MUTATION_LOG = 0;
const static Version PARTITIONED_MUTATION_LOG = 1;
VersionProperty logType() { return { Reference<BackupContainerFileSystem>::addRef(this), "mutation_log_type" }; }
ACTOR static Future<Void> writeVersionProperty(Reference<BackupContainerFileSystem> bc, std::string path, Version v) {
try {
state Reference<IBackupFile> f = wait(bc->writeFile(path));

View File

@ -178,6 +178,7 @@ struct BackupDescription {
// The minimum version which this backup can be used to restore to
Optional<Version> minRestorableVersion;
std::string extendedDetail; // Freeform container-specific info.
bool partitioned; // If this backup contains partitioned mutation logs.
// Resolves the versions above to timestamps using a given database's TimeKeeper data.
// toString will use this information if present.
@ -260,23 +261,12 @@ public:
// be after deleting all data prior to logStartVersionOverride.
virtual Future<BackupDescription> describeBackup(bool deepScan = false, Version logStartVersionOverride = invalidVersion) = 0;
// The same as above, except using partitioned mutation logs.
virtual Future<BackupDescription> describePartitionedBackup(bool deepScan = false, Version logStartVersionOverride = invalidVersion) = 0;
virtual Future<BackupFileList> dumpFileList(Version begin = 0, Version end = std::numeric_limits<Version>::max()) = 0;
// If there are partitioned log files, then returns true; otherwise, returns false.
virtual Future<bool> isPartitionedBackup() = 0;
// Get exactly the files necessary to restore to targetVersion. Returns non-present if
// restore to given version is not possible.
virtual Future<Optional<RestorableFileSet>> getRestoreSet(Version targetVersion) = 0;
// Get exactly the files necessary to restore to targetVersion. Returns non-present if
// restore to given version is not possible. This is intended for parallel
// restore in FDB 7.0, which reads partitioned mutation logs.
virtual Future<Optional<RestorableFileSet>> getPartitionedRestoreSet(Version targetVersion) = 0;
// Get an IBackupContainer based on a container spec string
static Reference<IBackupContainer> openContainer(std::string url);
static std::vector<std::string> getURLFormats();

View File

@ -820,6 +820,11 @@ struct LogMessageVersion {
explicit LogMessageVersion(Version version) : version(version), sub(0) {}
LogMessageVersion() : version(0), sub(0) {}
bool empty() const { return (version == 0) && (sub == 0); }
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, version, sub);
}
};
struct AddressExclusion {

View File

@ -75,7 +75,7 @@ TLSConfig tlsConfig(TLSEndpointType::CLIENT);
NetworkOptions::NetworkOptions()
: localAddress(""), clusterFile(""), traceDirectory(Optional<std::string>()),
traceRollSize(TRACE_DEFAULT_ROLL_SIZE), traceMaxLogsSize(TRACE_DEFAULT_MAX_LOGS_SIZE), traceLogGroup("default"),
traceFormat("xml"), traceClockSource("now"), slowTaskProfilingEnabled(false) {
traceFormat("xml"), traceClockSource("now"), runLoopProfilingEnabled(false) {
Standalone<VectorRef<ClientVersionRef>> defaultSupportedVersions;
@ -1010,9 +1010,9 @@ void setNetworkOption(FDBNetworkOptions::Option option, Optional<StringRef> valu
break;
}
case FDBNetworkOptions::ENABLE_SLOW_TASK_PROFILING:
case FDBNetworkOptions::ENABLE_RUN_LOOP_PROFILING: // Same as ENABLE_SLOW_TASK_PROFILING
validateOptionValue(value, false);
networkOptions.slowTaskProfilingEnabled = true;
networkOptions.runLoopProfilingEnabled = true;
break;
default:
break;
@ -1035,8 +1035,8 @@ void runNetwork() {
if(!g_network)
throw network_not_setup();
if(networkOptions.traceDirectory.present() && networkOptions.slowTaskProfilingEnabled) {
setupSlowTaskProfiler();
if(networkOptions.traceDirectory.present() && networkOptions.runLoopProfilingEnabled) {
setupRunLoopProfiler();
}
g_network->run();
@ -2233,6 +2233,8 @@ ACTOR Future<Standalone<VectorRef<const char*>>> getAddressesForKeyActor(Key key
// If key >= allKeys.end, then getRange will return a kv-pair with an empty value. This will result in our serverInterfaces vector being empty, which will cause us to return an empty addresses list.
state Key ksKey = keyServersKey(key);
state Standalone<RangeResultRef> serverTagResult = wait( getRange(cx, ver, lastLessOrEqual(serverTagKeys.begin), firstGreaterThan(serverTagKeys.end), GetRangeLimits(CLIENT_KNOBS->TOO_MANY), false, info ) );
ASSERT( !serverTagResult.more && serverTagResult.size() < CLIENT_KNOBS->TOO_MANY );
Future<Standalone<RangeResultRef>> futureServerUids = getRange(cx, ver, lastLessOrEqual(ksKey), firstGreaterThan(ksKey), GetRangeLimits(1), false, info);
Standalone<RangeResultRef> serverUids = wait( futureServerUids );
@ -2240,7 +2242,7 @@ ACTOR Future<Standalone<VectorRef<const char*>>> getAddressesForKeyActor(Key key
vector<UID> src;
vector<UID> ignore; // 'ignore' is so named because it is the vector into which we decode the 'dest' servers in the case where this key is being relocated. But 'src' is the canonical location until the move is finished, because it could be cancelled at any time.
decodeKeyServersValue(serverUids[0].value, src, ignore);
decodeKeyServersValue(serverTagResult, serverUids[0].value, src, ignore);
Optional<vector<StorageServerInterface>> serverInterfaces = wait( transactionalGetServerInterfaces(ver, cx, info, src) );
ASSERT( serverInterfaces.present() ); // since this is happening transactionally, /FF/keyServers and /FF/serverList need to be consistent with one another

View File

@ -61,7 +61,7 @@ struct NetworkOptions {
std::string traceFileIdentifier;
Optional<bool> logClientInfo;
Reference<ReferencedObject<Standalone<VectorRef<ClientVersionRef>>>> supportedVersions;
bool slowTaskProfilingEnabled;
bool runLoopProfilingEnabled;
NetworkOptions();
};

View File

@ -460,31 +460,31 @@ struct RestoreSendVersionedMutationsRequest : TimedRequest {
int batchIndex; // version batch index
RestoreAsset asset; // Unique identifier for the current restore asset
Version prevVersion, version; // version is the commitVersion of the mutation vector.
Version msgIndex; // Monitonically increasing index of mutation messages
bool isRangeFile;
MutationsVec mutations; // All mutations at the same version parsed by one loader
SubSequenceVec subs; // Sub-sequence number for mutations
MutationsVec mutations; // Mutations that may be at different versions parsed by one loader
LogMessageVersionVec mVersions; // (version, subversion) of each mutation in mutations field
ReplyPromise<RestoreCommonReply> reply;
RestoreSendVersionedMutationsRequest() = default;
explicit RestoreSendVersionedMutationsRequest(int batchIndex, const RestoreAsset& asset, Version prevVersion,
Version version, bool isRangeFile, MutationsVec mutations,
SubSequenceVec subs)
: batchIndex(batchIndex), asset(asset), prevVersion(prevVersion), version(version), isRangeFile(isRangeFile),
mutations(mutations), subs(subs) {}
explicit RestoreSendVersionedMutationsRequest(int batchIndex, const RestoreAsset& asset, Version msgIndex,
bool isRangeFile, MutationsVec mutations,
LogMessageVersionVec mVersions)
: batchIndex(batchIndex), asset(asset), msgIndex(msgIndex), isRangeFile(isRangeFile), mutations(mutations),
mVersions(mVersions) {}
std::string toString() {
std::stringstream ss;
ss << "VersionBatchIndex:" << batchIndex << "RestoreAsset:" << asset.toString()
<< " prevVersion:" << prevVersion << " version:" << version << " isRangeFile:" << isRangeFile
<< " mutations.size:" << mutations.size() << " subs.size:" << subs.size();
ss << "VersionBatchIndex:" << batchIndex << "RestoreAsset:" << asset.toString() << " msgIndex:" << msgIndex
<< " isRangeFile:" << isRangeFile << " mutations.size:" << mutations.size()
<< " mVersions.size:" << mVersions.size();
return ss.str();
}
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, batchIndex, asset, prevVersion, version, isRangeFile, mutations, subs, reply);
serializer(ar, batchIndex, asset, msgIndex, isRangeFile, mutations, mVersions, reply);
}
};

View File

@ -21,6 +21,8 @@
#include "fdbclient/SystemData.h"
#include "fdbclient/StorageServerInterface.h"
#include "flow/TDMetric.actor.h"
#include "fdbclient/NativeAPI.actor.h"
const KeyRef systemKeysPrefix = LiteralStringRef("\xff");
const KeyRangeRef normalKeys(KeyRef(), systemKeysPrefix);
@ -43,20 +45,67 @@ const Key keyServersKey( const KeyRef& k ) {
const KeyRef keyServersKey( const KeyRef& k, Arena& arena ) {
return k.withPrefix( keyServersPrefix, arena );
}
const Value keyServersValue( const vector<UID>& src, const vector<UID>& dest ) {
const Value keyServersValue( Standalone<RangeResultRef> result, const std::vector<UID>& src, const std::vector<UID>& dest ) {
std::vector<Tag> srcTag;
std::vector<Tag> destTag;
for (const KeyValueRef kv : result) {
UID uid = decodeServerTagKey(kv.key);
if (std::find(src.begin(), src.end(), uid) != src.end()) {
srcTag.push_back( decodeServerTagValue(kv.value) );
}
if (std::find(dest.begin(), dest.end(), uid) != dest.end()) {
destTag.push_back( decodeServerTagValue(kv.value) );
}
}
return keyServersValue(srcTag, destTag);
}
const Value keyServersValue( const std::vector<Tag>& srcTag, const std::vector<Tag>& destTag ) {
// src and dest are expected to be sorted
ASSERT( std::is_sorted(src.begin(), src.end()) && std::is_sorted(dest.begin(), dest.end()) );
BinaryWriter wr((IncludeVersion())); wr << src << dest;
BinaryWriter wr(IncludeVersion()); wr << srcTag << destTag;
return wr.toValue();
}
void decodeKeyServersValue( const ValueRef& value, vector<UID>& src, vector<UID>& dest ) {
if (value.size()) {
BinaryReader rd(value, IncludeVersion());
rd >> src >> dest;
} else {
void decodeKeyServersValue( Standalone<RangeResultRef> result, const ValueRef& value,
std::vector<UID>& src, std::vector<UID>& dest ) {
if (value.size() == 0) {
src.clear();
dest.clear();
return;
}
BinaryReader rd(value, IncludeVersion());
rd.checkpoint();
int srcLen, destLen;
rd >> srcLen;
rd.readBytes(srcLen * sizeof(Tag));
rd >> destLen;
rd.rewind();
if (value.size() != sizeof(ProtocolVersion) + sizeof(int) + srcLen * sizeof(Tag) + sizeof(int) + destLen * sizeof(Tag)) {
rd >> src >> dest;
rd.assertEnd();
return;
}
std::vector<Tag> srcTag, destTag;
rd >> srcTag >> destTag;
src.clear();
dest.clear();
for (const KeyValueRef kv : result) {
Tag tag = decodeServerTagValue(kv.value);
if (std::find(srcTag.begin(), srcTag.end(), tag) != srcTag.end()) {
src.push_back( decodeServerTagKey(kv.key) );
}
if (std::find(destTag.begin(), destTag.end(), tag) != destTag.end()) {
dest.push_back( decodeServerTagKey(kv.key) );
}
}
std::sort(src.begin(), src.end());
std::sort(dest.begin(), dest.end());
}
const KeyRef conflictingKeysPrefix = LiteralStringRef("/transaction/conflicting_keys/");

View File

@ -28,6 +28,10 @@
#include "fdbclient/StorageServerInterface.h"
#include "fdbclient/RestoreWorkerInterface.actor.h"
// Don't warn on constants being defined in this file.
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wunused-variable"
struct RestoreLoaderInterface;
struct RestoreApplierInterface;
struct RestoreMasterInterface;
@ -39,23 +43,28 @@ extern const KeyRangeRef allKeys; // '' to systemKeys.end
extern const KeyRangeRef specialKeys; // [FF][FF] to [FF][FF][FF][FF]
extern const KeyRef afterAllKeys;
// "\xff/keyServers/[[begin]]" := "[[vector<serverID>, vector<serverID>]]"
// "\xff/keyServers/[[begin]]" := "[[vector<serverID>, vector<serverID>]|[vector<Tag>, vector<Tag>]]"
extern const KeyRangeRef keyServersKeys, keyServersKeyServersKeys;
extern const KeyRef keyServersPrefix, keyServersEnd, keyServersKeyServersKey;
const Key keyServersKey( const KeyRef& k );
const KeyRef keyServersKey( const KeyRef& k, Arena& arena );
const Value keyServersValue(
const vector<UID>& src,
const vector<UID>& dest = vector<UID>() );
void decodeKeyServersValue( const ValueRef& value,
vector<UID>& src, vector<UID>& dest );
Standalone<RangeResultRef> result,
const std::vector<UID>& src,
const std::vector<UID>& dest = std::vector<UID>() );
const Value keyServersValue(
const std::vector<Tag>& srcTag,
const std::vector<Tag>& destTag = std::vector<Tag>());
// `result` must be the full result of getting serverTagKeys
void decodeKeyServersValue( Standalone<RangeResultRef> result, const ValueRef& value,
std::vector<UID>& src, std::vector<UID>& dest );
// "\xff/storageCache/[[begin]]" := "[[vector<uint16_t>]]"
extern const KeyRangeRef storageCacheKeys;
extern const KeyRef storageCachePrefix;
const Key storageCacheKey( const KeyRef& k );
const Value storageCacheValue( const vector<uint16_t>& serverIndices );
void decodeStorageCacheValue( const ValueRef& value, vector<uint16_t>& serverIndices );
const Value storageCacheValue( const std::vector<uint16_t>& serverIndices );
void decodeStorageCacheValue( const ValueRef& value, std::vector<uint16_t>& serverIndices );
// "\xff/serverKeys/[[serverID]]/[[begin]]" := "" | "1" | "2"
extern const KeyRef serverKeysPrefix;
@ -82,6 +91,7 @@ extern const KeyRef cacheChangePrefix;
const Key cacheChangeKeyFor( uint16_t idx );
uint16_t cacheChangeKeyDecodeIndex( const KeyRef& key );
// "\xff/serverTag/[[serverID]]" = "[[Tag]]"
extern const KeyRangeRef serverTagKeys;
extern const KeyRef serverTagPrefix;
extern const KeyRangeRef serverTagMaxKeys;
@ -366,4 +376,6 @@ std::pair<Key,Version> decodeHealthyZoneValue( ValueRef const& );
// Used to create artifically large txnStateStore instances in testing.
extern const KeyRangeRef testOnlyTxnStateStorePrefixRange;
#pragma clang diagnostic pop
#endif

View File

@ -112,7 +112,9 @@ description is not currently required but encouraged.
<Option name="disable_client_statistics_logging" code="70"
description="Disables logging of client statistics, such as sampled transaction activity." />
<Option name="enable_slow_task_profiling" code="71"
description="Enables debugging feature to perform slow task profiling. Requires trace logging to be enabled. WARNING: this feature is not recommended for use in production." />
description="Deprecated" />
<Option name="enable_run_loop_profiling" code="71"
description="Enables debugging feature to perform run loop profiling. Requires trace logging to be enabled. WARNING: this feature is not recommended for use in production." />
<Option name="client_buggify_enable" code="80"
description="Enable client buggify - will make requests randomly fail (intended for client testing)" />
<Option name="client_buggify_disable" code="81"

View File

@ -97,6 +97,7 @@ public:
#endif
static Future<Reference<IAsyncFile>> open( std::string filename, int flags, int mode, void* ignore ) {
ASSERT( !FLOW_KNOBS->DISABLE_POSIX_KERNEL_AIO );
ASSERT( flags & OPEN_UNBUFFERED );
if (flags & OPEN_LOCK)
@ -153,6 +154,7 @@ public:
}
static void init( Reference<IEventFD> ev, double ioTimeout ) {
ASSERT( !FLOW_KNOBS->DISABLE_POSIX_KERNEL_AIO );
if( !g_network->isSimulated() ) {
ctx.countAIOSubmit.init(LiteralStringRef("AsyncFile.CountAIOSubmit"));
ctx.countAIOCollect.init(LiteralStringRef("AsyncFile.CountAIOCollect"));
@ -578,7 +580,7 @@ private:
static Context ctx;
explicit AsyncFileKAIO(int fd, int flags, std::string const& filename) : fd(fd), flags(flags), filename(filename), failed(false) {
ASSERT( !FLOW_KNOBS->DISABLE_POSIX_KERNEL_AIO );
if( !g_network->isSimulated() ) {
countFileLogicalWrites.init(LiteralStringRef("AsyncFile.CountFileLogicalWrites"), filename);
countFileLogicalReads.init( LiteralStringRef("AsyncFile.CountFileLogicalReads"), filename);

View File

@ -1360,3 +1360,94 @@ TEST_CASE("/flow/DeterministicRandom/SignedOverflow") {
std::numeric_limits<int64_t>::max() - 1);
return Void();
}
struct Tracker {
int copied;
bool moved;
Tracker(int copied = 0) : moved(false), copied(copied) {}
Tracker(Tracker&& other) : Tracker(other.copied) {
ASSERT(!other.moved);
other.moved = true;
}
Tracker& operator=(Tracker&& other) {
ASSERT(!other.moved);
other.moved = true;
this->moved = false;
this->copied = other.copied;
return *this;
}
Tracker(const Tracker& other) : Tracker(other.copied + 1) { ASSERT(!other.moved); }
Tracker& operator=(const Tracker& other) {
ASSERT(!other.moved);
this->moved = false;
this->copied = other.copied + 1;
return *this;
}
ACTOR static Future<Void> listen(FutureStream<Tracker> stream) {
Tracker t = waitNext(stream);
ASSERT(!t.moved);
ASSERT(t.copied == 0);
return Void();
}
};
TEST_CASE("/flow/flow/PromiseStream/move") {
state PromiseStream<Tracker> stream;
{
// This tests the case when a callback is added before
// a movable value is sent
state Future<Void> listener = Tracker::listen(stream.getFuture());
stream.send(Tracker{});
wait(listener);
}
{
// This tests the case when a callback is added before
// a unmovable value is sent
listener = Tracker::listen(stream.getFuture());
Tracker namedTracker;
stream.send(namedTracker);
wait(listener);
}
{
// This tests the case when no callback is added until
// after a movable value is sent
stream.send(Tracker{});
stream.send(Tracker{});
{
Tracker t = waitNext(stream.getFuture());
ASSERT(!t.moved);
ASSERT(t.copied == 0);
}
choose {
when(Tracker t = waitNext(stream.getFuture())) {
ASSERT(!t.moved);
ASSERT(t.copied == 0);
}
}
}
{
// This tests the case when no callback is added until
// after an unmovable value is sent
Tracker namedTracker1;
Tracker namedTracker2;
stream.send(namedTracker1);
stream.send(namedTracker2);
{
Tracker t = waitNext(stream.getFuture());
ASSERT(!t.moved);
// must copy onto queue
ASSERT(t.copied == 1);
}
choose {
when(Tracker t = waitNext(stream.getFuture())) {
ASSERT(!t.moved);
// must copy onto queue
ASSERT(t.copied == 1);
}
}
}
return Void();
}

View File

@ -93,7 +93,8 @@ Net2FileSystem::Net2FileSystem(double ioTimeout, std::string fileSystemPath)
{
Net2AsyncFile::init();
#ifdef __linux__
AsyncFileKAIO::init( Reference<IEventFD>(N2::ASIOReactor::getEventFD()), ioTimeout );
if (!FLOW_KNOBS->DISABLE_POSIX_KERNEL_AIO)
AsyncFileKAIO::init( Reference<IEventFD>(N2::ASIOReactor::getEventFD()), ioTimeout );
if (fileSystemPath.empty()) {
checkFileSystem = false;

View File

@ -253,6 +253,15 @@ public:
else
queue->send(value);
}
void send(T&& value) const {
if (queue->isRemoteEndpoint()) {
FlowTransport::transport().sendUnreliable(SerializeSource<T>(std::move(value)), getEndpoint(), true);
}
else
queue->send(std::move(value));
}
/*void sendError(const Error& error) const {
ASSERT( !queue->isRemoteEndpoint() );
queue->sendError(error);

View File

@ -61,7 +61,10 @@ void applyMetadataMutations(UID const& dbgid, Arena &arena, VectorRef<MutationRe
KeyRef end = keyInfo->rangeContaining(k).end();
KeyRangeRef insertRange(k,end);
vector<UID> src, dest;
decodeKeyServersValue(m.param2, src, dest);
// txnStateStore is always an in-memory KVS, and must always be recovered before
// applyMetadataMutations is called, so a wait here should never be needed.
Future<Standalone<RangeResultRef>> fResult = txnStateStore->readRange(serverTagKeys);
decodeKeyServersValue(fResult.get(), m.param2, src, dest);
ASSERT(storageCache);
ServerCacheInfo info;

View File

@ -45,17 +45,21 @@ void BackupProgress::addBackupStatus(const WorkerBackupStatus& status) {
}
void BackupProgress::updateTagVersions(std::map<Tag, Version>* tagVersions, std::set<Tag>* tags,
const std::map<Tag, Version>& progress, Version endVersion, LogEpoch epoch) {
const std::map<Tag, Version>& progress, Version endVersion,
Version adjustedBeginVersion, LogEpoch epoch) {
for (const auto& [tag, savedVersion] : progress) {
// If tag is not in "tags", it means the old epoch has more tags than
// new epoch's tags. Just ignore the tag here.
auto n = tags->erase(tag);
if (n > 0 && savedVersion < endVersion - 1) {
tagVersions->insert({ tag, savedVersion + 1 });
const Version beginVersion =
(savedVersion + 1 > adjustedBeginVersion) ? (savedVersion + 1) : adjustedBeginVersion;
tagVersions->insert({ tag, beginVersion });
TraceEvent("BackupVersionRange", dbgid)
.detail("OldEpoch", epoch)
.detail("Tag", tag.toString())
.detail("BeginVersion", savedVersion + 1)
.detail("AdjustedBeginVersion", beginVersion)
.detail("EndVersion", endVersion);
}
}
@ -66,12 +70,20 @@ std::map<std::tuple<LogEpoch, Version, int>, std::map<Tag, Version>> BackupProgr
if (!backupStartedValue.present()) return toRecruit; // No active backups
Version lastEnd = invalidVersion;
for (const auto& [epoch, info] : epochInfos) {
std::set<Tag> tags = enumerateLogRouterTags(info.logRouterTags);
std::map<Tag, Version> tagVersions;
// Sometimes, an epoch's begin version is lower than the previous epoch's
// end version. In this case, adjust the epoch's begin version to be the
// same as previous end version.
Version adjustedBeginVersion = lastEnd > info.epochBegin ? lastEnd : info.epochBegin;
lastEnd = info.epochEnd;
auto progressIt = progress.lower_bound(epoch);
if (progressIt != progress.end() && progressIt->first == epoch) {
updateTagVersions(&tagVersions, &tags, progressIt->second, info.epochEnd, epoch);
updateTagVersions(&tagVersions, &tags, progressIt->second, info.epochEnd, adjustedBeginVersion, epoch);
} else {
auto rit = std::find_if(
progress.rbegin(), progress.rend(),
@ -90,17 +102,18 @@ std::map<std::tuple<LogEpoch, Version, int>, std::map<Tag, Version>> BackupProgr
// The logRouterTags are the same
// ASSERT(info.logRouterTags == epochTags[rit->first]);
updateTagVersions(&tagVersions, &tags, rit->second, info.epochEnd, epoch);
updateTagVersions(&tagVersions, &tags, rit->second, info.epochEnd, adjustedBeginVersion, epoch);
}
}
}
for (const Tag tag : tags) { // tags without progress data
tagVersions.insert({ tag, info.epochBegin });
tagVersions.insert({ tag, adjustedBeginVersion });
TraceEvent("BackupVersionRange", dbgid)
.detail("OldEpoch", epoch)
.detail("Tag", tag.toString())
.detail("BeginVersion", info.epochBegin)
.detail("AdjustedBeginVersion", adjustedBeginVersion)
.detail("EndVersion", info.epochEnd);
}
if (!tagVersions.empty()) {

View File

@ -81,7 +81,8 @@ private:
// For each tag in progress, the saved version is smaller than endVersion - 1,
// add {tag, savedVersion+1} to tagVersions and remove the tag from "tags".
void updateTagVersions(std::map<Tag, Version>* tagVersions, std::set<Tag>* tags,
const std::map<Tag, Version>& progress, Version endVersion, LogEpoch epoch);
const std::map<Tag, Version>& progress, Version endVersion, Version adjustedBeginVersion,
LogEpoch epoch);
const UID dbgid;

View File

@ -179,7 +179,9 @@ struct BackupData {
config.startedBackupWorkers().set(tr, workers.get());
}
for (auto p : workers.get()) {
TraceEvent("BackupWorkerDebug", self->myId).detail("Epoch", p.first).detail("TagID", p.second);
TraceEvent("BackupWorkerDebugTag", self->myId)
.detail("Epoch", p.first)
.detail("TagID", p.second);
}
wait(tr->commit());

View File

@ -1326,7 +1326,7 @@ public:
Future<Void> outstandingRequestChecker;
Future<Void> outstandingRemoteRequestChecker;
AsyncTrigger updateDBInfo;
std::vector<Endpoint> updateDBInfoEndpoints;
std::set<Endpoint> updateDBInfoEndpoints;
std::set<Endpoint> removedDBInfoEndpoints;
DBInfo db;
@ -1732,7 +1732,7 @@ ACTOR Future<Void> workerAvailabilityWatch( WorkerInterface worker, ProcessClass
? Never()
: waitFailureClient(worker.waitFailure, SERVER_KNOBS->WORKER_FAILURE_TIME);
cluster->updateWorkerList.set( worker.locality.processId(), ProcessData(worker.locality, startingClass, worker.stableAddress()) );
cluster->updateDBInfoEndpoints.push_back(worker.updateServerDBInfo.getEndpoint());
cluster->updateDBInfoEndpoints.insert(worker.updateServerDBInfo.getEndpoint());
cluster->updateDBInfo.trigger();
// This switching avoids a race where the worker can be added to id_worker map after the workerAvailabilityWatch fails for the worker.
wait(delay(0));
@ -2395,12 +2395,12 @@ ACTOR Future<Void> statusServer(FutureStream< StatusRequest> requests,
// Get status but trap errors to send back to client.
vector<WorkerDetails> workers;
std::vector<std::pair<NetworkAddress, Standalone<VectorRef<StringRef>>>> workerIssues;
std::vector<ProcessIssues> workerIssues;
for(auto& it : self->id_worker) {
workers.push_back(it.second.details);
if(it.second.issues.size()) {
workerIssues.push_back(std::make_pair(it.second.details.interf.address(), it.second.issues));
workerIssues.push_back(ProcessIssues(it.second.details.interf.address(), it.second.issues));
}
}
@ -3032,30 +3032,26 @@ ACTOR Future<Void> dbInfoUpdater( ClusterControllerData* self ) {
when(wait(dbInfoChange)) {}
}
UpdateServerDBInfoRequest req;
if(dbInfoChange.isReady()) {
self->updateDBInfoEndpoints.clear();
for(auto &it : self->id_worker) {
self->updateDBInfoEndpoints.push_back(it.second.details.interf.updateServerDBInfo.getEndpoint());
req.broadcastInfo.push_back(it.second.details.interf.updateServerDBInfo.getEndpoint());
}
} else {
uniquify(self->updateDBInfoEndpoints);
for(int i = 0; i < self->updateDBInfoEndpoints.size(); i++) {
if(self->removedDBInfoEndpoints.count(self->updateDBInfoEndpoints[i])) {
self->updateDBInfoEndpoints[i] = self->updateDBInfoEndpoints.back();
self->updateDBInfoEndpoints.pop_back();
}
for(auto it : self->removedDBInfoEndpoints) {
self->updateDBInfoEndpoints.erase(it);
}
req.broadcastInfo = std::vector<Endpoint>(self->updateDBInfoEndpoints.begin(), self->updateDBInfoEndpoints.end());
}
self->updateDBInfoEndpoints.clear();
self->removedDBInfoEndpoints.clear();
dbInfoChange = self->db.serverInfo->onChange();
updateDBInfo = self->updateDBInfo.onTrigger();
UpdateServerDBInfoRequest req;
req.serializedDbInfo = BinaryWriter::toValue(self->db.serverInfo->get(), AssumeVersion(currentProtocolVersion));
req.broadcastInfo = self->updateDBInfoEndpoints;
self->updateDBInfoEndpoints.clear();
TraceEvent("DBInfoStartBroadcast", self->id);
choose {
when(std::vector<Endpoint> notUpdated = wait( broadcastDBInfoRequest(req, SERVER_KNOBS->DBINFO_SEND_AMOUNT, Optional<Endpoint>(), false) )) {
@ -3063,8 +3059,8 @@ ACTOR Future<Void> dbInfoUpdater( ClusterControllerData* self ) {
for(auto &it : notUpdated) {
TraceEvent("DBInfoNotUpdated", self->id).detail("Addr", it.getPrimaryAddress());
}
self->updateDBInfoEndpoints.insert(self->updateDBInfoEndpoints.end(), notUpdated.begin(), notUpdated.end());
if(notUpdated.size()) {
self->updateDBInfoEndpoints.insert(notUpdated.begin(), notUpdated.end());
self->updateDBInfo.trigger();
}
}

View File

@ -180,10 +180,10 @@ class WorkPool : public IThreadPool, public ReferenceCounted<WorkPool<Threadlike
ACTOR Future<Void> stopOnError( WorkPool* w ) {
try {
wait( w->getError() );
ASSERT(false);
} catch (Error& e) {
w->error = e;
w->stop(e);
}
w->stop();
return Void();
}
@ -230,12 +230,14 @@ public:
} else
pool->queueLock.leave();
}
virtual Future<Void> stop() {
if (error.code() == invalid_error_code) error = success();
virtual Future<Void> stop(Error const& e) {
if (error.code() == invalid_error_code) {
error = e;
}
pool->queueLock.enter();
TraceEvent("WorkPool_Stop").detail("Workers", pool->workers.size()).detail("Idle", pool->idle.size())
.detail("Work", pool->work.size());
.detail("Work", pool->work.size()).error(e, true);
for (uint32_t i=0; i<pool->work.size(); i++)
pool->work[i]->cancel(); // What if cancel() does something to this?

View File

@ -474,6 +474,8 @@ ACTOR Future<Reference<InitialDataDistribution>> getInitialDataDistribution( Dat
try {
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
wait(checkMoveKeysLockReadOnly(&tr, moveKeysLock));
state Standalone<RangeResultRef> UIDtoTagMap = wait(tr.getRange(serverTagKeys, CLIENT_KNOBS->TOO_MANY));
ASSERT( !UIDtoTagMap.more && UIDtoTagMap.size() < CLIENT_KNOBS->TOO_MANY );
Standalone<RangeResultRef> keyServers = wait(krmGetRanges(&tr, keyServersPrefix, KeyRangeRef(beginKey, allKeys.end), SERVER_KNOBS->MOVE_KEYS_KRM_LIMIT, SERVER_KNOBS->MOVE_KEYS_KRM_LIMIT_BYTES));
succeeded = true;
@ -482,7 +484,7 @@ ACTOR Future<Reference<InitialDataDistribution>> getInitialDataDistribution( Dat
// for each range
for(int i = 0; i < keyServers.size() - 1; i++) {
DDShardInfo info( keyServers[i].key );
decodeKeyServersValue( keyServers[i].value, src, dest );
decodeKeyServersValue( UIDtoTagMap, keyServers[i].value, src, dest );
if(remoteDcIds.size()) {
auto srcIter = team_cache.find(src);
if(srcIter == team_cache.end()) {
@ -3597,14 +3599,14 @@ ACTOR Future<Void> storageServerTracker(
if (worstStatus != DDTeamCollection::Status::NONE) {
TraceEvent(SevWarn, "UndesiredStorageServer", self->distributorId)
.detail("Server", server->id)
.detail("Excluded", worstAddr.toString());
.detail("Server", server->id)
.detail("Excluded", worstAddr.toString());
status.isUndesired = true;
status.isWrongConfiguration = true;
if (worstStatus == DDTeamCollection::Status::FAILED) {
TraceEvent(SevWarn, "FailedServerRemoveKeys", self->distributorId)
.detail("Server", server->id)
.detail("Excluded", worstAddr.toString());
.detail("Excluded", worstAddr.toString());
wait(removeKeysFromFailedServer(cx, server->id, self->lock));
if (BUGGIFY) wait(delay(5.0));
self->shardsAffectedByTeamFailure->eraseServer(server->id);

View File

@ -537,7 +537,7 @@ struct DDQueueData {
// FIXME: is the merge case needed
if( input.priority == SERVER_KNOBS->PRIORITY_MERGE_SHARD ) {
wait( delay( 0.5, decrementPriority(decrementPriority(TaskPriority::DataDistribution )) ) );
wait( delay( 0.5, TaskPriority::DataDistributionVeryLow ) );
} else {
wait( delay( 0.0001, TaskPriority::DataDistributionLaunch ) );
}
@ -546,6 +546,8 @@ struct DDQueueData {
servers.clear();
tr.setOption( FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE );
try {
state Standalone<RangeResultRef> UIDtoTagMap = wait( tr.getRange( serverTagKeys, CLIENT_KNOBS->TOO_MANY ) );
ASSERT( !UIDtoTagMap.more && UIDtoTagMap.size() < CLIENT_KNOBS->TOO_MANY );
Standalone<RangeResultRef> keyServersEntries = wait(
tr.getRange( lastLessOrEqual( keyServersKey( input.keys.begin ) ),
firstGreaterOrEqual( keyServersKey( input.keys.end ) ), SERVER_KNOBS->DD_QUEUE_MAX_KEY_SERVERS ) );
@ -553,7 +555,7 @@ struct DDQueueData {
if(keyServersEntries.size() < SERVER_KNOBS->DD_QUEUE_MAX_KEY_SERVERS) {
for( int shard = 0; shard < keyServersEntries.size(); shard++ ) {
vector<UID> src, dest;
decodeKeyServersValue( keyServersEntries[shard].value, src, dest );
decodeKeyServersValue( UIDtoTagMap, keyServersEntries[shard].value, src, dest );
ASSERT( src.size() );
for( int i = 0; i < src.size(); i++ ) {
servers.insert( src[i] );

View File

@ -367,14 +367,14 @@ ACTOR Future<Void> changeSizes( DataDistributionTracker* self, KeyRange keys, in
struct HasBeenTrueFor : ReferenceCounted<HasBeenTrueFor> {
explicit HasBeenTrueFor( Optional<ShardMetrics> value ) {
if(value.present()) {
trigger = delayJittered(std::max(0.0, SERVER_KNOBS->DD_MERGE_COALESCE_DELAY + value.get().lastLowBandwidthStartTime - now()), decrementPriority(TaskPriority::DataDistribution) ) || cleared.getFuture();
trigger = delayJittered(std::max(0.0, SERVER_KNOBS->DD_MERGE_COALESCE_DELAY + value.get().lastLowBandwidthStartTime - now()), TaskPriority::DataDistributionLow ) || cleared.getFuture();
}
}
Future<Void> set() {
if( !trigger.isValid() ) {
cleared = Promise<Void>();
trigger = delayJittered( SERVER_KNOBS->DD_MERGE_COALESCE_DELAY, decrementPriority(TaskPriority::DataDistribution) ) || cleared.getFuture();
trigger = delayJittered( SERVER_KNOBS->DD_MERGE_COALESCE_DELAY, TaskPriority::DataDistributionLow ) || cleared.getFuture();
}
return trigger;
}

View File

@ -26,6 +26,7 @@
#include "fdbclient/Notified.h"
#include "fdbclient/SystemData.h"
#include "flow/actorcompiler.h" // This must be the last #include.
#include "fdbserver/DeltaTree.h"
#define OP_DISK_OVERHEAD (sizeof(OpHeader) + 1)
@ -268,7 +269,8 @@ private:
OpSnapshotEnd,
OpSnapshotAbort, // terminate an in progress snapshot in order to start a full snapshot
OpCommit, // only in log, not in queue
OpRollback // only in log, not in queue
OpRollback, // only in log, not in queue
OpSnapshotItemDelta
};
struct OpRef {
@ -344,8 +346,7 @@ private:
int64_t overheadWriteBytes;
NotifiedVersion notifiedCommittedWriteBytes;
Key recoveredSnapshotKey; // After recovery, the next key in the currently uncompleted snapshot
IDiskQueue::location
currentSnapshotEnd; // The end of the most recently completed snapshot (this snapshot cannot be discarded)
IDiskQueue::location currentSnapshotEnd; // The end of the most recently completed snapshot (this snapshot cannot be discarded)
IDiskQueue::location previousSnapshotEnd; // The end of the second most recently completed snapshot (on commit, this
// snapshot can be discarded)
PromiseStream<Future<Void>> addActor;
@ -443,6 +444,7 @@ private:
state OpQueue recoveryQueue;
state OpHeader h;
state Standalone<StringRef> lastSnapshotKey;
TraceEvent("KVSMemRecoveryStarted", self->id)
.detail("SnapshotEndLocation", uncommittedSnapshotEnd);
@ -485,7 +487,7 @@ private:
StringRef p1 = data.substr(0, h.len1);
StringRef p2 = data.substr(h.len1, h.len2);
if (h.op == OpSnapshotItem) { // snapshot data item
if (h.op == OpSnapshotItem || h.op == OpSnapshotItemDelta) { // snapshot data item
/*if (p1 < uncommittedNextKey) {
TraceEvent(SevError, "RecSnapshotBack", self->id)
.detail("NextKey", uncommittedNextKey)
@ -493,11 +495,27 @@ private:
.detail("Nextlocation", self->log->getNextReadLocation());
}
ASSERT( p1 >= uncommittedNextKey );*/
if(h.op == OpSnapshotItemDelta) {
ASSERT(p1.size() > 1);
// Get number of bytes borrowed from previous item key
int borrowed = *(uint8_t *)p1.begin();
ASSERT(borrowed <= lastSnapshotKey.size());
// Trim p1 to just the suffix
StringRef suffix = p1.substr(1);
// Allocate a new string in data arena to hold prefix + suffix
Arena &dataArena = *(Arena *)&data.arena();
p1 = makeString(borrowed + suffix.size(), dataArena);
// Copy the prefix into the new reconstituted key
memcpy(mutateString(p1), lastSnapshotKey.begin(), borrowed);
// Copy the suffix into the new reconstituted key
memcpy(mutateString(p1) + borrowed, suffix.begin(), suffix.size());
}
if( p1 >= uncommittedNextKey )
recoveryQueue.clear( KeyRangeRef(uncommittedNextKey, p1), &uncommittedNextKey.arena() ); //FIXME: Not sure what this line is for, is it necessary?
recoveryQueue.set( KeyValueRef(p1, p2), &data.arena() );
uncommittedNextKey = keyAfter(p1);
++dbgSnapshotItemCount;
lastSnapshotKey = Key(p1, data.arena());
} else if (h.op == OpSnapshotEnd || h.op == OpSnapshotAbort) { // snapshot complete
TraceEvent("RecSnapshotEnd", self->id)
.detail("NextKey", uncommittedNextKey)
@ -511,6 +529,7 @@ private:
}
uncommittedNextKey = Key();
lastSnapshotKey = Key();
++dbgSnapshotEndCount;
} else if (h.op == OpSet) { // set mutation
recoveryQueue.set( KeyValueRef(p1,p2), &data.arena() );
@ -629,6 +648,12 @@ private:
state int snapItems = 0;
state uint64_t snapshotBytes = 0;
// Snapshot keys will be alternately written to two preallocated buffers.
// This allows consecutive snapshot keys to be compared for delta compression while only copying each key's bytes once.
state Key lastSnapshotKeyA = makeString(CLIENT_KNOBS->SYSTEM_KEY_SIZE_LIMIT);
state Key lastSnapshotKeyB = makeString(CLIENT_KNOBS->SYSTEM_KEY_SIZE_LIMIT);
state bool lastSnapshotKeyUsingA = true;
TraceEvent("KVSMemStartingSnapshot", self->id).detail("StartKey", nextKey);
loop {
@ -652,40 +677,118 @@ private:
.detail("LastOperationWasASnapshot", nextKey == Key() && !nextKeyAfter);
lastDiff = diff;
if (next == self->data.end()) {
auto thisSnapshotEnd = self->log_op(OpSnapshotEnd, StringRef(), StringRef());
//TraceEvent("SnapshotEnd", self->id)
// .detail("LastKey", lastKey.present() ? lastKey.get() : LiteralStringRef("<none>"))
// .detail("CurrentSnapshotEndLoc", self->currentSnapshotEnd)
// .detail("PreviousSnapshotEndLoc", self->previousSnapshotEnd)
// .detail("ThisSnapshotEnd", thisSnapshotEnd)
// .detail("Items", snapItems)
// .detail("CommittedWrites", self->notifiedCommittedWriteBytes.get())
// .detail("SnapshotSize", snapshotBytes);
// Since notifiedCommittedWriteBytes is only set() once per commit, before logging the commit operation, when
// this line is reached it is certain that there are no snapshot items in this commit yet. Since this commit
// could be the first thing read during recovery, we can't write a delta yet.
bool useDelta = false;
ASSERT(thisSnapshotEnd >= self->currentSnapshotEnd);
self->previousSnapshotEnd = self->currentSnapshotEnd;
self->currentSnapshotEnd = thisSnapshotEnd;
// Write snapshot items until the wait above would block because we've used up all of the byte budget
loop {
if (++self->snapshotCount == 2) {
self->replaceContent = false;
if (next == self->data.end()) {
// After a snapshot end is logged, recovery may not see the last snapshot item logged before it so the
// next snapshot item logged cannot be a delta.
useDelta = false;
auto thisSnapshotEnd = self->log_op(OpSnapshotEnd, StringRef(), StringRef());
//TraceEvent("SnapshotEnd", self->id)
// .detail("LastKey", lastKey.present() ? lastKey.get() : LiteralStringRef("<none>"))
// .detail("CurrentSnapshotEndLoc", self->currentSnapshotEnd)
// .detail("PreviousSnapshotEndLoc", self->previousSnapshotEnd)
// .detail("ThisSnapshotEnd", thisSnapshotEnd)
// .detail("Items", snapItems)
// .detail("CommittedWrites", self->notifiedCommittedWriteBytes.get())
// .detail("SnapshotSize", snapshotBytes);
ASSERT(thisSnapshotEnd >= self->currentSnapshotEnd);
self->previousSnapshotEnd = self->currentSnapshotEnd;
self->currentSnapshotEnd = thisSnapshotEnd;
if (++self->snapshotCount == 2) {
self->replaceContent = false;
}
snapItems = 0;
snapshotBytes = 0;
snapshotTotalWrittenBytes += OP_DISK_OVERHEAD;
// If we're not stopping now, reset next
if(snapshotTotalWrittenBytes < self->notifiedCommittedWriteBytes.get()) {
next = self->data.begin();
}
else {
// Otherwise, save state for continuing after the next wait and stop
nextKey = Key();
nextKeyAfter = false;
break;
}
} else {
// destKey is whichever of the two last key buffers we should write to next.
Key &destKey = lastSnapshotKeyUsingA ? lastSnapshotKeyA : lastSnapshotKeyB;
// Get the key, using destKey as a temporary buffer if needed.
KeyRef tempKey = next.getKey(mutateString(destKey));
int opKeySize = tempKey.size();
// If tempKey did not use the start of destKey, then copy tempKey into destKey.
// It's technically possible for the source and dest to overlap but with the current container implementations that will not happen.
if(tempKey.begin() != destKey.begin()) {
memcpy(mutateString(destKey), tempKey.begin(), tempKey.size());
}
// Now, tempKey's bytes definitely exist in memory at destKey.begin() so update destKey's contents to be a proper KeyRef of the key.
// This intentionally leaves the Arena alone and doesn't copy anything into it.
destKey.contents() = KeyRef(destKey.begin(), tempKey.size());
// Get the common prefix between this key and the previous one, or 0 if there was no previous one.
int commonPrefix;
if(useDelta) {
commonPrefix = commonPrefixLength(lastSnapshotKeyA, lastSnapshotKeyB);
}
else {
commonPrefix = 0;
useDelta = true;
}
// If the common prefix is greater than 1, write a delta item. It isn't worth doing for 0 or 1 bytes, it would merely add decode overhead (string copying).
if(commonPrefix > 1) {
// Cap the common prefix length to 255. Sorry, ridiculously long keys!
commonPrefix = std::min<int>(commonPrefix, std::numeric_limits<uint8_t>::max());
// We're going to temporarily write a 1-byte integer just before the key suffix to create the log op key and log it, then restore that byte.
uint8_t &prefixLength = mutateString(destKey)[commonPrefix - 1];
uint8_t backupByte = prefixLength;
prefixLength = commonPrefix;
opKeySize = opKeySize - commonPrefix + 1;
KeyRef opKey(&prefixLength, opKeySize);
self->log_op(OpSnapshotItemDelta, opKey, next.getValue());
// Restore the overwritten byte
prefixLength = backupByte;
}
else {
self->log_op(OpSnapshotItem, tempKey, next.getValue());
}
snapItems++;
uint64_t opBytes = opKeySize + next.getValue().size() + OP_DISK_OVERHEAD;
snapshotBytes += opBytes;
snapshotTotalWrittenBytes += opBytes;
lastSnapshotKeyUsingA = !lastSnapshotKeyUsingA;
// If we're not stopping now, increment next
if(snapshotTotalWrittenBytes < self->notifiedCommittedWriteBytes.get()) {
++next;
}
else {
// Otherwise, save state for continuing after the next wait and stop
nextKey = destKey;
nextKeyAfter = true;
break;
}
}
nextKey = Key();
nextKeyAfter = false;
snapItems = 0;
snapshotBytes = 0;
snapshotTotalWrittenBytes += OP_DISK_OVERHEAD;
} else {
StringRef tempKey = next.getKey(self->reserved_buffer);
self->log_op(OpSnapshotItem, tempKey, next.getValue());
nextKey = tempKey;
nextKeyAfter = true;
snapItems++;
uint64_t opBytes = tempKey.size() + next.getValue().size() + OP_DISK_OVERHEAD;
snapshotBytes += opBytes;
snapshotTotalWrittenBytes += opBytes;
}
}
}

View File

@ -1858,13 +1858,15 @@ private:
ACTOR static Future<Void> stopOnError( KeyValueStoreSQLite* self ) {
try {
wait( self->readThreads->getError() || self->writeThread->getError() );
ASSERT(false);
} catch (Error& e) {
if (e.code() == error_code_actor_cancelled)
throw;
self->readThreads->stop(e);
self->writeThread->stop(e);
}
self->readThreads->stop();
self->writeThread->stop();
return Void();
}

View File

@ -201,7 +201,7 @@ void ServerKnobs::initialize(bool randomize, ClientKnobs* clientKnobs, bool isSi
init( STORAGE_METRICS_POLLING_DELAY, 2.0 ); if( randomize && BUGGIFY ) STORAGE_METRICS_POLLING_DELAY = 15.0;
init( STORAGE_METRICS_RANDOM_DELAY, 0.2 );
init( AVAILABLE_SPACE_RATIO_CUTOFF, 0.05 );
init( DESIRED_TEAMS_PER_SERVER, 5 ); if( randomize && BUGGIFY ) DESIRED_TEAMS_PER_SERVER = deterministicRandom()->randomInt(1, 10);
init( DESIRED_TEAMS_PER_SERVER, 5 ); DESIRED_TEAMS_PER_SERVER = deterministicRandom()->randomInt(1, 10);
init( MAX_TEAMS_PER_SERVER, 5*DESIRED_TEAMS_PER_SERVER );
init( DD_SHARD_SIZE_GRANULARITY, 5000000 );
init( DD_SHARD_SIZE_GRANULARITY_SIM, 500000 ); if( randomize && BUGGIFY ) DD_SHARD_SIZE_GRANULARITY_SIM = 0;
@ -307,6 +307,8 @@ void ServerKnobs::initialize(bool randomize, ClientKnobs* clientKnobs, bool isSi
init( START_TRANSACTION_BATCH_QUEUE_CHECK_INTERVAL, 0.001 );
init( START_TRANSACTION_MAX_TRANSACTIONS_TO_START, 100000 );
init( START_TRANSACTION_MAX_REQUESTS_TO_START, 10000 );
init( START_TRANSACTION_RATE_WINDOW, 2.0 );
init( START_TRANSACTION_MAX_EMPTY_QUEUE_BUDGET, 10.0 );
init( START_TRANSACTION_MAX_QUEUE_SIZE, 1e6 );
init( KEY_LOCATION_MAX_QUEUE_SIZE, 1e6 );
@ -326,7 +328,6 @@ void ServerKnobs::initialize(bool randomize, ClientKnobs* clientKnobs, bool isSi
init( COMMIT_TRANSACTION_BATCH_BYTES_SCALE_BASE, 100000 );
init( COMMIT_TRANSACTION_BATCH_BYTES_SCALE_POWER, 0.0 );
init( TRANSACTION_BUDGET_TIME, 0.050 ); if( randomize && BUGGIFY ) TRANSACTION_BUDGET_TIME = 0.0;
init( RESOLVER_COALESCE_TIME, 1.0 );
init( BUGGIFIED_ROW_LIMIT, APPLY_MUTATION_BYTES ); if( randomize && BUGGIFY ) BUGGIFIED_ROW_LIMIT = deterministicRandom()->randomInt(3, 30);
init( PROXY_SPIN_DELAY, 0.01 );
@ -360,7 +361,7 @@ void ServerKnobs::initialize(bool randomize, ClientKnobs* clientKnobs, bool isSi
init( PROVISIONAL_START_DELAY, 1.0 );
init( PROVISIONAL_MAX_DELAY, 60.0 );
init( PROVISIONAL_DELAY_GROWTH, 1.5 );
init( SECONDS_BEFORE_RECRUIT_BACKUP_WORKER, 4.0 );
init( SECONDS_BEFORE_RECRUIT_BACKUP_WORKER, 4.0 ); if( randomize && BUGGIFY ) SECONDS_BEFORE_RECRUIT_BACKUP_WORKER = deterministicRandom()->random01() * 8;
// Resolver
init( SAMPLE_OFFSET_PER_KEY, 100 );
@ -584,6 +585,7 @@ void ServerKnobs::initialize(bool randomize, ClientKnobs* clientKnobs, bool isSi
init( FASTRESTORE_HEARTBEAT_DELAY, 10 ); if( randomize && BUGGIFY ) { FASTRESTORE_HEARTBEAT_DELAY = deterministicRandom()->random01() * 120 + 2; }
init( FASTRESTORE_HEARTBEAT_MAX_DELAY, 10 ); if( randomize && BUGGIFY ) { FASTRESTORE_HEARTBEAT_MAX_DELAY = FASTRESTORE_HEARTBEAT_DELAY * 10; }
init( FASTRESTORE_APPLIER_FETCH_KEYS_SIZE, 100 ); if( randomize && BUGGIFY ) { FASTRESTORE_APPLIER_FETCH_KEYS_SIZE = deterministicRandom()->random01() * 10240 + 1; }
init( FASTRESTORE_LOADER_SEND_MUTATION_MSG_BYTES, 1.0 * 1024.0 * 1024.0 ); if( randomize && BUGGIFY ) { FASTRESTORE_LOADER_SEND_MUTATION_MSG_BYTES = deterministicRandom()->random01() * 10.0 * 1024.0 * 1024.0 + 1; }
// clang-format on

View File

@ -248,6 +248,8 @@ public:
double START_TRANSACTION_BATCH_QUEUE_CHECK_INTERVAL;
double START_TRANSACTION_MAX_TRANSACTIONS_TO_START;
int START_TRANSACTION_MAX_REQUESTS_TO_START;
double START_TRANSACTION_RATE_WINDOW;
double START_TRANSACTION_MAX_EMPTY_QUEUE_BUDGET;
int START_TRANSACTION_MAX_QUEUE_SIZE;
int KEY_LOCATION_MAX_QUEUE_SIZE;
@ -265,7 +267,6 @@ public:
double COMMIT_BATCHES_MEM_FRACTION_OF_TOTAL;
double COMMIT_BATCHES_MEM_TO_TOTAL_MEM_SCALE_FACTOR;
double TRANSACTION_BUDGET_TIME;
double RESOLVER_COALESCE_TIME;
int BUGGIFIED_ROW_LIMIT;
double PROXY_SPIN_DELAY;
@ -522,6 +523,7 @@ public:
int64_t FASTRESTORE_HEARTBEAT_DELAY; // interval for master to ping loaders and appliers
int64_t FASTRESTORE_HEARTBEAT_MAX_DELAY; // master claim a node is down if no heart beat from the node for this delay
int64_t FASTRESTORE_APPLIER_FETCH_KEYS_SIZE; // number of keys to fetch in a txn on applier
int64_t FASTRESTORE_LOADER_SEND_MUTATION_MSG_BYTES; // desired size of mutation message sent from loader to appliers
ServerKnobs();
void initialize(bool randomize = false, ClientKnobs* clientKnobs = NULL, bool isSimulated = false);

View File

@ -120,8 +120,82 @@ struct ProxyStats {
}
};
ACTOR Future<Void> getRate(UID myID, Reference<AsyncVar<ServerDBInfo>> db, int64_t* inTransactionCount, int64_t* inBatchTransactionCount, double* outTransactionRate,
double* outBatchTransactionRate, GetHealthMetricsReply* healthMetricsReply, GetHealthMetricsReply* detailedHealthMetricsReply) {
struct TransactionRateInfo {
double rate;
double limit;
double budget;
bool disabled;
Smoother smoothRate;
Smoother smoothReleased;
TransactionRateInfo(double rate) : rate(rate), limit(0), budget(0), disabled(true), smoothRate(SERVER_KNOBS->START_TRANSACTION_RATE_WINDOW),
smoothReleased(SERVER_KNOBS->START_TRANSACTION_RATE_WINDOW) {}
void reset() {
// Determine the number of transactions that this proxy is allowed to release
// Roughly speaking, this is done by computing the number of transactions over some historical window that we could
// have started but didn't, and making that our limit. More precisely, we track a smoothed rate limit and release rate,
// the difference of which is the rate of additional transactions that we could have released based on that window.
// Then we multiply by the window size to get a number of transactions.
//
// Limit can be negative in the event that we are releasing more transactions than we are allowed (due to the use of
// our budget or because of higher priority transactions).
double releaseRate = smoothRate.smoothTotal() - smoothReleased.smoothRate();
limit = SERVER_KNOBS->START_TRANSACTION_RATE_WINDOW * releaseRate;
}
bool canStart(int64_t numAlreadyStarted, int64_t count) {
return numAlreadyStarted + count <= std::min(limit + budget, SERVER_KNOBS->START_TRANSACTION_MAX_TRANSACTIONS_TO_START);
}
void updateBudget(int64_t numStartedAtPriority, bool queueEmptyAtPriority, double elapsed) {
// Update the budget to accumulate any extra capacity available or remove any excess that was used.
// The actual delta is the portion of the limit we didn't use multiplied by the fraction of the window that elapsed.
//
// We may have exceeded our limit due to the budget or because of higher priority transactions, in which case this
// delta will be negative. The delta can also be negative in the event that our limit was negative, which can happen
// if we had already started more transactions in our window than our rate would have allowed.
//
// This budget has the property that when the budget is required to start transactions (because batches are big),
// the sum limit+budget will increase linearly from 0 to the batch size over time and decrease by the batch size
// upon starting a batch. In other words, this works equivalently to a model where we linearly accumulate budget over
// time in the case that our batches are too big to take advantage of the window based limits.
budget = std::max(0.0, budget + elapsed * (limit - numStartedAtPriority) / SERVER_KNOBS->START_TRANSACTION_RATE_WINDOW);
// If we are emptying out the queue of requests, then we don't need to carry much budget forward
// If we did keep accumulating budget, then our responsiveness to changes in workflow could be compromised
if(queueEmptyAtPriority) {
budget = std::min(budget, SERVER_KNOBS->START_TRANSACTION_MAX_EMPTY_QUEUE_BUDGET);
}
smoothReleased.addDelta(numStartedAtPriority);
}
void disable() {
disabled = true;
rate = 0;
smoothRate.reset(0);
}
void setRate(double rate) {
ASSERT(rate >= 0 && rate != std::numeric_limits<double>::infinity() && !isnan(rate));
this->rate = rate;
if(disabled) {
smoothRate.reset(rate);
disabled = false;
}
else {
smoothRate.setTotal(rate);
}
}
};
ACTOR Future<Void> getRate(UID myID, Reference<AsyncVar<ServerDBInfo>> db, int64_t* inTransactionCount, int64_t* inBatchTransactionCount, TransactionRateInfo *transactionRateInfo,
TransactionRateInfo *batchTransactionRateInfo, GetHealthMetricsReply* healthMetricsReply, GetHealthMetricsReply* detailedHealthMetricsReply) {
state Future<Void> nextRequestTimer = Never();
state Future<Void> leaseTimeout = Never();
state Future<GetRateInfoReply> reply = Never();
@ -150,8 +224,9 @@ ACTOR Future<Void> getRate(UID myID, Reference<AsyncVar<ServerDBInfo>> db, int64
}
when ( GetRateInfoReply rep = wait(reply) ) {
reply = Never();
*outTransactionRate = rep.transactionRate;
*outBatchTransactionRate = rep.batchTransactionRate;
transactionRateInfo->setRate(rep.transactionRate);
batchTransactionRateInfo->setRate(rep.batchTransactionRate);
//TraceEvent("MasterProxyRate", myID).detail("Rate", rep.transactionRate).detail("BatchRate", rep.batchTransactionRate).detail("Lease", rep.leaseDuration).detail("ReleasedTransactions", *inTransactionCount - lastTC);
lastTC = *inTransactionCount;
leaseTimeout = delay(rep.leaseDuration);
@ -163,35 +238,15 @@ ACTOR Future<Void> getRate(UID myID, Reference<AsyncVar<ServerDBInfo>> db, int64
}
}
when ( wait( leaseTimeout ) ) {
*outTransactionRate = 0;
*outBatchTransactionRate = 0;
//TraceEvent("MasterProxyRate", myID).detail("Rate", 0.0).detail("BatchRate", 0.0).detail("Lease", "Expired");
transactionRateInfo->disable();
batchTransactionRateInfo->disable();
TraceEvent(SevWarn, "MasterProxyRateLeaseExpired", myID).suppressFor(5.0);
//TraceEvent("MasterProxyRate", myID).detail("Rate", 0.0).detail("BatchRate", 0.0).detail("Lease", 0);
leaseTimeout = Never();
}
}
}
struct TransactionRateInfo {
double rate;
double limit;
TransactionRateInfo(double rate) : rate(rate), limit(0) {}
void reset(double elapsed) {
limit = std::min(0.0, limit) + rate * elapsed; // Adjust the limit based on the full elapsed interval in order to properly erase a deficit
limit = std::min(limit, rate * SERVER_KNOBS->START_TRANSACTION_BATCH_INTERVAL_MAX); // Don't allow the rate to exceed what would be allowed in the maximum batch interval
limit = std::min(limit, SERVER_KNOBS->START_TRANSACTION_MAX_TRANSACTIONS_TO_START);
}
bool canStart(int64_t numAlreadyStarted) {
return numAlreadyStarted < limit;
}
void updateBudget(int64_t numStarted) {
limit -= numStarted;
}
};
ACTOR Future<Void> queueTransactionStartRequests(
Reference<AsyncVar<ServerDBInfo>> db,
Deque<GetReadVersionRequest> *systemQueue,
@ -531,7 +586,7 @@ ACTOR Future<Void> commitBatcher(ProxyCommitData *commitData, PromiseStream<std:
}
if((batchBytes + bytes > CLIENT_KNOBS->TRANSACTION_SIZE_LIMIT || req.firstInBatch()) && batch.size()) {
out.send({ batch, batchBytes });
out.send({ std::move(batch), batchBytes });
lastBatch = now();
timeout = delayJittered(commitData->commitBatchInterval, TaskPriority::ProxyCommitBatcher);
batch = std::vector<CommitTransactionRequest>();
@ -1354,7 +1409,7 @@ ACTOR static Future<Void> transactionStarter(
state vector<MasterProxyInterface> otherProxies;
state PromiseStream<double> replyTimes;
addActor.send(getRate(proxy.id(), db, &transactionCount, &batchTransactionCount, &normalRateInfo.rate, &batchRateInfo.rate, healthMetricsReply, detailedHealthMetricsReply));
addActor.send(getRate(proxy.id(), db, &transactionCount, &batchTransactionCount, &normalRateInfo, &batchRateInfo, healthMetricsReply, detailedHealthMetricsReply));
addActor.send(queueTransactionStartRequests(db, &systemQueue, &defaultQueue, &batchQueue, proxy.getConsistentReadVersion.getFuture(),
GRVTimer, &lastGRVTime, &GRVBatchTime, replyTimes.getFuture(),
&commitData->stats, &batchRateInfo));
@ -1380,8 +1435,8 @@ ACTOR static Future<Void> transactionStarter(
if(elapsed == 0) elapsed = 1e-15; // resolve a possible indeterminant multiplication with infinite transaction rate
normalRateInfo.reset(elapsed);
batchRateInfo.reset(elapsed);
normalRateInfo.reset();
batchRateInfo.reset();
int transactionsStarted[2] = {0,0};
int systemTransactionsStarted[2] = {0,0};
@ -1408,13 +1463,12 @@ ACTOR static Future<Void> transactionStarter(
auto& req = transactionQueue->front();
int tc = req.transactionCount;
if (req.priority() < GetReadVersionRequest::PRIORITY_DEFAULT &&
!batchRateInfo.canStart(transactionsStarted[0] + transactionsStarted[1])) {
break;
} else if (req.priority() < GetReadVersionRequest::PRIORITY_SYSTEM_IMMEDIATE &&
!normalRateInfo.canStart(transactionsStarted[0] + transactionsStarted[1])) {
if(req.priority() < GetReadVersionRequest::PRIORITY_DEFAULT && !batchRateInfo.canStart(transactionsStarted[0] + transactionsStarted[1], tc)) {
break;
}
else if(req.priority() < GetReadVersionRequest::PRIORITY_SYSTEM_IMMEDIATE && !normalRateInfo.canStart(transactionsStarted[0] + transactionsStarted[1], tc)) {
break;
}
if (req.debugID.present()) {
if (!debugID.present()) debugID = nondeterministicRandom()->randomUniqueID();
@ -1450,11 +1504,15 @@ ACTOR static Future<Void> transactionStarter(
.detail("TransactionBudget", transactionBudget)
.detail("BatchTransactionBudget", batchTransactionBudget);*/
transactionCount += transactionsStarted[0] + transactionsStarted[1];
batchTransactionCount += batchPriTransactionsStarted[0] + batchPriTransactionsStarted[1];
int systemTotalStarted = systemTransactionsStarted[0] + systemTransactionsStarted[1];
int normalTotalStarted = defaultPriTransactionsStarted[0] + defaultPriTransactionsStarted[1];
int batchTotalStarted = batchPriTransactionsStarted[0] + batchPriTransactionsStarted[1];
normalRateInfo.updateBudget(transactionsStarted[0] + transactionsStarted[1]);
batchRateInfo.updateBudget(transactionsStarted[0] + transactionsStarted[1]);
transactionCount += transactionsStarted[0] + transactionsStarted[1];
batchTransactionCount += batchTotalStarted;
normalRateInfo.updateBudget(systemTotalStarted + normalTotalStarted, systemQueue.empty() && defaultQueue.empty(), elapsed);
batchRateInfo.updateBudget(systemTotalStarted + normalTotalStarted + batchTotalStarted, systemQueue.empty() && defaultQueue.empty() && batchQueue.empty(), elapsed);
if (debugID.present()) {
g_traceBatch.addEvent("TransactionDebug", debugID.get().first(), "MasterProxyServer.masterProxyServerCore.Broadcast");
@ -1935,6 +1993,7 @@ ACTOR Future<Void> masterProxyServerCore(
state KeyRange txnKeys = allKeys;
loop {
wait(yield());
Standalone<RangeResultRef> UIDtoTagMap = commitData.txnStateStore->readRange( serverTagKeys ).get();
Standalone<RangeResultRef> data = commitData.txnStateStore->readRange(txnKeys, SERVER_KNOBS->BUGGIFIED_ROW_LIMIT, SERVER_KNOBS->APPLY_MUTATION_BYTES).get();
if(!data.size()) break;
((KeyRangeRef&)txnKeys) = KeyRangeRef( keyAfter(data.back().key, txnKeys.arena()), txnKeys.end );
@ -1947,7 +2006,7 @@ ACTOR Future<Void> masterProxyServerCore(
if( kv.key.startsWith(keyServersPrefix) ) {
KeyRef k = kv.key.removePrefix(keyServersPrefix);
if(k != allKeys.end) {
decodeKeyServersValue(kv.value, src, dest);
decodeKeyServersValue(UIDtoTagMap, kv.value, src, dest);
info.tags.clear();
info.src_info.clear();
info.dest_info.clear();

View File

@ -211,13 +211,15 @@ ACTOR Future<vector<UID>> addReadWriteDestinations(KeyRangeRef shard, vector<Sto
}
ACTOR Future<vector<vector<UID>>> additionalSources(Standalone<RangeResultRef> shards, Transaction* tr, int desiredHealthy, int maxServers) {
state Standalone<RangeResultRef> UIDtoTagMap = wait( tr->getRange(serverTagKeys, CLIENT_KNOBS->TOO_MANY) );
ASSERT( !UIDtoTagMap.more && UIDtoTagMap.size() < CLIENT_KNOBS->TOO_MANY );
vector<Future<Optional<Value>>> serverListEntries;
std::set<UID> fetching;
for(int i = 0; i < shards.size() - 1; ++i) {
vector<UID> src;
vector<UID> dest;
decodeKeyServersValue( shards[i].value, src, dest );
decodeKeyServersValue( UIDtoTagMap, shards[i].value, src, dest );
for(int s=0; s<src.size(); s++) {
if(!fetching.count(src[s])) {
@ -251,7 +253,7 @@ ACTOR Future<vector<vector<UID>>> additionalSources(Standalone<RangeResultRef> s
vector<StorageServerInterface> srcInterfs;
vector<StorageServerInterface> destInterfs;
decodeKeyServersValue( shards[i].value, src, dest );
decodeKeyServersValue( UIDtoTagMap, shards[i].value, src, dest );
for(int s=0; s<src.size(); s++) {
srcInterfs.push_back( ssiMap[src[s]] );
@ -356,6 +358,8 @@ ACTOR Future<Void> startMoveKeys( Database occ, KeyRange keys, vector<UID> serve
// printf("'%s': '%s'\n", old[i].key.toString().c_str(), old[i].value.toString().c_str());
//Check that enough servers for each shard are in the correct state
state Standalone<RangeResultRef> UIDtoTagMap = wait(tr.getRange(serverTagKeys, CLIENT_KNOBS->TOO_MANY));
ASSERT( !UIDtoTagMap.more && UIDtoTagMap.size() < CLIENT_KNOBS->TOO_MANY );
vector<vector<UID>> addAsSource = wait(additionalSources(old, &tr, servers.size(), SERVER_KNOBS->MAX_ADDED_SOURCES_MULTIPLIER*servers.size()));
// For each intersecting range, update keyServers[range] dest to be servers and clear existing dest servers from serverKeys
@ -363,7 +367,7 @@ ACTOR Future<Void> startMoveKeys( Database occ, KeyRange keys, vector<UID> serve
KeyRangeRef rangeIntersectKeys( old[i].key, old[i+1].key );
vector<UID> src;
vector<UID> dest;
decodeKeyServersValue( old[i].value, src, dest );
decodeKeyServersValue( UIDtoTagMap, old[i].value, src, dest );
// TraceEvent("StartMoveKeysOldRange", relocationIntervalId)
// .detail("KeyBegin", rangeIntersectKeys.begin.toString())
@ -378,7 +382,7 @@ ACTOR Future<Void> startMoveKeys( Database occ, KeyRange keys, vector<UID> serve
uniquify(src);
//Update dest servers for this range to be equal to servers
krmSetPreviouslyEmptyRange( &tr, keyServersPrefix, rangeIntersectKeys, keyServersValue(src, servers), old[i+1].value );
krmSetPreviouslyEmptyRange( &tr, keyServersPrefix, rangeIntersectKeys, keyServersValue(UIDtoTagMap, src, servers), old[i+1].value );
//Track old destination servers. They may be removed from serverKeys soon, since they are about to be overwritten in keyServers
for(auto s = dest.begin(); s != dest.end(); ++s) {
@ -555,6 +559,8 @@ ACTOR Future<Void> finishMoveKeys( Database occ, KeyRange keys, vector<UID> dest
wait( checkMoveKeysLock(&tr, lock) );
state KeyRange currentKeys = KeyRangeRef(begin, keys.end);
state Standalone<RangeResultRef> UIDtoTagMap = wait( tr.getRange(serverTagKeys, CLIENT_KNOBS->TOO_MANY) );
ASSERT( !UIDtoTagMap.more && UIDtoTagMap.size() < CLIENT_KNOBS->TOO_MANY );
state Standalone<RangeResultRef> keyServers = wait( krmGetRanges( &tr, keyServersPrefix, currentKeys, SERVER_KNOBS->MOVE_KEYS_KRM_LIMIT, SERVER_KNOBS->MOVE_KEYS_KRM_LIMIT_BYTES ) );
//Determine the last processed key (which will be the beginning for the next iteration)
@ -575,7 +581,7 @@ ACTOR Future<Void> finishMoveKeys( Database occ, KeyRange keys, vector<UID> dest
//Iterate through the beginning of keyServers until we find one that hasn't already been processed
int currentIndex;
for(currentIndex = 0; currentIndex < keyServers.size() - 1 && alreadyMoved; currentIndex++) {
decodeKeyServersValue( keyServers[currentIndex].value, src, dest );
decodeKeyServersValue( UIDtoTagMap, keyServers[currentIndex].value, src, dest );
std::set<UID> srcSet;
for(int s = 0; s < src.size(); s++) {
@ -633,7 +639,7 @@ ACTOR Future<Void> finishMoveKeys( Database occ, KeyRange keys, vector<UID> dest
//Process the rest of the key servers
for(; currentIndex < keyServers.size() - 1; currentIndex++) {
vector<UID> src2, dest2;
decodeKeyServersValue( keyServers[currentIndex].value, src2, dest2 );
decodeKeyServersValue( UIDtoTagMap, keyServers[currentIndex].value, src2, dest2 );
std::set<UID> srcSet;
for(int s = 0; s < src2.size(); s++)
@ -718,7 +724,7 @@ ACTOR Future<Void> finishMoveKeys( Database occ, KeyRange keys, vector<UID> dest
if( count == dest.size() ) {
// update keyServers, serverKeys
// SOMEDAY: Doing these in parallel is safe because none of them overlap or touch (one per server)
wait( krmSetRangeCoalescing( &tr, keyServersPrefix, currentKeys, keys, keyServersValue( dest ) ) );
wait( krmSetRangeCoalescing( &tr, keyServersPrefix, currentKeys, keys, keyServersValue( UIDtoTagMap, dest ) ) );
std::set<UID>::iterator asi = allServers.begin();
std::vector<Future<Void>> actors;
@ -989,6 +995,8 @@ ACTOR Future<Void> removeKeysFromFailedServer(Database cx, UID serverID, MoveKey
// Get all values of keyServers and remove serverID from every occurrence
// Very inefficient going over every entry in keyServers
// No shortcut because keyServers and serverKeys are not guaranteed same shard boundaries
state Standalone<RangeResultRef> UIDtoTagMap = wait( tr.getRange(serverTagKeys, CLIENT_KNOBS->TOO_MANY) );
ASSERT( !UIDtoTagMap.more && UIDtoTagMap.size() < CLIENT_KNOBS->TOO_MANY );
state Standalone<RangeResultRef> keyServers =
wait(krmGetRanges(&tr, keyServersPrefix, KeyRangeRef(begin, allKeys.end),
SERVER_KNOBS->MOVE_KEYS_KRM_LIMIT, SERVER_KNOBS->MOVE_KEYS_KRM_LIMIT_BYTES));
@ -997,7 +1005,7 @@ ACTOR Future<Void> removeKeysFromFailedServer(Database cx, UID serverID, MoveKey
auto it = keyServers[i];
vector<UID> src;
vector<UID> dest;
decodeKeyServersValue(it.value, src, dest);
decodeKeyServersValue(UIDtoTagMap, it.value, src, dest);
// The failed server is not present
if (std::find(src.begin(), src.end(), serverID) == src.end() &&
@ -1013,7 +1021,7 @@ ACTOR Future<Void> removeKeysFromFailedServer(Database cx, UID serverID, MoveKey
.detail("Key", it.key)
.detail("ValueSrc", describe(src))
.detail("ValueDest", describe(dest));
tr.set(keyServersKey(it.key), keyServersValue(src, dest));
tr.set(keyServersKey(it.key), keyServersValue(UIDtoTagMap, src, dest));
}
// Set entire range for our serverID in serverKeys keyspace to false to signal erasure
@ -1095,13 +1103,13 @@ void seedShardServers(
tr.set(arena, serverListKeyFor(servers[s].id()), serverListValue(servers[s]));
}
std::vector<UID> serverIds;
std::vector<Tag> serverTags;
for(int i=0;i<servers.size();i++)
serverIds.push_back(servers[i].id());
serverTags.push_back(server_tag[servers[i].id()]);
// We have to set this range in two blocks, because the master tracking of "keyServersLocations" depends on a change to a specific
// key (keyServersKeyServersKey)
krmSetPreviouslyEmptyRange( tr, arena, keyServersPrefix, KeyRangeRef(KeyRef(), allKeys.end), keyServersValue( serverIds ), Value() );
krmSetPreviouslyEmptyRange( tr, arena, keyServersPrefix, KeyRangeRef(KeyRef(), allKeys.end), keyServersValue( serverTags ), Value() );
for(int s=0; s<servers.size(); s++)
krmSetPreviouslyEmptyRange( tr, arena, serverKeysPrefixFor( servers[s].id() ), allKeys, serverKeysTrue, serverKeysFalse );

View File

@ -110,36 +110,29 @@ ACTOR static Future<Void> handleSendMutationVectorRequest(RestoreSendVersionedMu
state Reference<ApplierBatchData> batchData = self->batch[req.batchIndex];
// Assume: processedFileState[req.asset] will not be erased while the actor is active.
// Note: Insert new items into processedFileState will not invalidate the reference.
state NotifiedVersion& curFilePos = batchData->processedFileState[req.asset];
state NotifiedVersion& curMsgIndex = batchData->processedFileState[req.asset];
TraceEvent(SevDebug, "FastRestoreApplierPhaseReceiveMutations", self->id())
.detail("BatchIndex", req.batchIndex)
.detail("RestoreAsset", req.asset.toString())
.detail("ProcessedFileVersion", curFilePos.get())
.detail("RestoreAssetMesssageIndex", curMsgIndex.get())
.detail("Request", req.toString())
.detail("CurrentMemory", getSystemStatistics().processMemory)
.detail("PreviousVersionBatchState", batchData->vbState.get());
wait(isSchedulable(self, req.batchIndex, __FUNCTION__));
wait(curFilePos.whenAtLeast(req.prevVersion));
wait(curMsgIndex.whenAtLeast(req.msgIndex - 1));
batchData->vbState = ApplierVersionBatchState::RECEIVE_MUTATIONS;
state bool isDuplicated = true;
if (curFilePos.get() == req.prevVersion) {
if (curMsgIndex.get() == req.msgIndex - 1) {
isDuplicated = false;
const Version commitVersion = req.version;
uint16_t numVersionStampedKV = 0;
// Sanity check: mutations in range file is in [beginVersion, endVersion);
// mutations in log file is in [beginVersion, endVersion], both inclusive.
ASSERT(commitVersion >= req.asset.beginVersion);
// Loader sends the endVersion to ensure all useful versions are sent
ASSERT(commitVersion <= req.asset.endVersion);
ASSERT(req.mutations.size() == req.subs.size());
ASSERT(req.mutations.size() == req.mVersions.size());
for (int mIndex = 0; mIndex < req.mutations.size(); mIndex++) {
const MutationRef& mutation = req.mutations[mIndex];
const LogMessageVersion mutationVersion(commitVersion, req.subs[mIndex]);
const LogMessageVersion mutationVersion(req.mVersions[mIndex]);
TraceEvent(SevFRMutationInfo, "FastRestoreApplierPhaseReceiveMutations", self->id())
.detail("RestoreAsset", req.asset.toString())
.detail("Version", mutationVersion.toString())
@ -150,26 +143,23 @@ ACTOR static Future<Void> handleSendMutationVectorRequest(RestoreSendVersionedMu
batchData->counters.receivedMutations += 1;
batchData->counters.receivedAtomicOps += isAtomicOp((MutationRef::Type)mutation.type) ? 1 : 0;
// Sanity check
ASSERT_WE_THINK(req.asset.isInVersionRange(mutationVersion.version));
ASSERT_WE_THINK(req.asset.isInKeyRange(mutation));
// Note: Log and range mutations may be delivered out of order. Can we handle it?
if (mutation.type == MutationRef::SetVersionstampedKey ||
mutation.type == MutationRef::SetVersionstampedValue) {
ASSERT(false); // No version stamp mutations in backup logs
batchData->addVersionStampedKV(mutation, mutationVersion, numVersionStampedKV);
numVersionStampedKV++;
} else {
batchData->addMutation(mutation, mutationVersion);
}
batchData->addMutation(mutation, mutationVersion);
ASSERT(mutation.type != MutationRef::SetVersionstampedKey &&
mutation.type != MutationRef::SetVersionstampedValue);
}
curFilePos.set(req.version);
curMsgIndex.set(req.msgIndex);
}
req.reply.send(RestoreCommonReply(self->id(), isDuplicated));
TraceEvent(SevDebug, "FastRestoreApplierPhaseReceiveMutationsDone", self->id())
.detail("BatchIndex", req.batchIndex)
.detail("RestoreAsset", req.asset.toString())
.detail("ProcessedFileVersion", curFilePos.get())
.detail("ProcessedMessageIndex", curMsgIndex.get())
.detail("Request", req.toString());
return Void();
}

View File

@ -107,8 +107,9 @@ struct StagingKey {
// TODO: Add SevError here
TraceEvent("SameVersion")
.detail("Version", version.toString())
.detail("Mutation", m.toString())
.detail("NewVersion", newVersion.toString());
.detail("NewVersion", newVersion.toString())
.detail("OldMutation", it->second.toString())
.detail("NewMutation", m.toString());
ASSERT(it->second.type == m.type && it->second.param1 == m.param1 && it->second.param2 == m.param2);
}
}
@ -282,26 +283,6 @@ struct ApplierBatchData : public ReferenceCounted<ApplierBatchData> {
}
}
void addVersionStampedKV(MutationRef m, LogMessageVersion ver, uint16_t numVersionStampedKV) {
if (m.type == MutationRef::SetVersionstampedKey) {
// Assume transactionNumber = 0 does not affect result
TraceEvent(SevDebug, "FastRestoreApplierAddMutation")
.detail("MutationType", typeString[m.type])
.detail("FakedTransactionNumber", numVersionStampedKV);
transformVersionstampMutation(m, &MutationRef::param1, ver.version, numVersionStampedKV);
addMutation(m, ver);
} else if (m.type == MutationRef::SetVersionstampedValue) {
// Assume transactionNumber = 0 does not affect result
TraceEvent(SevDebug, "FastRestoreApplierAddMutation")
.detail("MutationType", typeString[m.type])
.detail("FakedTransactionNumber", numVersionStampedKV);
transformVersionstampMutation(m, &MutationRef::param2, ver.version, numVersionStampedKV);
addMutation(m, ver);
} else {
ASSERT(false);
}
}
// Return true if all staging keys have been precomputed
bool allKeysPrecomputed() {
for (auto& stagingKey : stagingKeys) {

View File

@ -343,7 +343,7 @@ ACTOR Future<Standalone<VectorRef<KeyValueRef>>> decodeRangeFileBlock(Reference<
return results;
} catch (Error& e) {
TraceEvent(SevWarn, "FileRestoreCorruptRangeFileBlock")
TraceEvent(SevError, "FileRestoreCorruptRangeFileBlock")
.error(e)
.detail("Filename", file->getFilename())
.detail("BlockOffset", offset)
@ -388,7 +388,7 @@ ACTOR Future<Standalone<VectorRef<KeyValueRef>>> decodeLogFileBlock(Reference<IA
return results;
} catch (Error& e) {
TraceEvent(SevWarn, "FileRestoreCorruptLogFileBlock")
TraceEvent(SevError, "FileRestoreCorruptLogFileBlock")
.error(e)
.detail("Filename", file->getFilename())
.detail("BlockOffset", offset)

View File

@ -413,8 +413,9 @@ ACTOR Future<Void> sendMutationsToApplier(VersionedMutationsMap* pkvOps, int bat
state int kvCount = 0;
state int splitMutationIndex = 0;
state std::vector<std::pair<UID, RestoreSendVersionedMutationsRequest>> requests;
state Version prevVersion = 0; // startVersion
state Version msgIndex = 1; // Monotonically increased index for send message, must start at 1
state std::vector<UID> applierIDs = getApplierIDs(*pRangeToApplier);
state double msgSize = 0; // size of mutations in the message
TraceEvent("FastRestoreLoaderSendMutationToApplier")
.detail("IsRangeFile", isRangeFile)
@ -439,11 +440,11 @@ ACTOR Future<Void> sendMutationsToApplier(VersionedMutationsMap* pkvOps, int bat
// applierMutationsBuffer is the mutation vector to be sent to each applier
// applierMutationsSize is buffered mutation vector size for each applier
state std::map<UID, MutationsVec> applierMutationsBuffer;
state std::map<UID, SubSequenceVec> applierSubsBuffer;
state std::map<UID, LogMessageVersionVec> applierVersionsBuffer;
state std::map<UID, double> applierMutationsSize;
for (auto& applierID : applierIDs) {
applierMutationsBuffer[applierID] = MutationsVec();
applierSubsBuffer[applierID] = SubSequenceVec();
applierVersionsBuffer[applierID] = LogMessageVersionVec();
applierMutationsSize[applierID] = 0.0;
}
for (kvOp = kvOps.begin(); kvOp != kvOps.end(); kvOp++) {
@ -458,7 +459,6 @@ ACTOR Future<Void> sendMutationsToApplier(VersionedMutationsMap* pkvOps, int bat
Standalone<VectorRef<UID>> nodeIDs;
// Because using a vector of mutations causes overhead, and the range mutation should happen rarely;
// We handle the range mutation and key mutation differently for the benefit of avoiding memory copy
// WARNING: The splitMutation() may have bugs
splitMutation(pRangeToApplier, kvm, mvector.arena(), mvector.contents(), nodeIDs.arena(),
nodeIDs.contents());
ASSERT(mvector.size() == nodeIDs.size());
@ -475,16 +475,15 @@ ACTOR Future<Void> sendMutationsToApplier(VersionedMutationsMap* pkvOps, int bat
for (splitMutationIndex = 0; splitMutationIndex < mvector.size(); splitMutationIndex++) {
MutationRef mutation = mvector[splitMutationIndex];
UID applierID = nodeIDs[splitMutationIndex];
// printf("SPLITTED MUTATION: %d: mutation:%s applierID:%s\n", splitMutationIndex,
// mutation.toString().c_str(), applierID.toString().c_str());
if (debugMutation("RestoreLoader", commitVersion.version, mutation)) {
TraceEvent("SplittedMutation")
.detail("Version", commitVersion.toString())
.detail("Mutation", mutation.toString());
}
applierMutationsBuffer[applierID].push_back_deep(applierMutationsBuffer[applierID].arena(), mutation);
applierSubsBuffer[applierID].push_back(applierSubsBuffer[applierID].arena(), commitVersion.sub);
applierVersionsBuffer[applierID].push_back(applierVersionsBuffer[applierID].arena(), commitVersion);
applierMutationsSize[applierID] += mutation.expectedSize();
msgSize += mutation.expectedSize();
kvCount++;
}
@ -502,8 +501,9 @@ ACTOR Future<Void> sendMutationsToApplier(VersionedMutationsMap* pkvOps, int bat
.detail("Mutation", kvm.toString());
}
applierMutationsBuffer[applierID].push_back_deep(applierMutationsBuffer[applierID].arena(), kvm);
applierSubsBuffer[applierID].push_back(applierSubsBuffer[applierID].arena(), commitVersion.sub);
applierVersionsBuffer[applierID].push_back(applierVersionsBuffer[applierID].arena(), commitVersion);
applierMutationsSize[applierID] += kvm.expectedSize();
msgSize += kvm.expectedSize();
}
} // Mutations at the same LogMessageVersion
@ -511,26 +511,27 @@ ACTOR Future<Void> sendMutationsToApplier(VersionedMutationsMap* pkvOps, int bat
// changing the version comparison below.
auto next = std::next(kvOp, 1);
if (next == kvOps.end() || commitVersion.version < next->first.version) {
// if (next == kvOps.end() || msgSize >= SERVER_KNOBS->FASTRESTORE_LOADER_SEND_MUTATION_MSG_BYTES) {
// TODO: Sanity check each asset has been received exactly once!
// Send the mutations to appliers for each version
for (const UID& applierID : applierIDs) {
requests.emplace_back(applierID, RestoreSendVersionedMutationsRequest(
batchIndex, asset, prevVersion, commitVersion.version, isRangeFile,
applierMutationsBuffer[applierID], applierSubsBuffer[applierID]));
requests.emplace_back(applierID,
RestoreSendVersionedMutationsRequest(batchIndex, asset, msgIndex, isRangeFile,
applierMutationsBuffer[applierID],
applierVersionsBuffer[applierID]));
}
TraceEvent(SevDebug, "FastRestoreLoaderSendMutationToApplier")
.detail("PrevVersion", prevVersion)
.detail("CommitVersion", commitVersion.toString())
.detail("MessageIndex", msgIndex)
.detail("RestoreAsset", asset.toString())
.detail("Requests", requests.size());
ASSERT(prevVersion < commitVersion.version);
prevVersion = commitVersion.version;
wait(sendBatchRequests(&RestoreApplierInterface::sendMutationVector, *pApplierInterfaces, requests,
TaskPriority::RestoreLoaderSendMutations));
msgIndex++;
msgSize = 0;
requests.clear();
for (auto& applierID : applierIDs) {
applierMutationsBuffer[applierID] = MutationsVec();
applierSubsBuffer[applierID] = SubSequenceVec();
applierVersionsBuffer[applierID] = LogMessageVersionVec();
applierMutationsSize[applierID] = 0.0;
}
}
@ -540,7 +541,6 @@ ACTOR Future<Void> sendMutationsToApplier(VersionedMutationsMap* pkvOps, int bat
return Void();
}
// TODO: Add a unit test for this function
void splitMutation(std::map<Key, UID>* pRangeToApplier, MutationRef m, Arena& mvector_arena,
VectorRef<MutationRef>& mvector, Arena& nodeIDs_arena, VectorRef<UID>& nodeIDs) {
TraceEvent(SevWarn, "FastRestoreSplitMutation").detail("Mutation", m.toString());

View File

@ -617,8 +617,7 @@ ACTOR static Future<Standalone<VectorRef<RestoreRequest>>> collectRestoreRequest
ACTOR static Future<Version> collectBackupFiles(Reference<IBackupContainer> bc, std::vector<RestoreFileFR>* rangeFiles,
std::vector<RestoreFileFR>* logFiles, Database cx,
RestoreRequest request) {
state bool partitioned = wait(bc->isPartitionedBackup());
state BackupDescription desc = wait(partitioned ? bc->describePartitionedBackup() : bc->describeBackup());
state BackupDescription desc = wait(bc->describeBackup());
// Convert version to real time for operators to read the BackupDescription desc.
wait(desc.resolveVersionTimes(cx));
@ -634,8 +633,7 @@ ACTOR static Future<Version> collectBackupFiles(Reference<IBackupContainer> bc,
std::cout << "Restore to version: " << request.targetVersion << "\nBackupDesc: \n" << desc.toString() << "\n\n";
}
Optional<RestorableFileSet> restorable = wait(partitioned ? bc->getPartitionedRestoreSet(request.targetVersion)
: bc->getRestoreSet(request.targetVersion));
Optional<RestorableFileSet> restorable = wait(bc->getRestoreSet(request.targetVersion));
if (!restorable.present()) {
TraceEvent(SevWarn, "FastRestoreMasterPhaseCollectBackupFiles").detail("NotRestorable", request.targetVersion);

View File

@ -39,7 +39,7 @@
#define SevFRMutationInfo SevInfo
using MutationsVec = Standalone<VectorRef<MutationRef>>;
using SubSequenceVec = Standalone<VectorRef<uint32_t>>;
using LogMessageVersionVec = Standalone<VectorRef<LogMessageVersion>>;
enum class RestoreRole { Invalid = 0, Master = 1, Loader, Applier };
BINARY_SERIALIZABLE(RestoreRole);

View File

@ -1966,14 +1966,14 @@ static std::string getIssueDescription(std::string name) {
}
static std::map<std::string, std::vector<JsonBuilderObject>> getProcessIssuesAsMessages(
std::vector<std::pair<NetworkAddress, Standalone<VectorRef<StringRef>>>> const& issues) {
std::vector<ProcessIssues> const& issues) {
std::map<std::string, std::vector<JsonBuilderObject>> issuesMap;
try {
for (auto processIssues : issues) {
for (auto issue : processIssues.second) {
for (auto issue : processIssues.issues) {
std::string issueStr = issue.toString();
issuesMap[processIssues.first.toString()].push_back(
issuesMap[processIssues.address.toString()].push_back(
JsonString::makeMessage(issueStr.c_str(), getIssueDescription(issueStr).c_str()));
}
}
@ -2163,7 +2163,7 @@ ACTOR Future<StatusReply> clusterGetStatus(
Reference<AsyncVar<ServerDBInfo>> db,
Database cx,
vector<WorkerDetails> workers,
std::vector<std::pair<NetworkAddress, Standalone<VectorRef<StringRef>>>> workerIssues,
std::vector<ProcessIssues> workerIssues,
std::map<NetworkAddress, std::pair<double, OpenDatabaseRequest>>* clientStatus,
ServerCoordinators coordinators,
std::vector<NetworkAddress> incompatibleConnections,

View File

@ -27,7 +27,14 @@
#include "fdbserver/MasterInterface.h"
#include "fdbclient/ClusterInterface.h"
Future<StatusReply> clusterGetStatus( Reference<AsyncVar<struct ServerDBInfo>> const& db, Database const& cx, vector<WorkerDetails> const& workers, std::vector<std::pair<NetworkAddress, Standalone<VectorRef<StringRef>>>> const& workerIssues,
struct ProcessIssues {
NetworkAddress address;
Standalone<VectorRef<StringRef>> issues;
ProcessIssues(NetworkAddress address, Standalone<VectorRef<StringRef>> issues) : address(address), issues(issues) {}
};
Future<StatusReply> clusterGetStatus( Reference<AsyncVar<struct ServerDBInfo>> const& db, Database const& cx, vector<WorkerDetails> const& workers, std::vector<ProcessIssues> const& workerIssues,
std::map<NetworkAddress, std::pair<double, OpenDatabaseRequest>>* const& clientStatus, ServerCoordinators const& coordinators, std::vector<NetworkAddress> const& incompatibleConnections, Version const& datacenterVersionDifference );
#endif

View File

@ -120,7 +120,7 @@ struct ClusterControllerFullInterface {
RequestStream< struct RegisterWorkerRequest > registerWorker;
RequestStream< struct GetWorkersRequest > getWorkers;
RequestStream< struct RegisterMasterRequest > registerMaster;
RequestStream< struct GetServerDBInfoRequest > getServerDBInfo;
RequestStream< struct GetServerDBInfoRequest > getServerDBInfo; //only used by testers; the cluster controller will send the serverDBInfo to workers
UID id() const { return clientInterface.id(); }
bool operator == (ClusterControllerFullInterface const& r) const { return id() == r.id(); }

View File

@ -1872,7 +1872,7 @@ int main(int argc, char* argv[]) {
} else { // Call fdbd roles in conventional way
ASSERT(opts.connectionFile);
setupSlowTaskProfiler();
setupRunLoopProfiler();
auto dataFolder = opts.dataFolder;
if (!dataFolder.size())
@ -1898,7 +1898,7 @@ int main(int argc, char* argv[]) {
opts.localities));
g_network->run();
} else if (role == ConsistencyCheck) {
setupSlowTaskProfiler();
setupRunLoopProfiler();
auto m = startSystemMonitor(opts.dataFolder, opts.zoneId, opts.zoneId);
f = stopAfter(runTests(opts.connectionFile, TEST_TYPE_CONSISTENCY_CHECK, TEST_HERE, 1, opts.testFile,

View File

@ -1048,7 +1048,7 @@ ACTOR Future<Void> workerServer(
ServerDBInfo localInfo = BinaryReader::fromStringRef<ServerDBInfo>(req.serializedDbInfo, AssumeVersion(currentProtocolVersion));
localInfo.myLocality = locality;
if(ccInterface->get().present() && localInfo.infoGeneration < dbInfo->get().infoGeneration && dbInfo->get().clusterInterface == ccInterface->get().get()) {
if(localInfo.infoGeneration < dbInfo->get().infoGeneration && localInfo.clusterInterface == dbInfo->get().clusterInterface) {
std::vector<Endpoint> rep = req.broadcastInfo;
rep.push_back(interf.updateServerDBInfo.getEndpoint());
req.reply.send(rep);
@ -1057,7 +1057,7 @@ ACTOR Future<Void> workerServer(
if(!ccInterface->get().present() || localInfo.clusterInterface != ccInterface->get().get()) {
notUpdated = interf.updateServerDBInfo.getEndpoint();
}
if(ccInterface->get().present() && localInfo.clusterInterface == ccInterface->get().get() && (localInfo.infoGeneration > dbInfo->get().infoGeneration || dbInfo->get().clusterInterface != ccInterface->get().get())) {
else if(localInfo.infoGeneration > dbInfo->get().infoGeneration || dbInfo->get().clusterInterface != ccInterface->get().get()) {
TraceEvent("GotServerDBInfoChange").detail("ChangeID", localInfo.id).detail("MasterID", localInfo.master.id())
.detail("RatekeeperID", localInfo.ratekeeper.present() ? localInfo.ratekeeper.get().id() : UID())

View File

@ -21,6 +21,7 @@
#include "fdbrpc/simulator.h"
#include "fdbclient/BackupAgent.actor.h"
#include "fdbclient/BackupContainer.h"
#include "fdbclient/ManagementAPI.actor.h"
#include "fdbserver/workloads/workloads.actor.h"
#include "fdbserver/workloads/BulkSetup.actor.h"
#include "fdbclient/RestoreWorkerInterface.actor.h"
@ -213,9 +214,7 @@ struct BackupAndParallelRestoreCorrectnessWorkload : TestWorkload {
state bool restorable = false;
if (lastBackupContainer) {
state Future<BackupDescription> fdesc = self->usePartitionedLogs
? lastBackupContainer->describePartitionedBackup()
: lastBackupContainer->describeBackup();
state Future<BackupDescription> fdesc = lastBackupContainer->describeBackup();
wait(ready(fdesc));
if(!fdesc.isError()) {
@ -423,6 +422,11 @@ struct BackupAndParallelRestoreCorrectnessWorkload : TestWorkload {
// wait(attemptDirtyRestore(self, cx, &backupAgent, StringRef(lastBackupContainer->getURL()),
// randomID));
}
// We must ensure no backup workers are running, otherwise the clear DB
// below can be picked up by backup workers and applied during restore.
wait(success(changeConfig(cx, "backup_worker_enabled:=0", true)));
// Clear DB before restore
wait(runRYWTransaction(cx, [=](Reference<ReadYourWritesTransaction> tr) -> Future<Void> {
for (auto& kvrange : self->backupRanges) tr->clear(kvrange);
@ -436,14 +440,8 @@ struct BackupAndParallelRestoreCorrectnessWorkload : TestWorkload {
.detail("BackupTag", printable(self->backupTag));
auto container = IBackupContainer::openContainer(lastBackupContainer->getURL());
BackupDescription desc = wait(self->usePartitionedLogs ? container->describePartitionedBackup()
: container->describeBackup());
TraceEvent("BAFRW_Restore", randomID)
.detail("LastBackupContainer", lastBackupContainer->getURL())
.detail("MinRestorableVersion", desc.minRestorableVersion.get())
.detail("MaxRestorableVersion", desc.maxRestorableVersion.get())
.detail("ContiguousLogEnd", desc.contiguousLogEnd.get());
BackupDescription desc = wait(container->describeBackup());
ASSERT(self->usePartitionedLogs == desc.partitioned);
state Version targetVersion = -1;
if (desc.maxRestorableVersion.present()) {
@ -463,6 +461,13 @@ struct BackupAndParallelRestoreCorrectnessWorkload : TestWorkload {
}
}
TraceEvent("BAFRW_Restore", randomID)
.detail("LastBackupContainer", lastBackupContainer->getURL())
.detail("MinRestorableVersion", desc.minRestorableVersion.get())
.detail("MaxRestorableVersion", desc.maxRestorableVersion.get())
.detail("ContiguousLogEnd", desc.contiguousLogEnd.get())
.detail("TargetVersion", targetVersion);
state std::vector<Future<Version>> restores;
state std::vector<Standalone<StringRef>> restoreTags;

View File

@ -667,7 +667,9 @@ struct ConsistencyCheckWorkload : TestWorkload
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
state int bytesReadInRange = 0;
decodeKeyServersValue(keyLocations[shard].value, sourceStorageServers, destStorageServers);
Standalone<RangeResultRef> UIDtoTagMap = wait( tr.getRange( serverTagKeys, CLIENT_KNOBS->TOO_MANY ) );
ASSERT( !UIDtoTagMap.more && UIDtoTagMap.size() < CLIENT_KNOBS->TOO_MANY );
decodeKeyServersValue(UIDtoTagMap, keyLocations[shard].value, sourceStorageServers, destStorageServers);
//If the destStorageServers is non-empty, then this shard is being relocated
state bool isRelocating = destStorageServers.size() > 0;

View File

@ -36,7 +36,7 @@ struct SlowTaskWorkload : TestWorkload {
}
virtual Future<Void> start(Database const& cx) {
setupSlowTaskProfiler();
setupRunLoopProfiler();
return go();
}

View File

@ -87,7 +87,6 @@ set(FLOW_SRCS
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/SourceVersion.h.cmake ${CMAKE_CURRENT_BINARY_DIR}/SourceVersion.h)
add_flow_target(STATIC_LIBRARY NAME flow SRCS ${FLOW_SRCS})
target_include_directories(flow SYSTEM PUBLIC ${CMAKE_THREAD_LIBS_INIT})
target_include_directories(flow PUBLIC ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR})
if (NOT APPLE AND NOT WIN32)
set (FLOW_LIBS ${FLOW_LIBS} rt)

View File

@ -78,7 +78,7 @@ class ThreadPool : public IThreadPool, public ReferenceCounted<ThreadPool> {
public:
ThreadPool() : dontstop(ios), mode(Run) {}
~ThreadPool() {}
Future<Void> stop() {
Future<Void> stop(Error const& e = success()) {
if (mode == Shutdown) return Void();
ReferenceCounted<ThreadPool>::addref();
ios.stop(); // doesn't work?

View File

@ -60,7 +60,7 @@ public:
virtual Future<Void> getError() = 0; // asynchronously throws an error if there is an internal error
virtual void addThread( IThreadPoolReceiver* userData ) = 0;
virtual void post( PThreadAction action ) = 0;
virtual Future<Void> stop() = 0;
virtual Future<Void> stop(Error const& e = success()) = 0;
virtual bool isCoro() const { return false; }
virtual void addref() = 0;
virtual void delref() = 0;

View File

@ -48,9 +48,13 @@ void FlowKnobs::initialize(bool randomize, bool isSimulated) {
init( DISABLE_ASSERTS, 0 );
init( QUEUE_MODEL_SMOOTHING_AMOUNT, 2.0 );
init( SLOWTASK_PROFILING_INTERVAL, 0.125 ); // A value of 0 disables SlowTask profiling
init( RUN_LOOP_PROFILING_INTERVAL, 0.125 ); // A value of 0 disables run loop profiling
init( SLOWTASK_PROFILING_LOG_INTERVAL, 0 ); // A value of 0 means use RUN_LOOP_PROFILING_INTERVAL
init( SLOWTASK_PROFILING_MAX_LOG_INTERVAL, 1.0 );
init( SLOWTASK_PROFILING_LOG_BACKOFF, 2.0 );
init( SATURATION_PROFILING_LOG_INTERVAL, 0.5 ); // A value of 0 means use RUN_LOOP_PROFILING_INTERVAL
init( SATURATION_PROFILING_MAX_LOG_INTERVAL, 5.0 );
init( SATURATION_PROFILING_LOG_BACKOFF, 2.0 );
init( RANDOMSEED_RETRY_LIMIT, 4 );
init( FAST_ALLOC_LOGGING_BYTES, 10e6 );
@ -129,6 +133,7 @@ void FlowKnobs::initialize(bool randomize, bool isSimulated) {
init( SLOW_LOOP_CUTOFF, 15.0 / 1000.0 );
init( SLOW_LOOP_SAMPLING_RATE, 0.1 );
init( TSC_YIELD_TIME, 1000000 );
init( MIN_LOGGED_PRIORITY_BUSY_FRACTION, 0.05 );
init( CERT_FILE_MAX_SIZE, 5 * 1024 * 1024 );
//Network

View File

@ -69,10 +69,14 @@ public:
double HUGE_ARENA_LOGGING_BYTES;
double HUGE_ARENA_LOGGING_INTERVAL;
//slow task profiling
double SLOWTASK_PROFILING_INTERVAL;
//run loop profiling
double RUN_LOOP_PROFILING_INTERVAL;
double SLOWTASK_PROFILING_LOG_INTERVAL;
double SLOWTASK_PROFILING_MAX_LOG_INTERVAL;
double SLOWTASK_PROFILING_LOG_BACKOFF;
double SATURATION_PROFILING_LOG_INTERVAL;
double SATURATION_PROFILING_MAX_LOG_INTERVAL;
double SATURATION_PROFILING_LOG_BACKOFF;
//connectionMonitor
double CONNECTION_MONITOR_LOOP_TIME;
@ -147,6 +151,7 @@ public:
double SLOW_LOOP_SAMPLING_RATE;
int64_t TSC_YIELD_TIME;
int64_t REACTOR_FLAGS;
double MIN_LOGGED_PRIORITY_BUSY_FRACTION;
int CERT_FILE_MAX_SIZE;
//Network

View File

@ -57,7 +57,8 @@ using namespace boost::asio::ip;
#if defined(__linux__)
#include <execinfo.h>
std::atomic<int64_t> net2liveness(0);
std::atomic<int64_t> net2RunLoopIterations(0);
std::atomic<int64_t> net2RunLoopSleeps(0);
volatile size_t net2backtraces_max = 10000;
volatile void** volatile net2backtraces = NULL;
@ -171,7 +172,7 @@ public:
INetworkConnections *network; // initially this, but can be changed
int64_t tsc_begin, tsc_end;
int64_t tscBegin, tscEnd;
double taskBegin;
TaskPriority currentTaskID;
uint64_t tasksIssued;
@ -182,7 +183,7 @@ public:
uint64_t numYields;
TaskPriority lastMinTaskID;
NetworkMetrics::PriorityStats* lastPriorityStats;
std::priority_queue<OrderedTask, std::vector<OrderedTask>> ready;
ThreadSafeQueue<OrderedTask> threadReady;
@ -195,9 +196,9 @@ public:
std::priority_queue<DelayedTask, std::vector<DelayedTask>> timers;
void checkForSlowTask(int64_t tscBegin, int64_t tscEnd, double duration, TaskPriority priority);
bool check_yield(TaskPriority taskId, bool isRunLoop);
bool check_yield(TaskPriority taskId, int64_t tscNow);
void processThreadReady();
void trackMinPriority( TaskPriority minTaskID, double now );
void trackAtPriority( TaskPriority priority, double now );
void stopImmediately() {
stopped=true; decltype(ready) _1; ready.swap(_1); decltype(timers) _2; timers.swap(_2);
}
@ -221,7 +222,7 @@ public:
Int64MetricHandle countYieldCalls;
Int64MetricHandle countYieldCallsTrue;
Int64MetricHandle countASIOEvents;
Int64MetricHandle countSlowTaskSignals;
Int64MetricHandle countRunLoopProfilingSignals;
Int64MetricHandle countTLSPolicyFailures;
Int64MetricHandle priorityMetric;
DoubleMetricHandle countLaunchTime;
@ -863,9 +864,9 @@ Net2::Net2(const TLSConfig& tlsConfig, bool useThreadPool, bool useMetrics)
stopped(false),
tasksIssued(0),
// Until run() is called, yield() will always yield
tsc_begin(0), tsc_end(0), taskBegin(0), currentTaskID(TaskPriority::DefaultYield),
lastMinTaskID(TaskPriority::Zero),
tscBegin(0), tscEnd(0), taskBegin(0), currentTaskID(TaskPriority::DefaultYield),
numYields(0),
lastPriorityStats(nullptr),
tlsInitialized(false),
tlsConfig(tlsConfig)
#ifndef TLS_DISABLED
@ -887,13 +888,7 @@ Net2::Net2(const TLSConfig& tlsConfig, bool useThreadPool, bool useMetrics)
setGlobal(INetwork::enEventFD, (flowGlobalType) N2::ASIOReactor::newEventFD(reactor));
#endif
int priBins[] = { 1, 2050, 3050, 4050, 4950, 5050, 7050, 8050, 10050 };
static_assert( sizeof(priBins) == sizeof(int)*NetworkMetrics::PRIORITY_BINS, "Fix priority bins");
for(int i=0; i<NetworkMetrics::PRIORITY_BINS; i++)
networkInfo.metrics.priorityBins[i] = static_cast<TaskPriority>(priBins[i]);
updateNow();
}
#ifndef TLS_DISABLED
@ -1009,7 +1004,7 @@ void Net2::initMetrics() {
countYieldCalls.init(LiteralStringRef("Net2.CountYieldCalls"));
countASIOEvents.init(LiteralStringRef("Net2.CountASIOEvents"));
countYieldCallsTrue.init(LiteralStringRef("Net2.CountYieldCallsTrue"));
countSlowTaskSignals.init(LiteralStringRef("Net2.CountSlowTaskSignals"));
countRunLoopProfilingSignals.init(LiteralStringRef("Net2.CountRunLoopProfilingSignals"));
countTLSPolicyFailures.init(LiteralStringRef("Net2.CountTLSPolicyFailures"));
priorityMetric.init(LiteralStringRef("Net2.Priority"));
awakeMetric.init(LiteralStringRef("Net2.Awake"));
@ -1047,13 +1042,14 @@ void Net2::run() {
++countRunLoop;
if (runFunc) {
tsc_begin = __rdtsc();
tscBegin = __rdtsc();
taskBegin = nnow;
trackMinPriority(TaskPriority::RunCycleFunction, taskBegin);
trackAtPriority(TaskPriority::RunCycleFunction, taskBegin);
runFunc();
double taskEnd = timer_monotonic();
trackAtPriority(TaskPriority::RunLoop, taskEnd);
countLaunchTime += taskEnd - taskBegin;
checkForSlowTask(tsc_begin, __rdtsc(), taskEnd - taskBegin, TaskPriority::RunCycleFunction);
checkForSlowTask(tscBegin, __rdtsc(), taskEnd - taskBegin, TaskPriority::RunCycleFunction);
}
double sleepTime = 0;
@ -1070,7 +1066,12 @@ void Net2::run() {
sleepTime = timers.top().at - sleepStart; // + 500e-6?
}
if (sleepTime > 0) {
trackMinPriority(TaskPriority::Zero, sleepStart);
#if defined(__linux__)
// notify the run loop monitoring thread that we have gone idle
net2RunLoopSleeps.fetch_add(1);
#endif
trackAtPriority(TaskPriority::Zero, sleepStart);
awakeMetric = false;
priorityMetric = 0;
reactor.sleep(sleepTime);
@ -1078,16 +1079,17 @@ void Net2::run() {
}
}
tsc_begin = __rdtsc();
tscBegin = __rdtsc();
taskBegin = timer_monotonic();
trackMinPriority(TaskPriority::ASIOReactor, taskBegin);
trackAtPriority(TaskPriority::ASIOReactor, taskBegin);
reactor.react();
updateNow();
double now = this->currentTime;
trackAtPriority(TaskPriority::RunLoop, now);
countReactTime += now - taskBegin;
checkForSlowTask(tsc_begin, __rdtsc(), now - taskBegin, TaskPriority::ASIOReactor);
checkForSlowTask(tscBegin, __rdtsc(), now - taskBegin, TaskPriority::ASIOReactor);
if ((now-nnow) > FLOW_KNOBS->SLOW_LOOP_CUTOFF && nondeterministicRandom()->random01() < (now-nnow)*FLOW_KNOBS->SLOW_LOOP_SAMPLING_RATE)
TraceEvent("SomewhatSlowRunLoopTop").detail("Elapsed", now - nnow);
@ -1104,8 +1106,8 @@ void Net2::run() {
processThreadReady();
tsc_begin = __rdtsc();
tsc_end = tsc_begin + FLOW_KNOBS->TSC_YIELD_TIME;
tscBegin = __rdtsc();
tscEnd = tscBegin + FLOW_KNOBS->TSC_YIELD_TIME;
taskBegin = timer_monotonic();
numYields = 0;
TaskPriority minTaskID = TaskPriority::Max;
@ -1115,8 +1117,11 @@ void Net2::run() {
while (!ready.empty()) {
++countTasks;
currentTaskID = ready.top().taskID;
if(currentTaskID < minTaskID) {
trackAtPriority(currentTaskID, taskBegin);
minTaskID = currentTaskID;
}
priorityMetric = static_cast<int64_t>(currentTaskID);
minTaskID = std::min(minTaskID, currentTaskID);
Task* task = ready.top().task;
ready.pop();
@ -1128,19 +1133,26 @@ void Net2::run() {
TraceEvent(SevError, "TaskError").error(unknown_error());
}
if (check_yield(TaskPriority::Max, true)) {
double tscNow = __rdtsc();
double newTaskBegin = timer_monotonic();
if (check_yield(TaskPriority::Max, tscNow)) {
checkForSlowTask(tscBegin, tscNow, newTaskBegin - taskBegin, currentTaskID);
FDB_TRACE_PROBE(run_loop_yield);
++countYields;
break;
break;
}
taskBegin = newTaskBegin;
tscBegin = tscNow;
}
trackAtPriority(TaskPriority::RunLoop, taskBegin);
queueSize = ready.size();
FDB_TRACE_PROBE(run_loop_done, queueSize);
trackMinPriority(minTaskID, now);
#if defined(__linux__)
if(FLOW_KNOBS->SLOWTASK_PROFILING_INTERVAL > 0) {
if(FLOW_KNOBS->RUN_LOOP_PROFILING_INTERVAL > 0) {
sigset_t orig_set;
pthread_sigmask(SIG_BLOCK, &sigprof_set, &orig_set);
@ -1148,7 +1160,7 @@ void Net2::run() {
bool was_overflow = net2backtraces_overflow;
int signal_count = net2backtraces_count;
countSlowTaskSignals += signal_count;
countRunLoopProfilingSignals += signal_count;
if (other_offset) {
volatile void** _traces = net2backtraces;
@ -1164,7 +1176,7 @@ void Net2::run() {
pthread_sigmask(SIG_SETMASK, &orig_set, NULL);
if (was_overflow) {
TraceEvent("Net2SlowTaskOverflow")
TraceEvent("Net2RunLoopProfilerOverflow")
.detail("SignalsReceived", signal_count)
.detail("BackTraceHarvested", other_offset != 0);
}
@ -1172,13 +1184,13 @@ void Net2::run() {
size_t iter_offset = 0;
while (iter_offset < other_offset) {
ProfilingSample *ps = (ProfilingSample *)(other_backtraces + iter_offset);
TraceEvent(SevWarn, "Net2SlowTaskTrace").detailf("TraceTime", "%.6f", ps->timestamp).detail("Trace", platform::format_backtrace(ps->frames, ps->length));
TraceEvent(SevWarn, "Net2RunLoopTrace").detailf("TraceTime", "%.6f", ps->timestamp).detail("Trace", platform::format_backtrace(ps->frames, ps->length));
iter_offset += ps->length + 2;
}
}
// to keep the thread liveness check happy
net2liveness.fetch_add(1);
// notify the run loop monitoring thread that we are making progress
net2RunLoopIterations.fetch_add(1);
}
#endif
nnow = timer_monotonic();
@ -1192,24 +1204,43 @@ void Net2::run() {
#endif
}
void Net2::trackMinPriority( TaskPriority minTaskID, double now ) {
if (minTaskID != lastMinTaskID) {
for(int c=0; c<NetworkMetrics::PRIORITY_BINS; c++) {
TaskPriority pri = networkInfo.metrics.priorityBins[c];
if (pri > minTaskID && pri <= lastMinTaskID) { // busy -> idle
networkInfo.metrics.priorityBlocked[c] = false;
networkInfo.metrics.priorityBlockedDuration[c] += now - networkInfo.metrics.windowedPriorityTimer[c];
networkInfo.metrics.priorityMaxBlockedDuration[c] = std::max(networkInfo.metrics.priorityMaxBlockedDuration[c], now - networkInfo.metrics.priorityTimer[c]);
void Net2::trackAtPriority( TaskPriority priority, double now ) {
if (lastPriorityStats == nullptr || priority != lastPriorityStats->priority) {
// Start tracking current priority
auto activeStatsItr = networkInfo.metrics.activeTrackers.try_emplace(priority, priority);
activeStatsItr.first->second.active = true;
activeStatsItr.first->second.windowedTimer = now;
if(lastPriorityStats != nullptr) {
// Stop tracking previous priority
lastPriorityStats->active = false;
lastPriorityStats->duration += now - lastPriorityStats->windowedTimer;
}
// Update starvation trackers
TaskPriority lastPriority = (lastPriorityStats == nullptr) ? TaskPriority::Zero : lastPriorityStats->priority;
for(auto& binStats : networkInfo.metrics.starvationTrackers) {
if(binStats.priority > lastPriority && binStats.priority > priority) {
break;
}
if (pri <= minTaskID && pri > lastMinTaskID) { // idle -> busy
networkInfo.metrics.priorityBlocked[c] = true;
networkInfo.metrics.priorityTimer[c] = now;
networkInfo.metrics.windowedPriorityTimer[c] = now;
// Busy -> idle at binStats.priority
if(binStats.priority > priority && binStats.priority <= lastPriority) {
binStats.active = false;
binStats.duration += now - binStats.windowedTimer;
binStats.maxDuration = std::max(binStats.maxDuration, now - binStats.timer);
}
// Idle -> busy at binStats.priority
else if(binStats.priority <= priority && binStats.priority > lastPriority) {
binStats.active = true;
binStats.timer = now;
binStats.windowedTimer = now;
}
}
}
lastMinTaskID = minTaskID;
lastPriorityStats = &activeStatsItr.first->second;
}
}
void Net2::processThreadReady() {
@ -1241,7 +1272,8 @@ void Net2::checkForSlowTask(int64_t tscBegin, int64_t tscEnd, double duration, T
slowTaskMetric->log();
double sampleRate = std::min(1.0, (elapsed > warnThreshold) ? 1.0 : elapsed / 10e9);
if(FLOW_KNOBS->SLOWTASK_PROFILING_INTERVAL > 0 && duration > FLOW_KNOBS->SLOWTASK_PROFILING_INTERVAL) {
double slowTaskProfilingLogInterval = std::max(FLOW_KNOBS->RUN_LOOP_PROFILING_INTERVAL, FLOW_KNOBS->SLOWTASK_PROFILING_LOG_INTERVAL);
if(slowTaskProfilingLogInterval > 0 && duration > slowTaskProfilingLogInterval) {
sampleRate = 1; // Always include slow task events that could show up in our slow task profiling.
}
@ -1250,12 +1282,8 @@ void Net2::checkForSlowTask(int64_t tscBegin, int64_t tscEnd, double duration, T
}
}
bool Net2::check_yield( TaskPriority taskID, bool isRunLoop ) {
if(!isRunLoop && numYields > 0) {
++numYields;
return true;
}
bool Net2::check_yield( TaskPriority taskID, int64_t tscNow ) {
// SOMEDAY: Yield if there are lots of higher priority tasks queued?
if ((g_stackYieldLimit) && ( (intptr_t)&taskID < g_stackYieldLimit )) {
++countYieldBigStack;
return true;
@ -1268,35 +1296,31 @@ bool Net2::check_yield( TaskPriority taskID, bool isRunLoop ) {
return true;
}
// SOMEDAY: Yield if there are lots of higher priority tasks queued?
int64_t tsc_now = __rdtsc();
double newTaskBegin = timer_monotonic();
if (tsc_now < tsc_begin) {
if (tscNow < tscBegin) {
return true;
}
if(isRunLoop) {
checkForSlowTask(tsc_begin, tsc_now, newTaskBegin-taskBegin, currentTaskID);
}
if (tsc_now > tsc_end) {
if (tscNow > tscEnd) {
++numYields;
return true;
}
taskBegin = newTaskBegin;
tsc_begin = tsc_now;
return false;
}
bool Net2::check_yield( TaskPriority taskID ) {
return check_yield(taskID, false);
if(numYields > 0) {
++numYields;
return true;
}
return check_yield(taskID, __rdtsc());
}
Future<class Void> Net2::yield( TaskPriority taskID ) {
++countYieldCalls;
if (taskID == TaskPriority::DefaultYield) taskID = currentTaskID;
if (check_yield(taskID, false)) {
if (check_yield(taskID)) {
++countYieldCallsTrue;
return delay(0, taskID);
}

View File

@ -2867,7 +2867,8 @@ extern volatile size_t net2backtraces_offset;
extern volatile size_t net2backtraces_max;
extern volatile bool net2backtraces_overflow;
extern volatile int64_t net2backtraces_count;
extern std::atomic<int64_t> net2liveness;
extern std::atomic<int64_t> net2RunLoopIterations;
extern std::atomic<int64_t> net2RunLoopSleeps;
extern void initProfiling();
std::atomic<double> checkThreadTime;
@ -2953,28 +2954,64 @@ void* checkThread(void *arg) {
pthread_t mainThread = *(pthread_t*)arg;
free(arg);
int64_t lastValue = net2liveness.load();
double lastSignal = 0;
double logInterval = FLOW_KNOBS->SLOWTASK_PROFILING_INTERVAL;
int64_t lastRunLoopIterations = net2RunLoopIterations.load();
int64_t lastRunLoopSleeps = net2RunLoopSleeps.load();
double lastSlowTaskSignal = 0;
double lastSaturatedSignal = 0;
const double minSlowTaskLogInterval = std::max(FLOW_KNOBS->SLOWTASK_PROFILING_LOG_INTERVAL, FLOW_KNOBS->RUN_LOOP_PROFILING_INTERVAL);
const double minSaturationLogInterval = std::max(FLOW_KNOBS->SATURATION_PROFILING_LOG_INTERVAL, FLOW_KNOBS->RUN_LOOP_PROFILING_INTERVAL);
double slowTaskLogInterval = minSlowTaskLogInterval;
double saturatedLogInterval = minSaturationLogInterval;
while(true) {
threadSleep(FLOW_KNOBS->SLOWTASK_PROFILING_INTERVAL);
int64_t currentLiveness = net2liveness.load();
if(lastValue == currentLiveness) {
threadSleep(FLOW_KNOBS->RUN_LOOP_PROFILING_INTERVAL);
int64_t currentRunLoopIterations = net2RunLoopIterations.load();
int64_t currentRunLoopSleeps = net2RunLoopSleeps.load();
bool slowTask = lastRunLoopIterations == currentRunLoopIterations;
bool saturated = lastRunLoopSleeps == currentRunLoopSleeps;
if(slowTask) {
double t = timer();
if(lastSignal == 0 || t - lastSignal >= logInterval) {
if(lastSignal > 0) {
logInterval = std::min(FLOW_KNOBS->SLOWTASK_PROFILING_MAX_LOG_INTERVAL, FLOW_KNOBS->SLOWTASK_PROFILING_LOG_BACKOFF * logInterval);
if(lastSlowTaskSignal == 0 || t - lastSlowTaskSignal >= slowTaskLogInterval) {
if(lastSlowTaskSignal > 0) {
slowTaskLogInterval = std::min(FLOW_KNOBS->SLOWTASK_PROFILING_MAX_LOG_INTERVAL, FLOW_KNOBS->SLOWTASK_PROFILING_LOG_BACKOFF * slowTaskLogInterval);
}
lastSignal = t;
checkThreadTime.store(lastSignal);
lastSlowTaskSignal = t;
checkThreadTime.store(lastSlowTaskSignal);
pthread_kill(mainThread, SIGPROF);
}
}
else {
lastValue = currentLiveness;
lastSignal = 0;
logInterval = FLOW_KNOBS->SLOWTASK_PROFILING_INTERVAL;
lastSlowTaskSignal = 0;
lastRunLoopIterations = currentRunLoopIterations;
slowTaskLogInterval = minSlowTaskLogInterval;
}
if(saturated) {
double t = timer();
if(lastSaturatedSignal == 0 || t - lastSaturatedSignal >= saturatedLogInterval) {
if(lastSaturatedSignal > 0) {
saturatedLogInterval = std::min(FLOW_KNOBS->SATURATION_PROFILING_MAX_LOG_INTERVAL, FLOW_KNOBS->SATURATION_PROFILING_LOG_BACKOFF * saturatedLogInterval);
}
lastSaturatedSignal = t;
if(!slowTask) {
checkThreadTime.store(lastSaturatedSignal);
pthread_kill(mainThread, SIGPROF);
}
}
}
else {
lastSaturatedSignal = 0;
lastRunLoopSleeps = currentRunLoopSleeps;
saturatedLogInterval = minSaturationLogInterval;
}
}
return NULL;
@ -3000,10 +3037,10 @@ void fdb_probe_actor_exit(const char* name, unsigned long id, int index) {
#endif
void setupSlowTaskProfiler() {
void setupRunLoopProfiler() {
#ifdef __linux__
if (!profileThread && FLOW_KNOBS->SLOWTASK_PROFILING_INTERVAL > 0) {
TraceEvent("StartingSlowTaskProfilingThread").detail("Interval", FLOW_KNOBS->SLOWTASK_PROFILING_INTERVAL);
if (!profileThread && FLOW_KNOBS->RUN_LOOP_PROFILING_INTERVAL > 0) {
TraceEvent("StartingRunLoopProfilingThread").detail("Interval", FLOW_KNOBS->RUN_LOOP_PROFILING_INTERVAL);
initProfiling();
profileThread = true;

View File

@ -619,7 +619,7 @@ EXTERNC void flushAndExit(int exitCode);
void platformInit();
void registerCrashHandler();
void setupSlowTaskProfiler();
void setupRunLoopProfiler();
EXTERNC void setProfilingEnabled(int enabled);
// Use _exit() or criticalError(), not exit()

View File

@ -126,7 +126,7 @@ SystemStatistics customSystemMonitor(std::string eventName, StatisticsState *sta
.detail("Yields", netData.countYields - statState->networkState.countYields)
.detail("YieldCalls", netData.countYieldCalls - statState->networkState.countYieldCalls)
.detail("YieldCallsTrue", netData.countYieldCallsTrue - statState->networkState.countYieldCallsTrue)
.detail("SlowTaskSignals", netData.countSlowTaskSignals - statState->networkState.countSlowTaskSignals)
.detail("RunLoopProfilingSignals", netData.countRunLoopProfilingSignals - statState->networkState.countRunLoopProfilingSignals)
.detail("YieldBigStack", netData.countYieldBigStack - statState->networkState.countYieldBigStack)
.detail("RunLoopIterations", netData.countRunLoop - statState->networkState.countRunLoop)
.detail("TimersExecuted", netData.countTimers - statState->networkState.countTimers)
@ -148,17 +148,36 @@ SystemStatistics customSystemMonitor(std::string eventName, StatisticsState *sta
}
}
for (int i = 0; i < NetworkMetrics::PRIORITY_BINS && g_network->networkInfo.metrics.priorityBins[i] != TaskPriority::Zero; i++) {
if(g_network->networkInfo.metrics.priorityBlocked[i]) {
g_network->networkInfo.metrics.priorityBlockedDuration[i] += now() - g_network->networkInfo.metrics.windowedPriorityTimer[i];
g_network->networkInfo.metrics.priorityMaxBlockedDuration[i] = std::max(g_network->networkInfo.metrics.priorityMaxBlockedDuration[i], now() - g_network->networkInfo.metrics.priorityTimer[i]);
g_network->networkInfo.metrics.windowedPriorityTimer[i] = now();
std::map<TaskPriority, double> loggedDurations;
for (auto &itr : g_network->networkInfo.metrics.activeTrackers) {
if(itr.second.active) {
itr.second.duration += now() - itr.second.windowedTimer;
itr.second.windowedTimer = now();
}
n.detail(format("PriorityBusy%d", g_network->networkInfo.metrics.priorityBins[i]).c_str(), std::min(currentStats.elapsed, g_network->networkInfo.metrics.priorityBlockedDuration[i] - statState->networkMetricsState.priorityBlockedDuration[i]));
n.detail(format("PriorityMaxBusy%d", g_network->networkInfo.metrics.priorityBins[i]).c_str(), g_network->networkInfo.metrics.priorityMaxBlockedDuration[i]);
if(itr.second.duration / currentStats.elapsed >= FLOW_KNOBS->MIN_LOGGED_PRIORITY_BUSY_FRACTION) {
loggedDurations[itr.first] = std::min(currentStats.elapsed, itr.second.duration);
}
g_network->networkInfo.metrics.priorityMaxBlockedDuration[i] = 0;
itr.second.duration = 0;
}
for (auto const& itr : loggedDurations) {
n.detail(format("PriorityBusy%d", itr.first).c_str(), itr.second);
}
for (auto &itr : g_network->networkInfo.metrics.starvationTrackers) {
if(itr.active) {
itr.duration += now() - itr.windowedTimer;
itr.maxDuration = std::max(itr.maxDuration, now() - itr.timer);
itr.windowedTimer = now();
}
n.detail(format("PriorityStarvedBelow%d", itr.priority).c_str(), std::min(currentStats.elapsed, itr.duration));
n.detail(format("PriorityMaxStarvedBelow%d", itr.priority).c_str(), itr.maxDuration);
itr.duration = 0;
itr.maxDuration = 0;
}
n.trackLatest("NetworkMetrics");

View File

@ -62,7 +62,7 @@ struct NetworkData {
int64_t countYieldCalls;
int64_t countASIOEvents;
int64_t countYieldCallsTrue;
int64_t countSlowTaskSignals;
int64_t countRunLoopProfilingSignals;
int64_t countFileLogicalWrites;
int64_t countFileLogicalReads;
int64_t countAIOSubmit;
@ -104,7 +104,7 @@ struct NetworkData {
countYieldCalls = Int64Metric::getValueOrDefault(LiteralStringRef("Net2.CountYieldCalls"));
countASIOEvents = Int64Metric::getValueOrDefault(LiteralStringRef("Net2.CountASIOEvents"));
countYieldCallsTrue = Int64Metric::getValueOrDefault(LiteralStringRef("Net2.CountYieldCallsTrue"));
countSlowTaskSignals = Int64Metric::getValueOrDefault(LiteralStringRef("Net2.CountSlowTaskSignals"));
countRunLoopProfilingSignals = Int64Metric::getValueOrDefault(LiteralStringRef("Net2.CountRunLoopProfilingSignals"));
countConnEstablished = Int64Metric::getValueOrDefault(LiteralStringRef("Net2.CountConnEstablished"));
countConnClosedWithError = Int64Metric::getValueOrDefault(LiteralStringRef("Net2.CountConnClosedWithError"));
countConnClosedWithoutError = Int64Metric::getValueOrDefault(LiteralStringRef("Net2.CountConnClosedWithoutError"));

View File

@ -74,7 +74,7 @@ public:
errors.sendError( unknown_error() );
}
}
Future<Void> stop() {
Future<Void> stop(Error const& e) {
return Void();
}
void addref() {
@ -377,18 +377,21 @@ public:
eventBuffer.clear();
}
opened = true;
for(TraceEventFields &fields : eventBuffer) {
annotateEvent(fields);
}
opened = true;
if(preopenOverflowCount > 0) {
TraceEvent(SevWarn, "TraceLogPreopenOverflow").detail("OverflowEventCount", preopenOverflowCount);
preopenOverflowCount = 0;
}
}
void annotateEvent( TraceEventFields &fields ) {
void annotateEvent(TraceEventFields& fields) {
MutexHolder holder(mutex);
if (!opened || fields.isAnnotated())
return;
if(localAddress.present()) {
fields.addField("Machine", formatIpPort(localAddress.get().ip, localAddress.get().port));
}
@ -399,14 +402,13 @@ public:
if(r.rolesString.size() > 0) {
fields.addField("Roles", r.rolesString);
}
fields.setAnnotated();
}
void writeEvent( TraceEventFields fields, std::string trackLatestKey, bool trackError ) {
void writeEvent(TraceEventFields fields, std::string trackLatestKey, bool trackError) {
MutexHolder hold(mutex);
if(opened) {
annotateEvent(fields);
}
annotateEvent(fields);
if(!trackLatestKey.empty()) {
fields.addField("TrackLatestType", "Original");
@ -418,6 +420,7 @@ public:
}
// FIXME: What if we are using way too much memory for buffer?
ASSERT(!isOpen() || fields.isAnnotated());
eventBuffer.push_back(fields);
bufferLength += fields.sizeBytes();
@ -1189,25 +1192,35 @@ TraceInterval& TraceInterval::begin() {
return *this;
}
bool TraceBatch::dumpImmediately() {
return (g_network->isSimulated() || FLOW_KNOBS->AUTOMATIC_TRACE_DUMP);
}
void TraceBatch::addEvent( const char *name, uint64_t id, const char *location ) {
eventBatch.push_back( EventInfo(TraceEvent::getCurrentTime(), name, id, location));
if( g_network->isSimulated() || FLOW_KNOBS->AUTOMATIC_TRACE_DUMP )
auto& eventInfo = eventBatch.emplace_back(EventInfo(TraceEvent::getCurrentTime(), name, id, location));
if (dumpImmediately())
dump();
else
g_traceLog.annotateEvent(eventInfo.fields);
}
void TraceBatch::addAttach( const char *name, uint64_t id, uint64_t to ) {
attachBatch.push_back( AttachInfo(TraceEvent::getCurrentTime(), name, id, to));
if( g_network->isSimulated() || FLOW_KNOBS->AUTOMATIC_TRACE_DUMP )
auto& attachInfo = attachBatch.emplace_back(AttachInfo(TraceEvent::getCurrentTime(), name, id, to));
if (dumpImmediately())
dump();
else
g_traceLog.annotateEvent(attachInfo.fields);
}
void TraceBatch::addBuggify( int activated, int line, std::string file ) {
if( g_network ) {
buggifyBatch.push_back( BuggifyInfo(TraceEvent::getCurrentTime(), activated, line, file));
if( g_network->isSimulated() || FLOW_KNOBS->AUTOMATIC_TRACE_DUMP )
auto& buggifyInfo = buggifyBatch.emplace_back(BuggifyInfo(TraceEvent::getCurrentTime(), activated, line, file));
if (dumpImmediately())
dump();
else
g_traceLog.annotateEvent(buggifyInfo.fields);
} else {
buggifyBatch.push_back( BuggifyInfo(0, activated, line, file));
buggifyBatch.push_back(BuggifyInfo(0, activated, line, file));
}
}
@ -1272,7 +1285,7 @@ TraceBatch::BuggifyInfo::BuggifyInfo(double time, int activated, int line, std::
fields.addField("Line", format("%d", line));
}
TraceEventFields::TraceEventFields() : bytes(0) {}
TraceEventFields::TraceEventFields() : bytes(0), annotated(false) {}
void TraceEventFields::addField(const std::string& key, const std::string& value) {
bytes += key.size() + value.size();
@ -1300,6 +1313,14 @@ TraceEventFields::FieldIterator TraceEventFields::end() const {
return fields.cend();
}
bool TraceEventFields::isAnnotated() const {
return annotated;
}
void TraceEventFields::setAnnotated() {
annotated = true;
}
const TraceEventFields::Field &TraceEventFields::operator[] (int index) const {
ASSERT(index >= 0 && index < size());
return fields.at(index);

View File

@ -71,6 +71,8 @@ public:
size_t sizeBytes() const;
FieldIterator begin() const;
FieldIterator end() const;
bool isAnnotated() const;
void setAnnotated();
void addField(const std::string& key, const std::string& value);
void addField(std::string&& key, std::string&& value);
@ -95,6 +97,7 @@ public:
private:
FieldContainer fields;
size_t bytes;
bool annotated;
};
template <class Archive>
@ -144,6 +147,7 @@ private:
std::vector<EventInfo> eventBatch;
std::vector<AttachInfo> attachBatch;
std::vector<BuggifyInfo> buggifyBatch;
static bool dumpImmediately();
};
struct DynamicEventMetric;

View File

@ -813,7 +813,7 @@ namespace actorcompiler
returnType = "void",
formalParameters = new string[] {
ch.CallbackTypeInStateClass + "*",
ch.Stmt.wait.result.type + " value"
ch.Stmt.wait.result.type + " const& value"
},
endIsUnreachable = true
};

View File

@ -586,7 +586,7 @@ struct NotifiedQueue : private SingleCallback<T>, FastAllocated<NotifiedQueue<T>
if (error.isValid()) throw error;
throw internal_error();
}
auto copy = queue.front();
auto copy = std::move(queue.front());
queue.pop();
return copy;
}
@ -908,6 +908,9 @@ public:
void send(const T& value) const {
queue->send(value);
}
void send(T&& value) const {
queue->send(std::move(value));
}
void sendError(const Error& error) const {
queue->sendError(error);
}

View File

@ -167,6 +167,8 @@ Future<Reference<IConnection>> INetworkConnections::connect( std::string host, s
});
}
const std::vector<int> NetworkMetrics::starvationBins = { 1, 3500, 7000, 7500, 8500, 8900, 10500 };
TEST_CASE("/flow/network/ipaddress") {
ASSERT(NetworkAddress::parse("[::1]:4800").toString() == "[::1]:4800");

View File

@ -35,6 +35,7 @@
enum class TaskPriority {
Max = 1000000,
RunLoop = 30000,
ASIOReactor = 20001,
RunCycleFunction = 20000,
FlushTrace = 10500,
@ -84,7 +85,9 @@ enum class TaskPriority {
MoveKeys = 3550,
DataDistributionLaunch = 3530,
Ratekeeper = 3510,
DataDistribution = 3500,
DataDistribution = 3502,
DataDistributionLow = 3501,
DataDistributionVeryLow = 3500,
DiskWrite = 3010,
UpdateStorage = 3000,
CompactCache = 2900,
@ -322,18 +325,31 @@ struct NetworkMetrics {
enum { SLOW_EVENT_BINS = 16 };
uint64_t countSlowEvents[SLOW_EVENT_BINS] = {};
enum { PRIORITY_BINS = 9 };
TaskPriority priorityBins[PRIORITY_BINS] = {};
bool priorityBlocked[PRIORITY_BINS] = {};
double priorityBlockedDuration[PRIORITY_BINS] = {};
double priorityMaxBlockedDuration[PRIORITY_BINS] = {};
double priorityTimer[PRIORITY_BINS] = {};
double windowedPriorityTimer[PRIORITY_BINS] = {};
double secSquaredSubmit = 0;
double secSquaredDiskStall = 0;
NetworkMetrics() {}
struct PriorityStats {
TaskPriority priority;
bool active = false;
double duration = 0;
double timer = 0;
double windowedTimer = 0;
double maxDuration = 0;
PriorityStats(TaskPriority priority) : priority(priority) {}
};
std::unordered_map<TaskPriority, struct PriorityStats> activeTrackers;
std::vector<struct PriorityStats> starvationTrackers;
static const std::vector<int> starvationBins;
NetworkMetrics() {
for(int priority : starvationBins) {
starvationTrackers.emplace_back(static_cast<TaskPriority>(priority));
}
}
};
struct BoundedFlowLock;