diff --git a/.gitignore b/.gitignore index b0ea7ba212..4ddab65ee1 100644 --- a/.gitignore +++ b/.gitignore @@ -84,3 +84,4 @@ compile_commands.json .envrc .DS_Store temp/ +/compile_commands.json diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000000..27b02f4791 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,214 @@ +# +# CMakeLists.txt +# +# This source file is part of the FoundationDB open source project +# +# Copyright 2013-2018 Apple Inc. and the FoundationDB project authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +cmake_minimum_required(VERSION 3.12) +project(fdb + VERSION 6.1.0 + DESCRIPTION "FoundationDB is a scalable, fault-tolerant, ordered key-value store with full ACID transactions." + HOMEPAGE_URL "http://www.foundationdb.org/" + LANGUAGES C CXX ASM Java) + +set(CMAKE_MODULE_PATH "${CMAKE_MODULE_PATH};${PROJECT_SOURCE_DIR}/cmake") +message (STATUS "${PROJECT_SOURCE_DIR} ${PROJECT_BINARY_DIR}") +if("${PROJECT_SOURCE_DIR}" STREQUAL "${PROJECT_BINARY_DIR}") + message(FATAL_ERROR "In-source builds are forbidden, unsupported, and stupid!!") +endif() + +if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) + message(STATUS "Setting build type to 'Release' as none was specified") + set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build" FORCE) + set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" + "MinSizeRel" "RelWithDebInfo") +endif() + +set(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/bin) +set(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR}/lib) + +################################################################################ +# Packages used for bindings +################################################################################ + +set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake/") + +find_package(PythonInterp 3.4 REQUIRED) +set(Python_ADDITIONAL_VERSIONS 3.4 3.5 3.5) +find_package(PythonLibs 3.4 REQUIRED) + + +################################################################################ +# Compiler configuration +################################################################################ + +include(ConfigureCompiler) + +################################################################################ +# Get repository information +################################################################################ + +add_custom_target(branch_file ALL DEPENDS ${CURR_BRANCH_FILE}) +execute_process( + COMMAND git rev-parse HEAD + WORKING_DIRECTORY ${PROJECT_SOURCE_DIR} + OUTPUT_VARIABLE CURRENT_GIT_VERSION_WNL) +string(STRIP "${CURRENT_GIT_VERSION_WNL}" CURRENT_GIT_VERSION) +message(STATUS "Current git version ${CURRENT_GIT_VERSION}") + +################################################################################ +# Version information +################################################################################ + +set(USE_VERSIONS_TARGET OFF CACHE BOOL "Use the deprecated versions.target file") +if(USE_VERSIONS_TARGET) + add_custom_target(version_file ALL DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/versions.target) + execute_process( + COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/build/get_version.sh ${CMAKE_CURRENT_SOURCE_DIR}/versions.target + OUTPUT_VARIABLE FDB_VERSION_WNL) + execute_process( + COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/build/get_package_name.sh ${CMAKE_CURRENT_SOURCE_DIR}/versions.target + OUTPUT_VARIABLE FDB_PACKAGE_NAME_WNL) + string(STRIP "${FDB_VERSION_WNL}" FDB_VERSION) + string(STRIP "${FDB_PACKAGE_NAME_WNL}" FDB_PACKAGE_NAME) + set(FDB_VERSION_PLAIN ${FDB_VERSION}) + if(NOT FDB_RELEASE) + set(FDB_VERSION "${FDB_VERSION}-PRERELEASE") + endif() +else() + set(FDB_PACKAGE_NAME "${PROJECT_VERSION_MAJOR}.${PROJECT_VERSION_MINOR}") + set(FDB_VERSION ${PROJECT_VERSION}) + set(FDB_VERSION_PLAIN ${FDB_VERSION}) +endif() + +message(STATUS "FDB version is ${FDB_VERSION}") +message(STATUS "FDB package name is ${FDB_PACKAGE_NAME}") +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/fdbclient/versions.h.cmake ${CMAKE_CURRENT_BINARY_DIR}/fdbclient/versions.h) + + +################################################################################ +# Flow +################################################################################ + +# First thing we need is the actor compiler - and to compile and run the +# actor compiler, we need mono +include(CompileActorCompiler) + +# with the actor compiler, we can now make the flow commands available +include(FlowCommands) + +################################################################################ +# Vexilographer +################################################################################ + +include(CompileVexillographer) + +# This macro can be used to install symlinks, which turns out to be +# non-trivial due to CMake version differences and limitations on how +# files can be installed when building binary packages. +# +# The rule for binary packaging is that files (including symlinks) must +# be installed with the standard CMake install() macro. +# +# The rule for non-binary packaging is that CMake 2.6 cannot install() +# symlinks, but can create the symlink at install-time via scripting. +# Though, we assume that CMake 2.6 isn't going to be used to generate +# packages because versions later than 2.8.3 are superior for that purpose. +# +# _filepath: the absolute path to the file to symlink +# _sympath: absolute path of the installed symlink + +macro(InstallSymlink _filepath _sympath) + get_filename_component(_symname ${_sympath} NAME) + get_filename_component(_installdir ${_sympath} PATH) + + if (BINARY_PACKAGING_MODE) + execute_process(COMMAND "${CMAKE_COMMAND}" -E create_symlink + ${_filepath} + ${CMAKE_CURRENT_BINARY_DIR}/${_symname}) + install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${_symname} + DESTINATION ${_installdir} + COMPONENT clients) + else () + # scripting the symlink installation at install time should work + # for CMake 2.6.x and 2.8.x + install(CODE " + if (\"\$ENV{DESTDIR}\" STREQUAL \"\") + execute_process(COMMAND \"${CMAKE_COMMAND}\" -E create_symlink + ${_filepath} + ${_installdir}/${_symname}) + else () + execute_process(COMMAND \"${CMAKE_COMMAND}\" -E create_symlink + ${_filepath} + \$ENV{DESTDIR}/${_installdir}/${_symname}) + endif () + " + COMPONENT clients) + endif () +endmacro(InstallSymlink) + +################################################################################ +# Generate config file +################################################################################ + +string(RANDOM LENGTH 8 description1) +string(RANDOM LENGTH 8 description2) +set(CLUSTER_DESCRIPTION1 ${description1} CACHE STRING "Cluster description") +set(CLUSTER_DESCRIPTION2 ${description2} CACHE STRING "Cluster description") + +configure_file(fdb.cluster.cmake ${CMAKE_CURRENT_BINARY_DIR}/fdb.cluster) + + +################################################################################ +# testing +################################################################################ +enable_testing() + +################################################################################ +# Directory structure +################################################################################ + +include(cmake/InstallLayout.cmake) + +################################################################################ +# components +################################################################################ + +include(CompileBoost) +add_subdirectory(flow) +add_subdirectory(fdbrpc) +add_subdirectory(fdbclient) +add_subdirectory(fdbserver) +add_subdirectory(fdbcli) +add_subdirectory(fdbmonitor) +add_subdirectory(bindings) +add_subdirectory(fdbbackup) + +include(CPack) + +################################################################################ +# process compile commands for IDE +################################################################################ + +if (CMAKE_EXPORT_COMPILE_COMMANDS) + add_custom_command( + OUTPUT ${CMAKE_CURRENT_SOURCE_DIR}/compile_commands.json + COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/build/gen_compile_db.py + ARGS -b ${CMAKE_CURRENT_BINARY_DIR} -s ${CMAKE_CURRENT_SOURCE_DIR} -o ${CMAKE_CURRENT_SOURCE_DIR}/compile_commands.json ${CMAKE_CURRENT_BINARY_DIR}/compile_commands.json + DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/build/gen_compile_db.py ${CMAKE_CURRENT_BINARY_DIR}/compile_commands.json + COMMENT "Build compile commands for IDE" + ) + add_custom_target(procossed_compile_commands ALL DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/compile_commands.json ${CMAKE_CURRENT_BINARY_DIR}/compile_commands.json) +endif() diff --git a/Makefile b/Makefile index b8a4fc243e..9c6ac54f08 100644 --- a/Makefile +++ b/Makefile @@ -15,13 +15,12 @@ ifeq ($(MONO),) MONO := /usr/bin/mono endif -DMCS := $(shell which dmcs) MCS := $(shell which mcs) -ifneq ($(DMCS),) - MCS := $(DMCS) +ifeq ($(MCS),) + MCS := $(shell which dmcs) endif ifeq ($(MCS),) - MCS := /usr/bin/dmcs + MCS := /usr/bin/mcs endif CFLAGS := -Werror -Wno-error=format -fPIC -DNO_INTELLISENSE -fvisibility=hidden -DNDEBUG=1 -Wreturn-type -fno-omit-frame-pointer diff --git a/README.md b/README.md index 0e4d683708..7d8fd3daa3 100755 --- a/README.md +++ b/README.md @@ -43,10 +43,23 @@ Developers on a OS for which there is no binary package, or who would like to st 1. Install [Docker](https://www.docker.com/). 1. Check out the foundationdb repo. 1. Build Linux docker image using the file `Dockerfile` located in the `build` source directory. + + ```shell + cd /dir/path/foundationdb + docker build ./build -t + ``` + 1. Run the docker image interactively [Docker Run](https://docs.docker.com/engine/reference/run/#general-form) with the directory containing the foundationdb repo mounted [Docker Mounts](https://docs.docker.com/storage/volumes/). -`docker run -it -v '/local/dir/path/foundationdb:/docker/dir/path/foundationdb' /bin/bash` -1. Navigate to the mounted directory containing the foundationdb repo. -`cd /docker/dir/path/foundationdb` + + ```shell + docker run -it -v '/local/dir/path/foundationdb:/docker/dir/path/foundationdb' /bin/bash + ``` + +1. Navigate to the container's mounted directory which contains the foundationdb repo. + + ```shell + cd /docker/dir/path/foundationdb + ``` 1. Run `make`. This will build the fdbserver binary and the python bindings. If you want to build our other bindings, you will need to install a runtime for the language whose binding you want to build. Each binding has an `.mk` file which provides specific targets for that binding. diff --git a/bindings/CMakeLists.txt b/bindings/CMakeLists.txt new file mode 100644 index 0000000000..c3c7cadf79 --- /dev/null +++ b/bindings/CMakeLists.txt @@ -0,0 +1,3 @@ +add_subdirectory(c) +add_subdirectory(python) +add_subdirectory(java) diff --git a/bindings/bindingtester/__init__.py b/bindings/bindingtester/__init__.py index 44cd5e5f84..8f93b26b09 100644 --- a/bindings/bindingtester/__init__.py +++ b/bindings/bindingtester/__init__.py @@ -18,6 +18,7 @@ # limitations under the License. # +import math import sys import os @@ -61,11 +62,31 @@ class Result: def key(self, specification): return self.key_tuple[specification.key_start_index:] + @staticmethod + def elements_equal(el1, el2): + if type(el1) != type(el2): + return False + + if isinstance(el1, tuple): + return Result.tuples_match(el1, el2) + + if isinstance(el1, float) and math.isnan(el1): + return math.isnan(el2) + + return el1 == el2 + + @staticmethod + def tuples_match(t1, t2): + if len(t1) != len(t2): + return False + + return all([Result.elements_equal(x,y) for x,y in zip(t1, t2)]) + def matches_key(self, rhs, specification): if not isinstance(rhs, Result): return False - return self.key(specification) == rhs.key(specification) + return Result.tuples_match(self.key(specification), rhs.key(specification)) def matches(self, rhs, specification): if not self.matches_key(rhs, specification): diff --git a/bindings/bindingtester/bindingtester.py b/bindings/bindingtester/bindingtester.py index b6f0aab590..5a60d1112a 100755 --- a/bindings/bindingtester/bindingtester.py +++ b/bindings/bindingtester/bindingtester.py @@ -98,7 +98,7 @@ class ResultSet(object): # If these results aren't using sequence numbers, then we match two results based on whether they share the same key else: min_key = min([r.key(self.specification) for r in results.values()]) - results = {i: r for i, r in results.items() if r.key(self.specification) == min_key} + results = {i: r for i, r in results.items() if Result.tuples_match(r.key(self.specification), min_key)} # Increment the indices for those testers which produced a result in this iteration for i in results.keys(): diff --git a/bindings/c/CMakeLists.txt b/bindings/c/CMakeLists.txt new file mode 100644 index 0000000000..88c849f68d --- /dev/null +++ b/bindings/c/CMakeLists.txt @@ -0,0 +1,53 @@ +set(FDB_C_SRCS + fdb_c.cpp + foundationdb/fdb_c.h + ThreadCleanup.cpp) + +file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/foundationdb) + +set(platform) +if(APPLE) + set(platform "osx") +else() + set(platform "linux") +endif() + +add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/fdb_c.g.S + ${CMAKE_CURRENT_BINARY_DIR}/fdb_c_function_pointers.g.h + COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/generate_asm.py ${platform} + ${CMAKE_CURRENT_SOURCE_DIR}/fdb_c.cpp + ${CMAKE_CURRENT_BINARY_DIR}/fdb_c.g.S + ${CMAKE_CURRENT_BINARY_DIR}/fdb_c_function_pointers.g.h + DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/generate_asm.py ${CMAKE_CURRENT_SOURCE_DIR}/fdb_c.cpp + COMMENT "Generate C bindings") +add_custom_target(fdb_c_generated DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/fdb_c.g.S + ${CMAKE_CURRENT_BINARY_DIR}/fdb_c_function_pointers.g.h) + +add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/foundationdb/fdb_c_options.g.h + COMMAND ${MONO_EXECUTABLE} ${VEXILLOGRAPHER_EXE} ${CMAKE_SOURCE_DIR}/fdbclient/vexillographer/fdb.options c ${CMAKE_CURRENT_BINARY_DIR}/foundationdb/fdb_c_options.g.h + DEPENDS ${CMAKE_SOURCE_DIR}/fdbclient/vexillographer/fdb.options vexillographer + COMMENT "Generate C options") +add_custom_target(fdb_c_options DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/foundationdb/fdb_c_options.g.h) + +include(GenerateExportHeader) + +add_library(fdb_c SHARED ${FDB_C_SRCS} ${CMAKE_CURRENT_BINARY_DIR}/fdb_c.g.S) +add_dependencies(fdb_c fdb_c_generated fdb_c_options) +target_link_libraries(fdb_c PUBLIC fdbclient) +target_include_directories(fdb_c PUBLIC + ${CMAKE_CURRENT_BINARY_DIR} + ${CMAKE_CURRENT_SOURCE_DIR} + ${CMAKE_CURRENT_BINARY_DIR}/foundationdb) +# TODO: re-enable once the old vcxproj-based build system is removed. +#generate_export_header(fdb_c EXPORT_MACRO_NAME "DLLEXPORT" +# EXPORT_FILE_NAME ${CMAKE_CURRENT_BINARY_DIR}/foundationdb/fdb_c_export.h) +install(TARGETS fdb_c + EXPORT fdbc + DESTINATION ${FDB_LIB_DIR} + COMPONENT clients) +install( + FILES foundationdb/fdb_c.h + ${CMAKE_CURRENT_BINARY_DIR}/foundationdb/fdb_c_options.g.h + ${CMAKE_SOURCE_DIR}/fdbclient/vexillographer/fdb.options + DESTINATION ${FDB_INCLUDE_INSTALL_DIR}/foundationdb COMPONENT clients) +#install(EXPORT fdbc DESTINATION ${FDB_LIB_DIR}/foundationdb COMPONENT clients) diff --git a/bindings/c/fdb_c.cpp b/bindings/c/fdb_c.cpp index cb63feea70..52e055c7f2 100644 --- a/bindings/c/fdb_c.cpp +++ b/bindings/c/fdb_c.cpp @@ -297,11 +297,15 @@ fdb_error_t fdb_future_get_string_array( extern "C" DLLEXPORT FDBFuture* fdb_create_cluster_v609( const char* cluster_file_path ) { - char *path = NULL; + char *path; if(cluster_file_path) { path = new char[strlen(cluster_file_path) + 1]; strcpy(path, cluster_file_path); } + else { + path = new char[1]; + path[0] = '\0'; + } return (FDBFuture*)ThreadFuture(path).extractPtr(); } @@ -340,7 +344,7 @@ FDBFuture* fdb_cluster_create_database_v609( FDBCluster* c, uint8_t const* db_na extern "C" DLLEXPORT fdb_error_t fdb_create_database( const char* cluster_file_path, FDBDatabase** out_database ) { CATCH_AND_RETURN( - *out_database = (FDBDatabase*)API->createDatabase( cluster_file_path ? cluster_file_path : "" ).extractPtr(); + *out_database = (FDBDatabase*)API->createDatabase( cluster_file_path ).extractPtr(); ); } diff --git a/bindings/c/local.mk b/bindings/c/local.mk index 28418d408b..36e295ce0e 100644 --- a/bindings/c/local.mk +++ b/bindings/c/local.mk @@ -24,7 +24,7 @@ fdb_c_CFLAGS := $(fdbclient_CFLAGS) fdb_c_LDFLAGS := $(fdbrpc_LDFLAGS) fdb_c_LIBS := lib/libfdbclient.a lib/libfdbrpc.a lib/libflow.a $(FDB_TLS_LIB) fdb_c_STATIC_LIBS := $(TLS_LIBS) -fdb_c_tests_LIBS := -shared -Llib -lfdb_c +fdb_c_tests_LIBS := -Llib -lfdb_c fdb_c_tests_HEADERS := -Ibindings/c CLEAN_TARGETS += fdb_c_tests_clean @@ -84,11 +84,11 @@ bindings/c/foundationdb/fdb_c_options.g.h: bin/vexillographer.exe fdbclient/vexi bin/fdb_c_performance_test: bindings/c/test/performance_test.c bindings/c/test/test.h fdb_c @echo "Compiling fdb_c_performance_test" - @$(CC) $(CFLAGS) $(fdb_c_tests_LIBS) $(fdb_c_tests_HEADERS) -o $@ bindings/c/test/performance_test.c + @$(CC) $(CFLAGS) $(fdb_c_tests_HEADERS) -o $@ bindings/c/test/performance_test.c $(fdb_c_tests_LIBS) bin/fdb_c_ryw_benchmark: bindings/c/test/ryw_benchmark.c bindings/c/test/test.h fdb_c @echo "Compiling fdb_c_ryw_benchmark" - @$(CC) $(CFLAGS) $(fdb_c_tests_LIBS) $(fdb_c_tests_HEADERS) -o $@ bindings/c/test/ryw_benchmark.c + @$(CC) $(CFLAGS) $(fdb_c_tests_HEADERS) -o $@ bindings/c/test/ryw_benchmark.c $(fdb_c_tests_LIBS) packages/fdb-c-tests-$(VERSION)-$(PLATFORM).tar.gz: bin/fdb_c_performance_test bin/fdb_c_ryw_benchmark @echo "Packaging $@" diff --git a/bindings/flow/FDBLoanerTypes.h b/bindings/flow/FDBLoanerTypes.h index 4797f863c5..a5be056b20 100755 --- a/bindings/flow/FDBLoanerTypes.h +++ b/bindings/flow/FDBLoanerTypes.h @@ -87,7 +87,7 @@ namespace FDB { template void serialize( Ar& ar ) { - ar & key & orEqual & offset; + serializer(ar, key, orEqual, offset); } }; inline bool operator == (const KeySelectorRef& lhs, const KeySelectorRef& rhs) { return lhs.key == rhs.key && lhs.orEqual==rhs.orEqual && lhs.offset==rhs.offset; } @@ -123,7 +123,7 @@ namespace FDB { int expectedSize() const { return key.expectedSize() + value.expectedSize(); } template - force_inline void serialize(Ar& ar) { ar & key & value; } + force_inline void serialize(Ar& ar) { serializer(ar, key, value); } struct OrderByKey { bool operator()(KeyValueRef const& a, KeyValueRef const& b) const { @@ -171,7 +171,7 @@ namespace FDB { template void serialize( Ar& ar ) { - ar & ((VectorRef&)*this) & more & readThrough & readToBegin & readThroughEnd; + serializer(ar, ((VectorRef&)*this), more, readThrough, readToBegin, readThroughEnd); } }; @@ -234,7 +234,7 @@ namespace FDB { template force_inline void serialize(Ar& ar) { - ar & const_cast(begin) & const_cast(end); + serializer(ar, const_cast(begin), const_cast(end)); if( begin > end ) { throw inverted_range(); }; diff --git a/bindings/go/fdb-go-install.sh b/bindings/go/fdb-go-install.sh index e0ee36710d..897e694d5d 100755 --- a/bindings/go/fdb-go-install.sh +++ b/bindings/go/fdb-go-install.sh @@ -63,7 +63,7 @@ function printUsage() { echo echo "cmd: One of the commands to run. The options are:" echo " install Download the FDB go bindings and install them" - echo " localinstall Install a into the go path a local copy of the repo" + echo " localinstall Install into the go path a local copy of the repo" echo " download Download but do not prepare the FoundationDB bindings" echo " help Print this help message and then quit" echo diff --git a/bindings/go/src/fdb/database.go b/bindings/go/src/fdb/database.go index 0055380753..db888307b4 100644 --- a/bindings/go/src/fdb/database.go +++ b/bindings/go/src/fdb/database.go @@ -86,7 +86,7 @@ func retryable(wrapped func() (interface{}, error), onError func(Error) FutureNi for { ret, e = wrapped() - /* No error means success! */ + // No error means success! if e == nil { return } @@ -96,8 +96,8 @@ func retryable(wrapped func() (interface{}, error), onError func(Error) FutureNi e = onError(ep).Get() } - /* If OnError returns an error, then it's not - /* retryable; otherwise take another pass at things */ + // If OnError returns an error, then it's not + // retryable; otherwise take another pass at things if e != nil { return } @@ -125,7 +125,7 @@ func retryable(wrapped func() (interface{}, error), onError func(Error) FutureNi // Transaction and Database objects. func (d Database) Transact(f func(Transaction) (interface{}, error)) (interface{}, error) { tr, e := d.CreateTransaction() - /* Any error here is non-retryable */ + // Any error here is non-retryable if e != nil { return nil, e } @@ -165,7 +165,7 @@ func (d Database) Transact(f func(Transaction) (interface{}, error)) (interface{ // Transaction, Snapshot and Database objects. func (d Database) ReadTransact(f func(ReadTransaction) (interface{}, error)) (interface{}, error) { tr, e := d.CreateTransaction() - /* Any error here is non-retryable */ + // Any error here is non-retryable if e != nil { return nil, e } diff --git a/bindings/go/src/fdb/fdb.go b/bindings/go/src/fdb/fdb.go index c90dcad8ef..336d8a713d 100644 --- a/bindings/go/src/fdb/fdb.go +++ b/bindings/go/src/fdb/fdb.go @@ -38,9 +38,9 @@ import ( "unsafe" ) -/* Would put this in futures.go but for the documented issue with -/* exports and functions in preamble -/* (https://code.google.com/p/go-wiki/wiki/cgo#Global_functions) */ +// Would put this in futures.go but for the documented issue with +// exports and functions in preamble +// (https://code.google.com/p/go-wiki/wiki/cgo#Global_functions) //export unlockMutex func unlockMutex(p unsafe.Pointer) { m := (*sync.Mutex)(p) diff --git a/bindings/go/src/fdb/range.go b/bindings/go/src/fdb/range.go index 8f3bb0dc45..a4383e0770 100644 --- a/bindings/go/src/fdb/range.go +++ b/bindings/go/src/fdb/range.go @@ -90,7 +90,11 @@ type ExactRange interface { // that the default zero-value of KeyRange specifies an empty range before all // keys in the database. type KeyRange struct { - Begin, End KeyConvertible + // The (inclusive) beginning of the range + Begin KeyConvertible + + // The (exclusive) end of the range + End KeyConvertible } // FDBRangeKeys allows KeyRange to satisfy the ExactRange interface. diff --git a/bindings/java/CMakeLists.txt b/bindings/java/CMakeLists.txt new file mode 100644 index 0000000000..1d352b0302 --- /dev/null +++ b/bindings/java/CMakeLists.txt @@ -0,0 +1,144 @@ +include(UseJava) +find_package(JNI 1.8 REQUIRED) +find_package(Java 1.8 COMPONENTS Development REQUIRED) + +set(JAVA_BINDING_SRCS + src/main/com/apple/foundationdb/async/AsyncIterable.java + src/main/com/apple/foundationdb/async/AsyncIterator.java + src/main/com/apple/foundationdb/async/AsyncUtil.java + src/main/com/apple/foundationdb/async/Cancellable.java + src/main/com/apple/foundationdb/async/CloneableException.java + src/main/com/apple/foundationdb/async/CloseableAsyncIterator.java + src/main/com/apple/foundationdb/async/package-info.java + src/main/com/apple/foundationdb/Cluster.java + src/main/com/apple/foundationdb/Database.java + src/main/com/apple/foundationdb/directory/Directory.java + src/main/com/apple/foundationdb/directory/DirectoryAlreadyExistsException.java + src/main/com/apple/foundationdb/directory/DirectoryException.java + src/main/com/apple/foundationdb/directory/DirectoryLayer.java + src/main/com/apple/foundationdb/directory/DirectoryMoveException.java + src/main/com/apple/foundationdb/directory/DirectoryPartition.java + src/main/com/apple/foundationdb/directory/DirectorySubspace.java + src/main/com/apple/foundationdb/directory/DirectoryUtil.java + src/main/com/apple/foundationdb/directory/DirectoryVersionException.java + src/main/com/apple/foundationdb/directory/MismatchedLayerException.java + src/main/com/apple/foundationdb/directory/NoSuchDirectoryException.java + src/main/com/apple/foundationdb/directory/package-info.java + src/main/com/apple/foundationdb/directory/PathUtil.java + src/main/com/apple/foundationdb/FDB.java + src/main/com/apple/foundationdb/FDBDatabase.java + src/main/com/apple/foundationdb/FDBTransaction.java + src/main/com/apple/foundationdb/FutureCluster.java + src/main/com/apple/foundationdb/FutureDatabase.java + src/main/com/apple/foundationdb/FutureKey.java + src/main/com/apple/foundationdb/FutureResult.java + src/main/com/apple/foundationdb/FutureResults.java + src/main/com/apple/foundationdb/FutureStrings.java + src/main/com/apple/foundationdb/FutureVersion.java + src/main/com/apple/foundationdb/FutureVoid.java + src/main/com/apple/foundationdb/JNIUtil.java + src/main/com/apple/foundationdb/KeySelector.java + src/main/com/apple/foundationdb/KeyValue.java + src/main/com/apple/foundationdb/LocalityUtil.java + src/main/com/apple/foundationdb/NativeFuture.java + src/main/com/apple/foundationdb/NativeObjectWrapper.java + src/main/com/apple/foundationdb/OptionConsumer.java + src/main/com/apple/foundationdb/OptionsSet.java + src/main/com/apple/foundationdb/package-info.java + src/main/com/apple/foundationdb/Range.java + src/main/com/apple/foundationdb/RangeQuery.java + src/main/com/apple/foundationdb/RangeResult.java + src/main/com/apple/foundationdb/RangeResultInfo.java + src/main/com/apple/foundationdb/RangeResultSummary.java + src/main/com/apple/foundationdb/ReadTransaction.java + src/main/com/apple/foundationdb/ReadTransactionContext.java + src/main/com/apple/foundationdb/subspace/package-info.java + src/main/com/apple/foundationdb/subspace/Subspace.java + src/main/com/apple/foundationdb/Transaction.java + src/main/com/apple/foundationdb/TransactionContext.java + src/main/com/apple/foundationdb/tuple/ByteArrayUtil.java + src/main/com/apple/foundationdb/tuple/IterableComparator.java + src/main/com/apple/foundationdb/tuple/package-info.java + src/main/com/apple/foundationdb/tuple/Tuple.java + src/main/com/apple/foundationdb/tuple/TupleUtil.java + src/main/com/apple/foundationdb/tuple/Versionstamp.java) + +set(JAVA_TESTS_SRCS + src/test/com/apple/foundationdb/test/AbstractTester.java + src/test/com/apple/foundationdb/test/AsyncDirectoryExtension.java + src/test/com/apple/foundationdb/test/AsyncStackTester.java + src/test/com/apple/foundationdb/test/BlockingBenchmark.java + src/test/com/apple/foundationdb/test/ConcurrentGetSetGet.java + src/test/com/apple/foundationdb/test/Context.java + src/test/com/apple/foundationdb/test/ContinuousSample.java + src/test/com/apple/foundationdb/test/DirectoryExtension.java + src/test/com/apple/foundationdb/test/DirectoryOperation.java + src/test/com/apple/foundationdb/test/DirectoryTest.java + src/test/com/apple/foundationdb/test/DirectoryUtil.java + src/test/com/apple/foundationdb/test/Example.java + src/test/com/apple/foundationdb/test/Instruction.java + src/test/com/apple/foundationdb/test/IterableTest.java + src/test/com/apple/foundationdb/test/LocalityTests.java + src/test/com/apple/foundationdb/test/ParallelRandomScan.java + src/test/com/apple/foundationdb/test/PerformanceTester.java + src/test/com/apple/foundationdb/test/RangeTest.java + src/test/com/apple/foundationdb/test/RYWBenchmark.java + src/test/com/apple/foundationdb/test/SerialInsertion.java + src/test/com/apple/foundationdb/test/SerialIteration.java + src/test/com/apple/foundationdb/test/SerialTest.java + src/test/com/apple/foundationdb/test/Stack.java + src/test/com/apple/foundationdb/test/StackEntry.java + src/test/com/apple/foundationdb/test/StackOperation.java + src/test/com/apple/foundationdb/test/StackTester.java + src/test/com/apple/foundationdb/test/StackUtils.java + src/test/com/apple/foundationdb/test/TesterArgs.java + src/test/com/apple/foundationdb/test/TestResult.java + src/test/com/apple/foundationdb/test/TupleTest.java + src/test/com/apple/foundationdb/test/VersionstampSmokeTest.java + src/test/com/apple/foundationdb/test/WatchTest.java + src/test/com/apple/foundationdb/test/WhileTrueTest.java) + +set(GENERATED_JAVA_DIR ${CMAKE_CURRENT_BINARY_DIR}/src/main/com/foundationdb) +file(MAKE_DIRECTORY ${GENERATED_JAVA_DIR}) + +set(GENERATED_JAVA_FILES + ${GENERATED_JAVA_DIR}/ClusterOptions.java + ${GENERATED_JAVA_DIR}/ConflictRangeType.java + ${GENERATED_JAVA_DIR}/DatabaseOptions.java + ${GENERATED_JAVA_DIR}/MutationType.java + ${GENERATED_JAVA_DIR}/NetworkOptions.java + ${GENERATED_JAVA_DIR}/StreamingMode.java + ${GENERATED_JAVA_DIR}/TransactionOptions.java + ${GENERATED_JAVA_DIR}/FDBException.java) + +add_custom_command(OUTPUT ${GENERATED_JAVA_FILES} + COMMAND ${MONO_EXECUTABLE} ${VEXILLOGRAPHER_EXE} ${CMAKE_SOURCE_DIR}/fdbclient/vexillographer/fdb.options java ${GENERATED_JAVA_DIR} + DEPENDS ${CMAKE_SOURCE_DIR}/fdbclient/vexillographer/fdb.options vexillographer + COMMENT "Generate Java options") +add_custom_target(fdb_java_options DEPENDS ${GENERATED_JAVA_DIR}/StreamingMode.java) + +set(SYSTEM_NAME "linux") +if (APPLE) + set(SYSTEM_NAME "osx") +endif() + +add_library(fdb_java SHARED fdbJNI.cpp) +message(DEBUG ${JNI_INCLUDE_DIRS}) +message(DEBUG ${JNI_LIBRARIES}) +target_include_directories(fdb_java PRIVATE ${JNI_INCLUDE_DIRS}) +# libfdb_java.so is loaded by fdb-java.jar and doesn't need to depened on jvm shared libraries. +target_link_libraries(fdb_java PRIVATE fdb_c) +set_target_properties(fdb_java PROPERTIES + LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/lib/${SYSTEM_NAME}/amd64/) + +set(CMAKE_JAVA_COMPILE_FLAGS "-source" "1.8" "-target" "1.8") +set(CMAKE_JNI_TARGET TRUE) +set(JAR_VERSION "${FDB_MAJOR}.${FDB_MINOR}.${FDB_REVISION}") +add_jar(fdb-java ${JAVA_BINDING_SRCS} ${GENERATED_JAVA_FILES} + OUTPUT_DIR ${PROJECT_BINARY_DIR}/lib) +add_dependencies(fdb-java fdb_java_options fdb_java) +add_jar(foundationdb-tests SOURCES ${JAVA_TESTS_SRCS} INCLUDE_JARS fdb-java) +add_dependencies(foundationdb-tests fdb_java_options) + +install_jar(fdb-java DESTINATION ${FDB_SHARE_DIR}/java COMPONENT clients) +install(TARGETS fdb_java DESTINATION ${FDB_LIB_DIR} COMPONENT clients) diff --git a/bindings/python/CMakeLists.txt b/bindings/python/CMakeLists.txt new file mode 100644 index 0000000000..73a0e4b13b --- /dev/null +++ b/bindings/python/CMakeLists.txt @@ -0,0 +1,44 @@ +set(SRCS + fdb/__init__.py + fdb/directory_impl.py + fdb/impl.py + fdb/locality.py + fdb/six.py + fdb/subspace_impl.py + fdb/tuple.py) + +if(APPLE) + list(APPEND SRCS fdb/libfdb_c.dylib.pth) +else() + list(APPEND SRCS fdb/libfdb_c.so.pth) +endif() + +set(out_files "") +foreach(src ${SRCS}) + get_filename_component(dirname ${src} DIRECTORY) + get_filename_component(extname ${src} EXT) + add_custom_command(OUTPUT ${PROJECT_BINARY_DIR}/bindings/python/${src} + COMMAND mkdir -p ${PROJECT_BINARY_DIR}/bindings/python/${dirname} + COMMAND cp ${src} ${PROJECT_BINARY_DIR}/bindings/python/${dirname}/ + DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/${src} + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} + COMMENT "copy ${src}") + set(out_files "${out_files};${PROJECT_BINARY_DIR}/bindings/python/${src}") +endforeach() +add_custom_target(python_binding ALL DEPENDS ${out_files}) + +file(MAKE_DIRECTORY ${PROJECT_BINARY_DIR}/bindings/python/fdb) +set(options_file ${PROJECT_BINARY_DIR}/bindings/python/fdb/fdboptions.py) +add_custom_command(OUTPUT ${options_file} + COMMAND ${MONO_EXECUTABLE} ${VEXILLOGRAPHER_EXE} ${CMAKE_SOURCE_DIR}/fdbclient/vexillographer/fdb.options python ${options_file} + DEPENDS ${PROJECT_SOURCE_DIR}/fdbclient/vexillographer/fdb.options vexillographer + COMMENT "Generate Python options") +add_custom_target(fdb_python_options DEPENDS + ${options_file} + ${PROJECT_SOURCE_DIR}/fdbclient/vexillographer/fdb.options + vexillographer) + +add_dependencies(python_binding fdb_python_options) + +set(out_files "${out_files};${options_file}") +install(FILES ${out_files} DESTINATION ${FDB_PYTHON_INSTALL_DIR} COMPONENT clients) diff --git a/bindings/ruby/tests/directory_extension.rb b/bindings/ruby/tests/directory_extension.rb index e985ab7e89..1f6c3e7bbc 100644 --- a/bindings/ruby/tests/directory_extension.rb +++ b/bindings/ruby/tests/directory_extension.rb @@ -157,10 +157,10 @@ module DirectoryExtension exists = directory.exists?(inst.tr) children = exists ? directory.list(inst.tr) : [] log_subspace = FDB::Subspace.new([@dir_index], inst.wait_and_pop) - inst.tr[log_subspace['path']] = FDB::Tuple.pack(directory.path) - inst.tr[log_subspace['layer']] = FDB::Tuple.pack([directory.layer]) - inst.tr[log_subspace['exists']] = FDB::Tuple.pack([exists ? 1 : 0]) - inst.tr[log_subspace['children']] = FDB::Tuple.pack(children) + inst.tr[log_subspace['path'.encode('utf-8')]] = FDB::Tuple.pack(directory.path) + inst.tr[log_subspace['layer'.encode('utf-8')]] = FDB::Tuple.pack([directory.layer]) + inst.tr[log_subspace['exists'.encode('utf-8')]] = FDB::Tuple.pack([exists ? 1 : 0]) + inst.tr[log_subspace['children'.encode('utf-8')]] = FDB::Tuple.pack(children) elsif inst.op == 'DIRECTORY_STRIP_PREFIX' str = inst.wait_and_pop throw "String #{str} does not start with raw prefix #{directory.key}" if !str.start_with?(directory.key) diff --git a/build/Dockerfile b/build/Dockerfile index 9b8e14b70e..e841ee338c 100644 --- a/build/Dockerfile +++ b/build/Dockerfile @@ -9,7 +9,13 @@ RUN adduser --disabled-password --gecos '' fdb && chown -R fdb /opt && chmod -R USER fdb -RUN cd /opt/ && wget http://downloads.sourceforge.net/project/boost/boost/1.52.0/boost_1_52_0.tar.bz2 -qO - | tar -xj +# wget of bintray without forcing UTF-8 encoding results in 403 Forbidden +RUN cd /opt/ && wget http://downloads.sourceforge.net/project/boost/boost/1.52.0/boost_1_52_0.tar.bz2 &&\ + wget --local-encoding=UTF-8 https://dl.bintray.com/boostorg/release/1.67.0/source/boost_1_67_0.tar.bz2 &&\ + echo '2684c972994ee57fc5632e03bf044746f6eb45d4920c343937a465fd67a5adba boost_1_67_0.tar.bz2' | sha256sum -c - &&\ + tar -xjf boost_1_52_0.tar.bz2 &&\ + tar -xjf boost_1_67_0.tar.bz2 &&\ + rm boost_1_52_0.tar.bz2 boost_1_67_0.tar.bz2 USER root diff --git a/build/gen_compile_db.py b/build/gen_compile_db.py new file mode 100755 index 0000000000..15c4a8a2ef --- /dev/null +++ b/build/gen_compile_db.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python3 +from argparse import ArgumentParser +import os +import json +import re + +def actorFile(actor: str, build: str, src: str): + res = actor.replace(build, src, 1) + res = res.replace('actor.g.cpp', 'actor.cpp') + return res.replace('actor.g.h', 'actor.h') + +def rreplace(s, old, new, occurrence = 1): + li = s.rsplit(old, occurrence) + return new.join(li) + + +def actorCommand(cmd: str, build:str, src: str): + r1 = re.compile('-c (.+)(actor\.g\.cpp)') + m1 = r1.search(cmd) + if m1 is None: + return cmd + cmd1 = r1.sub('\\1actor.cpp', cmd) + return rreplace(cmd1, build, src) + + +parser = ArgumentParser(description="Generates a new compile_commands.json for rtags+flow") +parser.add_argument("-b", help="Build directory", dest="builddir", default=os.getcwd()) +parser.add_argument("-s", help="Build directory", dest="srcdir", default=os.getcwd()) +parser.add_argument("-o", help="Output file", dest="out", default="processed_compile_commands.json") +parser.add_argument("input", help="compile_commands.json", default="compile_commands.json", nargs="?") +args = parser.parse_args() + +print("transform {} with build directory {}".format(args.input, args.builddir)) + +with open(args.input) as f: + cmds = json.load(f) + +result = [] + +for cmd in cmds: + cmd['command'] = cmd['command'].replace(' -DNO_INTELLISENSE ', ' ') + if cmd['file'].endswith('actor.g.cpp'): + # here we need to rewrite the rule + cmd['command'] = actorCommand(cmd['command'], args.builddir, args.srcdir) + cmd['file'] = actorFile(cmd['file'], args.builddir, args.srcdir) + result.append(cmd) + else: + result.append(cmd) + +with open(args.out, 'w') as f: + json.dump(result, f, indent=4) diff --git a/build/get_package_name.sh b/build/get_package_name.sh new file mode 100755 index 0000000000..c2c94d126b --- /dev/null +++ b/build/get_package_name.sh @@ -0,0 +1,3 @@ +#!/usr/bin/env bash + +cat $1 | grep '' | sed -e 's,^[^>]*>,,' -e 's,<.*,,' diff --git a/build/get_version.sh b/build/get_version.sh new file mode 100755 index 0000000000..a7a2a179f2 --- /dev/null +++ b/build/get_version.sh @@ -0,0 +1,4 @@ +#!/usr/bin/env bash + +cat $1 | grep '' | sed -e 's,^[^>]*>,,' -e 's,<.*,,' + diff --git a/cmake/CompileActorCompiler.cmake b/cmake/CompileActorCompiler.cmake new file mode 100644 index 0000000000..aecabddfca --- /dev/null +++ b/cmake/CompileActorCompiler.cmake @@ -0,0 +1,36 @@ +find_program(MONO_EXECUTABLE mono) +find_program(MCS_EXECUTABLE dmcs) + +if (NOT MCS_EXECUTABLE) + find_program(MCS_EXECUTABLE mcs) +endif() + +set(MONO_FOUND FALSE CACHE INTERNAL "") + +if (NOT MCS_EXECUTABLE) + find_program(MCS_EXECUTABLE mcs) +endif() + +if (MONO_EXECUTABLE AND MCS_EXECUTABLE) + set(MONO_FOUND True CACHE INTERNAL "") +endif() + +if (NOT MONO_FOUND) + message(FATAL_ERROR "Could not find mono") +endif() + +set(ACTORCOMPILER_SRCS + ${CMAKE_CURRENT_SOURCE_DIR}/flow/actorcompiler/ActorCompiler.cs + ${CMAKE_CURRENT_SOURCE_DIR}/flow/actorcompiler/ActorParser.cs + ${CMAKE_CURRENT_SOURCE_DIR}/flow/actorcompiler/ParseTree.cs + ${CMAKE_CURRENT_SOURCE_DIR}/flow/actorcompiler/Program.cs + ${CMAKE_CURRENT_SOURCE_DIR}/flow/actorcompiler/Properties/AssemblyInfo.cs) +set(ACTOR_COMPILER_REFERENCES + "-r:System,System.Core,System.Xml.Linq,System.Data.DataSetExtensions,Microsoft.CSharp,System.Data,System.Xml") + +add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/actorcompiler.exe + COMMAND ${MCS_EXECUTABLE} ARGS ${ACTOR_COMPILER_REFERENCES} ${ACTORCOMPILER_SRCS} "-target:exe" "-out:actorcompiler.exe" + DEPENDS ${ACTORCOMPILER_SRCS} + COMMENT "Compile actor compiler" VERBATIM) +add_custom_target(actorcompiler DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/actorcompiler.exe) +set(actor_exe "${CMAKE_CURRENT_BINARY_DIR}/actorcompiler.exe") diff --git a/cmake/CompileBoost.cmake b/cmake/CompileBoost.cmake new file mode 100644 index 0000000000..ede9afd946 --- /dev/null +++ b/cmake/CompileBoost.cmake @@ -0,0 +1,26 @@ +find_package(Boost 1.67) + +if(Boost_FOUND) + add_library(boost_target INTERFACE) + target_link_libraries(boost_target INTERFACE Boost::boost) +else() + include(ExternalProject) + ExternalProject_add(boostProject + URL "https://dl.bintray.com/boostorg/release/1.67.0/source/boost_1_67_0.tar.bz2" + URL_HASH SHA256=2684c972994ee57fc5632e03bf044746f6eb45d4920c343937a465fd67a5adba + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + BUILD_IN_SOURCE ON + INSTALL_COMMAND "" + UPDATE_COMMAND "" + BUILD_BYPRODUCTS /boost/config.hpp) + + ExternalProject_Get_property(boostProject SOURCE_DIR) + + set(BOOST_INCLUDE_DIR ${SOURCE_DIR}) + message(STATUS "Boost include dir ${BOOST_INCLUDE_DIR}") + + add_library(boost_target INTERFACE) + add_dependencies(boost_target boostProject) + target_include_directories(boost_target INTERFACE ${BOOST_INCLUDE_DIR}) +endif() diff --git a/cmake/CompileVexillographer.cmake b/cmake/CompileVexillographer.cmake new file mode 100644 index 0000000000..80d0518b82 --- /dev/null +++ b/cmake/CompileVexillographer.cmake @@ -0,0 +1,25 @@ +set(VEXILLOGRAPHER_SRCS + ${CMAKE_CURRENT_SOURCE_DIR}/fdbclient/vexillographer/c.cs + ${CMAKE_CURRENT_SOURCE_DIR}/fdbclient/vexillographer/cpp.cs + ${CMAKE_CURRENT_SOURCE_DIR}/fdbclient/vexillographer/java.cs + ${CMAKE_CURRENT_SOURCE_DIR}/fdbclient/vexillographer/python.cs + ${CMAKE_CURRENT_SOURCE_DIR}/fdbclient/vexillographer/ruby.cs + ${CMAKE_CURRENT_SOURCE_DIR}/fdbclient/vexillographer/vexillographer.cs) + +set(VEXILLOGRAPHER_REFERENCES "-r:System,System.Core,System.Data,System.Xml,System.Xml.Linq") +set(VEXILLOGRAPHER_EXE "${CMAKE_CURRENT_BINARY_DIR}/vexillographer.exe") +add_custom_command(OUTPUT ${VEXILLOGRAPHER_EXE} + COMMAND ${MCS_EXECUTABLE} ARGS ${VEXILLOGRAPHER_REFERENCES} ${VEXILLOGRAPHER_SRCS} -target:exe -out:${VEXILLOGRAPHER_EXE} + DEPENDS ${VEXILLOGRAPHER_SRCS} + COMMENT "Compile Vexillographer") +add_custom_target(vexillographer DEPENDS ${VEXILLOGRAPHER_EXE}) + +set(ERROR_GEN_SRCS + ${CMAKE_CURRENT_SOURCE_DIR}/flow/error_gen.cs) +set(ERROR_GEN_REFERENCES "-r:System,System.Core,System.Data,System.Xml,System.Xml.Linq") +set(ERROR_GEN_EXE "${CMAKE_CURRENT_BINARY_DIR}/error_gen.exe") +add_custom_command (OUTPUT ${ERROR_GEN_EXE} + COMMAND ${MCS_EXECUTABLE} ARGS ${ERROR_GEN_REFERENCES} ${ERROR_GEN_SRCS} -target:exe -out:${ERROR_GEN_EXE} + DEPENDS ${ERROR_GEN_SRCS} + COMMENT "Compile error_gen") +add_custom_target(error_gen DEPENDS ${ERROR_GEN_EXE}) diff --git a/cmake/ConfigureCompiler.cmake b/cmake/ConfigureCompiler.cmake new file mode 100644 index 0000000000..7cdbd0e4ad --- /dev/null +++ b/cmake/ConfigureCompiler.cmake @@ -0,0 +1,128 @@ +set(USE_GPERFTOOLS OFF CACHE BOOL "Use gperfools for profiling") +set(PORTABLE_BINARY OFF CACHE BOOL "Create a binary that runs on older OS versions") +set(USE_VALGRIND OFF CACHE BOOL "Compile for valgrind usage") +set(USE_GOLD_LINKER OFF CACHE BOOL "Use gold linker") +set(ALLOC_INSTRUMENTATION OFF CACHE BOOL "Instrument alloc") +set(WITH_UNDODB OFF CACHE BOOL "Use rr or undodb") +set(OPEN_FOR_IDE OFF CACHE BOOL "Open this in an IDE (won't compile/link)") +set(FDB_RELEASE OFF CACHE BOOL "This is a building of a final release") + +find_package(Threads REQUIRED) +if(ALLOC_INSTRUMENTATION) + add_compile_options(-DALLOC_INSTRUMENTATION) +endif() +if(WITH_UNDODB) + add_compile_options(-DWITH_UNDODB) +endif() +if(DEBUG_TASKS) + add_compile_options(-DDEBUG_TASKS) +endif() + +if(NDEBUG) + add_compile_options(-DNDEBUG) +endif() + +if(FDB_RELEASE) + add_compile_options(-DFDB_RELEASE) +endif() + +include_directories(${CMAKE_SOURCE_DIR}) +include_directories(${CMAKE_CURRENT_BINARY_DIR}) +if (NOT OPEN_FOR_IDE) + add_definitions(-DNO_INTELLISENSE) +endif() +add_definitions(-DUSE_UCONTEXT) +enable_language(ASM) + +include(CheckFunctionExists) +set(CMAKE_REQUIRED_INCLUDES stdlib.h malloc.h) +set(CMAKE_REQUIRED_LIBRARIES c) + + +if(WIN32) + add_compile_options(/W3 /EHsc) +else() + if(USE_GOLD_LINKER) + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fuse-ld=gold -Wl,--disable-new-dtags") + set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fuse-ld=gold -Wl,--disable-new-dtags") + endif() + + set(GCC NO) + set(CLANG NO) + if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang" OR "${CMAKE_CXX_COMPILER_ID}" STREQUAL "AppleClang") + set(CLANG YES) + else() + # This is not a very good test. However, as we do not really support many architectures + # this is good enough for now + set(GCC YES) + endif() + + # we always compile with debug symbols. CPack will strip them out + # and create a debuginfo rpm + add_compile_options(-ggdb) + set(USE_ASAN OFF CACHE BOOL "Compile with address sanitizer") + if(USE_ASAN) + add_compile_options( + -fno-omit-frame-pointer -fsanitize=address + -DUSE_ASAN) + set(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} -fno-omit-frame-pointer -fsanitize=address") + set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -fno-omit-frame-pointer -fsanitize=address") + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fno-omit-frame-pointer -fsanitize=address ${CMAKE_THREAD_LIBS_INIT}") + endif() + + if(PORTABLE_BINARY) + message(STATUS "Create a more portable binary") + set(CMAKE_MODULE_LINKER_FLAGS "-static-libstdc++ -static-libgcc ${CMAKE_MODULE_LINKER_FLAGS}") + set(CMAKE_SHARED_LINKER_FLAGS "-static-libstdc++ -static-libgcc ${CMAKE_SHARED_LINKER_FLAGS}") + set(CMAKE_EXE_LINKER_FLAGS "-static-libstdc++ -static-libgcc ${CMAKE_EXE_LINKER_FLAGS}") + endif() + # Instruction sets we require to be supported by the CPU + add_compile_options( + -maes + -mmmx + -mavx + -msse4.2) + add_compile_options($<$:-std=c++11>) + if (USE_VALGRIND) + add_compile_options(-DVALGRIND -DUSE_VALGRIND) + endif() + if (CLANG) + if (APPLE) + add_compile_options(-stdlib=libc++) + endif() + add_compile_options( + -Wno-unknown-warning-option + -Wno-dangling-else + -Wno-sign-compare + -Wno-comment + -Wno-unknown-pragmas + -Wno-delete-non-virtual-dtor + -Wno-undefined-var-template + -Wno-unused-value + -Wno-tautological-pointer-compare + -Wno-format) + endif() + if (CMAKE_GENERATOR STREQUAL Xcode) + else() + add_compile_options(-Werror) + endif() + add_compile_options($<$:-Wno-pragmas>) + add_compile_options(-Wno-error=format + -Wno-deprecated + -fvisibility=hidden + -Wreturn-type + -fdiagnostics-color=always + -fPIC) + + if(CMAKE_COMPILER_IS_GNUCXX) + set(USE_LTO OFF CACHE BOOL "Do link time optimization") + if (USE_LTO) + add_compile_options($<$:-flto>) + set(CMAKE_AR "gcc-ar") + set(CMAKE_C_ARCHIVE_CREATE " qcs ") + set(CMAKE_C_ARCHIVE_FINISH true) + set(CMAKE_CXX_ARCHIVE_CREATE " qcs ") + set(CMAKE_CXX_ARCHIVE_FINISH true) + endif() + endif() +endif() diff --git a/cmake/FindEditline.cmake b/cmake/FindEditline.cmake new file mode 100644 index 0000000000..09a0c9f39a --- /dev/null +++ b/cmake/FindEditline.cmake @@ -0,0 +1,16 @@ +find_package(Curses) +include(FindPackageHandleStandardArgs) + +if(CURSES_FOUND) + find_path(Editline_INCLUDE_DIR editline/readline.h) + find_library(Editline_LIBRARY edit) + find_package_handle_standard_args( + Editline DEFAULT_MSG Editline_LIBRARY Editline_INCLUDE_DIR) + if(Editline_FOUND) + set(Editline_LIBRARIES ${Editline_LIBRARY} ${CURSES_LIBRARIES}) + set(Editline_INCLUDE_DIRS ${Editline_INCLUDE_DIR} ${CURSES_INCLUDE_DIRS}) + mark_as_advanced(Editline_INCLUDE_DIR Editline_LIBRARY) + endif() +else() + set(Editline_FOUND False) +endif() diff --git a/cmake/FindGperftools.cmake b/cmake/FindGperftools.cmake new file mode 100644 index 0000000000..cd990662cf --- /dev/null +++ b/cmake/FindGperftools.cmake @@ -0,0 +1,51 @@ +# Tries to find Gperftools. +# +# Usage of this module as follows: +# +# find_package(Gperftools) +# +# Variables used by this module, they can change the default behaviour and need +# to be set before calling find_package: +# +# Gperftools_ROOT_DIR Set this variable to the root installation of +# Gperftools if the module has problems finding +# the proper installation path. +# +# Variables defined by this module: +# +# GPERFTOOLS_FOUND System has Gperftools libs/headers +# GPERFTOOLS_LIBRARIES The Gperftools libraries (tcmalloc & profiler) +# GPERFTOOLS_INCLUDE_DIR The location of Gperftools headers + +find_library(GPERFTOOLS_TCMALLOC + NAMES tcmalloc + HINTS ${Gperftools_ROOT_DIR}/lib) + +find_library(GPERFTOOLS_PROFILER + NAMES profiler + HINTS ${Gperftools_ROOT_DIR}/lib) + +find_library(GPERFTOOLS_TCMALLOC_AND_PROFILER + NAMES tcmalloc_and_profiler + HINTS ${Gperftools_ROOT_DIR}/lib) + +find_path(GPERFTOOLS_INCLUDE_DIR + NAMES gperftools/heap-profiler.h + HINTS ${Gperftools_ROOT_DIR}/include) + +set(GPERFTOOLS_LIBRARIES ${GPERFTOOLS_TCMALLOC_AND_PROFILER}) + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args( + Gperftools + DEFAULT_MSG + GPERFTOOLS_LIBRARIES + GPERFTOOLS_INCLUDE_DIR) + +mark_as_advanced( + Gperftools_ROOT_DIR + GPERFTOOLS_TCMALLOC + GPERFTOOLS_PROFILER + GPERFTOOLS_TCMALLOC_AND_PROFILER + GPERFTOOLS_LIBRARIES + GPERFTOOLS_INCLUDE_DIR) diff --git a/cmake/FlowCommands.cmake b/cmake/FlowCommands.cmake new file mode 100644 index 0000000000..8e009b9efb --- /dev/null +++ b/cmake/FlowCommands.cmake @@ -0,0 +1,46 @@ +macro(actor_set varname srcs) + set(${varname}) + foreach(src ${srcs}) + set(tmp "${src}") + if(${src} MATCHES ".*\\.h") + continue() + elseif(${src} MATCHES ".*\\.actor\\.cpp") + string(REPLACE ".actor.cpp" ".actor.g.cpp" tmp ${src}) + set(tmp "${CMAKE_CURRENT_BINARY_DIR}/${tmp}") + endif() + set(${varname} "${${varname}};${tmp}") + endforeach() +endmacro() + +set(ACTOR_TARGET_COUNTER "0") +macro(actor_compile target srcs) + set(options DISABLE_ACTOR_WITHOUT_WAIT) + set(oneValueArg) + set(multiValueArgs) + cmake_parse_arguments(ACTOR_COMPILE "${options}" "${oneValueArgs}" "${multiValueArgs}" "${ARGN}") + set(_tmp_out "") + foreach(src ${srcs}) + set(tmp "") + if(${src} MATCHES ".*\\.actor\\.h") + string(REPLACE ".actor.h" ".actor.g.h" tmp ${src}) + elseif(${src} MATCHES ".*\\.actor\\.cpp") + string(REPLACE ".actor.cpp" ".actor.g.cpp" tmp ${src}) + endif() + set(actor_compiler_flags "") + if(ACTOR_COMPILE_DISABLE_ACTOR_WITHOUT_WAIT) + set(actor_compiler_flags "--disable-actor-without-wait-error") + endif() + if(tmp) + add_custom_command(OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/${tmp}" + COMMAND ${MONO_EXECUTABLE} ${actor_exe} "${CMAKE_CURRENT_SOURCE_DIR}/${src}" "${CMAKE_CURRENT_BINARY_DIR}/${tmp}" ${actor_compiler_flags} > /dev/null + DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/${src}" actorcompiler ${actor_exe} + COMMENT "Compile actor: ${src}") + set(_tmp_out "${_tmp_out};${CMAKE_CURRENT_BINARY_DIR}/${tmp}") + endif() + endforeach() + MATH(EXPR ACTOR_TARGET_COUNTER "${ACTOR_TARGET_COUNTER}+1") + add_custom_target(${target}_actors_${ACTOR_TARGET_COUNTER} DEPENDS ${_tmp_out}) + add_dependencies(${target} ${target}_actors_${ACTOR_TARGET_COUNTER}) + target_include_directories(${target} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) + target_include_directories(${target} PUBLIC ${CMAKE_CURRENT_BINARY_DIR}) +endmacro() diff --git a/cmake/InstallLayout.cmake b/cmake/InstallLayout.cmake new file mode 100644 index 0000000000..d161ef1735 --- /dev/null +++ b/cmake/InstallLayout.cmake @@ -0,0 +1,221 @@ +if(NOT INSTALL_LAYOUT) + set(DEFAULT_INSTALL_LAYOUT "STANDALONE") +endif() +set(INSTALL_LAYOUT "${DEFAULT_INSTALL_LAYOUT}" + CACHE STRING "Installation directory layout. Options are: TARGZ (as in tar.gz installer), WIN, STANDALONE, RPM, DEB, OSX") + +set(DIR_LAYOUT ${INSTALL_LAYOUT}) +if(DIR_LAYOUT MATCHES "TARGZ") + set(DIR_LAYOUT "STANDALONE") +endif() + +if(UNIX) + get_property(LIB64 GLOBAL PROPERTY FIND_LIBRARY_USE_LIB64_PATHS) + set(FDB_CONFIG_DIR "etc/foundationdb") + if("${LIB64}" STREQUAL "TRUE") + set(LIBSUFFIX 64) + else() + set(LIBSUFFIX "") + endif() + set(FDB_LIB_NOSUFFIX "lib") + if(DIR_LAYOUT MATCHES "STANDALONE") + set(FDB_LIB_DIR "lib${LIBSUFFIX}") + set(FDB_LIBEXEC_DIR "${FDB_LIB_DIR}") + set(FDB_BIN_DIR "bin") + set(FDB_SBIN_DIR "sbin") + set(FDB_INCLUDE_INSTALL_DIR "include") + set(FDB_PYTHON_INSTALL_DIR "${FDB_LIB_DIR}/python2.7/site-packages/fdb") + set(FDB_SHARE_DIR "share") + elseif(DIR_LAYOUT MATCHES "OSX") + set(CPACK_GENERATOR productbuild) + set(CPACK_PACKAGING_INSTALL_PREFIX "/") + set(FDB_LIB_DIR "usr/local/lib") + set(FDB_LIB_NOSUFFIX "usr/lib") + set(FDB_LIBEXEC_DIR "usr/local/libexec") + set(FDB_BIN_DIR "usr/local/bin") + set(FDB_SBIN_DIR "usr/local/sbin") + set(FDB_INCLUDE_INSTALL_DIR "usr/local/include") + set(FDB_PYTHON_INSTALL_DIR "Library/Python/2.7/site-packages/fdb") + set(FDB_SHARE_DIR "usr/local/share") + elseif(DIR_LAYOUT MATCHES "WIN") + # TODO + else() + # for deb and rpm + if(INSTALL_LAYOUT MATCHES "RPM") + set(CPACK_GENERATOR "RPM") + else() + # DEB + set(CPACK_GENERATOR "DEB") + endif() + set(CMAKE_INSTALL_PREFIX "/") + set(CPACK_PACKAGING_INSTALL_PREFIX "/") + set(FDB_LIB_DIR "usr/lib${LIBSUFFIX}") + set(FDB_LIB_NOSUFFIX "usr/lib") + set(FDB_LIBEXEC_DIR "${FDB_LIB_DIR}") + set(FDB_BIN_DIR "usr/bin") + set(FDB_SBIN_DIR "usr/sbin") + set(FDB_INCLUDE_INSTALL_DIR "usr/include") + set(FDB_PYTHON_INSTALL_DIR "${FDB_LIB_DIR}/python2.7/site-packages/fdb") + set(FDB_SHARE_DIR "usr/share") + endif() +endif() + +################################################################################ +# Version information +################################################################################ + +string(REPLACE "." ";" FDB_VERSION_LIST ${FDB_VERSION_PLAIN}) +list(GET FDB_VERSION_LIST 0 FDB_MAJOR) +list(GET FDB_VERSION_LIST 1 FDB_MINOR) +list(GET FDB_VERSION_LIST 2 FDB_PATCH) + +################################################################################ +# General CPack configuration +################################################################################ + +include(InstallRequiredSystemLibraries) +set(CPACK_PACKAGE_NAME "foundationdb") +set(CPACK_PACKAGE_VENDOR "FoundationDB ") +set(CPACK_PACKAGE_VERSION_MAJOR ${FDB_MAJOR}) +set(CPACK_PACKAGE_VERSION_MINOR ${FDB_MINOR}) +set(CPACK_PACKAGE_VERSION_PATCH ${FDB_PATCH}) +set(CPACK_PACKAGE_DESCRIPTION_FILE ${CMAKE_SOURCE_DIR}/packaging/description) +set(CPACK_PACKAGE_DESCRIPTION_SUMMARY + "FoundationDB is a scalable, fault-tolerant, ordered key-value store with full ACID transactions.") +set(CPACK_PACKAGE_ICON ${CMAKE_SOURCE_DIR}/packaging/foundationdb.ico) +set(CPACK_PACKAGE_CONTACT "The FoundationDB Community") +set(CPACK_COMPONENT_server_DEPENDS clients) +if (INSTALL_LAYOUT MATCHES "OSX") + set(CPACK_RESOURCE_FILE_README ${CMAKE_SOURCE_DIR}/packaging/osx/resources/conclusion.rtf) + set(CPACK_PRODUCTBUILD_RESOURCES_DIR ${CMAKE_SOURCE_DIR}/packaging/osx/resources) +else() + set(CPACK_RESOURCE_FILE_LICENSE ${CMAKE_SOURCE_DIR}/LICENSE) + set(CPACK_RESOURCE_FILE_README ${CMAKE_SOURCE_DIR}/README.md) +endif() + +################################################################################ +# Configuration for RPM +################################################################################ + +if(INSTALL_LAYOUT MATCHES "RPM") + set(CPACK_RPM_server_USER_FILELIST + "%config(noreplace) /etc/foundationdb/foundationdb.conf" + "%attr(0700,foundationdb,foundationdb) /var/log/foundationdb" + "%attr(0700, foundationdb, foundationdb) /var/lib/foundationdb") + set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION + "/usr/sbin" + "/usr/share/java" + "/usr/lib64/python2.7" + "/usr/lib64/python2.7/site-packages" + "/var" + "/var/log" + "/var/lib" + "/lib" + "/lib/systemd" + "/lib/systemd/system" + "/etc/rc.d/init.d") + set(CPACK_RPM_DEBUGINFO_PACKAGE ON) + set(CPACK_RPM_BUILD_SOURCE_DIRS_PREFIX /usr/src) + set(CPACK_RPM_COMPONENT_INSTALL ON) + set(CPACK_RPM_clients_PRE_INSTALL_SCRIPT_FILE + ${CMAKE_SOURCE_DIR}/packaging/rpm/scripts/preclients.sh) + set(CPACK_RPM_clients_POST_INSTALL_SCRIPT_FILE + ${CMAKE_SOURCE_DIR}/packaging/rpm/scripts/postclients.sh) + set(CPACK_RPM_server_PRE_INSTALL_SCRIPT_FILE + ${CMAKE_SOURCE_DIR}/packaging/rpm/scripts/preserver.sh) + set(CPACK_RPM_server_POST_INSTALL_SCRIPT_FILE + ${CMAKE_SOURCE_DIR}/packaging/rpm/scripts/postserver.sh) + set(CPACK_RPM_server_PRE_UNINSTALL_SCRIPT_FILE + ${CMAKE_SOURCE_DIR}/packaging/rpm/scripts/preunserver.sh) + set(CPACK_RPM_server_PACKAGE_REQUIRES + "foundationdb-clients = ${FDB_MAJOR}.${FDB_MINOR}.${FDB_PATCH}") +endif() + +################################################################################ +# Configuration for DEB +################################################################################ + +if(INSTALL_LAYOUT MATCHES "DEB") + set(CPACK_DEB_COMPONENT_INSTALL ON) + set(CPACK_DEBIAN_PACKAGE_SECTION "database") + set(CPACK_DEBIAN_ENABLE_COMPONENT_DEPENDS ON) + + set(CPACK_DEBIAN_server_PACKAGE_DEPENDS "adduser, libc6 (>= 2.11), python (>= 2.6)") + set(CPACK_DEBIAN_clients_PACKAGE_DEPENDS "adduser, libc6 (>= 2.11)") + set(CPACK_DEBIAN_PACKAGE_HOMEPAGE "https://www.foundationdb.org") + set(CPACK_DEBIAN_clients_PACKAGE_CONTROL_EXTRA + ${CMAKE_SOURCE_DIR}/packaging/deb/DEBIAN-foundationdb-clients/postinst) + set(CPACK_DEBIAN_server_PACKAGE_CONTROL_EXTRA + ${CMAKE_SOURCE_DIR}/packaging/deb/DEBIAN-foundationdb-server/conffiles + ${CMAKE_SOURCE_DIR}/packaging/deb/DEBIAN-foundationdb-server/preinst + ${CMAKE_SOURCE_DIR}/packaging/deb/DEBIAN-foundationdb-server/postinst + ${CMAKE_SOURCE_DIR}/packaging/deb/DEBIAN-foundationdb-server/prerm + ${CMAKE_SOURCE_DIR}/packaging/deb/DEBIAN-foundationdb-server/postrm) +endif() + +################################################################################ +# Server configuration +################################################################################ + +string(RANDOM LENGTH 8 description1) +string(RANDOM LENGTH 8 description2) +set(CLUSTER_DESCRIPTION1 ${description1} CACHE STRING "Cluster description") +set(CLUSTER_DESCRIPTION2 ${description2} CACHE STRING "Cluster description") + +install(FILES ${CMAKE_SOURCE_DIR}/packaging/foundationdb.conf + DESTINATION ${FDB_CONFIG_DIR} + COMPONENT server) +install(FILES ${CMAKE_SOURCE_DIR}/packaging/argparse.py + DESTINATION "usr/lib/foundationdb" + COMPONENT server) +install(FILES ${CMAKE_SOURCE_DIR}/packaging/make_public.py + DESTINATION "usr/lib/foundationdb") +if((INSTALL_LAYOUT MATCHES "RPM") OR (INSTALL_LAYOUT MATCHES "DEB")) + file(MAKE_DIRECTORY ${CMAKE_BINARY_DIR}/packaging/foundationdb + ${CMAKE_BINARY_DIR}/packaging/rpm) + install( + DIRECTORY ${CMAKE_BINARY_DIR}/packaging/foundationdb + DESTINATION "var/log" + COMPONENT server) + install( + DIRECTORY ${CMAKE_BINARY_DIR}/packaging/foundationdb + DESTINATION "var/lib" + COMPONENT server) + execute_process( + COMMAND pidof systemd + RESULT_VARIABLE IS_SYSTEMD + OUTPUT_QUIET + ERROR_QUIET) + if(IS_SYSTEMD EQUAL "0") + configure_file(${CMAKE_SOURCE_DIR}/packaging/rpm/foundationdb.service + ${CMAKE_BINARY_DIR}/packaging/rpm/foundationdb.service) + install(FILES ${CMAKE_BINARY_DIR}/packaging/rpm/foundationdb.service + DESTINATION "lib/systemd/system" + COMPONENT server) + else() + if(INSTALL_LAYOUT MATCHES "RPM") + install(FILES ${CMAKE_SOURCE_DIR}/packaging/rpm/foundationdb-init + DESTINATION "etc/rc.d/init.d" + RENAME "foundationdb" + COMPONENT server) + else() + install(FILES ${CMAKE_SOURCE_DIR}/packaging/deb/foundationdb-init + DESTINATION "etc/init.d" + RENAME "foundationdb" + COMPONENT server) + endif() + endif() +endif() + +################################################################################ +# Helper Macros +################################################################################ + +macro(install_symlink filepath sympath compondent) + install(CODE "execute_process(COMMAND ${CMAKE_COMMAND} -E create_symlink ${filepath} ${sympath})" COMPONENT ${component}) + install(CODE "message(\"-- Created symlink: ${sympath} -> ${filepath}\")") +endmacro() +macro(install_mkdir dirname component) + install(CODE "execute_process(COMMAND ${CMAKE_COMMAND} -E make_directory ${dirname})" COMPONENT ${component}) + install(CODE "message(\"-- Created directory: ${dirname}\")") +endmacro() diff --git a/cmake/user-config.jam.cmake b/cmake/user-config.jam.cmake new file mode 100644 index 0000000000..d719b5020d --- /dev/null +++ b/cmake/user-config.jam.cmake @@ -0,0 +1,2 @@ +using @BOOST_TOOLSET@ : : @CMAKE_CXX_COMPILER@ : @BOOST_ADDITIONAL_COMPILE_OPTIOINS@ ; +using python : @PYTHON_VERSION_MAJOR@.@PYTHON_VERSION_MINOR@ : @PYTHON_EXECUTABLE@ : @PYTHON_INCLUDE_DIRS@ ; diff --git a/documentation/sphinx/source/administration.rst b/documentation/sphinx/source/administration.rst index 6aae431def..8ddc88ac6f 100644 --- a/documentation/sphinx/source/administration.rst +++ b/documentation/sphinx/source/administration.rst @@ -9,15 +9,18 @@ Administration :hidden: :titlesonly: + configuration moving-a-cluster tls - + This document covers the administration of an existing FoundationDB cluster. We recommend you read this document before setting up a cluster for performance testing or production use. .. note:: In FoundationDB, a "cluster" refers to one or more FoundationDB processes spread across one or more physical machines that together host a FoundationDB database. To administer an externally accessible cluster, you need to understand basic system tasks. You should begin with how to :ref:`start and stop the database `. Next, you should review management of a cluster, including :ref:`adding ` and :ref:`removing ` machines, and monitoring :ref:`cluster status ` and the basic :ref:`server processes `. You should be familiar with :ref:`managing trace files ` and :ref:`other administrative concerns `. Finally, you should know how to :ref:`uninstall ` or :ref:`upgrade ` the database. +FoundationDB also provides a number of different :doc:`configuration ` options which you should know about when setting up a FoundationDB database. + .. _administration-running-foundationdb: Starting and stopping diff --git a/documentation/sphinx/source/api-error-codes.rst b/documentation/sphinx/source/api-error-codes.rst index f0d2554f02..4e0cad202b 100644 --- a/documentation/sphinx/source/api-error-codes.rst +++ b/documentation/sphinx/source/api-error-codes.rst @@ -100,7 +100,7 @@ FoundationDB may return the following error codes from API functions. If you nee +-----------------------------------------------+-----+--------------------------------------------------------------------------------+ | transaction_invalid_version | 2020| Transaction does not have a valid commit version | +-----------------------------------------------+-----+--------------------------------------------------------------------------------+ -| transaction_read_only | 2021| Transaction is read-only and therefore does not have a commit version | +| no_commit_version | 2021| Transaction is read-only and therefore does not have a commit version | +-----------------------------------------------+-----+--------------------------------------------------------------------------------+ | environment_variable_network_option_failed | 2022| Environment variable network option could not be set | +-----------------------------------------------+-----+--------------------------------------------------------------------------------+ diff --git a/documentation/sphinx/source/command-line-interface.rst b/documentation/sphinx/source/command-line-interface.rst index 605efe0c40..df365ea7da 100644 --- a/documentation/sphinx/source/command-line-interface.rst +++ b/documentation/sphinx/source/command-line-interface.rst @@ -101,6 +101,15 @@ Set the process using ``configure [proxies|resolvers|logs]=``, where ```` For recommendations on appropriate values for process types in large clusters, see :ref:`guidelines-process-class-config`. +fileconfigure +------------- + +The ``fileconfigure`` command is alternative to the ``configure`` command which changes the configuration of the database based on a json document. The command loads a JSON document from the provided file, and change the database configuration to match the contents of the JSON document. + +The format should be the same as the value of the ``configuration`` entry in status JSON without ``excluded_servers`` or ``coordinators_count``. Its syntax is ``fileconfigure [new] ``. + +"The ``new`` option, if present, initializes a new database with the given configuration rather than changing the configuration of an existing one. + coordinators ------------ diff --git a/documentation/sphinx/source/configuration.rst b/documentation/sphinx/source/configuration.rst index 96884ef19d..f5298b3b76 100644 --- a/documentation/sphinx/source/configuration.rst +++ b/documentation/sphinx/source/configuration.rst @@ -260,7 +260,7 @@ Contains default parameters for all fdbserver processes on this machine. These s * ``storage_memory``: Maximum memory used for data storage. This parameter is used *only* with memory storage engine, not the ssd storage engine. The default value is 1GiB. When specified without a unit, MB is assumed. Clusters will be restricted to using this amount of memory per process for purposes of data storage. Memory overhead associated with storing the data is counted against this total. If you increase the ``storage_memory``, you should also increase the ``memory`` parameter by the same amount. * ``locality_machineid``: Machine identifier key. All processes on a machine should share a unique id. By default, processes on a machine determine a unique id to share. This does not generally need to be set. * ``locality_zoneid``: Zone identifier key. Processes that share a zone id are considered non-unique for the purposes of data replication. If unset, defaults to machine id. -* ``locality_dcid``: Data center identifier key. All processes physically located in a data center should share the id. No default value. If you are depending on data center based replication this must be set on all processes. +* ``locality_dcid``: Datacenter identifier key. All processes physically located in a datacenter should share the id. No default value. If you are depending on datacenter based replication this must be set on all processes. * ``locality_data_hall``: Data hall identifier key. All processes physically located in a data hall should share the id. No default value. If you are depending on data hall based replication this must be set on all processes. * ``io_trust_seconds``: Time in seconds that a read or write operation is allowed to take before timing out with an error. If an operation times out, all future operations on that file will fail with an error as well. Only has an effect when using AsyncFileKAIO in Linux. If unset, defaults to 0 which means timeout is disabled. @@ -355,22 +355,21 @@ FoundationDB will never use processes on the same machine for the replication of FoundationDB replicates data to three machines, and at least three available machines are required to make progress. This is the recommended mode for a cluster of five or more machines in a single datacenter. ``three_data_hall`` mode - FoundationDB replicates data to three machines, and at least three available machines are required to make progress. Every piece of data that has been committed to storage servers - will be replicated onto three different data halls, and the cluster will - remain available after losing a single data hall and one machine in another - data hall. + FoundationDB stores data in triplicate, with one copy on a storage server in each of three data halls. The transaction logs are replicated four times, with two data halls containing two replicas apiece. Four available machines (two in each of two data halls) are therefore required to make progress. This configuration enables the cluster to remain available after losing a single data hall and one machine in another data hall. Datacenter-aware mode --------------------- -In addition to the more commonly used modes listed above, this version of FoundationDB has support for redundancy across multiple datacenters. Although data will always be triple replicated in this mode, it may not be replicated across all datacenters. +In addition to the more commonly used modes listed above, this version of FoundationDB has support for redundancy across multiple datacenters. .. note:: When using the datacenter-aware mode, all ``fdbserver`` processes should be passed a valid datacenter identifier on the command line. ``three_datacenter`` mode *(for 5+ machines in 3 datacenters)* - FoundationDB attempts to replicate data across three datacenters and will stay up with only two available. Data is replicated 6 times. For maximum availability, you should use five coordination servers: two in two of the datacenters and one in the third datacenter. + FoundationDB attempts to replicate data across three datacenters and will stay up with only two available. Data is replicated 6 times. Transaction logs are stored in the same configuration as the ``three_data_hall`` mode, so commit latencies are tied to the latency between datacenters. For maximum availability, you should use five coordination servers: two in two of the datacenters and one in the third datacenter. + +.. warning:: ``three_datacenter`` mode is not compatible with region configuration. Changing redundancy mode ------------------------ @@ -478,7 +477,7 @@ FoundationDB recommends the ext4 filesystem. (However, see :ref:`Platform Issues * Copy-on-write type filesystems (such as Btrfs) will likely have poor performance with FoundationDB. -Ext4 filesystems should be mounted with mount options ``default,noatime,discard``. +Ext4 filesystems should be mounted with mount options ``defaults,noatime,discard``. .. note :: The ``noatime`` option disables updating of access times when reading files, an unneeded feature for FoundationDB that increases write activity on the disk. The discard option enables `TRIM `_ support, allowing the operating system to efficiently inform the SSD of erased blocks, maintaining high write speed and increasing drive lifetime. @@ -516,6 +515,236 @@ When creating a partition for use with FoundationDB using the standard Linux fdi For an SSD with a single partition, the partition should typically begin at sector 2048 (512 byte sectors yields 1024 KiB alignment). +.. _configuration-configuring-regions: + +Configuring regions +=================== + +.. note:: In the following text, the term ``datacenter`` is used to denote unique locations that are failure independent from one another. Cloud providers generally expose this property of failure independence with Availability Zones. + +Regions configuration enables automatic failover between two datacenters, without adding a WAN latency for commits, while still maintaining all the consistency properties FoundationDB provides. + +This is made possible by combining two features. The first is asynchronous replication between two regions. By not waiting for the commits to become durable in the remote region before reporting a commit as successful, it means the remote region will slightly lag behind the primary. This is similar to ``fdbdr``, except that the asynchronous replication is done within a single cluster instead of between different FoundationDB clusters. + +The second feature is the ability to add one or more synchronous replicas of the mutation log in a different datacenter. Because this datacenter is only holding a transient copy of the mutations being committed to the database, only a few FoundationDB processes are required to fulfill this role. If the primary datacenter fails, the external mutation log replicas will still allow access to the most recent commits. This allows the lagging remote replica to catch up to the primary. Once the remote replica has applied all the mutations, it can start accepting new commits without suffering any data loss. + +An example configuration would be four total datacenters, two on the east coast, two on the west coast, with a preference for fast write latencies from the west coast. One datacenter on each coast would be sized to store a full copy of the data. The second datacenter on each coast would only have a few FoundationDB processes. + +While everything is healthy, writes need to be made durable in both west coast datacenters before a commit can succeed. The geographic proximity of the two datacenters minimizes the additional commit latency. Reads can be served from either region, and clients can get data from whichever region is closer. Getting a read version from the each coast region will still require communicating with a west coast datacenter. Clients can cache read versions if they can tolerate reading stale data to avoid waiting on read versions. + +If either west coast datacenter fails, the last few mutations will be propagated from the remaining west coast datacenter to the east coast. At this point, FoundationDB will start accepting commits on the east coast. Once the west coast comes back online, the system will automatically start copying all the data that was committed to the east coast back to the west coast replica. Once the west coast has caught up, the system will automatically switch back to accepting writes from the west coast again. + +The west coast mutation logs will maintain their copies of all committed mutations until they have been applied by the east coast datacenter. In the event that the east coast has failed for long enough that the west coast mutation logs no longer have enough disk space to continue storing the mutations, FoundationDB can be requested to drop the east coast replica completely. This decision is not automatic, and requires a manual change to the configuration. The west coast database will then act as a single datacenter database until the east coast comes back online. Because the east coast datacenter was completely dropped from the configuration, to bring the west coast back online FoundationDB will have to copy all the data between the regions. + +If a region failover occurs, clients will generally only see a latency spike of a few seconds. + +Specifying datacenters +---------------------- + +To use region configurations all processes in the cluster need to specify in which datacenter they are located. This can be done on the command line with either ``--locality_dcid`` or ``--datacenter_id``. This datacenter identifier is case sensitive. + +Clients should also specify their datacenter with the database option ``datacenter_id``. If a client does not specify their datacenter, they will use latency estimates to balance traffic between the two regions. This will result in about 5% of requests being served by the remote regions, so reads will suffer from high tail latencies. + +Changing the region configuration +--------------------------------- + +To change the region configure, use the ``fileconfigure`` command ``fdbcli``. For example:: + + user@host$ fdbcli + Using cluster file `/etc/foundationdb/fdb.cluster'. + + The database is available. + + Welcome to the fdbcli. For help, type `help'. + fdb> fileconfigure regions.json + Configuration changed. + + +Regions are configured in FoundationDB as a json document. For example:: + + "regions":[{ + "datacenters":[{ + "id":"WC1", + "priority":1, + "satellite":1 + }], + "satellite_redundancy_mode":"one_satellite_double", + "satellite_logs":2 + }] + +The ``regions`` object in the json document should be an array. Each element of the array describes the configuration of an individual region. + +Each region is described using an object that contains an array of ``datacenters``. Each region may also optionally provide a ``satellite_redundancy_mode`` and ``satellite_logs``. + +Each datacenter is described with an object that contains the ``id`` and ``priority`` of that datacenter. An ``id`` may be any unique alphanumeric string. Datacenters which hold a full replica of the data are referred to as primary datacenters. Datacenters that only store transaction logs are referred to as satellite datacenters. To specify a datacenter is a satellite, it needs to include ``"satellite" : 1``. The priorities of satellite datacenters are only compared to other satellites datacenters in the same region. The priorities of primary datacenters are only compared to other primary datacenters. + +.. warning:: In release 6.0, FoundationDB supports at most two regions. + +Each region can only have one primary datacenter. A negative priority for a datacenter denotes that the system should not recover the transaction subsystem in that datacenter. The region with the transaction subsystem is referred to as the active region. + +One primary datacenter must have a priority >= 0. The cluster will make the region with the highest priority the active region. If two datacenters have equal priority the cluster will make one of them the active region arbitrarily. + +The ``satellite_redundancy_mode`` is configured per region, and specifies how many copies of each mutation should be replicated to the satellite datacenters. + +``one_satellite_single`` mode + + Keep one copy of the mutation log in the satellite datacenter with the highest priority. If the highest priority satellite is unavailable it will put the transaction log in the satellite datacenter with the next highest priority. + +``one_satellite_double`` mode + + Keep two copies of the mutation log in the satellite datacenter with the highest priority. + +``one_satellite_triple`` mode + + Keep three copies of the mutation log in the satellite datacenter with the highest priority. + +``two_satellite_safe`` mode + + Keep two copies of the mutation log in each of the two satellite datacenters with the highest priorities, for a total of four copies of each mutation. This mode will protect against the simultaneous loss of both the primary and one of the satellite datacenters. If only one satellite is available, it will fall back to only storing two copies of the mutation log in the remaining datacenter. + +``two_satellite_fast`` mode + + Keep two copies of the mutation log in each of the two satellite datacenters with the highest priorities, for a total of four copies of each mutation. FoundationDB will only synchronously wait for one of the two satellite datacenters to make the mutations durable before considering a commit successful. This will reduce tail latencies caused by network issues between datacenters. If only one satellite is available, it will fall back to only storing two copies of the mutation log in the remaining datacenter. + +.. warning:: In release 6.0 this is implemented by waiting for all but 2 of the transaction logs. If ``satellite_logs`` is set to more than 4, FoundationDB will still need to wait for replies from both datacenters. + +The number of ``satellite_logs`` is also configured per region. It represents the desired number of transaction logs that should be recruited in the satellite datacenters. The satellite transaction logs do slightly less work than the primary datacenter transaction logs. So while the ratio of logs to replicas should be kept roughly equal in the primary datacenter and the satellites, a slightly fewer number of satellite transaction logs may be the optimal balance for performance. + +The number of replicas in each region is controlled by redundancy level. For example ``double`` mode will put 2 replicas in each region, for a total of 4 replicas. + +Asymmetric configurations +------------------------- + +The fact that satellite policies are configured per region allows for asymmetric configurations. For example, FoudnationDB can have a three datacenter setup where there are two datacenters on the west coast (WC1, WC2) and one datacenter on the east coast (EC1). The west coast region can be set as the preferred active region by setting the priority of its primary datacenter higher than the east coast datacenter. The west coast region should have a satellite policy configured, so that when it is active, FoundationDB is making mutations durable in both west coast datacenters. In the rare event that one of the west coast datacenter have failed, FoundationDB will fail over to the east coast datacenter. Because this region does not a satellite datacenter, the mutations will only be made durable in one datacenter while the transaction subsystem is located here. However this is justifiable because the region will only be active if a datacenter has already been lost. + +This is the region configuration that implements the example:: + + "regions":[{ + "datacenters":[{ + "id":"WC1", + "priority":1, + },{ + "id":"WC2", + "priority":0, + "satellite":1 + }], + "satellite_redundancy_mode":"one_satellite_double" + },{ + "datacenters":[{ + "id":"EC1", + "priority":0, + }] + }] + +Changing the usable_regions configuration +----------------------------------------- + +The ``usable_regions`` configuration option determines the number of regions which have a replica of the database. + +.. warning:: In release 6.0, ``usable_regions`` can only be configured to the values of ``1`` or ``2``, and a maximum of 2 regions can be defined in the ``regions`` json object. + +Increasing the ``usable_regions`` will start copying data from the active region to the remote region. Reducing the ``usable_regions`` will immediately drop the replicas in the remote region. During these changes, only one primary datacenter can have priority >= 0. This enforces exactly which region will lose its replica. + +Changing the log routers configuration +-------------------------------------- + +FoundationDB is architected to copy every mutation between regions exactly once. This copying is done by a new role called the log router. When a mutation is committed, it will be randomly assigned to one log router, which will be responsible for copying it across the WAN. + +This log router will pull the mutation from exactly one of the transaction logs. This means a single socket will be used to copy mutations across the WAN per log router. Because of this, if the latency between regions is large the bandwidth-delay product means that the number of log routers could limit the throughput at which mutations can be copied across the WAN. This can be mitigated by either configuring more log routers, or increasing the TCP window scale option. + +To keep the work evenly distributed on the transaction logs, the number of log routers should be a multiple of the number of transaction logs. + +The ``log_routers`` configuration option determines the number of log routers recruited in the remote region. + +Migrating a database to use a region configuration +-------------------------------------------------- + +To configure an existing database to regions, do the following steps: + + 1. Ensure all processes have their dcid locality set on the command line. All processes should exist in the same datacenter. If converting from a ``three_datacenter`` configuration, first configure down to using a single datacenter by changing the replication mode. Then exclude the machines in all datacenters but the one that will become the initial active region. + + 2. Configure the region configuration. The datacenter with all the existing processes should have a non-negative priority. The region which will eventually store the remote replica should be added with a negative priority. + + 3. Add processes to the cluster in the remote region. These processes will not take data yet, but need to be added to the cluster. If they are added before the region configuration is set they will be assigned data like any other FoundationDB process, which will lead to high latencies. + + 4. Configure ``usable_regions=2``. This will cause the cluster to start copying data between the regions. + + 5. Watch ``status`` and wait until data movement is complete. This will mean signal that the remote datacenter has a full replica of all of the data in the database. + + 6. Change the region configuration to have a non-negative priority for the primary datacenters in both regions. This will enable automatic failover between regions. + +Handling datacenter failures +---------------------------- + +When a primary datacenter fails, the cluster will go into a degraded state. It will recover to the other region and continue accepting commits, however the mutations bound for the other side will build up on the transaction logs. Eventually, the disks on the primary's transaction logs will fill up, so the database cannot be left in this condition indefinitely. + +.. warning:: While a datacenter has failed, the maximum write throughput of the cluster will be roughly 1/3 of normal performance. This is because the transaction logs need to store all of the mutations being committed, so that once the other datacenter comes back online, it can replay history to catch back up. + +To drop the dead datacenter do the follow steps: + + 1. Configure the region configuration so that the dead datacenter has a negative priority. + + 2. Configure ``usable_regions=1``. + +If you are running in a configuration without a satellite datacenter, or you have lost all machines in a region simultaneously, the ``force_recovery_with_data_loss`` command from ``fdbcli`` allows you to force a recovery to the other region. This will discard the portion of the mutation log which did not make it across the WAN. Once the database has recovered, immediately follow the previous steps to drop the dead region the normal way. + +.. warning:: In 6.0 the ``force_recovery_with_data_loss`` command from ``fdbcli`` can cause data inconsistencies if it is used when processes from both non-satellite datacenters are still in the cluster. In general this command has not be tested to same degree as the rest of the codebase, and should only be used in extreme emergencies. + +Region change safety +-------------------- + +The steps described above for both adding and removing replicas are enforced by ``fdbcli``. The following are the specific conditions checked by ``fdbcli``: + + * You cannot change the ``regions`` configuration while also changing ``usable_regions``. + + * You can only change ``usable_regions`` when exactly one region has priority >= 0. + + * When ``usable_regions`` > 1, all regions with priority >= 0 must have a full replica of the data. + + * All storage servers must be in one of the regions specified by the region configuration. + +Monitoring +---------- + +It is important to ensure the remote replica does not fall too far behind the active replica. To failover between regions, all of the mutations need to be flushed from the active replica to the remote replica. If the remote replica is too far behind, this can take a very long time. The version difference between the datacenters is available in ``status json`` as ``datacenter_version_difference``. This number should be less than 5 million. A large datacenter version difference could indicate that more log routers are needed. It could also be caused by network issues between the regions. If the difference becomes too large the remote replica should be dropped, similar to a datacenter outage that goes on too long. + +Because of asymmetric write latencies in the two regions, it important to route client traffic to the currently active region. The current active region is written in the system key space as the key ``\xff/primaryDatacenter``. Clients can read and watch this key after setting the ``read_system_keys`` transaction option. + +Choosing coordinators +--------------------- + +Choosing coordinators for a multi-region configuration provides its own set of challenges. A majority of coordinators need to be alive for the cluster to be available. There are two common coordinators setups that allow a cluster to survive the simultaneous loss of a datacenter and one additional machine. + +The first is five coordinators in five different datacenters. The second is nine total coordinators spread across three datacenters. There is some additional benefit to spreading the coordinators across regions rather than datacenters. This is because if an entire region fails, it is still possible to recover to the other region if you are willing to accept a small amount of data loss. However, if you have lost a majority of coordinators, this becomes much more difficult. + +Additionally, if a datacenter fails and then the second datacenter in the region fails 30 seconds later, we can generally survive this scenario. The second datacenter only needs to be alive long enough to copy the tail of the mutation log across the WAN. However if your coordinators are in this second datacenter, you will still experience an outage. + +These considerations mean that best practice is to put three coordinators in the main datacenters of each of the two regions, and then put three additional coordinators in a third region. + +Comparison to other multiple datacenter configurations +------------------------------------------------------ + +Region configuration provides very similar functionality to ``fdbdr``. + +If you are not using satellite datacenters, the main benefit of a region configuration compared to ``fdbdr`` is that each datacenter is able to restore replication even after losing all copies of a key range. If we simultaneously lose two storage servers in a double replicated cluster, with ``fdbdr`` we would be forced to fail over to the remote region. With region configuration the cluster will automatically copy the missing key range from the remote replica back to the primary datacenter. + +The main disadvantage of using a region configuration is that the total number of processes we can support in a single region is around half when compared against ``fdbdr``. This is because we have processes for both regions in the same cluster, and some singleton components like the failure monitor will have to do twice as much work. In ``fdbdr``, there are two separate clusters for each region, so the total number of processes can scale to about twice as large as using a region configuration. + +Region configuration is better in almost all ways than the ``three_datacenter`` replication mode. Region configuration gives the same ability to survive the loss of one datacenter, however we only need to store two full replicas of the database instead of three. Region configuration is more efficient with how it sends mutations across the WAN. The only reason to use ``three_datacenter`` replication is if low latency reads from all three locations is required. + +Known limitations +----------------- + +The 6.0 release still has a number of rough edges related to region configuration. This is a collection of all the issues that have been pointed out in the sections above. These issues should be significantly improved in future releases of FoundationDB: + + * FoundationDB supports replicating data to at most two regions. + + * ``two_satellite_fast`` does not hide latency properly when configured with more than 4 satellite transaction logs. + + * While a datacenter has failed, the maximum write throughput of the cluster will be roughly 1/3 of normal performance. + + * ``force_recovery_with_data_loss`` can cause data inconsistencies if it is used when processes from both non-satellite datacenters are still in the cluster. + .. _guidelines-process-class-config: Guidelines for setting process class diff --git a/documentation/sphinx/source/developer-guide.rst b/documentation/sphinx/source/developer-guide.rst index 30b8d2c542..55c8303b8d 100644 --- a/documentation/sphinx/source/developer-guide.rst +++ b/documentation/sphinx/source/developer-guide.rst @@ -685,7 +685,7 @@ For example, suppose you have a polling loop that checks keys for changes once a value = read_keys(db) for k in keys: if cache[k] != value[k]: - yield value[k] + yield (k, value[k]) cache[k] = value[k] time.sleep(1) @@ -706,7 +706,7 @@ With watches, you can eliminate the sleep and perform new reads only after a cha value, watches = watch_keys(db) for k in keys: if cache[k] != value[k]: - yield value[k] + yield (k, value[k]) cache[k] = value[k] fdb.Future.wait_for_any(*watches) diff --git a/documentation/sphinx/source/downloads.rst b/documentation/sphinx/source/downloads.rst index 988b78e7b5..f6c99427ea 100644 --- a/documentation/sphinx/source/downloads.rst +++ b/documentation/sphinx/source/downloads.rst @@ -10,38 +10,38 @@ macOS The macOS installation package is supported on macOS 10.7+. It includes the client and (optionally) the server. -* `FoundationDB-6.0.15.pkg `_ +* `FoundationDB-6.0.18.pkg `_ Ubuntu ------ The Ubuntu packages are supported on 64-bit Ubuntu 12.04+, but beware of the Linux kernel bug in Ubuntu 12.x. -* `foundationdb-clients-6.0.15-1_amd64.deb `_ -* `foundationdb-server-6.0.15-1_amd64.deb `_ (depends on the clients package) +* `foundationdb-clients-6.0.18-1_amd64.deb `_ +* `foundationdb-server-6.0.18-1_amd64.deb `_ (depends on the clients package) RHEL/CentOS EL6 --------------- The RHEL/CentOS EL6 packages are supported on 64-bit RHEL/CentOS 6.x. -* `foundationdb-clients-6.0.15-1.el6.x86_64.rpm `_ -* `foundationdb-server-6.0.15-1.el6.x86_64.rpm `_ (depends on the clients package) +* `foundationdb-clients-6.0.18-1.el6.x86_64.rpm `_ +* `foundationdb-server-6.0.18-1.el6.x86_64.rpm `_ (depends on the clients package) RHEL/CentOS EL7 --------------- The RHEL/CentOS EL7 packages are supported on 64-bit RHEL/CentOS 7.x. -* `foundationdb-clients-6.0.15-1.el7.x86_64.rpm `_ -* `foundationdb-server-6.0.15-1.el7.x86_64.rpm `_ (depends on the clients package) +* `foundationdb-clients-6.0.18-1.el7.x86_64.rpm `_ +* `foundationdb-server-6.0.18-1.el7.x86_64.rpm `_ (depends on the clients package) Windows ------- The Windows installer is supported on 64-bit Windows XP and later. It includes the client and (optionally) the server. -* `foundationdb-6.0.15-x64.msi `_ +* `foundationdb-6.0.18-x64.msi `_ API Language Bindings ===================== @@ -58,18 +58,18 @@ On macOS and Windows, the FoundationDB Python API bindings are installed as part If you need to use the FoundationDB Python API from other Python installations or paths, download the Python package: -* `foundationdb-6.0.15.tar.gz `_ +* `foundationdb-6.0.18.tar.gz `_ Ruby 1.9.3/2.0.0+ ----------------- -* `fdb-6.0.15.gem `_ +* `fdb-6.0.18.gem `_ Java 8+ ------- -* `fdb-java-6.0.15.jar `_ -* `fdb-java-6.0.15-javadoc.jar `_ +* `fdb-java-6.0.18.jar `_ +* `fdb-java-6.0.18-javadoc.jar `_ Go 1.1+ ------- diff --git a/documentation/sphinx/source/old-release-notes/release-notes-600.rst b/documentation/sphinx/source/old-release-notes/release-notes-600.rst index e1e2281bbb..03a770fdc5 100644 --- a/documentation/sphinx/source/old-release-notes/release-notes-600.rst +++ b/documentation/sphinx/source/old-release-notes/release-notes-600.rst @@ -2,6 +2,46 @@ Release Notes ############# +6.0.18 +====== + +Fixes +----- + +* Backup metadata could falsely indicate that a backup is not usable. `(PR #1007) `_ +* Blobstore request failures could cause backup expire and delete operations to skip some files. `(PR #1007) `_ +* Blobstore request failures could cause restore to fail to apply some files. `(PR #1007) `_ +* Storage servers with large amounts of data would pause for a short period of time after rebooting. `(PR #1001) `_ +* The client library could leak memory when a thread died. `(PR #1011) `_ + +Features +-------- + +* Added the ability to specify versions as version-days ago from latest log in backup. `(PR #1007) `_ + +6.0.17 +====== + +Fixes +----- + +* Existing backups did not make progress when upgraded to 6.0.16. `(PR #962) `_ + +6.0.16 +====== + +Performance +----------- + +* Added a new backup folder scheme which results in far fewer kv range folders. `(PR #939) `_ + +Fixes +----- + +* Blobstore REST client attempted to create buckets that already existed. `(PR #923) `_ +* DNS would fail if IPv6 responses were received. `(PR #945) `_ +* Backup expiration would occasionally fail due to an incorrect assert. `(PR #926) `_ + 6.0.15 ====== diff --git a/documentation/sphinx/source/technical-overview.rst b/documentation/sphinx/source/technical-overview.rst index 318afeda94..c5dbf29e37 100644 --- a/documentation/sphinx/source/technical-overview.rst +++ b/documentation/sphinx/source/technical-overview.rst @@ -26,6 +26,8 @@ These documents explain the engineering design of FoundationDB, with detailed in * :doc:`testing`: FoundationDB uses a combined regime of robust simulation, live performance testing, and hardware-based failure testing to meet exacting standards of correctness and performance. +* :doc:`kv-architecture` provides a description of every major role a process in FoundationDB can fulfill. + .. toctree:: :maxdepth: 1 :titlesonly: @@ -42,3 +44,4 @@ These documents explain the engineering design of FoundationDB, with detailed in fault-tolerance flow testing + kv-architecture diff --git a/documentation/sphinx/source/tls.rst b/documentation/sphinx/source/tls.rst index c761f4b939..c56004aa7d 100644 --- a/documentation/sphinx/source/tls.rst +++ b/documentation/sphinx/source/tls.rst @@ -70,6 +70,8 @@ The value for each setting can be specified in more than one way. The actual va 2. The value of the environment variable, if one has been set; 3. The default value +For the password, rather than using the command-line option, it is recommended to use the environment variable ``FDB_TLS_PASSWORD``, as command-line options are more visible to other processes running on the same host. + As with all other command-line options to ``fdbserver``, the TLS settings can be specified in the :ref:`[fdbserver] section of the configuration file `. The settings for certificate file, key file, peer verification, password and CA file are interpreted by the software. @@ -99,6 +101,17 @@ There is no default password. If no password is specified, it is assumed that th Parameters and client bindings ------------------------------ +Automatic TLS certificate refresh +------------------------------ + +The TLS certificate will be automatically refreshed on a configurable cadence. The server will inspect the CA, certificate, and key files in the specified locations periodically, and will begin using the new versions if following criterion were met: + + * They are changed, judging by the last modified time. + * They are valid certificates. + * The key file matches the certificate file. + +The refresh rate is controlled by ``--knob_tls_cert_refresh_delay_seconds``. Setting it to 0 will disable the refresh. + The default LibreSSL-based implementation ========================================= diff --git a/fdb.cluster.cmake b/fdb.cluster.cmake new file mode 100644 index 0000000000..632fa1a4d3 --- /dev/null +++ b/fdb.cluster.cmake @@ -0,0 +1 @@ +${CLUSTER_DESCRIPTION1}:${CLUSTER_DESCRIPTION1}@127.0.0.1:4500 diff --git a/fdbbackup/CMakeLists.txt b/fdbbackup/CMakeLists.txt new file mode 100644 index 0000000000..dd6f46fa5b --- /dev/null +++ b/fdbbackup/CMakeLists.txt @@ -0,0 +1,25 @@ +set(FDBBACKUP_SRCS + backup.actor.cpp) + +actor_set(FDBBACKUP_BUILD "${FDBBACKUP_SRCS}") +add_executable(fdbbackup "${FDBBACKUP_BUILD}") +actor_compile(fdbbackup "${FDBBACKUP_SRCS}") +target_link_libraries(fdbbackup PRIVATE fdbclient) + +install(TARGETS fdbbackup DESTINATION ${FDB_BIN_DIR} COMPONENT clients) +install(PROGRAMS $ + DESTINATION ${FDB_LIB_DIR}/foundationdb/backup_agent + RENAME backup_agent + COMPONENT clients) +install(PROGRAMS $ + DESTINATION ${FDB_BIN_DIR} + RENAME fdbrestore + COMPONENT clients) +install(PROGRAMS $ + DESTINATION ${FDB_BIN_DIR} + RENAME dr_agent + COMPONENT clients) +install(PROGRAMS $ + DESTINATION ${FDB_BIN_DIR} + RENAME fdbdr + COMPONENT clients) diff --git a/fdbbackup/backup.actor.cpp b/fdbbackup/backup.actor.cpp index 727c747c71..0956d0c9c9 100644 --- a/fdbbackup/backup.actor.cpp +++ b/fdbbackup/backup.actor.cpp @@ -77,7 +77,7 @@ enum enumProgramExe { }; enum enumBackupType { - BACKUP_UNDEFINED=0, BACKUP_START, BACKUP_STATUS, BACKUP_ABORT, BACKUP_WAIT, BACKUP_DISCONTINUE, BACKUP_PAUSE, BACKUP_RESUME, BACKUP_EXPIRE, BACKUP_DELETE, BACKUP_DESCRIBE, BACKUP_LIST + BACKUP_UNDEFINED=0, BACKUP_START, BACKUP_STATUS, BACKUP_ABORT, BACKUP_WAIT, BACKUP_DISCONTINUE, BACKUP_PAUSE, BACKUP_RESUME, BACKUP_EXPIRE, BACKUP_DELETE, BACKUP_DESCRIBE, BACKUP_LIST, BACKUP_DUMP }; enum enumDBType { @@ -92,8 +92,10 @@ enum enumRestoreType { enum { // Backup constants OPT_DESTCONTAINER, OPT_SNAPSHOTINTERVAL, OPT_ERRORLIMIT, OPT_NOSTOPWHENDONE, - OPT_EXPIRE_BEFORE_VERSION, OPT_EXPIRE_BEFORE_DATETIME, OPT_EXPIRE_RESTORABLE_AFTER_VERSION, OPT_EXPIRE_RESTORABLE_AFTER_DATETIME, + OPT_EXPIRE_BEFORE_VERSION, OPT_EXPIRE_BEFORE_DATETIME, OPT_EXPIRE_DELETE_BEFORE_DAYS, + OPT_EXPIRE_RESTORABLE_AFTER_VERSION, OPT_EXPIRE_RESTORABLE_AFTER_DATETIME, OPT_EXPIRE_MIN_RESTORABLE_DAYS, OPT_BASEURL, OPT_BLOB_CREDENTIALS, OPT_DESCRIBE_DEEP, OPT_DESCRIBE_TIMESTAMPS, + OPT_DUMP_BEGIN, OPT_DUMP_END, // Backup and Restore constants OPT_TAGNAME, OPT_BACKUPKEYS, OPT_WAITFORDONE, @@ -110,7 +112,9 @@ enum { //DB constants OPT_SOURCE_CLUSTER, OPT_DEST_CLUSTER, - OPT_CLEANUP + OPT_CLEANUP, + + OPT_TRACE_FORMAT }; CSimpleOpt::SOption g_rgAgentOptions[] = { @@ -119,7 +123,6 @@ CSimpleOpt::SOption g_rgAgentOptions[] = { #endif { OPT_CLUSTERFILE, "-C", SO_REQ_SEP }, { OPT_CLUSTERFILE, "--cluster_file", SO_REQ_SEP }, - { OPT_TRACE_LOG_GROUP, "--loggroup", SO_REQ_SEP }, { OPT_KNOB, "--knob_", SO_REQ_SEP }, { OPT_VERSION, "--version", SO_NONE }, { OPT_VERSION, "-v", SO_NONE }, @@ -127,6 +130,8 @@ CSimpleOpt::SOption g_rgAgentOptions[] = { { OPT_QUIET, "--quiet", SO_NONE }, { OPT_TRACE, "--log", SO_NONE }, { OPT_TRACE_DIR, "--logdir", SO_REQ_SEP }, + { OPT_TRACE_FORMAT, "--trace_format", SO_REQ_SEP }, + { OPT_TRACE_LOG_GROUP, "--loggroup", SO_REQ_SEP }, { OPT_CRASHONERROR, "--crash", SO_NONE }, { OPT_LOCALITY, "--locality_", SO_REQ_SEP }, { OPT_MEMLIMIT, "-m", SO_REQ_SEP }, @@ -162,6 +167,8 @@ CSimpleOpt::SOption g_rgBackupStartOptions[] = { { OPT_DRYRUN, "--dryrun", SO_NONE }, { OPT_TRACE, "--log", SO_NONE }, { OPT_TRACE_DIR, "--logdir", SO_REQ_SEP }, + { OPT_TRACE_FORMAT, "--trace_format", SO_REQ_SEP }, + { OPT_TRACE_LOG_GROUP, "--loggroup", SO_REQ_SEP }, { OPT_QUIET, "-q", SO_NONE }, { OPT_QUIET, "--quiet", SO_NONE }, { OPT_VERSION, "--version", SO_NONE }, @@ -191,6 +198,8 @@ CSimpleOpt::SOption g_rgBackupStatusOptions[] = { { OPT_TAGNAME, "--tagname", SO_REQ_SEP }, { OPT_TRACE, "--log", SO_NONE }, { OPT_TRACE_DIR, "--logdir", SO_REQ_SEP }, + { OPT_TRACE_FORMAT, "--trace_format", SO_REQ_SEP }, + { OPT_TRACE_LOG_GROUP, "--loggroup", SO_REQ_SEP }, { OPT_VERSION, "--version", SO_NONE }, { OPT_VERSION, "-v", SO_NONE }, { OPT_QUIET, "-q", SO_NONE }, @@ -216,6 +225,8 @@ CSimpleOpt::SOption g_rgBackupAbortOptions[] = { { OPT_TAGNAME, "--tagname", SO_REQ_SEP }, { OPT_TRACE, "--log", SO_NONE }, { OPT_TRACE_DIR, "--logdir", SO_REQ_SEP }, + { OPT_TRACE_FORMAT, "--trace_format", SO_REQ_SEP }, + { OPT_TRACE_LOG_GROUP, "--loggroup", SO_REQ_SEP }, { OPT_QUIET, "-q", SO_NONE }, { OPT_QUIET, "--quiet", SO_NONE }, { OPT_VERSION, "--version", SO_NONE }, @@ -243,6 +254,8 @@ CSimpleOpt::SOption g_rgBackupDiscontinueOptions[] = { { OPT_WAITFORDONE, "--waitfordone", SO_NONE }, { OPT_TRACE, "--log", SO_NONE }, { OPT_TRACE_DIR, "--logdir", SO_REQ_SEP }, + { OPT_TRACE_FORMAT, "--trace_format", SO_REQ_SEP }, + { OPT_TRACE_LOG_GROUP, "--loggroup", SO_REQ_SEP }, { OPT_QUIET, "-q", SO_NONE }, { OPT_QUIET, "--quiet", SO_NONE }, { OPT_VERSION, "--version", SO_NONE }, @@ -270,6 +283,8 @@ CSimpleOpt::SOption g_rgBackupWaitOptions[] = { { OPT_NOSTOPWHENDONE, "--no-stop-when-done",SO_NONE }, { OPT_TRACE, "--log", SO_NONE }, { OPT_TRACE_DIR, "--logdir", SO_REQ_SEP }, + { OPT_TRACE_FORMAT, "--trace_format", SO_REQ_SEP }, + { OPT_TRACE_LOG_GROUP, "--loggroup", SO_REQ_SEP }, { OPT_QUIET, "-q", SO_NONE }, { OPT_QUIET, "--quiet", SO_NONE }, { OPT_VERSION, "--version", SO_NONE }, @@ -293,6 +308,8 @@ CSimpleOpt::SOption g_rgBackupPauseOptions[] = { { OPT_CLUSTERFILE, "--cluster_file", SO_REQ_SEP }, { OPT_TRACE, "--log", SO_NONE }, { OPT_TRACE_DIR, "--logdir", SO_REQ_SEP }, + { OPT_TRACE_FORMAT, "--trace_format", SO_REQ_SEP }, + { OPT_TRACE_LOG_GROUP, "--loggroup", SO_REQ_SEP }, { OPT_QUIET, "-q", SO_NONE }, { OPT_QUIET, "--quiet", SO_NONE }, { OPT_VERSION, "--version", SO_NONE }, @@ -318,6 +335,8 @@ CSimpleOpt::SOption g_rgBackupExpireOptions[] = { { OPT_DESTCONTAINER, "--destcontainer", SO_REQ_SEP }, { OPT_TRACE, "--log", SO_NONE }, { OPT_TRACE_DIR, "--logdir", SO_REQ_SEP }, + { OPT_TRACE_FORMAT, "--trace_format", SO_REQ_SEP }, + { OPT_TRACE_LOG_GROUP, "--loggroup", SO_REQ_SEP }, { OPT_QUIET, "-q", SO_NONE }, { OPT_QUIET, "--quiet", SO_NONE }, { OPT_VERSION, "-v", SO_NONE }, @@ -337,6 +356,8 @@ CSimpleOpt::SOption g_rgBackupExpireOptions[] = { { OPT_EXPIRE_RESTORABLE_AFTER_DATETIME, "--restorable_after_timestamp", SO_REQ_SEP }, { OPT_EXPIRE_BEFORE_VERSION, "--expire_before_version", SO_REQ_SEP }, { OPT_EXPIRE_BEFORE_DATETIME, "--expire_before_timestamp", SO_REQ_SEP }, + { OPT_EXPIRE_MIN_RESTORABLE_DAYS, "--min_restorable_days", SO_REQ_SEP }, + { OPT_EXPIRE_DELETE_BEFORE_DAYS, "--delete_before_days", SO_REQ_SEP }, SO_END_OF_OPTIONS }; @@ -349,6 +370,8 @@ CSimpleOpt::SOption g_rgBackupDeleteOptions[] = { { OPT_DESTCONTAINER, "--destcontainer", SO_REQ_SEP }, { OPT_TRACE, "--log", SO_NONE }, { OPT_TRACE_DIR, "--logdir", SO_REQ_SEP }, + { OPT_TRACE_FORMAT, "--trace_format", SO_REQ_SEP }, + { OPT_TRACE_LOG_GROUP, "--loggroup", SO_REQ_SEP }, { OPT_QUIET, "-q", SO_NONE }, { OPT_QUIET, "--quiet", SO_NONE }, { OPT_VERSION, "-v", SO_NONE }, @@ -376,6 +399,8 @@ CSimpleOpt::SOption g_rgBackupDescribeOptions[] = { { OPT_DESTCONTAINER, "--destcontainer", SO_REQ_SEP }, { OPT_TRACE, "--log", SO_NONE }, { OPT_TRACE_DIR, "--logdir", SO_REQ_SEP }, + { OPT_TRACE_FORMAT, "--trace_format", SO_REQ_SEP }, + { OPT_TRACE_LOG_GROUP, "--loggroup", SO_REQ_SEP }, { OPT_QUIET, "-q", SO_NONE }, { OPT_QUIET, "--quiet", SO_NONE }, { OPT_VERSION, "-v", SO_NONE }, @@ -395,6 +420,36 @@ CSimpleOpt::SOption g_rgBackupDescribeOptions[] = { SO_END_OF_OPTIONS }; +CSimpleOpt::SOption g_rgBackupDumpOptions[] = { +#ifdef _WIN32 + { OPT_PARENTPID, "--parentpid", SO_REQ_SEP }, +#endif + { OPT_CLUSTERFILE, "-C", SO_REQ_SEP }, + { OPT_CLUSTERFILE, "--cluster_file", SO_REQ_SEP }, + { OPT_DESTCONTAINER, "-d", SO_REQ_SEP }, + { OPT_DESTCONTAINER, "--destcontainer", SO_REQ_SEP }, + { OPT_TRACE, "--log", SO_NONE }, + { OPT_TRACE_DIR, "--logdir", SO_REQ_SEP }, + { OPT_TRACE_LOG_GROUP, "--loggroup", SO_REQ_SEP }, + { OPT_QUIET, "-q", SO_NONE }, + { OPT_QUIET, "--quiet", SO_NONE }, + { OPT_VERSION, "-v", SO_NONE }, + { OPT_VERSION, "--version", SO_NONE }, + { OPT_CRASHONERROR, "--crash", SO_NONE }, + { OPT_MEMLIMIT, "-m", SO_REQ_SEP }, + { OPT_MEMLIMIT, "--memory", SO_REQ_SEP }, + { OPT_HELP, "-?", SO_NONE }, + { OPT_HELP, "-h", SO_NONE }, + { OPT_HELP, "--help", SO_NONE }, + { OPT_DEVHELP, "--dev-help", SO_NONE }, + { OPT_BLOB_CREDENTIALS, "--blob_credentials", SO_REQ_SEP }, + { OPT_KNOB, "--knob_", SO_REQ_SEP }, + { OPT_DUMP_BEGIN, "--begin", SO_REQ_SEP }, + { OPT_DUMP_END, "--end", SO_REQ_SEP }, + + SO_END_OF_OPTIONS +}; + CSimpleOpt::SOption g_rgBackupListOptions[] = { #ifdef _WIN32 { OPT_PARENTPID, "--parentpid", SO_REQ_SEP }, @@ -403,6 +458,8 @@ CSimpleOpt::SOption g_rgBackupListOptions[] = { { OPT_BASEURL, "--base_url", SO_REQ_SEP }, { OPT_TRACE, "--log", SO_NONE }, { OPT_TRACE_DIR, "--logdir", SO_REQ_SEP }, + { OPT_TRACE_FORMAT, "--trace_format", SO_REQ_SEP }, + { OPT_TRACE_LOG_GROUP, "--loggroup", SO_REQ_SEP }, { OPT_QUIET, "-q", SO_NONE }, { OPT_QUIET, "--quiet", SO_NONE }, { OPT_VERSION, "-v", SO_NONE }, @@ -440,6 +497,8 @@ CSimpleOpt::SOption g_rgRestoreOptions[] = { { OPT_DBVERSION, "-v", SO_REQ_SEP }, { OPT_TRACE, "--log", SO_NONE }, { OPT_TRACE_DIR, "--logdir", SO_REQ_SEP }, + { OPT_TRACE_FORMAT, "--trace_format", SO_REQ_SEP }, + { OPT_TRACE_LOG_GROUP, "--loggroup", SO_REQ_SEP }, { OPT_QUIET, "-q", SO_NONE }, { OPT_QUIET, "--quiet", SO_NONE }, { OPT_DRYRUN, "-n", SO_NONE }, @@ -473,6 +532,8 @@ CSimpleOpt::SOption g_rgDBAgentOptions[] = { { OPT_QUIET, "--quiet", SO_NONE }, { OPT_TRACE, "--log", SO_NONE }, { OPT_TRACE_DIR, "--logdir", SO_REQ_SEP }, + { OPT_TRACE_FORMAT, "--trace_format", SO_REQ_SEP }, + { OPT_TRACE_LOG_GROUP, "--loggroup", SO_REQ_SEP }, { OPT_CRASHONERROR, "--crash", SO_NONE }, { OPT_LOCALITY, "--locality_", SO_REQ_SEP }, { OPT_MEMLIMIT, "-m", SO_REQ_SEP }, @@ -499,6 +560,8 @@ CSimpleOpt::SOption g_rgDBStartOptions[] = { { OPT_BACKUPKEYS, "--keys", SO_REQ_SEP }, { OPT_TRACE, "--log", SO_NONE }, { OPT_TRACE_DIR, "--logdir", SO_REQ_SEP }, + { OPT_TRACE_FORMAT, "--trace_format", SO_REQ_SEP }, + { OPT_TRACE_LOG_GROUP, "--loggroup", SO_REQ_SEP }, { OPT_QUIET, "-q", SO_NONE }, { OPT_QUIET, "--quiet", SO_NONE }, { OPT_VERSION, "--version", SO_NONE }, @@ -528,6 +591,8 @@ CSimpleOpt::SOption g_rgDBStatusOptions[] = { { OPT_TAGNAME, "--tagname", SO_REQ_SEP }, { OPT_TRACE, "--log", SO_NONE }, { OPT_TRACE_DIR, "--logdir", SO_REQ_SEP }, + { OPT_TRACE_FORMAT, "--trace_format", SO_REQ_SEP }, + { OPT_TRACE_LOG_GROUP, "--loggroup", SO_REQ_SEP }, { OPT_VERSION, "--version", SO_NONE }, { OPT_VERSION, "-v", SO_NONE }, { OPT_QUIET, "-q", SO_NONE }, @@ -555,6 +620,8 @@ CSimpleOpt::SOption g_rgDBSwitchOptions[] = { { OPT_TAGNAME, "--tagname", SO_REQ_SEP }, { OPT_TRACE, "--log", SO_NONE }, { OPT_TRACE_DIR, "--logdir", SO_REQ_SEP }, + { OPT_TRACE_FORMAT, "--trace_format", SO_REQ_SEP }, + { OPT_TRACE_LOG_GROUP, "--loggroup", SO_REQ_SEP }, { OPT_QUIET, "-q", SO_NONE }, { OPT_QUIET, "--quiet", SO_NONE }, { OPT_VERSION, "--version", SO_NONE }, @@ -583,6 +650,8 @@ CSimpleOpt::SOption g_rgDBAbortOptions[] = { { OPT_TAGNAME, "--tagname", SO_REQ_SEP }, { OPT_TRACE, "--log", SO_NONE }, { OPT_TRACE_DIR, "--logdir", SO_REQ_SEP }, + { OPT_TRACE_FORMAT, "--trace_format", SO_REQ_SEP }, + { OPT_TRACE_LOG_GROUP, "--loggroup", SO_REQ_SEP }, { OPT_QUIET, "-q", SO_NONE }, { OPT_QUIET, "--quiet", SO_NONE }, { OPT_VERSION, "--version", SO_NONE }, @@ -608,6 +677,8 @@ CSimpleOpt::SOption g_rgDBPauseOptions[] = { { OPT_DEST_CLUSTER, "--destination", SO_REQ_SEP }, { OPT_TRACE, "--log", SO_NONE }, { OPT_TRACE_DIR, "--logdir", SO_REQ_SEP }, + { OPT_TRACE_FORMAT, "--trace_format", SO_REQ_SEP }, + { OPT_TRACE_LOG_GROUP, "--loggroup", SO_REQ_SEP }, { OPT_QUIET, "-q", SO_NONE }, { OPT_QUIET, "--quiet", SO_NONE }, { OPT_VERSION, "--version", SO_NONE }, @@ -650,7 +721,7 @@ static void printVersion() { printf("protocol %llx\n", (long long) currentProtocolVersion); } -const char *BlobCredentialInfo = +const char *BlobCredentialInfo = " BLOB CREDENTIALS\n" " Blob account secret keys can optionally be omitted from blobstore:// URLs, in which case they will be\n" " loaded, if possible, from 1 or more blob credentials definition files.\n\n" @@ -677,6 +748,9 @@ static void printAgentUsage(bool devhelp) { " --logdir PATH Specifes the output directory for trace files. If\n" " unspecified, defaults to the current directory. Has\n" " no effect unless --log is specified.\n"); + printf(" --trace_format FORMAT\n" + " Select the format of the trace files. xml (the default) and json are supported.\n" + " Has no effect unless --log is specified.\n"); printf(" -m SIZE, --memory SIZE\n" " Memory limit. The default value is 8GiB. When specified\n" " without a unit, MiB is assumed.\n"); @@ -725,10 +799,16 @@ static void printBackupUsage(bool devhelp) { " in the database to obtain a cutoff version very close to the timestamp given in YYYY-MM-DD.HH:MI:SS format (UTC).\n"); printf(" --expire_before_version VERSION\n" " Version cutoff for expire operations. Deletes data files containing no data at or after VERSION.\n"); + printf(" --delete_before_days NUM_DAYS\n" + " Another way to specify version cutoff for expire operations. Deletes data files containing no data at or after a\n" + " version approximately NUM_DAYS days worth of versions prior to the latest log version in the backup.\n"); printf(" --restorable_after_timestamp DATETIME\n" " For expire operations, set minimum acceptable restorability to the version equivalent of DATETIME and later.\n"); printf(" --restorable_after_version VERSION\n" " For expire operations, set minimum acceptable restorability to the VERSION and later.\n"); + printf(" --min_restorable_days NUM_DAYS\n" + " For expire operations, set minimum acceptable restorability to approximately NUM_DAYS days worth of versions\n" + " prior to the latest log version in the backup.\n"); printf(" --version_timestamps\n"); printf(" For describe operations, lookup versions in the database to obtain timestamps. A cluster file is required.\n"); printf(" -f, --force For expire operations, force expiration even if minimum restorability would be violated.\n"); @@ -737,7 +817,7 @@ static void printBackupUsage(bool devhelp) { printf(" -e ERRORLIMIT The maximum number of errors printed by status (default is 10).\n"); printf(" -k KEYS List of key ranges to backup.\n" " If not specified, the entire database will be backed up.\n"); - printf(" -n, --dry-run For start or restore operations, performs a trial run with no actual changes made.\n"); + printf(" -n, --dryrun For start or restore operations, performs a trial run with no actual changes made.\n"); printf(" -v, --version Print version information and exit.\n"); printf(" -w, --wait Wait for the backup to complete (allowed with `start' and `discontinue').\n"); printf(" -z, --no-stop-when-done\n" @@ -752,7 +832,7 @@ static void printBackupUsage(bool devhelp) { printf(" Specify a process after whose termination to exit.\n"); #endif printf(" --deep For describe operations, do not use cached metadata. Warning: Very slow\n"); - + } printf("\n" " KEYS FORMAT: \" \" [...]\n"); @@ -781,7 +861,7 @@ static void printRestoreUsage(bool devhelp ) { printf(" -k KEYS List of key ranges from the backup to restore\n"); printf(" --remove_prefix PREFIX prefix to remove from the restored keys\n"); printf(" --add_prefix PREFIX prefix to add to the restored keys\n"); - printf(" -n, --dry-run Perform a trial run with no changes made.\n"); + printf(" -n, --dryrun Perform a trial run with no changes made.\n"); printf(" -v DBVERSION The version at which the database will be restored.\n"); printf(" -h, --help Display this help and exit.\n"); @@ -812,6 +892,9 @@ static void printDBAgentUsage(bool devhelp) { " --logdir PATH Specifes the output directory for trace files. If\n" " unspecified, defaults to the current directory. Has\n" " no effect unless --log is specified.\n"); + printf(" --trace_format FORMAT\n" + " Select the format of the trace files. xml (the default) and json are supported.\n" + " Has no effect unless --log is specified.\n"); printf(" -m SIZE, --memory SIZE\n" " Memory limit. The default value is 8GiB. When specified\n" " without a unit, MiB is assumed.\n"); @@ -970,6 +1053,7 @@ enumBackupType getBackupType(std::string backupType) values["delete"] = BACKUP_DELETE; values["describe"] = BACKUP_DESCRIBE; values["list"] = BACKUP_LIST; + values["dump"] = BACKUP_DUMP; } auto i = values.find(backupType); @@ -1730,11 +1814,10 @@ ACTOR Future changeDBBackupResumed(Database src, Database dest, bool pause return Void(); } -ACTOR Future runRestore(Database db, std::string tagName, std::string container, Standalone> ranges, Version dbVersion, bool performRestore, bool verbose, bool waitForDone, std::string addPrefix, std::string removePrefix) { +ACTOR Future runRestore(Database db, std::string tagName, std::string container, Standalone> ranges, Version targetVersion, bool performRestore, bool verbose, bool waitForDone, std::string addPrefix, std::string removePrefix) { try { state FileBackupAgent backupAgent; - state int64_t restoreVersion = -1; if(ranges.size() > 1) { fprintf(stderr, "Currently only a single restore range is supported!\n"); @@ -1743,52 +1826,45 @@ ACTOR Future runRestore(Database db, std::string tagName, std::string cont state KeyRange range = (ranges.size() == 0) ? normalKeys : ranges.front(); - if (performRestore) { - if(dbVersion == invalidVersion) { - BackupDescription desc = wait(IBackupContainer::openContainer(container)->describeBackup()); - if(!desc.maxRestorableVersion.present()) { - fprintf(stderr, "The specified backup is not restorable to any version.\n"); - throw restore_error(); - } + state Reference bc = IBackupContainer::openContainer(container); - dbVersion = desc.maxRestorableVersion.get(); + // If targetVersion is unset then use the maximum restorable version from the backup description + if(targetVersion == invalidVersion) { + if(verbose) + printf("No restore target version given, will use maximum restorable version from backup description.\n"); + + BackupDescription desc = wait(bc->describeBackup()); + + if(!desc.maxRestorableVersion.present()) { + fprintf(stderr, "The specified backup is not restorable to any version.\n"); + throw restore_error(); + } + + targetVersion = desc.maxRestorableVersion.get(); + + if(verbose) + printf("Using target restore version %lld\n", targetVersion); + } + + if (performRestore) { + Version restoredVersion = wait(backupAgent.restore(db, KeyRef(tagName), KeyRef(container), waitForDone, targetVersion, verbose, range, KeyRef(addPrefix), KeyRef(removePrefix))); + + if(waitForDone && verbose) { + // If restore is now complete then report version restored + printf("Restored to version %lld\n", restoredVersion); } - Version _restoreVersion = wait(backupAgent.restore(db, KeyRef(tagName), KeyRef(container), waitForDone, dbVersion, verbose, range, KeyRef(addPrefix), KeyRef(removePrefix))); - restoreVersion = _restoreVersion; } else { - state Reference bc = IBackupContainer::openContainer(container); - state BackupDescription description = wait(bc->describeBackup()); + state Optional rset = wait(bc->getRestoreSet(targetVersion)); - if(dbVersion <= 0) { - wait(description.resolveVersionTimes(db)); - if(description.maxRestorableVersion.present()) - restoreVersion = description.maxRestorableVersion.get(); - else { - fprintf(stderr, "Backup is not restorable\n"); - throw restore_invalid_version(); - } - } - else - restoreVersion = dbVersion; - - state Optional rset = wait(bc->getRestoreSet(restoreVersion)); if(!rset.present()) { - fprintf(stderr, "Insufficient data to restore to version %lld\n", restoreVersion); + fprintf(stderr, "Insufficient data to restore to version %lld. Describe backup for more information.\n", targetVersion); throw restore_invalid_version(); } - // Display the restore information, if requested - if (verbose) { - printf("[DRY RUN] Restoring backup to version: %lld\n", (long long) restoreVersion); - printf("%s\n", description.toString().c_str()); - } + printf("Backup can be used to restore to version %lld\n", targetVersion); } - if(waitForDone && verbose) { - // If restore completed then report version restored - printf("Restored to version %lld%s\n", (long long) restoreVersion, (performRestore) ? "" : " (DRY RUN)"); - } } catch (Error& e) { if(e.code() == error_code_actor_cancelled) @@ -1824,6 +1900,33 @@ Reference openBackupContainer(const char *name, std::string de return c; } +ACTOR Future dumpBackupData(const char *name, std::string destinationContainer, Version beginVersion, Version endVersion) { + state Reference c = openBackupContainer(name, destinationContainer); + + if(beginVersion < 0 || endVersion < 0) { + BackupDescription desc = wait(c->describeBackup()); + + if(!desc.maxLogEnd.present()) { + fprintf(stderr, "ERROR: Backup must have log data in order to use relative begin/end versions.\n"); + throw backup_invalid_info(); + } + + if(beginVersion < 0) { + beginVersion += desc.maxLogEnd.get(); + } + + if(endVersion < 0) { + endVersion += desc.maxLogEnd.get(); + } + } + + printf("Scanning version range %lld to %lld\n", beginVersion, endVersion); + BackupFileList files = wait(c->dumpFileList(beginVersion, endVersion)); + files.toStream(stdout); + + return Void(); +} + ACTOR Future expireBackupData(const char *name, std::string destinationContainer, Version endVersion, std::string endDatetime, Database db, bool force, Version restorableAfterVersion, std::string restorableAfterDatetime) { if (!endDatetime.empty()) { Version v = wait( timeKeeperVersionFromDatetime(endDatetime, db) ); @@ -1843,8 +1946,35 @@ ACTOR Future expireBackupData(const char *name, std::string destinationCon try { Reference c = openBackupContainer(name, destinationContainer); - wait(c->expireData(endVersion, force, restorableAfterVersion)); - printf("All data before version %lld is deleted.\n", endVersion); + + state IBackupContainer::ExpireProgress progress; + state std::string lastProgress; + state Future expire = c->expireData(endVersion, force, &progress, restorableAfterVersion); + + loop { + choose { + when(wait(delay(5))) { + std::string p = progress.toString(); + if(p != lastProgress) { + int spaces = lastProgress.size() - p.size(); + printf("\r%s%s", p.c_str(), (spaces > 0 ? std::string(spaces, ' ').c_str() : "") ); + lastProgress = p; + } + } + when(wait(expire)) { + break; + } + } + } + + std::string p = progress.toString(); + int spaces = lastProgress.size() - p.size(); + printf("\r%s%s\n", p.c_str(), (spaces > 0 ? std::string(spaces, ' ').c_str() : "") ); + + if(endVersion < 0) + printf("All data before %lld versions (%lld days) prior to latest backup log has been deleted.\n", -endVersion, -endVersion / ((int64_t)24 * 3600 * CLIENT_KNOBS->CORE_VERSIONSPERSECOND)); + else + printf("All data before version %lld has been deleted.\n", endVersion); } catch (Error& e) { if(e.code() == error_code_actor_cancelled) @@ -1865,17 +1995,24 @@ ACTOR Future deleteBackupContainer(const char *name, std::string destinati state int numDeleted = 0; state Future done = c->deleteContainer(&numDeleted); + state int lastUpdate = -1; + printf("Deleting %s...\n", destinationContainer.c_str()); + loop { choose { when ( wait(done) ) { - printf("The entire container has been deleted.\n"); break; } - when ( wait(delay(3)) ) { - printf("%d files have been deleted so far...\n", numDeleted); + when ( wait(delay(5)) ) { + if(numDeleted != lastUpdate) { + printf("\r%d...", numDeleted); + lastUpdate = numDeleted; + } } } } + printf("\r%d objects deleted\n", numDeleted); + printf("The entire container has been deleted.\n"); } catch (Error& e) { if(e.code() == error_code_actor_cancelled) @@ -2073,6 +2210,26 @@ static void addKeyRange(std::string optionValue, StandaloneCORE_VERSIONSPERSECOND * 24 * 3600 * -days; + } + + Version ver; + if(sscanf(str, "%lld", &ver) != 1) { + fprintf(stderr, "Could not parse version: %s\n", str); + flushAndExit(FDB_EXIT_ERROR); + } + return ver; +} + #ifdef ALLOC_INSTRUMENTATION extern uint8_t *g_extra_memory; #endif @@ -2151,6 +2308,9 @@ int main(int argc, char* argv[]) { case BACKUP_DESCRIBE: args = new CSimpleOpt(argc - 1, &argv[1], g_rgBackupDescribeOptions, SO_O_EXACT); break; + case BACKUP_DUMP: + args = new CSimpleOpt(argc - 1, &argv[1], g_rgBackupDumpOptions, SO_O_EXACT); + break; case BACKUP_LIST: args = new CSimpleOpt(argc - 1, &argv[1], g_rgBackupListOptions, SO_O_EXACT); break; @@ -2288,10 +2448,12 @@ int main(int argc, char* argv[]) { uint64_t memLimit = 8LL << 30; Optional ti; std::vector blobCredentials; + Version dumpBegin = 0; + Version dumpEnd = std::numeric_limits::max(); if( argc == 1 ) { printUsage(programExe, false); - return FDB_EXIT_ERROR; + return FDB_EXIT_ERROR; } #ifdef _WIN32 @@ -2375,6 +2537,11 @@ int main(int argc, char* argv[]) { trace = true; traceDir = args->OptionArg(); break; + case OPT_TRACE_FORMAT: + if (!selectTraceFormatter(args->OptionArg())) { + fprintf(stderr, "WARNING: Unrecognized trace format `%s'\n", args->OptionArg()); + } + break; case OPT_TRACE_LOG_GROUP: traceLogGroup = args->OptionArg(); break; @@ -2397,6 +2564,8 @@ int main(int argc, char* argv[]) { break; case OPT_EXPIRE_BEFORE_VERSION: case OPT_EXPIRE_RESTORABLE_AFTER_VERSION: + case OPT_EXPIRE_MIN_RESTORABLE_DAYS: + case OPT_EXPIRE_DELETE_BEFORE_DAYS: { const char* a = args->OptionArg(); long long ver = 0; @@ -2405,7 +2574,13 @@ int main(int argc, char* argv[]) { printHelpTeaser(argv[0]); return FDB_EXIT_ERROR; } - if(optId == OPT_EXPIRE_BEFORE_VERSION) + + // Interpret the value as days worth of versions relative to now (negative) + if(optId == OPT_EXPIRE_MIN_RESTORABLE_DAYS || optId == OPT_EXPIRE_DELETE_BEFORE_DAYS) { + ver = -ver * 24 * 60 * 60 * CLIENT_KNOBS->CORE_VERSIONSPERSECOND; + } + + if(optId == OPT_EXPIRE_BEFORE_VERSION || optId == OPT_EXPIRE_DELETE_BEFORE_DAYS) expireVersion = ver; else expireRestorableAfterVersion = ver; @@ -2537,6 +2712,12 @@ int main(int argc, char* argv[]) { case OPT_BLOB_CREDENTIALS: blobCredentials.push_back(args->OptionArg()); break; + case OPT_DUMP_BEGIN: + dumpBegin = parseVersion(args->OptionArg()); + break; + case OPT_DUMP_END: + dumpEnd = parseVersion(args->OptionArg()); + break; } } @@ -2632,7 +2813,7 @@ int main(int argc, char* argv[]) { for(auto k=knobs.begin(); k!=knobs.end(); ++k) { try { if (!flowKnobs->setKnob( k->first, k->second ) && - !clientKnobs->setKnob( k->first, k->second )) + !clientKnobs->setKnob( k->first, k->second )) { fprintf(stderr, "Unrecognized knob option '%s'\n", k->first.c_str()); return FDB_EXIT_ERROR; @@ -2700,7 +2881,7 @@ int main(int argc, char* argv[]) { .trackLatest("ProgramStart"); // Ordinarily, this is done when the network is run. However, network thread should be set before TraceEvents are logged. This thread will eventually run the network, so call it now. - TraceEvent::setNetworkThread(); + TraceEvent::setNetworkThread(); // Add blob credentials files from the environment to the list collected from the command line. const char *blobCredsFromENV = getenv("FDB_BLOB_CREDENTIALS"); @@ -2852,11 +3033,17 @@ int main(int argc, char* argv[]) { // Only pass database optionDatabase Describe will lookup version timestamps if a cluster file was given, but quietly skip them if not. f = stopAfter( describeBackup(argv[0], destinationContainer, describeDeep, describeTimestamps ? Optional(db) : Optional()) ); break; + case BACKUP_LIST: initTraceFile(); f = stopAfter( listBackup(baseUrl) ); break; + case BACKUP_DUMP: + initTraceFile(); + f = stopAfter( dumpBackupData(argv[0], destinationContainer, dumpBegin, dumpEnd) ); + break; + case BACKUP_UNDEFINED: default: fprintf(stderr, "ERROR: Unsupported backup action %s\n", argv[1]); @@ -2867,8 +3054,13 @@ int main(int argc, char* argv[]) { break; case EXE_RESTORE: - if(!initCluster()) + if(dryRun) { + initTraceFile(); + } + else if(!initCluster()) { return FDB_EXIT_ERROR; + } + switch(restoreType) { case RESTORE_START: f = stopAfter( runRestore(db, tagName, restoreContainer, backupKeys, dbVersion, !dryRun, !quietDisplay, waitForDone, addPrefix, removePrefix) ); @@ -2883,7 +3075,7 @@ int main(int argc, char* argv[]) { }) ); break; case RESTORE_STATUS: - + // If no tag is specifically provided then print all tag status, don't just use "default" if(tagProvided) tag = tagName; @@ -3004,5 +3196,5 @@ int main(int argc, char* argv[]) { status = FDB_EXIT_MAIN_EXCEPTION; } - return status; + flushAndExit(status); } diff --git a/fdbcli/CMakeLists.txt b/fdbcli/CMakeLists.txt new file mode 100644 index 0000000000..50f36cd5e7 --- /dev/null +++ b/fdbcli/CMakeLists.txt @@ -0,0 +1,13 @@ +set(FDBCLI_SRCS + fdbcli.actor.cpp + FlowLineNoise.actor.cpp + FlowLineNoise.h + linenoise/linenoise.c + linenoise/linenoise.h) + +actor_set(FDBCLI_BUILD "${FDBCLI_SRCS}") +add_executable(fdbcli "${FDBCLI_BUILD}") +actor_compile(fdbcli "${FDBCLI_SRCS}") +target_link_libraries(fdbcli PRIVATE fdbclient) + +install(TARGETS fdbcli DESTINATION ${FDB_BIN_DIR} COMPONENT clients) diff --git a/fdbcli/fdbcli.actor.cpp b/fdbcli/fdbcli.actor.cpp index cb2d4d81b0..97361a2c6f 100644 --- a/fdbcli/fdbcli.actor.cpp +++ b/fdbcli/fdbcli.actor.cpp @@ -57,7 +57,7 @@ extern const char* getHGVersion(); std::vector validOptions; -enum { OPT_CONNFILE, OPT_DATABASE, OPT_HELP, OPT_TRACE, OPT_TRACE_DIR, OPT_TIMEOUT, OPT_EXEC, OPT_NO_STATUS, OPT_STATUS_FROM_JSON, OPT_VERSION }; +enum { OPT_CONNFILE, OPT_DATABASE, OPT_HELP, OPT_TRACE, OPT_TRACE_DIR, OPT_TIMEOUT, OPT_EXEC, OPT_NO_STATUS, OPT_STATUS_FROM_JSON, OPT_VERSION, OPT_TRACE_FORMAT }; CSimpleOpt::SOption g_rgOptions[] = { { OPT_CONNFILE, "-C", SO_REQ_SEP }, @@ -74,6 +74,7 @@ CSimpleOpt::SOption g_rgOptions[] = { { OPT_STATUS_FROM_JSON, "--status-from-json", SO_REQ_SEP }, { OPT_VERSION, "--version", SO_NONE }, { OPT_VERSION, "-v", SO_NONE }, + { OPT_TRACE_FORMAT, "--trace_format", SO_REQ_SEP }, #ifndef TLS_DISABLED TLS_OPTION_FLAGS @@ -401,6 +402,9 @@ static void printProgramUsage(const char* name) { " --log-dir PATH Specifes the output directory for trace files. If\n" " unspecified, defaults to the current directory. Has\n" " no effect unless --log is specified.\n" + " --trace_format FORMAT\n" + " Select the format of the log files. xml (the default) and json\n" + " are supported. Has no effect unless --log is specified.\n" " --exec CMDS Immediately executes the semicolon separated CLI commands\n" " and then exits.\n" " --no-status Disables the initial status check done when starting\n" @@ -1618,6 +1622,11 @@ ACTOR Future configure( Database db, std::vector tokens, Refere printf("Type `configure FORCE *' to configure without this check\n"); ret=false; break; + case ConfigurationResult::NOT_ENOUGH_WORKERS: + printf("ERROR: Not enough processes exist to support the specified configuration\n"); + printf("Type `configure FORCE *' to configure without this check\n"); + ret=false; + break; case ConfigurationResult::SUCCESS: printf("Configuration changed\n"); ret=false; @@ -1724,7 +1733,12 @@ ACTOR Future fileConfigure(Database db, std::string filePath, bool isNewDa break; case ConfigurationResult::REGIONS_CHANGED: printf("ERROR: The region configuration cannot be changed while simultaneously changing usable_regions\n"); - printf("Type `fileconfigure FORCE *' to configure without this check\n"); + printf("Type `fileconfigure FORCE ' to configure without this check\n"); + ret=false; + break; + case ConfigurationResult::NOT_ENOUGH_WORKERS: + printf("ERROR: Not enough processes exist to support the specified configuration\n"); + printf("Type `fileconfigure FORCE ' to configure without this check\n"); ret=false; break; case ConfigurationResult::SUCCESS: @@ -2321,6 +2335,11 @@ struct CLIOptions { return 0; case OPT_STATUS_FROM_JSON: return printStatusFromJSON(args.OptionArg()); + case OPT_TRACE_FORMAT: + if (!selectTraceFormatter(args.OptionArg())) { + fprintf(stderr, "WARNING: Unrecognized trace format `%s'\n", args.OptionArg()); + } + break; case OPT_VERSION: printVersion(); return FDB_EXIT_SUCCESS; diff --git a/fdbclient/BackupAgent.h b/fdbclient/BackupAgent.h index 62ae3a1958..56ba96f077 100644 --- a/fdbclient/BackupAgent.h +++ b/fdbclient/BackupAgent.h @@ -276,7 +276,7 @@ public: // stopWhenDone will return when the backup is stopped, if enabled. Otherwise, it // will return when the backup directory is restorable. - Future waitBackup(Database cx, std::string tagName, bool stopWhenDone = true); + Future waitBackup(Database cx, std::string tagName, bool stopWhenDone = true, Reference *pContainer = nullptr, UID *pUID = nullptr); static const Key keyLastRestorable; @@ -415,7 +415,7 @@ struct RCGroup { template void serialize(Ar& ar) { - ar & items & version & groupKey; + serializer(ar, items, version, groupKey); } }; @@ -615,6 +615,15 @@ public: return configSpace.pack(LiteralStringRef(__FUNCTION__)); } + // Number of kv range files that were both committed to persistent storage AND inserted into + // the snapshotRangeFileMap. Note that since insertions could replace 1 or more existing + // map entries this is not necessarily the number of entries currently in the map. + // This value exists to help with sizing of kv range folders for BackupContainers that + // require it. + KeyBackedBinaryValue snapshotRangeFileCount() { + return configSpace.pack(LiteralStringRef(__FUNCTION__)); + } + // Coalesced set of ranges already dispatched for writing. typedef KeyBackedMap RangeDispatchMapT; RangeDispatchMapT snapshotRangeDispatchMap() { @@ -671,6 +680,7 @@ public: copy.snapshotBeginVersion().set(tr, beginVersion.get()); copy.snapshotTargetEndVersion().set(tr, endVersion); + copy.snapshotRangeFileCount().set(tr, 0); return Void(); }); diff --git a/fdbclient/BackupContainer.actor.cpp b/fdbclient/BackupContainer.actor.cpp index e8b3bbac6e..aa71731e9d 100644 --- a/fdbclient/BackupContainer.actor.cpp +++ b/fdbclient/BackupContainer.actor.cpp @@ -47,6 +47,26 @@ Future IBackupFile::appendStringRefWithLen(Standalone s) { return IBackupFile_impl::appendStringRefWithLen(Reference::addRef(this), s); } +std::string IBackupContainer::ExpireProgress::toString() const { + std::string s = step + "..."; + if(total > 0) { + s += format("%d/%d (%.2f%%)", done, total, double(done) / total * 100); + } + return s; +} + +void BackupFileList::toStream(FILE *fout) const { + for(const RangeFile &f : ranges) { + fprintf(fout, "range %lld %s\n", f.fileSize, f.fileName.c_str()); + } + for(const LogFile &f : logs) { + fprintf(fout, "log %lld %s\n", f.fileSize, f.fileName.c_str()); + } + for(const KeyspaceSnapshotFile &f : snapshots) { + fprintf(fout, "snapshotManifest %lld %s\n", f.totalSize, f.fileName.c_str()); + } +} + std::string formatTime(int64_t t) { time_t curTime = (time_t)t; char buffer[128]; @@ -110,6 +130,10 @@ std::string BackupDescription::toString() const { else s = format("%lld (unknown)", v); } + else if(maxLogEnd.present()) { + double days = double(maxLogEnd.get() - v) / (CLIENT_KNOBS->CORE_VERSIONSPERSECOND * 24 * 60 * 60); + s = format("%lld (maxLogEnd %s%.2f days)", v, days < 0 ? "+" : "-", days); + } else { s = format("%lld", v); } @@ -123,6 +147,10 @@ std::string BackupDescription::toString() const { info.append(format("SnapshotBytes: %lld\n", snapshotBytes)); + if(expiredEndVersion.present()) + info.append(format("ExpiredEndVersion: %s\n", formatVersion(expiredEndVersion.get()).c_str())); + if(unreliableEndVersion.present()) + info.append(format("UnreliableEndVersion: %s\n", formatVersion(unreliableEndVersion.get()).c_str())); if(minLogBegin.present()) info.append(format("MinLogBeginVersion: %s\n", formatVersion(minLogBegin.get()).c_str())); if(contiguousLogEnd.present()) @@ -143,16 +171,36 @@ std::string BackupDescription::toString() const { /* BackupContainerFileSystem implements a backup container which stores files in a nested folder structure. * Inheritors must only defined methods for writing, reading, deleting, sizing, and listing files. * - * BackupInfo is stored as a JSON document at - * /info - * Snapshots are stored as JSON at file paths like - * /snapshots/snapshot,startVersion,endVersion,totalBytes - * Log and Range data files at file paths like - * /logs/.../log,startVersion,endVersion,blockSize - * /ranges/.../range,version,uid,blockSize + * Snapshot manifests (a complete set of files constituting a database snapshot for the backup's target ranges) + * are stored as JSON files at paths like + * /snapshots/snapshot,minVersion,maxVersion,totalBytes + * + * Key range files for snapshots are stored at paths like + * /kvranges/snapshot,startVersion/N/range,version,uid,blockSize + * where startVersion is the version at which the backup snapshot execution began and N is a number + * that is increased as key range files are generated over time (at varying rates) such that there + * are around 5,000 key range files in each folder. * - * Where ... is a multi level path which sorts lexically into version order and targets 10,000 or less - * entries in each folder (though a full speed snapshot could exceed this count at the innermost folder level) + * Note that startVersion will NOT correspond to the minVersion of a snapshot manifest because + * snapshot manifest min/max versions are based on the actual contained data and the first data + * file written will be after the start version of the snapshot's execution. + * + * Log files are at file paths like + * /logs/.../log,startVersion,endVersion,blockSize + * where ... is a multi level path which sorts lexically into version order and results in approximately 1 + * unique folder per day containing about 5,000 files. + * + * BACKWARD COMPATIBILITY + * + * Prior to FDB version 6.0.16, key range files were stored using a different folder scheme. Newer versions + * still support this scheme for all restore and backup management operations but key range files generated + * by backup using version 6.0.16 or later use the scheme describe above. + * + * The old format stored key range files at paths like + * /ranges/.../range,version,uid,blockSize + * where ... is a multi level path with sorts lexically into version order and results in up to approximately + * 900 unique folders per day. The number of files per folder depends on the configured snapshot rate and + * database size and will vary from 1 to around 5,000. */ class BackupContainerFileSystem : public IBackupContainer { public: @@ -164,10 +212,11 @@ public: // Create the container virtual Future create() = 0; + virtual Future exists() = 0; // Get a list of fileNames and their sizes in the container under the given path - // The implementation can (but does not have to) use the folder path filter to avoid traversing - // specific subpaths. + // Although not required, an implementation can avoid traversing unwanted subfolders + // by calling folderPathFilter(absoluteFolderPath) and checking for a false return value. typedef std::vector> FilesAndSizesT; virtual Future listFiles(std::string path = "", std::function folderPathFilter = nullptr) = 0; @@ -207,10 +256,24 @@ public: } // The innermost folder covers 100 seconds (1e8 versions) During a full speed backup it is possible though very unlikely write about 10,000 snapshot range files during that time. - static std::string rangeVersionFolderString(Version v) { + static std::string old_rangeVersionFolderString(Version v) { return format("ranges/%s/", versionFolderString(v, 8).c_str()); } + // Get the root folder for a snapshot's data based on its begin version + static std::string snapshotFolderString(Version snapshotBeginVersion) { + return format("kvranges/snapshot.%018lld", snapshotBeginVersion); + } + + // Extract the snapshot begin version from a path + static Version extractSnapshotBeginVersion(std::string path) { + Version snapshotBeginVersion; + if(sscanf(path.c_str(), "kvranges/snapshot.%018lld", &snapshotBeginVersion) == 1) { + return snapshotBeginVersion; + } + return invalidVersion; + } + // The innermost folder covers 100,000 seconds (1e11 versions) which is 5,000 mutation log files at current settings. static std::string logVersionFolderString(Version v) { return format("logs/%s/", versionFolderString(v, 11).c_str()); @@ -220,8 +283,15 @@ public: return writeFile(logVersionFolderString(beginVersion) + format("log,%lld,%lld,%s,%d", beginVersion, endVersion, g_random->randomUniqueID().toString().c_str(), blockSize)); } - Future> writeRangeFile(Version version, int blockSize) { - return writeFile(rangeVersionFolderString(version) + format("range,%lld,%s,%d", version, g_random->randomUniqueID().toString().c_str(), blockSize)); + Future> writeRangeFile(Version snapshotBeginVersion, int snapshotFileCount, Version fileVersion, int blockSize) { + std::string fileName = format("range,%lld,%s,%d", fileVersion, g_random->randomUniqueID().toString().c_str(), blockSize); + + // In order to test backward compatibility in simulation, sometimes write to the old path format + if(g_network->isSimulated() && g_random->coinflip()) { + return writeFile(old_rangeVersionFolderString(fileVersion) + fileName); + } + + return writeFile(snapshotFolderString(snapshotBeginVersion) + format("/%d/", snapshotFileCount / (BUGGIFY ? 1 : 5000)) + fileName); } static bool pathToRangeFile(RangeFile &out, std::string path, int64_t size) { @@ -265,6 +335,7 @@ public: // TODO: Do this more efficiently, as the range file list for a snapshot could potentially be hundreds of megabytes. ACTOR static Future> readKeyspaceSnapshot_impl(Reference bc, KeyspaceSnapshotFile snapshot) { // Read the range file list for the specified version range, and then index them by fileName. + // This is so we can verify that each of the files listed in the manifest file are also in the container at this time. std::vector files = wait(bc->listRangeFiles(snapshot.beginVersion, snapshot.endVersion)); state std::map rangeIndex; for(auto &f : files) @@ -290,14 +361,34 @@ public: throw restore_corrupted_data(); std::vector results; + int missing = 0; + for(auto const &fileValue : filesArray.get_array()) { if(fileValue.type() != json_spirit::str_type) throw restore_corrupted_data(); - auto i = rangeIndex.find(fileValue.get_str()); - if(i == rangeIndex.end()) - throw restore_corrupted_data(); - results.push_back(i->second); + // If the file is not in the index then log the error but don't throw yet, keep checking the whole list. + auto i = rangeIndex.find(fileValue.get_str()); + if(i == rangeIndex.end()) { + TraceEvent(SevError, "FileRestoreMissingRangeFile") + .detail("URL", bc->getURL()) + .detail("File", fileValue.get_str()); + + ++missing; + } + + // No point in using more memory once data is missing since an error will be thrown instead. + if(missing == 0) { + results.push_back(i->second); + } + } + + if(missing > 0) { + TraceEvent(SevError, "FileRestoreMissingRangeFileSummary") + .detail("URL", bc->getURL()) + .detail("Count", missing); + + throw restore_missing_data(); } return results; @@ -354,8 +445,7 @@ public: return writeKeyspaceSnapshotFile_impl(Reference::addRef(this), fileNames, totalBytes); }; - // List log files which contain data at any version >= beginVersion and <= targetVersion - // Lists files in sorted order by begin version. Does not check that results are non overlapping or contiguous. + // List log files, unsorted, which contain data at any version >= beginVersion and <= targetVersion Future> listLogFiles(Version beginVersion = 0, Version targetVersion = std::numeric_limits::max()) { // The first relevant log file could have a begin version less than beginVersion based on the knobs which determine log file range size, // so start at an earlier version adjusted by how many versions a file could contain. @@ -381,16 +471,16 @@ public: if(pathToLogFile(lf, f.first, f.second) && lf.endVersion > beginVersion && lf.beginVersion <= targetVersion) results.push_back(lf); } - std::sort(results.begin(), results.end()); return results; }); } - // List range files, in sorted version order, which contain data at or between beginVersion and endVersion - Future> listRangeFiles(Version beginVersion = 0, Version endVersion = std::numeric_limits::max()) { + // List range files, unsorted, which contain data at or between beginVersion and endVersion + // NOTE: This reads the range file folder schema from FDB 6.0.15 and earlier and is provided for backward compatibility + Future> old_listRangeFiles(Version beginVersion, Version endVersion) { // Get the cleaned (without slashes) first and last folders that could contain relevant results. - std::string firstPath = cleanFolderString(rangeVersionFolderString(beginVersion)); - std::string lastPath = cleanFolderString(rangeVersionFolderString(endVersion)); + std::string firstPath = cleanFolderString(old_rangeVersionFolderString(beginVersion)); + std::string lastPath = cleanFolderString(old_rangeVersionFolderString(endVersion)); std::function pathFilter = [=](const std::string &folderPath) { // Remove slashes in the given folder path so that the '/' positions in the version folder string do not matter @@ -407,18 +497,50 @@ public: if(pathToRangeFile(rf, f.first, f.second) && rf.version >= beginVersion && rf.version <= endVersion) results.push_back(rf); } - std::sort(results.begin(), results.end()); return results; }); } - // List snapshots which have been fully written, in sorted beginVersion order. - Future> listKeyspaceSnapshots() { + // List range files, unsorted, which contain data at or between beginVersion and endVersion + // Note: The contents of each top level snapshot.N folder do not necessarily constitute a valid snapshot + // and therefore listing files is not how RestoreSets are obtained. + // Note: Snapshots partially written using FDB versions prior to 6.0.16 will have some range files stored + // using the old folder scheme read by old_listRangeFiles + Future> listRangeFiles(Version beginVersion, Version endVersion) { + // Until the old folder scheme is no longer supported, read files stored using old folder scheme + Future> oldFiles = old_listRangeFiles(beginVersion, endVersion); + + // Define filter function (for listFiles() implementations that use it) to reject any folder + // starting after endVersion + std::function pathFilter = [=](std::string const &path) { + return extractSnapshotBeginVersion(path) <= endVersion; + }; + + Future> newFiles = map(listFiles("kvranges/", pathFilter), [=](const FilesAndSizesT &files) { + std::vector results; + RangeFile rf; + for(auto &f : files) { + if(pathToRangeFile(rf, f.first, f.second) && rf.version >= beginVersion && rf.version <= endVersion) + results.push_back(rf); + } + return results; + }); + + return map(success(oldFiles) && success(newFiles), [=](Void _) { + std::vector results = std::move(newFiles.get()); + std::vector oldResults = std::move(oldFiles.get()); + results.insert(results.end(), std::make_move_iterator(oldResults.begin()), std::make_move_iterator(oldResults.end())); + return results; + }); + } + + // List snapshots which have been fully written, in sorted beginVersion order, which start before end and finish on or after begin + Future> listKeyspaceSnapshots(Version begin = 0, Version end = std::numeric_limits::max()) { return map(listFiles("snapshots/"), [=](const FilesAndSizesT &files) { std::vector results; KeyspaceSnapshotFile sf; for(auto &f : files) { - if(pathToKeyspaceSnapshotFile(sf, f.first)) + if(pathToKeyspaceSnapshotFile(sf, f.first) && sf.beginVersion < end && sf.endVersion >= begin) results.push_back(sf); } std::sort(results.begin(), results.end()); @@ -426,50 +548,144 @@ public: }); } - ACTOR static Future dumpFileList_impl(Reference bc) { - state Future> fRanges = bc->listRangeFiles(0, std::numeric_limits::max()); - state Future> fSnapshots = bc->listKeyspaceSnapshots(); - state Future> fLogs = bc->listLogFiles(0, std::numeric_limits::max()); + ACTOR static Future dumpFileList_impl(Reference bc, Version begin, Version end) { + state Future> fRanges = bc->listRangeFiles(begin, end); + state Future> fSnapshots = bc->listKeyspaceSnapshots(begin, end); + state Future> fLogs = bc->listLogFiles(begin, end); + wait(success(fRanges) && success(fSnapshots) && success(fLogs)); - return FullBackupListing({fRanges.get(), fLogs.get(), fSnapshots.get()}); + + return BackupFileList({fRanges.get(), fLogs.get(), fSnapshots.get()}); } - Future dumpFileList() { - return dumpFileList_impl(Reference::addRef(this)); + Future dumpFileList(Version begin, Version end) { + return dumpFileList_impl(Reference::addRef(this), begin, end); } - ACTOR static Future describeBackup_impl(Reference bc, bool deepScan) { + static Version resolveRelativeVersion(Optional max, Version v, const char *name, Error e) { + if(v == invalidVersion) { + TraceEvent(SevError, "BackupExpireInvalidVersion").detail(name, v); + throw e; + } + if(v < 0) { + if(!max.present()) { + TraceEvent(SevError, "BackupExpireCannotResolveRelativeVersion").detail(name, v); + throw e; + } + v += max.get(); + } + return v; + } + + ACTOR static Future describeBackup_impl(Reference bc, bool deepScan, Version logStartVersionOverride) { state BackupDescription desc; desc.url = bc->getURL(); - // This is the range of logs we'll have to list to determine log continuity - state Version scanBegin = 0; - state Version scanEnd = std::numeric_limits::max(); + TraceEvent("BackupContainerDescribe1") + .detail("URL", bc->getURL()) + .detail("LogStartVersionOverride", logStartVersionOverride); - // Get range for which we know there are logs, if available - state Optional begin; - state Optional end; - - if(!deepScan) { - wait(store(bc->logBeginVersion().get(), begin) && store(bc->logEndVersion().get(), end)); + bool e = wait(bc->exists()); + if(!e) { + TraceEvent(SevWarnAlways, "BackupContainerDoesNotExist").detail("URL", bc->getURL()); + throw backup_does_not_exist(); } + // If logStartVersion is relative, then first do a recursive call without it to find the max log version + // from which to resolve the relative version. + // This could be handled more efficiently without recursion but it's tricky, this will do for now. + if(logStartVersionOverride != invalidVersion && logStartVersionOverride < 0) { + BackupDescription tmp = wait(bc->describeBackup(false, invalidVersion)); + logStartVersionOverride = resolveRelativeVersion(tmp.maxLogEnd, logStartVersionOverride, "LogStartVersionOverride", invalid_option_value()); + } + + // Get metadata versions + state Optional metaLogBegin; + state Optional metaLogEnd; + state Optional metaExpiredEnd; + state Optional metaUnreliableEnd; + + std::vector> metaReads; + metaReads.push_back(store(bc->expiredEndVersion().get(), metaExpiredEnd)); + metaReads.push_back(store(bc->unreliableEndVersion().get(), metaUnreliableEnd)); + + // Only read log begin/end versions if not doing a deep scan, otherwise scan files and recalculate them. + if(!deepScan) { + metaReads.push_back(store(bc->logBeginVersion().get(), metaLogBegin)); + metaReads.push_back(store(bc->logEndVersion().get(), metaLogEnd)); + } + + wait(waitForAll(metaReads)); + + TraceEvent("BackupContainerDescribe2") + .detail("URL", bc->getURL()) + .detail("LogStartVersionOverride", logStartVersionOverride) + .detail("ExpiredEndVersion", metaExpiredEnd.orDefault(invalidVersion)) + .detail("UnreliableEndVersion", metaUnreliableEnd.orDefault(invalidVersion)) + .detail("LogBeginVersion", metaLogBegin.orDefault(invalidVersion)) + .detail("LogEndVersion", metaLogEnd.orDefault(invalidVersion)); + + // If the logStartVersionOverride is positive (not relative) then ensure that unreliableEndVersion is equal or greater + if(logStartVersionOverride != invalidVersion && metaUnreliableEnd.orDefault(invalidVersion) < logStartVersionOverride) { + metaUnreliableEnd = logStartVersionOverride; + } + + // Don't use metaLogBegin or metaLogEnd if any of the following are true, the safest + // thing to do is rescan to verify log continuity and get exact begin/end versions + // - either are missing + // - metaLogEnd <= metaLogBegin (invalid range) + // - metaLogEnd < metaExpiredEnd (log continuity exists in missing data range) + // - metaLogEnd < metaUnreliableEnd (log continuity exists in incomplete data range) + if(!metaLogBegin.present() || !metaLogEnd.present() + || metaLogEnd.get() <= metaLogBegin.get() + || metaLogEnd.get() < metaExpiredEnd.orDefault(invalidVersion) + || metaLogEnd.get() < metaUnreliableEnd.orDefault(invalidVersion) + ) { + TraceEvent(SevWarnAlways, "BackupContainerMetadataInvalid") + .detail("URL", bc->getURL()) + .detail("ExpiredEndVersion", metaExpiredEnd.orDefault(invalidVersion)) + .detail("UnreliableEndVersion", metaUnreliableEnd.orDefault(invalidVersion)) + .detail("LogBeginVersion", metaLogBegin.orDefault(invalidVersion)) + .detail("LogEndVersion", metaLogEnd.orDefault(invalidVersion)); + + metaLogBegin = Optional(); + metaLogEnd = Optional(); + } + + // If the unreliable end version is not set or is < expiredEndVersion then increase it to expiredEndVersion. + // Describe does not update unreliableEnd in the backup metadata for safety reasons as there is no + // compare-and-set operation to atomically change it and an expire process could be advancing it simultaneously. + if(!metaUnreliableEnd.present() || metaUnreliableEnd.get() < metaExpiredEnd.orDefault(0)) + metaUnreliableEnd = metaExpiredEnd; + + desc.unreliableEndVersion = metaUnreliableEnd; + desc.expiredEndVersion = metaExpiredEnd; + + // Start scanning at the end of the unreliable version range, which is the version before which data is likely + // missing because an expire process has operated on that range. + state Version scanBegin = desc.unreliableEndVersion.orDefault(0); + state Version scanEnd = std::numeric_limits::max(); + // Use the known log range if present - if(begin.present() && end.present()) { - // Logs are assumed to be contiguious between begin and max(begin, end), so initalize desc accordingly - // The use of max() is to allow for a stale end version that has been exceeded by begin version - desc.minLogBegin = begin.get(); - desc.maxLogEnd = std::max(begin.get(), end.get()); + // Logs are assumed to be contiguious between metaLogBegin and metaLogEnd, so initalize desc accordingly + if(metaLogBegin.present() && metaLogEnd.present()) { + // minLogBegin is the greater of the log begin metadata OR the unreliable end version since we can't count + // on log file presence before that version. + desc.minLogBegin = std::max(metaLogBegin.get(), desc.unreliableEndVersion.orDefault(0)); + + // Set the maximum known end version of a log file, so far, which is also the assumed contiguous log file end version + desc.maxLogEnd = metaLogEnd.get(); desc.contiguousLogEnd = desc.maxLogEnd; - // Begin file scan at the contiguous log end version + // Advance scanBegin to the contiguous log end version scanBegin = desc.contiguousLogEnd.get(); } - std::vector snapshots = wait(bc->listKeyspaceSnapshots()); - desc.snapshots = snapshots; + state std::vector logs; + wait(store(bc->listLogFiles(scanBegin, scanEnd), logs) && store(bc->listKeyspaceSnapshots(), desc.snapshots)); - std::vector logs = wait(bc->listLogFiles(scanBegin, scanEnd)); + // List logs in version order so log continuity can be analyzed + std::sort(logs.begin(), logs.end()); if(!logs.empty()) { desc.maxLogEnd = logs.rbegin()->endVersion; @@ -494,20 +710,32 @@ public: } } - // Try to update the saved log versions if they are not set and we have values for them, - // but ignore errors in the update attempt in case the container is not writeable - // Also update logEndVersion if it has a value but it is less than contiguousLogEnd - try { - state Future updates = Void(); - if(desc.minLogBegin.present() && !begin.present()) - updates = updates && bc->logBeginVersion().set(desc.minLogBegin.get()); - if(desc.contiguousLogEnd.present() && (!end.present() || end.get() < desc.contiguousLogEnd.get()) ) - updates = updates && bc->logEndVersion().set(desc.contiguousLogEnd.get()); - wait(updates); - } catch(Error &e) { - if(e.code() == error_code_actor_cancelled) - throw; - TraceEvent(SevWarn, "BackupContainerSafeVersionUpdateFailure").detail("URL", bc->getURL()); + // Only update stored contiguous log begin and end versions if we did NOT use a log start override. + // Otherwise, a series of describe operations can result in a version range which is actually missing data. + if(logStartVersionOverride == invalidVersion) { + // If the log metadata begin/end versions are missing (or treated as missing due to invalidity) or + // differ from the newly calculated values for minLogBegin and contiguousLogEnd, respectively, + // then attempt to update the metadata in the backup container but ignore errors in case the + // container is not writeable. + try { + state Future updates = Void(); + + if(desc.minLogBegin.present() && metaLogBegin != desc.minLogBegin) { + updates = updates && bc->logBeginVersion().set(desc.minLogBegin.get()); + } + + if(desc.contiguousLogEnd.present() && metaLogEnd != desc.contiguousLogEnd) { + updates = updates && bc->logEndVersion().set(desc.contiguousLogEnd.get()); + } + + wait(updates); + } catch(Error &e) { + if(e.code() == error_code_actor_cancelled) + throw; + TraceEvent(SevWarn, "BackupContainerMetadataUpdateFailure") + .error(e) + .detail("URL", bc->getURL()); + } } for(auto &s : desc.snapshots) { @@ -547,18 +775,37 @@ public: } // Uses the virtual methods to describe the backup contents - Future describeBackup(bool deepScan = false) { - return describeBackup_impl(Reference::addRef(this), deepScan); + Future describeBackup(bool deepScan, Version logStartVersionOverride) { + return describeBackup_impl(Reference::addRef(this), deepScan, logStartVersionOverride); } - ACTOR static Future expireData_impl(Reference bc, Version expireEndVersion, bool force, Version restorableBeginVersion) { + ACTOR static Future expireData_impl(Reference bc, Version expireEndVersion, bool force, ExpireProgress *progress, Version restorableBeginVersion) { + if(progress != nullptr) { + progress->step = "Describing backup"; + progress->total = 0; + } + + TraceEvent("BackupContainerFileSystemExpire1") + .detail("URL", bc->getURL()) + .detail("ExpireEndVersion", expireEndVersion) + .detail("RestorableBeginVersion", restorableBeginVersion); + + // Get the backup description. + state BackupDescription desc = wait(bc->describeBackup(false, expireEndVersion)); + + // Resolve relative versions using max log version + expireEndVersion = resolveRelativeVersion(desc.maxLogEnd, expireEndVersion, "ExpireEndVersion", invalid_option_value()); + restorableBeginVersion = resolveRelativeVersion(desc.maxLogEnd, restorableBeginVersion, "RestorableBeginVersion", invalid_option_value()); + + // It would be impossible to have restorability to any version < expireEndVersion after expiring to that version if(restorableBeginVersion < expireEndVersion) throw backup_cannot_expire(); - state Version scanBegin = 0; - - // Get the backup description. - state BackupDescription desc = wait(bc->describeBackup()); + // If the expire request is to a version at or before the previous version to which data was already deleted + // then do nothing and just return + if(expireEndVersion <= desc.expiredEndVersion.orDefault(invalidVersion)) { + return Void(); + } // Assume force is needed, then try to prove otherwise. // Force is required if there is not a restorable snapshot which both @@ -572,40 +819,50 @@ public: } } - // Get metadata - state Optional expiredEnd; - state Optional logBegin; - state Optional logEnd; - wait(store(bc->expiredEndVersion().get(), expiredEnd) && store(bc->logBeginVersion().get(), logBegin) && store(bc->logEndVersion().get(), logEnd)); + // If force is needed but not passed then refuse to expire anything. + // Note that it is possible for there to be no actual files in the backup prior to expireEndVersion, + // if they were externally deleted or an expire operation deleted them but was terminated before + // updating expireEndVersion + if(forceNeeded && !force) + throw backup_cannot_expire(); - // Update scan range if expiredEnd is present - if(expiredEnd.present()) { - if(expireEndVersion <= expiredEnd.get()) { - // If the expire request is to the version already expired to then there is no work to do so return true - return Void(); - } - scanBegin = expiredEnd.get(); + // Start scan for files to delete at the last completed expire operation's end or 0. + state Version scanBegin = desc.expiredEndVersion.orDefault(0); + + TraceEvent("BackupContainerFileSystemExpire2") + .detail("URL", bc->getURL()) + .detail("ExpireEndVersion", expireEndVersion) + .detail("RestorableBeginVersion", restorableBeginVersion) + .detail("ScanBeginVersion", scanBegin); + + state std::vector logs; + state std::vector ranges; + + if(progress != nullptr) { + progress->step = "Listing files"; } - - // Get log files that contain any data at or before expireEndVersion - state std::vector logs = wait(bc->listLogFiles(scanBegin, expireEndVersion - 1)); - // Get range files up to and including expireEndVersion - state std::vector ranges = wait(bc->listRangeFiles(scanBegin, expireEndVersion - 1)); + // Get log files or range files that contain any data at or before expireEndVersion + wait(store(bc->listLogFiles(scanBegin, expireEndVersion - 1), logs) && store(bc->listRangeFiles(scanBegin, expireEndVersion - 1), ranges)); // The new logBeginVersion will be taken from the last log file, if there is one state Optional newLogBeginVersion; if(!logs.empty()) { - LogFile &last = logs.back(); + // Linear scan the unsorted logs to find the latest one in sorted order + LogFile &last = *std::max_element(logs.begin(), logs.end()); + // If the last log ends at expireEndVersion then that will be the next log begin if(last.endVersion == expireEndVersion) { newLogBeginVersion = expireEndVersion; } else { // If the last log overlaps the expiredEnd then use the log's begin version and move the expiredEnd - // back to match it. + // back to match it and keep the last log file if(last.endVersion > expireEndVersion) { newLogBeginVersion = last.beginVersion; - logs.pop_back(); + + // Instead of modifying this potentially very large vector, just clear LogFile + last = LogFile(); + expireEndVersion = newLogBeginVersion.get(); } } @@ -616,14 +873,21 @@ public: // Move filenames out of vector then destroy it to save memory for(auto const &f : logs) { - toDelete.push_back(std::move(f.fileName)); + // We may have cleared the last log file earlier so skip any empty filenames + if(!f.fileName.empty()) { + toDelete.push_back(std::move(f.fileName)); + } } logs.clear(); // Move filenames out of vector then destroy it to save memory for(auto const &f : ranges) { - ASSERT(f.version < expireEndVersion); - toDelete.push_back(std::move(f.fileName)); + // The file version must be checked here again because it is likely that expireEndVersion is in the middle of a log file, in which case + // after the log and range file listings are done (using the original expireEndVersion) the expireEndVersion will be moved back slightly + // to the begin version of the last log file found (which is also the first log to not be deleted) + if(f.version < expireEndVersion) { + toDelete.push_back(std::move(f.fileName)); + } } ranges.clear(); @@ -633,37 +897,21 @@ public: } desc = BackupDescription(); - // If some files to delete were found AND force is needed AND the force option is NOT set, then fail - if(!toDelete.empty() && forceNeeded && !force) - throw backup_cannot_expire(); - - // We are about to start deleting files, at which point no data prior to the expire end version can be - // safely assumed to exist. The [logBegin, logEnd) range from the container's metadata describes - // a range of log versions which can be assumed to exist, so if the range of data being deleted overlaps - // that range then the metadata range must be updated. - - // If we're expiring the entire log range described by the metadata then clear both metadata values - if(logEnd.present() && logEnd.get() < expireEndVersion) { - if(logBegin.present()) - wait(bc->logBeginVersion().clear()); - if(logEnd.present()) - wait(bc->logEndVersion().clear()); + // We are about to start deleting files, at which point all data prior to expireEndVersion is considered + // 'unreliable' as some or all of it will be missing. So before deleting anything, read unreliableEndVersion + // (don't use cached value in desc) and update its value if it is missing or < expireEndVersion + if(progress != nullptr) { + progress->step = "Initial metadata update"; } - else { - // If we are expiring to a point within the metadata range then update the begin if we have a new - // log begin version (which we should!) or clear the metadata range if we do not (which would be - // repairing the metadata from an incorrect state) - if(logBegin.present() && logBegin.get() < expireEndVersion) { - if(newLogBeginVersion.present()) { - wait(bc->logBeginVersion().set(newLogBeginVersion.get())); - } - else { - if(logBegin.present()) - wait(bc->logBeginVersion().clear()); - if(logEnd.present()) - wait(bc->logEndVersion().clear()); - } - } + Optional metaUnreliableEnd = wait(bc->unreliableEndVersion().get()); + if(metaUnreliableEnd.orDefault(0) < expireEndVersion) { + wait(bc->unreliableEndVersion().set(expireEndVersion)); + } + + if(progress != nullptr) { + progress->step = "Deleting files"; + progress->total = toDelete.size(); + progress->done = 0; } // Delete files, but limit parallelism because the file list could use a lot of memory and the corresponding @@ -685,19 +933,30 @@ public: while(deleteFutures.size() > targetFuturesSize) { wait(deleteFutures.front()); + if(progress != nullptr) { + ++progress->done; + } deleteFutures.pop_front(); } } - // Update the expiredEndVersion property. - wait(bc->expiredEndVersion().set(expireEndVersion)); + if(progress != nullptr) { + progress->step = "Final metadata update"; + progress->total = 0; + } + // Update the expiredEndVersion metadata to indicate that everything prior to that version has been + // successfully deleted if the current version is lower or missing + Optional metaExpiredEnd = wait(bc->expiredEndVersion().get()); + if(metaExpiredEnd.orDefault(0) < expireEndVersion) { + wait(bc->expiredEndVersion().set(expireEndVersion)); + } return Void(); } // Delete all data up to (but not including endVersion) - Future expireData(Version expireEndVersion, bool force, Version restorableBeginVersion) { - return expireData_impl(Reference::addRef(this), expireEndVersion, force, restorableBeginVersion); + Future expireData(Version expireEndVersion, bool force, ExpireProgress *progress, Version restorableBeginVersion) { + return expireData_impl(Reference::addRef(this), expireEndVersion, force, progress, restorableBeginVersion); } ACTOR static Future> getRestoreSet_impl(Reference bc, Version targetVersion) { @@ -721,7 +980,10 @@ public: if(snapshot.get().beginVersion == snapshot.get().endVersion && snapshot.get().endVersion == targetVersion) return Optional(restorable); - std::vector logs = wait(bc->listLogFiles(snapshot.get().beginVersion, targetVersion)); + state std::vector logs = wait(bc->listLogFiles(snapshot.get().beginVersion, targetVersion)); + + // List logs in version order so log continuity can be analyzed + std::sort(logs.begin(), logs.end()); // If there are logs and the first one starts at or before the snapshot begin version then proceed if(!logs.empty() && logs.front().beginVersion <= snapshot.get().beginVersion) { @@ -771,18 +1033,19 @@ private: public: // To avoid the need to scan the underyling filesystem in many cases, some important version boundaries are stored in named files. - // These files can be deleted from the filesystem if they appear to be wrong or corrupt, and full scans will done - // when needed. + // These versions also indicate what version ranges are known to be deleted or partially deleted. // - // The three versions below, when present, describe 4 version ranges which collectively cover the entire version timeline. - // 0 - expiredEndVersion: All files in this range have been deleted - // expiredEndVersion - presentBeginVersion: Files in this range *may* have been deleted so their presence must not be assumed. - // presentBeginVersion - presentEndVersion: Files in this range have NOT been deleted by any FDB backup operations. - // presentEndVersion - infinity: Files in this range may or may not exist yet. Scan to find what is there. + // The values below describe version ranges as follows: + // 0 - expiredEndVersion All files in this range have been deleted + // expiredEndVersion - unreliableEndVersion Some files in this range may have been deleted. + // + // logBeginVersion - logEnd Log files are contiguous in this range and have NOT been deleted by fdbbackup + // logEnd - infinity Files in this range may or may not exist yet // VersionProperty logBeginVersion() { return {Reference::addRef(this), "log_begin_version"}; } VersionProperty logEndVersion() { return {Reference::addRef(this), "log_end_version"}; } VersionProperty expiredEndVersion() { return {Reference::addRef(this), "expired_end_version"}; } + VersionProperty unreliableEndVersion() { return {Reference::addRef(this), "unreliable_end_version"}; } ACTOR static Future writeVersionProperty(Reference bc, std::string path, Version v) { try { @@ -792,7 +1055,10 @@ public: wait(f->finish()); return Void(); } catch(Error &e) { - TraceEvent(SevWarn, "BackupContainerWritePropertyFailed").error(e).detail("Path", path); + TraceEvent(SevWarn, "BackupContainerWritePropertyFailed") + .error(e) + .detail("URL", bc->getURL()) + .detail("Path", path); throw; } } @@ -809,12 +1075,20 @@ public: if(rs == size && sscanf(s.c_str(), "%lld%n", &v, &len) == 1 && len == size) return v; - TraceEvent(SevWarn, "BackupContainerInvalidProperty"); + TraceEvent(SevWarn, "BackupContainerInvalidProperty") + .detail("URL", bc->getURL()) + .detail("Path", path); + throw backup_invalid_info(); } catch(Error &e) { if(e.code() == error_code_file_not_found) return Optional(); - TraceEvent(SevWarn, "BackupContainerReadPropertyFailed").error(e).detail("Path", path); + + TraceEvent(SevWarn, "BackupContainerReadPropertyFailed") + .error(e) + .detail("URL", bc->getURL()) + .detail("Path", path); + throw; } } @@ -881,6 +1155,11 @@ public: return Void(); } + // The container exists if the folder it resides in exists + Future exists() { + return directoryExists(m_path); + } + Future> readFile(std::string path) { int flags = IAsyncFile::OPEN_NO_AIO | IAsyncFile::OPEN_READONLY | IAsyncFile::OPEN_UNCACHED; // Simulation does not properly handle opening the same file from multiple machines using a shared filesystem, @@ -973,7 +1252,7 @@ public: Future deleteContainer(int *pNumDeleted) { // In order to avoid deleting some random directory due to user error, first describe the backup // and make sure it has something in it. - return map(describeBackup(), [=](BackupDescription const &desc) { + return map(describeBackup(false, invalidVersion), [=](BackupDescription const &desc) { // If the backup has no snapshots and no logs then it's probably not a valid backup if(desc.snapshots.size() == 0 && !desc.minLogBegin.present()) throw backup_invalid_url(); @@ -1127,7 +1406,18 @@ public: return create_impl(Reference::addRef(this)); } + // The container exists if the index entry in the blob bucket exists + Future exists() { + return m_bstore->objectExists(m_bucket, indexEntry()); + } + ACTOR static Future deleteContainer_impl(Reference bc, int *pNumDeleted) { + bool e = wait(bc->exists()); + if(!e) { + TraceEvent(SevWarnAlways, "BackupContainerDoesNotExist").detail("URL", bc->getURL()); + throw backup_does_not_exist(); + } + // First delete everything under the data prefix in the bucket wait(bc->m_bstore->deleteRecursively(bc->m_bucket, bc->dataPath(""), pNumDeleted)); @@ -1197,10 +1487,12 @@ Reference IBackupContainer::openContainer(std::string url) throw; TraceEvent m(SevWarn, "BackupContainer"); - m.detail("Description", "Invalid container specification. See help.").detail("URL", url); - + m.detail("Description", "Invalid container specification. See help."); + m.detail("URL", url); + m.error(e); if(e.code() == error_code_backup_invalid_url) m.detail("LastOpenError", lastOpenError); + throw; } } @@ -1241,10 +1533,13 @@ ACTOR Future> listContainers_impl(std::string baseURL) throw; TraceEvent m(SevWarn, "BackupContainer"); - m.detail("Description", "Invalid backup container URL prefix. See help.").detail("URL", baseURL); - + + m.detail("Description", "Invalid backup container URL prefix. See help."); + m.detail("URL", baseURL); + m.error(e); if(e.code() == error_code_backup_invalid_url) m.detail("LastOpenError", IBackupContainer::lastOpenError); + throw; } } @@ -1351,6 +1646,15 @@ ACTOR Future> timeKeeperEpochsFromVersion(Version v, Reference return found.first + (v - found.second) / CLIENT_KNOBS->CORE_VERSIONSPERSECOND; } +int chooseFileSize(std::vector &sizes) { + int size = 1000; + if(!sizes.empty()) { + size = sizes.back(); + sizes.pop_back(); + } + return size; +} + ACTOR Future writeAndVerifyFile(Reference c, Reference f, int size) { state Standalone content; if(size > 0) { @@ -1373,6 +1677,12 @@ ACTOR Future writeAndVerifyFile(Reference c, ReferencerandomInt64(1, CLIENT_KNOBS->CORE_VERSIONSPERSECOND); + return v + increment; +} + ACTOR Future testBackupContainer(std::string url) { printf("BackupContainerTest URL %s\n", url.c_str()); @@ -1382,91 +1692,119 @@ ACTOR Future testBackupContainer(std::string url) { try { wait(c->deleteContainer()); } catch(Error &e) { - if(e.code() != error_code_backup_invalid_url) + if(e.code() != error_code_backup_invalid_url && e.code() != error_code_backup_does_not_exist) throw; } wait(c->create()); - state int64_t versionShift = g_random->randomInt64(0, std::numeric_limits::max() - 500); + state std::vector> writes; + state std::map> snapshots; + state std::map snapshotSizes; + state int nRangeFiles = 0; + state std::map logs; + state Version v = g_random->randomInt64(0, std::numeric_limits::max() / 2); - state Reference log1 = wait(c->writeLogFile(100 + versionShift, 150 + versionShift, 10)); - state Reference log2 = wait(c->writeLogFile(150 + versionShift, 300 + versionShift, 10)); - state Reference range1 = wait(c->writeRangeFile(160 + versionShift, 10)); - state Reference range2 = wait(c->writeRangeFile(300 + versionShift, 10)); - state Reference range3 = wait(c->writeRangeFile(310 + versionShift, 10)); + // List of sizes to use to test edge cases on underlying file implementations + state std::vector fileSizes = {0, 10000000, 5000005}; - wait( - writeAndVerifyFile(c, log1, 0) - && writeAndVerifyFile(c, log2, g_random->randomInt(0, 10000000)) - && writeAndVerifyFile(c, range1, g_random->randomInt(0, 1000)) - && writeAndVerifyFile(c, range2, g_random->randomInt(0, 100000)) - && writeAndVerifyFile(c, range3, g_random->randomInt(0, 3000000)) - ); + loop { + state Version logStart = v; + state int kvfiles = g_random->randomInt(0, 3); - wait( - c->writeKeyspaceSnapshotFile({range1->getFileName(), range2->getFileName()}, range1->size() + range2->size()) - && c->writeKeyspaceSnapshotFile({range3->getFileName()}, range3->size()) - ); + while(kvfiles > 0) { + if(snapshots.empty()) { + snapshots[v] = {}; + snapshotSizes[v] = 0; + if(g_random->coinflip()) { + v = nextVersion(v); + } + } + Reference range = wait(c->writeRangeFile(snapshots.rbegin()->first, 0, v, 10)); + ++nRangeFiles; + v = nextVersion(v); + snapshots.rbegin()->second.push_back(range->getFileName()); - printf("Checking file list dump\n"); - FullBackupListing listing = wait(c->dumpFileList()); - ASSERT(listing.logs.size() == 2); - ASSERT(listing.ranges.size() == 3); - ASSERT(listing.snapshots.size() == 2); + int size = chooseFileSize(fileSizes); + snapshotSizes.rbegin()->second += size; + writes.push_back(writeAndVerifyFile(c, range, size)); + + if(g_random->random01() < .2) { + writes.push_back(c->writeKeyspaceSnapshotFile(snapshots.rbegin()->second, snapshotSizes.rbegin()->second)); + snapshots[v] = {}; + snapshotSizes[v] = 0; + break; + } + + --kvfiles; + } + + if(logStart == v || g_random->coinflip()) { + v = nextVersion(v); + } + state Reference log = wait(c->writeLogFile(logStart, v, 10)); + logs[logStart] = log->getFileName(); + int size = chooseFileSize(fileSizes); + writes.push_back(writeAndVerifyFile(c, log, size)); + + // Randomly stop after a snapshot has finished and all manually seeded file sizes have been used. + if(fileSizes.empty() && !snapshots.empty() && snapshots.rbegin()->second.empty() && g_random->random01() < .2) { + snapshots.erase(snapshots.rbegin()->first); + break; + } + } + + wait(waitForAll(writes)); + + state BackupFileList listing = wait(c->dumpFileList()); + ASSERT(listing.ranges.size() == nRangeFiles); + ASSERT(listing.logs.size() == logs.size()); + ASSERT(listing.snapshots.size() == snapshots.size()); state BackupDescription desc = wait(c->describeBackup()); - printf("Backup Description 1\n%s", desc.toString().c_str()); + printf("\n%s\n", desc.toString().c_str()); - ASSERT(desc.maxRestorableVersion.present()); - Optional rest = wait(c->getRestoreSet(desc.maxRestorableVersion.get())); - ASSERT(rest.present()); - ASSERT(rest.get().logs.size() == 0); - ASSERT(rest.get().ranges.size() == 1); + // Do a series of expirations and verify resulting state + state int i = 0; + for(; i < listing.snapshots.size(); ++i) { + // Ensure we can still restore to the latest version + Optional rest = wait(c->getRestoreSet(desc.maxRestorableVersion.get())); + ASSERT(rest.present()); - Optional rest = wait(c->getRestoreSet(150 + versionShift)); - ASSERT(!rest.present()); + // Ensure we can restore to the end version of snapshot i + Optional rest = wait(c->getRestoreSet(listing.snapshots[i].endVersion)); + ASSERT(rest.present()); - Optional rest = wait(c->getRestoreSet(300 + versionShift)); - ASSERT(rest.present()); - ASSERT(rest.get().logs.size() == 1); - ASSERT(rest.get().ranges.size() == 2); + // Test expiring to the end of this snapshot + state Version expireVersion = listing.snapshots[i].endVersion; - printf("Expire 1\n"); - wait(c->expireData(100 + versionShift)); - BackupDescription d = wait(c->describeBackup()); - printf("Backup Description 2\n%s", d.toString().c_str()); - ASSERT(d.minLogBegin == 100 + versionShift); - ASSERT(d.maxRestorableVersion == desc.maxRestorableVersion); + // Expire everything up to but not including the snapshot end version + printf("EXPIRE TO %lld\n", expireVersion); + state Future f = c->expireData(expireVersion); + wait(ready(f)); - printf("Expire 2\n"); - wait(c->expireData(101 + versionShift)); - BackupDescription d = wait(c->describeBackup()); - printf("Backup Description 3\n%s", d.toString().c_str()); - ASSERT(d.minLogBegin == 100 + versionShift); - ASSERT(d.maxRestorableVersion == desc.maxRestorableVersion); + // If there is an error, it must be backup_cannot_expire and we have to be on the last snapshot + if(f.isError()) { + ASSERT(f.getError().code() == error_code_backup_cannot_expire); + ASSERT(i == listing.snapshots.size() - 1); + wait(c->expireData(expireVersion, true)); + } - printf("Expire 3\n"); - wait(c->expireData(300 + versionShift)); - BackupDescription d = wait(c->describeBackup()); - printf("Backup Description 4\n%s", d.toString().c_str()); - ASSERT(d.minLogBegin.present()); - ASSERT(d.snapshots.size() == desc.snapshots.size()); - ASSERT(d.maxRestorableVersion == desc.maxRestorableVersion); - - printf("Expire 4\n"); - wait(c->expireData(301 + versionShift, true)); - BackupDescription d = wait(c->describeBackup()); - printf("Backup Description 4\n%s", d.toString().c_str()); - ASSERT(d.snapshots.size() == 1); - ASSERT(!d.minLogBegin.present()); + BackupDescription d = wait(c->describeBackup()); + printf("\n%s\n", d.toString().c_str()); + } + printf("DELETING\n"); wait(c->deleteContainer()); - BackupDescription d = wait(c->describeBackup()); - printf("Backup Description 5\n%s", d.toString().c_str()); - ASSERT(d.snapshots.size() == 0); - ASSERT(!d.minLogBegin.present()); + state Future d = c->describeBackup(); + wait(ready(d)); + ASSERT(d.isError() && d.getError().code() == error_code_backup_does_not_exist); + + BackupFileList empty = wait(c->dumpFileList()); + ASSERT(empty.ranges.size() == 0); + ASSERT(empty.logs.size() == 0); + ASSERT(empty.snapshots.size() == 0); printf("BackupContainerTest URL=%s PASSED.\n", url.c_str()); diff --git a/fdbclient/BackupContainer.h b/fdbclient/BackupContainer.h index 7bbb2a92c8..d3dfafde82 100644 --- a/fdbclient/BackupContainer.h +++ b/fdbclient/BackupContainer.h @@ -96,10 +96,12 @@ struct KeyspaceSnapshotFile { } }; -struct FullBackupListing { +struct BackupFileList { std::vector ranges; std::vector logs; std::vector snapshots; + + void toStream(FILE *fout) const; }; // The byte counts here only include usable log files and byte counts from kvrange manifests @@ -108,10 +110,19 @@ struct BackupDescription { std::string url; std::vector snapshots; int64_t snapshotBytes; + // The version before which everything has been deleted by an expire + Optional expiredEndVersion; + // The latest version before which at least some data has been deleted by an expire + Optional unreliableEndVersion; + // The minimum log version in the backup Optional minLogBegin; + // The maximum log version in the backup Optional maxLogEnd; + // The maximum log version for which there is contiguous log version coverage extending back to minLogBegin Optional contiguousLogEnd; + // The maximum version which this backup can be used to restore to Optional maxRestorableVersion; + // The minimum version which this backup can be used to restore to Optional minRestorableVersion; std::string extendedDetail; // Freeform container-specific info. @@ -153,10 +164,11 @@ public: // Create the container virtual Future create() = 0; + virtual Future exists() = 0; // Open a log file or range file for writing virtual Future> writeLogFile(Version beginVersion, Version endVersion, int blockSize) = 0; - virtual Future> writeRangeFile(Version version, int blockSize) = 0; + virtual Future> writeRangeFile(Version snapshotBeginVersion, int snapshotFileCount, Version fileVersion, int blockSize) = 0; // Write a KeyspaceSnapshotFile of range file names representing a full non overlapping // snapshot of the key ranges this backup is targeting. @@ -165,23 +177,32 @@ public: // Open a file for read by name virtual Future> readFile(std::string name) = 0; + struct ExpireProgress { + std::string step; + int total; + int done; + std::string toString() const; + }; // Delete backup files which do not contain any data at or after (more recent than) expireEndVersion. // If force is false, then nothing will be deleted unless there is a restorable snapshot which // - begins at or after expireEndVersion // - ends at or before restorableBeginVersion // If force is true, data is deleted unconditionally which could leave the backup in an unusable state. This is not recommended. // Returns true if expiration was done. - virtual Future expireData(Version expireEndVersion, bool force = false, Version restorableBeginVersion = std::numeric_limits::max()) = 0; + virtual Future expireData(Version expireEndVersion, bool force = false, ExpireProgress *progress = nullptr, Version restorableBeginVersion = std::numeric_limits::max()) = 0; // Delete entire container. During the process, if pNumDeleted is not null it will be // updated with the count of deleted files so that progress can be seen. virtual Future deleteContainer(int *pNumDeleted = nullptr) = 0; - // Return key details about a backup's contents, possibly using cached or stored metadata - // unless deepScan is true. - virtual Future describeBackup(bool deepScan = false) = 0; + // Return key details about a backup's contents. + // Unless deepScan is true, use cached metadata, if present, as initial contiguous available log range. + // If logStartVersionOverride is given, log data prior to that version will be ignored for the purposes + // of this describe operation. This can be used to calculate what the restorability of a backup would + // be after deleting all data prior to logStartVersionOverride. + virtual Future describeBackup(bool deepScan = false, Version logStartVersionOverride = invalidVersion) = 0; - virtual Future dumpFileList() = 0; + virtual Future dumpFileList(Version begin = 0, Version end = std::numeric_limits::max()) = 0; // Get exactly the files necessary to restore to targetVersion. Returns non-present if // restore to given version is not possible. diff --git a/fdbclient/BlobStore.actor.cpp b/fdbclient/BlobStore.actor.cpp index 40c8de4060..a17510528b 100644 --- a/fdbclient/BlobStore.actor.cpp +++ b/fdbclient/BlobStore.actor.cpp @@ -225,6 +225,20 @@ std::string BlobStoreEndpoint::getResourceURL(std::string resource) { return r; } +ACTOR Future bucketExists_impl(Reference b, std::string bucket) { + wait(b->requestRateRead->getAllowance(1)); + + std::string resource = std::string("/") + bucket; + HTTP::Headers headers; + + Reference r = wait(b->doRequest("HEAD", resource, headers, NULL, 0, {200, 404})); + return r->code == 200; +} + +Future BlobStoreEndpoint::bucketExists(std::string const &bucket) { + return bucketExists_impl(Reference::addRef(this), bucket); +} + ACTOR Future objectExists_impl(Reference b, std::string bucket, std::string object) { wait(b->requestRateRead->getAllowance(1)); @@ -244,8 +258,17 @@ ACTOR Future deleteObject_impl(Reference b, std::string std::string resource = std::string("/") + bucket + "/" + object; HTTP::Headers headers; + // 200 or 204 means object successfully deleted, 404 means it already doesn't exist, so any of those are considered successful Reference r = wait(b->doRequest("DELETE", resource, headers, NULL, 0, {200, 204, 404})); - // 200 means object deleted, 404 means it doesn't exist already, so either success code passed above is fine. + + // But if the object already did not exist then the 'delete' is assumed to be successful but a warning is logged. + if(r->code == 404) { + TraceEvent(SevWarnAlways, "BlobStoreEndpointDeleteObjectMissing") + .detail("Host", b->host) + .detail("Bucket", bucket) + .detail("Object", object); + } + return Void(); } @@ -310,9 +333,12 @@ Future BlobStoreEndpoint::deleteRecursively(std::string const &bucket, std ACTOR Future createBucket_impl(Reference b, std::string bucket) { wait(b->requestRateWrite->getAllowance(1)); - std::string resource = std::string("/") + bucket; - HTTP::Headers headers; - Reference r = wait(b->doRequest("PUT", resource, headers, NULL, 0, {200, 409})); + bool exists = wait(b->bucketExists(bucket)); + if(!exists) { + std::string resource = std::string("/") + bucket; + HTTP::Headers headers; + Reference r = wait(b->doRequest("PUT", resource, headers, NULL, 0, {200, 409})); + } return Void(); } @@ -485,8 +511,8 @@ ACTOR Future> doRequest_impl(Reference frconn = bstore->connect(); // Make a shallow copy of the queue by calling addref() on each buffer in the chain and then prepending that chain to contentCopy + contentCopy.discardAll(); if(pContent != nullptr) { - contentCopy.discardAll(); PacketBuffer *pFirst = pContent->getUnsent(); PacketBuffer *pLast = nullptr; for(PacketBuffer *p = pFirst; p != nullptr; p = p->nextPacketBuffer()) { diff --git a/fdbclient/BlobStore.h b/fdbclient/BlobStore.h index 34cb5cd394..7f0d02a0a3 100644 --- a/fdbclient/BlobStore.h +++ b/fdbclient/BlobStore.h @@ -36,7 +36,7 @@ public: struct Stats { Stats() : requests_successful(0), requests_failed(0), bytes_sent(0) {} Stats operator-(const Stats &rhs); - void clear() { memset(this, sizeof(*this), 0); } + void clear() { memset(this, 0, sizeof(*this)); } json_spirit::mObject getJSON(); int64_t requests_successful; @@ -197,6 +197,9 @@ public: // Get a list of the files in a bucket, see listBucketStream for more argument detail. Future listBucket(std::string const &bucket, Optional prefix = {}, Optional delimiter = {}, int maxDepth = 0, std::function recurseFilter = nullptr); + // Check if a bucket exists + Future bucketExists(std::string const &bucket); + // Check if an object exists in a bucket Future objectExists(std::string const &bucket, std::string const &object); diff --git a/fdbclient/CMakeLists.txt b/fdbclient/CMakeLists.txt new file mode 100644 index 0000000000..5637501c8f --- /dev/null +++ b/fdbclient/CMakeLists.txt @@ -0,0 +1,97 @@ +set(FDBCLIENT_SRCS + AsyncFileBlobStore.actor.cpp + AsyncFileBlobStore.actor.h + Atomic.h + AutoPublicAddress.cpp + BackupAgent.h + BackupAgentBase.actor.cpp + BackupContainer.actor.cpp + BackupContainer.h + BlobStore.actor.cpp + ClientDBInfo.h + ClientLogEvents.h + ClientWorkerInterface.h + ClusterInterface.h + CommitTransaction.h + CoordinationInterface.h + DatabaseBackupAgent.actor.cpp + DatabaseConfiguration.cpp + DatabaseConfiguration.h + DatabaseContext.h + EventTypes.actor.h + FailureMonitorClient.actor.cpp + FailureMonitorClient.h + FDBOptions.h + FDBTypes.h + FileBackupAgent.actor.cpp + HTTP.actor.cpp + IClientApi.h + JsonBuilder.cpp + JsonBuilder.h + KeyBackedTypes.h + KeyRangeMap.actor.cpp + KeyRangeMap.h + Knobs.cpp + Knobs.h + ManagementAPI.actor.cpp + ManagementAPI.h + MasterProxyInterface.h + MetricLogger.actor.cpp + MetricLogger.h + MonitorLeader.actor.cpp + MonitorLeader.h + MultiVersionAssignmentVars.h + MultiVersionTransaction.actor.cpp + MultiVersionTransaction.h + MutationList.h + NativeAPI.actor.cpp + NativeAPI.h + Notified.h + ReadYourWrites.actor.cpp + ReadYourWrites.h + RunTransaction.actor.h + RYWIterator.cpp + RYWIterator.h + Schemas.cpp + Schemas.h + SnapshotCache.h + Status.h + StatusClient.actor.cpp + StatusClient.h + StorageServerInterface.h + Subspace.cpp + Subspace.h + SystemData.cpp + SystemData.h + TaskBucket.actor.cpp + TaskBucket.h + ThreadSafeTransaction.actor.cpp + ThreadSafeTransaction.h + Tuple.cpp + Tuple.h + VersionedMap.actor.h + VersionedMap.h + WriteMap.h + json_spirit/json_spirit_error_position.h + json_spirit/json_spirit_reader_template.h + json_spirit/json_spirit_value.h + json_spirit/json_spirit_writer_options.h + json_spirit/json_spirit_writer_template.h + libb64/cdecode.c + libb64/cencode.c + md5/md5.c + sha1/SHA1.cpp + ${CMAKE_CURRENT_BINARY_DIR}/FDBOptions.g.cpp) + + +add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/FDBOptions.g.h ${CMAKE_CURRENT_BINARY_DIR}/FDBOptions.g.cpp + COMMAND ${MONO_EXECUTABLE} ${VEXILLOGRAPHER_EXE} ${CMAKE_CURRENT_SOURCE_DIR}/vexillographer/fdb.options cpp ${CMAKE_CURRENT_BINARY_DIR}/FDBOptions.g + DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/vexillographer/fdb.options vexillographer + COMMENT "Generate FDBOptions c++ files") +add_custom_target(fdboptions DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/FDBOptions.g.h ${CMAKE_CURRENT_BINARY_DIR}/FDBOptions.g.cpp) + +actor_set(FDBCLIENT_BUILD "${FDBCLIENT_SRCS}") +add_library(fdbclient STATIC ${FDBCLIENT_BUILD}) +add_dependencies(fdbclient fdboptions) +actor_compile(fdbclient "${FDBCLIENT_SRCS}") +target_link_libraries(fdbclient PUBLIC fdbrpc) diff --git a/fdbclient/ClientDBInfo.h b/fdbclient/ClientDBInfo.h index a6bc7d20bb..7b6aa047f8 100644 --- a/fdbclient/ClientDBInfo.h +++ b/fdbclient/ClientDBInfo.h @@ -39,8 +39,8 @@ struct ClientDBInfo { template void serialize(Archive& ar) { ASSERT( ar.protocolVersion() >= 0x0FDB00A200040001LL ); - ar & proxies & id & clientTxnInfoSampleRate & clientTxnInfoSizeLimit; + serializer(ar, proxies, id, clientTxnInfoSampleRate, clientTxnInfoSizeLimit); } }; -#endif \ No newline at end of file +#endif diff --git a/fdbclient/ClientLogEvents.h b/fdbclient/ClientLogEvents.h index c54ead5cfe..09d1685e52 100644 --- a/fdbclient/ClientLogEvents.h +++ b/fdbclient/ClientLogEvents.h @@ -39,7 +39,7 @@ namespace FdbClientLogEvents { Event(EventType t, double ts) : type(t), startTs(ts) { } Event() { } - template Ar& serialize(Ar &ar) { return ar & type & startTs; } + template Ar& serialize(Ar &ar) { return serializer(ar, type, startTs); } EventType type{ EVENTTYPEEND }; double startTs{ 0 }; @@ -53,9 +53,9 @@ namespace FdbClientLogEvents { template Ar& serialize(Ar &ar) { if (!ar.isDeserializing) - return Event::serialize(ar) & latency; + return serializer(Event::serialize(ar), latency); else - return ar & latency; + return serializer(ar, latency); } double latency; @@ -71,9 +71,9 @@ namespace FdbClientLogEvents { template Ar& serialize(Ar &ar) { if (!ar.isDeserializing) - return Event::serialize(ar) & latency & valueSize & key; + return serializer(Event::serialize(ar), latency, valueSize, key); else - return ar & latency & valueSize & key; + return serializer(ar, latency, valueSize, key); } double latency; @@ -91,9 +91,9 @@ namespace FdbClientLogEvents { template Ar& serialize(Ar &ar) { if (!ar.isDeserializing) - return Event::serialize(ar) & latency & rangeSize & startKey & endKey; + return serializer(Event::serialize(ar), latency, rangeSize, startKey, endKey); else - return ar & latency & rangeSize & startKey & endKey; + return serializer(ar, latency, rangeSize, startKey, endKey); } double latency; @@ -112,9 +112,9 @@ namespace FdbClientLogEvents { template Ar& serialize(Ar &ar) { if (!ar.isDeserializing) - return Event::serialize(ar) & latency & numMutations & commitBytes & req.transaction & req.arena; + return serializer(Event::serialize(ar), latency, numMutations, commitBytes, req.transaction, req.arena); else - return ar & latency & numMutations & commitBytes & req.transaction & req.arena; + return serializer(ar, latency, numMutations, commitBytes, req.transaction, req.arena); } double latency; @@ -145,9 +145,9 @@ namespace FdbClientLogEvents { template Ar& serialize(Ar &ar) { if (!ar.isDeserializing) - return Event::serialize(ar) & errCode & key; + return serializer(Event::serialize(ar), errCode, key); else - return ar & errCode & key; + return serializer(ar, errCode, key); } int errCode; @@ -164,9 +164,9 @@ namespace FdbClientLogEvents { template Ar& serialize(Ar &ar) { if (!ar.isDeserializing) - return Event::serialize(ar) & errCode & startKey & endKey; + return serializer(Event::serialize(ar), errCode, startKey, endKey); else - return ar & errCode & startKey & endKey; + return serializer(ar, errCode, startKey, endKey); } int errCode; @@ -184,9 +184,9 @@ namespace FdbClientLogEvents { template Ar& serialize(Ar &ar) { if (!ar.isDeserializing) - return Event::serialize(ar) & errCode & req.transaction & req.arena; + return serializer(Event::serialize(ar), errCode, req.transaction, req.arena); else - return ar & errCode & req.transaction & req.arena; + return serializer(ar, errCode, req.transaction, req.arena); } int errCode; diff --git a/fdbclient/ClientWorkerInterface.h b/fdbclient/ClientWorkerInterface.h index 9b221c4755..9c75c31b24 100644 --- a/fdbclient/ClientWorkerInterface.h +++ b/fdbclient/ClientWorkerInterface.h @@ -40,7 +40,7 @@ struct ClientWorkerInterface { template void serialize( Ar& ar ) { - ar & reboot & profiler; + serializer(ar, reboot, profiler); } }; @@ -52,7 +52,7 @@ struct RebootRequest { template void serialize(Ar& ar) { - ar & deleteData & checkData; + serializer(ar, deleteData, checkData); } }; @@ -77,7 +77,7 @@ struct ProfilerRequest { template void serialize( Ar& ar ) { - ar & reply & type & action & duration & outputFile; + serializer(ar, reply, type, action, duration, outputFile); } }; BINARY_SERIALIZABLE( ProfilerRequest::Type ); diff --git a/fdbclient/ClusterInterface.h b/fdbclient/ClusterInterface.h index c55e4c08fb..f0a71f4bfc 100755 --- a/fdbclient/ClusterInterface.h +++ b/fdbclient/ClusterInterface.h @@ -52,7 +52,7 @@ struct ClusterInterface { template void serialize( Ar& ar ) { - ar & openDatabase & failureMonitoring & databaseStatus & ping & getClientWorkers & forceRecovery; + serializer(ar, openDatabase, failureMonitoring, databaseStatus, ping, getClientWorkers, forceRecovery); } }; @@ -93,7 +93,7 @@ struct ClientVersionRef { template void serialize(Ar& ar) { - ar & clientVersion & sourceVersion & protocolVersion; + serializer(ar, clientVersion, sourceVersion, protocolVersion); } size_t expectedSize() const { return clientVersion.size() + sourceVersion.size() + protocolVersion.size(); } @@ -125,7 +125,7 @@ struct OpenDatabaseRequest { template void serialize(Ar& ar) { ASSERT( ar.protocolVersion() >= 0x0FDB00A400040001LL ); - ar & issues & supportedVersions & traceLogGroup & knownClientInfoID & reply & arena; + serializer(ar, issues, supportedVersions, traceLogGroup, knownClientInfoID, reply, arena); } }; @@ -138,7 +138,7 @@ struct SystemFailureStatus { template void serialize(Ar& ar) { - ar & address & status; + serializer(ar, address, status); } }; @@ -159,7 +159,7 @@ struct FailureMonitoringRequest { template void serialize(Ar& ar) { - ar & senderStatus & failureInformationVersion & reply; + serializer(ar, senderStatus, failureInformationVersion, reply); } }; @@ -173,7 +173,7 @@ struct FailureMonitoringReply { template void serialize(Ar& ar) { - ar & changes & failureInformationVersion & allOthersFailed & clientRequestIntervalMS & considerServerFailedTimeoutMS & arena; + serializer(ar, changes, failureInformationVersion, allOthersFailed, clientRequestIntervalMS, considerServerFailedTimeoutMS, arena); } }; @@ -182,7 +182,7 @@ struct StatusRequest { template void serialize(Ar& ar) { - ar & reply; + serializer(ar, reply); } }; @@ -196,7 +196,7 @@ struct StatusReply { template void serialize(Ar& ar) { - ar & statusStr; + serializer(ar, statusStr); if( ar.isDeserializing ) { json_spirit::mValue mv; if(g_network->isSimulated()) { @@ -218,7 +218,7 @@ struct GetClientWorkersRequest { template void serialize(Ar& ar) { - ar & reply; + serializer(ar, reply); } }; @@ -229,7 +229,7 @@ struct ForceRecoveryRequest { template void serialize(Ar& ar) { - ar & reply; + serializer(ar, reply); } }; diff --git a/fdbclient/CommitTransaction.h b/fdbclient/CommitTransaction.h index 444078e180..440e0aa909 100644 --- a/fdbclient/CommitTransaction.h +++ b/fdbclient/CommitTransaction.h @@ -50,7 +50,7 @@ struct MutationRef { template void serialize( Ar& ar ) { - ar & type & param1 & param2; + serializer(ar, type, param1, param2); } // These masks define which mutation types have particular properties (they are used to implement isSingleKeyMutation() etc) @@ -101,7 +101,7 @@ struct CommitTransactionRef { template force_inline void serialize( Ar& ar ) { - ar & read_conflict_ranges & write_conflict_ranges & mutations & read_snapshot; + serializer(ar, read_conflict_ranges, write_conflict_ranges, mutations, read_snapshot); } // Convenience for internal code required to manipulate these without the Native API diff --git a/fdbclient/CoordinationInterface.h b/fdbclient/CoordinationInterface.h index f45eb37f00..dd1c73ef6e 100644 --- a/fdbclient/CoordinationInterface.h +++ b/fdbclient/CoordinationInterface.h @@ -122,7 +122,7 @@ struct LeaderInfo { template void serialize(Ar& ar) { - ar & changeID & serializedInfo & forward; + serializer(ar, changeID, serializedInfo, forward); } }; @@ -136,7 +136,7 @@ struct GetLeaderRequest { template void serialize(Ar& ar) { - ar & key & knownLeader & reply; + serializer(ar, key, knownLeader, reply); } }; diff --git a/fdbclient/DatabaseConfiguration.h b/fdbclient/DatabaseConfiguration.h index dc0b3d5832..740cd376f0 100644 --- a/fdbclient/DatabaseConfiguration.h +++ b/fdbclient/DatabaseConfiguration.h @@ -41,7 +41,7 @@ struct SatelliteInfo { template void serialize(Ar& ar) { - ar & dcId & priority; + serializer(ar, dcId, priority); } }; @@ -71,8 +71,8 @@ struct RegionInfo { template void serialize(Ar& ar) { - ar & dcId & priority & satelliteTLogPolicy & satelliteDesiredTLogCount & satelliteTLogReplicationFactor & satelliteTLogWriteAntiQuorum & satelliteTLogUsableDcs & - satelliteTLogPolicyFallback & satelliteTLogReplicationFactorFallback & satelliteTLogWriteAntiQuorumFallback & satelliteTLogUsableDcsFallback & satellites; + serializer(ar, dcId, priority, satelliteTLogPolicy, satelliteDesiredTLogCount, satelliteTLogReplicationFactor, satelliteTLogWriteAntiQuorum, satelliteTLogUsableDcs, + satelliteTLogPolicyFallback, satelliteTLogReplicationFactorFallback, satelliteTLogWriteAntiQuorumFallback, satelliteTLogUsableDcsFallback, satellites); } }; @@ -203,7 +203,7 @@ struct DatabaseConfiguration { template void serialize(Ar& ar) { if (!ar.isDeserializing) makeConfigurationImmutable(); - ar & rawConfiguration; + serializer(ar, rawConfiguration); if (ar.isDeserializing) { for(auto c=rawConfiguration.begin(); c!=rawConfiguration.end(); ++c) setInternal(c->key, c->value); diff --git a/fdbclient/FDBTypes.h b/fdbclient/FDBTypes.h index e468ee2fbb..5ac49fd58e 100644 --- a/fdbclient/FDBTypes.h +++ b/fdbclient/FDBTypes.h @@ -53,7 +53,7 @@ struct Tag { template force_inline void serialize_unversioned(Ar& ar) { - ar & locality & id; + serializer(ar, locality, id); } }; #pragma pack(pop) @@ -193,7 +193,7 @@ struct KeyRangeRef { template force_inline void serialize(Ar& ar) { - ar & const_cast(begin) & const_cast(end); + serializer(ar, const_cast(begin), const_cast(end)); if( begin > end ) { throw inverted_range(); }; @@ -227,7 +227,7 @@ struct KeyValueRef { int expectedSize() const { return key.expectedSize() + value.expectedSize(); } template - force_inline void serialize(Ar& ar) { ar & key & value; } + force_inline void serialize(Ar& ar) { serializer(ar, key, value); } struct OrderByKey { bool operator()(KeyValueRef const& a, KeyValueRef const& b) const { @@ -385,7 +385,7 @@ public: template void serialize( Ar& ar ) { - ar & key & orEqual & offset; + serializer(ar, key, orEqual, offset); } }; @@ -418,7 +418,7 @@ struct KeyRangeWith : KeyRange { template void serialize( Ar& ar ) { - ar & ((KeyRange&)*this) & value; + serializer(ar, ((KeyRange&)*this), value); } }; template @@ -470,7 +470,7 @@ struct RangeResultRef : VectorRef { template void serialize( Ar& ar ) { - ar & ((VectorRef&)*this) & more & readThrough & readToBegin & readThroughEnd; + serializer(ar, ((VectorRef&)*this), more, readThrough, readToBegin, readThroughEnd); } }; @@ -492,7 +492,7 @@ struct KeyValueStoreType { operator StoreType() const { return StoreType(type); } template - void serialize(Ar& ar) { ar & type; } + void serialize(Ar& ar) { serializer(ar, type); } std::string toString() const { switch( type ) { @@ -520,7 +520,7 @@ struct StorageBytes { template void serialize(Ar& ar) { - ar & free & total & used & available; + serializer(ar, free, total, used, available); } }; @@ -639,7 +639,7 @@ struct ClusterControllerPriorityInfo { template void serialize(Ar& ar) { - ar & processClassFitness & isExcluded & dcFitness; + serializer(ar, processClassFitness, isExcluded, dcFitness); } }; diff --git a/fdbclient/FileBackupAgent.actor.cpp b/fdbclient/FileBackupAgent.actor.cpp old mode 100644 new mode 100755 index 7b41f5f280..a4fe1d969b --- a/fdbclient/FileBackupAgent.actor.cpp +++ b/fdbclient/FileBackupAgent.actor.cpp @@ -1004,6 +1004,7 @@ namespace fileBackup { // Update the range bytes written in the backup config backup.rangeBytesWritten().atomicOp(tr, file->size(), MutationRef::AddValue); + backup.snapshotRangeFileCount().atomicOp(tr, 1, MutationRef::AddValue); // See if there is already a file for this key which has an earlier begin, update the map if not. Optional s = wait(backup.snapshotRangeFileMap().get(tr, range.end)); @@ -1129,11 +1130,31 @@ namespace fileBackup { if(done) return Void(); - // Start writing a new file + // Start writing a new file after verifying this task should keep running as of a new read version (which must be >= outVersion) outVersion = values.second; // block size must be at least large enough for 3 max size keys and 2 max size values + overhead so 250k conservatively. state int blockSize = BUGGIFY ? g_random->randomInt(250e3, 4e6) : CLIENT_KNOBS->BACKUP_RANGEFILE_BLOCK_SIZE; - Reference f = wait(bc->writeRangeFile(outVersion, blockSize)); + state Version snapshotBeginVersion; + state int64_t snapshotRangeFileCount; + + state Reference tr(new ReadYourWritesTransaction(cx)); + loop { + try { + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + tr->setOption(FDBTransactionOptions::LOCK_AWARE); + + wait(taskBucket->keepRunning(tr, task) + && storeOrThrow(backup.snapshotBeginVersion().get(tr), snapshotBeginVersion) + && store(backup.snapshotRangeFileCount().getD(tr), snapshotRangeFileCount) + ); + + break; + } catch(Error &e) { + wait(tr->onError(e)); + } + } + + Reference f = wait(bc->writeRangeFile(snapshotBeginVersion, snapshotRangeFileCount, outVersion, blockSize)); outFile = f; // Initialize range file writer and write begin key @@ -3360,8 +3381,9 @@ class FileBackupAgentImpl { public: static const int MAX_RESTORABLE_FILE_METASECTION_BYTES = 1024 * 8; - // This method will return the final status of the backup - ACTOR static Future waitBackup(FileBackupAgent* backupAgent, Database cx, std::string tagName, bool stopWhenDone) { + // This method will return the final status of the backup at tag, and return the URL that was used on the tag + // when that status value was read. + ACTOR static Future waitBackup(FileBackupAgent* backupAgent, Database cx, std::string tagName, bool stopWhenDone, Reference *pContainer = nullptr, UID *pUID = nullptr) { state std::string backTrace; state KeyBackedTag tag = makeBackupTag(tagName); @@ -3379,13 +3401,20 @@ public: state BackupConfig config(oldUidAndAborted.get().first); state EBackupState status = wait(config.stateEnum().getD(tr, false, EBackupState::STATE_NEVERRAN)); - // Break, if no longer runnable - if (!FileBackupAgent::isRunnable(status)) { - return status; - } + // Break, if one of the following is true + // - no longer runnable + // - in differential mode (restorable) and stopWhenDone is not enabled + if( !FileBackupAgent::isRunnable(status) || (!stopWhenDone) && (BackupAgentBase::STATE_DIFFERENTIAL == status) ) { + + if(pContainer != nullptr) { + Reference c = wait(config.backupContainer().getOrThrow(tr, false, backup_invalid_info())); + *pContainer = c; + } + + if(pUID != nullptr) { + *pUID = oldUidAndAborted.get().first; + } - // Break, if in differential mode (restorable) and stopWhenDone is not enabled - if ((!stopWhenDone) && (BackupAgentBase::STATE_DIFFERENTIAL == status)) { return status; } @@ -4061,7 +4090,7 @@ void FileBackupAgent::setLastRestorable(Reference tr, tr->set(lastRestorable.pack(tagName), BinaryWriter::toValue(version, Unversioned())); } -Future FileBackupAgent::waitBackup(Database cx, std::string tagName, bool stopWhenDone) { - return FileBackupAgentImpl::waitBackup(this, cx, tagName, stopWhenDone); +Future FileBackupAgent::waitBackup(Database cx, std::string tagName, bool stopWhenDone, Reference *pContainer, UID *pUID) { + return FileBackupAgentImpl::waitBackup(this, cx, tagName, stopWhenDone, pContainer, pUID); } diff --git a/fdbclient/HTTP.actor.cpp b/fdbclient/HTTP.actor.cpp index 16c99be1f6..0517e33b1d 100644 --- a/fdbclient/HTTP.actor.cpp +++ b/fdbclient/HTTP.actor.cpp @@ -30,7 +30,7 @@ namespace HTTP { o.reserve(s.size() * 3); char buf[4]; for(auto c : s) - if(std::isalnum(c)) + if(std::isalnum(c) || c == '?' || c == '/' || c == '-' || c == '_' || c == '.') o.append(&c, 1); else { sprintf(buf, "%%%.02X", c); @@ -292,15 +292,41 @@ namespace HTTP { // Request content is provided as UnsentPacketQueue *pContent which will be depleted as bytes are sent but the queue itself must live for the life of this actor // and be destroyed by the caller // TODO: pSent is very hackish, do something better. - ACTOR Future> doRequest(Reference conn, std::string verb, std::string resource, HTTP::Headers headers, UnsentPacketQueue *pContent, int contentLen, Reference sendRate, int64_t *pSent, Reference recvRate) { + ACTOR Future> doRequest(Reference conn, std::string verb, std::string resource, HTTP::Headers headers, UnsentPacketQueue *pContent, int contentLen, Reference sendRate, int64_t *pSent, Reference recvRate, std::string requestIDHeader) { + state TraceEvent event(SevDebug, "HTTPRequest"); + state UnsentPacketQueue empty; if(pContent == NULL) pContent = ∅ + // There is no standard http request id header field, so either a global default can be set via a knob + // or it can be set per-request with the requestIDHeader argument (which overrides the default) + if(requestIDHeader.empty()) { + requestIDHeader = CLIENT_KNOBS->HTTP_REQUEST_ID_HEADER; + } + state bool earlyResponse = false; state int total_sent = 0; + event.detail("DebugID", conn->getDebugID()); + event.detail("RemoteAddress", conn->getPeerAddress()); + event.detail("Verb", verb); + event.detail("Resource", resource); + event.detail("RequestContentLen", contentLen); + try { + state std::string requestID; + if(!requestIDHeader.empty()) { + requestID = g_random->randomUniqueID().toString(); + requestID = requestID.insert(20, "-"); + requestID = requestID.insert(16, "-"); + requestID = requestID.insert(12, "-"); + requestID = requestID.insert(8, "-"); + + headers[requestIDHeader] = requestID; + event.detail("RequestIDSent", requestID); + } + // Write headers to a packet buffer chain PacketBuffer *pFirst = new PacketBuffer(); PacketBuffer *pLast = writeRequestHeader(verb, resource, headers, pFirst); @@ -346,19 +372,59 @@ namespace HTTP { } wait(responseReading); - double elapsed = timer() - send_start; - if(CLIENT_KNOBS->HTTP_VERBOSE_LEVEL > 0) - printf("[%s] HTTP code=%d early=%d, time=%fs %s %s contentLen=%d [%d out, response content len %d]\n", - conn->getDebugID().toString().c_str(), r->code, earlyResponse, elapsed, verb.c_str(), resource.c_str(), contentLen, total_sent, (int)r->contentLen); - if(CLIENT_KNOBS->HTTP_VERBOSE_LEVEL > 2) + + event.detail("ResponseCode", r->code); + event.detail("ResponseContentLen", r->contentLen); + event.detail("Elapsed", elapsed); + + Optional err; + if(!requestIDHeader.empty()) { + std::string responseID; + auto iid = r->headers.find(requestIDHeader); + if(iid != r->headers.end()) { + responseID = iid->second; + } + event.detail("RequestIDReceived", responseID); + if(requestID != responseID) { + err = http_bad_request_id(); + // Log a non-debug a error + TraceEvent(SevError, "HTTPRequestFailedIDMismatch") + .detail("DebugID", conn->getDebugID()) + .detail("RemoteAddress", conn->getPeerAddress()) + .detail("Verb", verb) + .detail("Resource", resource) + .detail("RequestContentLen", contentLen) + .detail("ResponseCode", r->code) + .detail("ResponseContentLen", r->contentLen) + .detail("RequestIDSent", requestID) + .detail("RequestIDReceived", responseID) + .error(err.get()); + } + } + + if(CLIENT_KNOBS->HTTP_VERBOSE_LEVEL > 0) { + printf("[%s] HTTP %scode=%d early=%d, time=%fs %s %s contentLen=%d [%d out, response content len %d]\n", + conn->getDebugID().toString().c_str(), + (err.present() ? format("*ERROR*=%s ", err.get().name()).c_str() : ""), + r->code, earlyResponse, elapsed, verb.c_str(), resource.c_str(), contentLen, total_sent, (int)r->contentLen); + } + if(CLIENT_KNOBS->HTTP_VERBOSE_LEVEL > 2) { printf("[%s] HTTP RESPONSE: %s %s\n%s\n", conn->getDebugID().toString().c_str(), verb.c_str(), resource.c_str(), r->toString().c_str()); + } + + if(err.present()) { + throw err.get(); + } + return r; } catch(Error &e) { double elapsed = timer() - send_start; - if(CLIENT_KNOBS->HTTP_VERBOSE_LEVEL > 0) + if(CLIENT_KNOBS->HTTP_VERBOSE_LEVEL > 0 && e.code() != error_code_http_bad_request_id) { printf("[%s] HTTP *ERROR*=%s early=%d, time=%fs %s %s contentLen=%d [%d out]\n", conn->getDebugID().toString().c_str(), e.name(), earlyResponse, elapsed, verb.c_str(), resource.c_str(), contentLen, total_sent); + } + event.error(e); throw; } } diff --git a/fdbclient/HTTP.h b/fdbclient/HTTP.h index 7d481a5eb1..476f6240d3 100644 --- a/fdbclient/HTTP.h +++ b/fdbclient/HTTP.h @@ -51,5 +51,5 @@ namespace HTTP { PacketBuffer * writeRequestHeader(std::string const &verb, std::string const &resource, HTTP::Headers const &headers, PacketBuffer *dest); // Do an HTTP request to the blob store, parse the response. - Future> doRequest(Reference const &conn, std::string const &verb, std::string const &resource, HTTP::Headers const &headers, UnsentPacketQueue * const &pContent, int const &contentLen, Reference const &sendRate, int64_t * const &pSent, Reference const &recvRate); + Future> doRequest(Reference const &conn, std::string const &verb, std::string const &resource, HTTP::Headers const &headers, UnsentPacketQueue * const &pContent, int const &contentLen, Reference const &sendRate, int64_t * const &pSent, Reference const &recvRate, const std::string &requestHeader = std::string()); } diff --git a/fdbclient/Knobs.cpp b/fdbclient/Knobs.cpp index e3161d5321..d7f81dab09 100644 --- a/fdbclient/Knobs.cpp +++ b/fdbclient/Knobs.cpp @@ -148,6 +148,7 @@ ClientKnobs::ClientKnobs(bool randomize) { init( HTTP_READ_SIZE, 128*1024 ); init( HTTP_SEND_SIZE, 32*1024 ); init( HTTP_VERBOSE_LEVEL, 0 ); + init( HTTP_REQUEST_ID_HEADER, "" ); init( BLOBSTORE_CONNECT_TRIES, 10 ); init( BLOBSTORE_CONNECT_TIMEOUT, 10 ); init( BLOBSTORE_MAX_CONNECTION_LIFE, 120 ); diff --git a/fdbclient/Knobs.h b/fdbclient/Knobs.h index 6bd3875641..19067a0b27 100644 --- a/fdbclient/Knobs.h +++ b/fdbclient/Knobs.h @@ -152,6 +152,7 @@ public: int HTTP_SEND_SIZE; int HTTP_READ_SIZE; int HTTP_VERBOSE_LEVEL; + std::string HTTP_REQUEST_ID_HEADER; int BLOBSTORE_CONNECT_TRIES; int BLOBSTORE_CONNECT_TIMEOUT; int BLOBSTORE_MAX_CONNECTION_LIFE; diff --git a/fdbclient/ManagementAPI.actor.cpp b/fdbclient/ManagementAPI.actor.cpp index ddef4150bc..7c29c53136 100644 --- a/fdbclient/ManagementAPI.actor.cpp +++ b/fdbclient/ManagementAPI.actor.cpp @@ -294,6 +294,7 @@ ACTOR Future changeConfig( Database cx, std::map> fConfig = tr.getRange(configKeys, CLIENT_KNOBS->TOO_MANY); + state Future> fWorkers = getWorkers(&tr); wait( success(fConfig) || tooLong ); if(!fConfig.isReady()) { @@ -378,6 +379,44 @@ ACTOR Future changeConfig( Database cx, std::map, std::set>> dcId_zoneIds; + for(auto& it : fWorkers.get()) { + if( it.processClass.machineClassFitness(ProcessClass::Storage) <= ProcessClass::WorstFit ) { + dcId_zoneIds[it.locality.dcId()].insert(it.locality.zoneId()); + } + } + for(auto& region : newConfig.regions) { + if(dcId_zoneIds[region.dcId].size() < std::max(newConfig.storageTeamSize, newConfig.tLogReplicationFactor)) { + return ConfigurationResult::NOT_ENOUGH_WORKERS; + } + if(region.satelliteTLogReplicationFactor > 0 && region.priority >= 0) { + int totalSatelliteProcesses = 0; + for(auto& sat : region.satellites) { + totalSatelliteProcesses += dcId_zoneIds[sat.dcId].size(); + } + if(totalSatelliteProcesses < region.satelliteTLogReplicationFactor) { + return ConfigurationResult::NOT_ENOUGH_WORKERS; + } + } + } + } else { + std::set> zoneIds; + for(auto& it : fWorkers.get()) { + if( it.processClass.machineClassFitness(ProcessClass::Storage) <= ProcessClass::WorstFit ) { + zoneIds.insert(it.locality.zoneId()); + } + } + if(zoneIds.size() < std::max(newConfig.storageTeamSize, newConfig.tLogReplicationFactor)) { + return ConfigurationResult::NOT_ENOUGH_WORKERS; + } + } } } diff --git a/fdbclient/ManagementAPI.h b/fdbclient/ManagementAPI.h index db71dbdf51..a4b9702f74 100644 --- a/fdbclient/ManagementAPI.h +++ b/fdbclient/ManagementAPI.h @@ -54,6 +54,7 @@ public: REGION_NOT_FULLY_REPLICATED, MULTIPLE_ACTIVE_REGIONS, REGIONS_CHANGED, + NOT_ENOUGH_WORKERS, SUCCESS }; }; diff --git a/fdbclient/MasterProxyInterface.h b/fdbclient/MasterProxyInterface.h index 3208ac0661..4da1389881 100644 --- a/fdbclient/MasterProxyInterface.h +++ b/fdbclient/MasterProxyInterface.h @@ -50,7 +50,7 @@ struct MasterProxyInterface { template void serialize(Archive& ar) { - ar & locality & commit & getConsistentReadVersion & getKeyServersLocations & waitFailure & getStorageServerRejoinInfo & getRawCommittedVersion & txnState; + serializer(ar, locality, commit, getConsistentReadVersion, getKeyServersLocations, waitFailure, getStorageServerRejoinInfo, getRawCommittedVersion, txnState); } void initEndpoints() { @@ -67,7 +67,7 @@ struct CommitID { template void serialize(Ar& ar) { - ar & version & txnBatchId; + serializer(ar, version, txnBatchId); } CommitID() : version(invalidVersion), txnBatchId(0) {} @@ -93,7 +93,7 @@ struct CommitTransactionRequest { template void serialize(Ar& ar) { - ar & transaction & reply & arena & flags & debugID; + serializer(ar, transaction, reply, arena, flags, debugID); } }; @@ -116,7 +116,7 @@ struct GetReadVersionReply { template void serialize(Ar& ar) { - ar & version & locked; + serializer(ar, version, locked); } }; @@ -144,7 +144,7 @@ struct GetReadVersionRequest { template void serialize(Ar& ar) { - ar & transactionCount & flags & debugID & reply; + serializer(ar, transactionCount, flags, debugID, reply); } }; @@ -154,7 +154,7 @@ struct GetKeyServerLocationsReply { template void serialize(Ar& ar) { - ar & results & arena; + serializer(ar, results, arena); } }; @@ -171,7 +171,7 @@ struct GetKeyServerLocationsRequest { template void serialize(Ar& ar) { - ar & begin & end & limit & reverse & reply & arena; + serializer(ar, begin, end, limit, reverse, reply, arena); } }; @@ -183,7 +183,7 @@ struct GetRawCommittedVersionRequest { template void serialize( Ar& ar ) { - ar & debugID & reply; + serializer(ar, debugID, reply); } }; @@ -196,7 +196,7 @@ struct GetStorageServerRejoinInfoReply { template void serialize(Ar& ar) { - ar & version & tag & newTag & newLocality & history; + serializer(ar, version, tag, newTag, newLocality, history); } }; @@ -210,7 +210,7 @@ struct GetStorageServerRejoinInfoRequest { template void serialize( Ar& ar ) { - ar & id & dcId & reply; + serializer(ar, id, dcId, reply); } }; @@ -223,7 +223,7 @@ struct TxnStateRequest { template void serialize(Ar& ar) { - ar & data & sequence & last & reply & arena; + serializer(ar, data, sequence, last, reply, arena); } }; diff --git a/fdbclient/MutationList.h b/fdbclient/MutationList.h index 0caec14b18..47a564846f 100644 --- a/fdbclient/MutationList.h +++ b/fdbclient/MutationList.h @@ -132,7 +132,7 @@ public: template void serialize_load( Ar& ar ) { - ar & totalBytes; + serializer(ar, totalBytes); if(totalBytes > 0) { blob_begin = blob_end = new (ar.arena()) Blob; @@ -142,7 +142,7 @@ public: } template void serialize_save( Ar& ar ) const { - ar & totalBytes; + serializer(ar, totalBytes); for(auto b = blob_begin; b; b=b->next) ar.serializeBytes(b->data); } diff --git a/fdbclient/StorageServerInterface.h b/fdbclient/StorageServerInterface.h index a3d655644f..c0da36a6b6 100644 --- a/fdbclient/StorageServerInterface.h +++ b/fdbclient/StorageServerInterface.h @@ -68,11 +68,11 @@ struct StorageServerInterface { void serialize( Ar& ar ) { // StorageServerInterface is persisted in the database and in the tLog's data structures, so changes here have to be // versioned carefully! - ar & uniqueID & locality & getVersion & getValue & getKey & getKeyValues & getShardState & waitMetrics - & splitMetrics & getPhysicalMetrics & waitFailure & getQueuingMetrics & getKeyValueStoreType; + serializer(ar, uniqueID, locality, getVersion, getValue, getKey, getKeyValues, getShardState, waitMetrics, + splitMetrics, getPhysicalMetrics, waitFailure, getQueuingMetrics, getKeyValueStoreType); if( ar.protocolVersion() >= 0x0FDB00A200090001LL ) - ar & watchValue; + serializer(ar, watchValue); } bool operator == (StorageServerInterface const& s) const { return uniqueID == s.uniqueID; } bool operator < (StorageServerInterface const& s) const { return uniqueID < s.uniqueID; } @@ -103,7 +103,7 @@ struct GetValueReply : public LoadBalancedReply { template void serialize( Ar& ar ) { - ar & *(LoadBalancedReply*)this & value; + serializer(ar, *(LoadBalancedReply*)this, value); } }; @@ -118,7 +118,7 @@ struct GetValueRequest { template void serialize( Ar& ar ) { - ar & key & version & debugID & reply; + serializer(ar, key, version, debugID, reply); } }; @@ -134,7 +134,7 @@ struct WatchValueRequest { template void serialize( Ar& ar ) { - ar & key & value & version & debugID & reply; + serializer(ar, key, value, version, debugID, reply); } }; @@ -146,7 +146,7 @@ struct GetKeyValuesReply : public LoadBalancedReply { template void serialize( Ar& ar ) { - ar & *(LoadBalancedReply*)this & data & version & more & arena; + serializer(ar, *(LoadBalancedReply*)this, data, version, more, arena); } }; @@ -162,7 +162,7 @@ struct GetKeyValuesRequest { // GetKeyValuesRequest(const KeySelectorRef& begin, const KeySelectorRef& end, Version version, int limit, int limitBytes, Optional debugID) : begin(begin), end(end), version(version), limit(limit), limitBytes(limitBytes) {} template void serialize( Ar& ar ) { - ar & begin & end & version & limit & limitBytes & debugID & reply & arena; + serializer(ar, begin, end, version, limit, limitBytes, debugID, reply, arena); } }; @@ -174,7 +174,7 @@ struct GetKeyReply : public LoadBalancedReply { template void serialize( Ar& ar ) { - ar & *(LoadBalancedReply*)this & sel; + serializer(ar, *(LoadBalancedReply*)this, sel); } }; @@ -189,7 +189,7 @@ struct GetKeyRequest { template void serialize( Ar& ar ) { - ar & sel & version & reply & arena; + serializer(ar, sel, version, reply, arena); } }; @@ -208,7 +208,7 @@ struct GetShardStateRequest { template void serialize( Ar& ar ) { - ar & keys & mode & reply; + serializer(ar, keys, mode, reply); } }; @@ -244,7 +244,7 @@ struct StorageMetrics { template void serialize( Ar& ar ) { - ar & bytes & bytesPerKSecond & iosPerKSecond; + serializer(ar, bytes, bytesPerKSecond, iosPerKSecond); } void negate() { operator*=(-1.0); } @@ -278,7 +278,7 @@ struct WaitMetricsRequest { template void serialize( Ar& ar ) { - ar & keys & min & max & reply & arena; + serializer(ar, keys, min, max, reply, arena); } }; @@ -288,7 +288,7 @@ struct SplitMetricsReply { template void serialize( Ar& ar ) { - ar & splits & used; + serializer(ar, splits, used); } }; @@ -306,7 +306,7 @@ struct SplitMetricsRequest { template void serialize(Ar& ar) { - ar & keys & limits & used & estimated & isLastShard & reply & arena; + serializer(ar, keys, limits, used, estimated, isLastShard, reply, arena); } }; @@ -317,7 +317,7 @@ struct GetPhysicalMetricsReply { template void serialize(Ar& ar) { - ar & load & free & capacity; + serializer(ar, load, free, capacity); } }; @@ -326,7 +326,7 @@ struct GetPhysicalMetricsRequest { template void serialize(Ar& ar) { - ar & reply; + serializer(ar, reply); } }; @@ -336,7 +336,7 @@ struct StorageQueuingMetricsRequest { template void serialize(Ar& ar) { - ar & reply; + serializer(ar, reply); } }; @@ -349,7 +349,7 @@ struct StorageQueuingMetricsReply { template void serialize(Ar& ar) { - ar & localTime & instanceID & bytesDurable & bytesInput & v & storageBytes; + serializer(ar, localTime, instanceID, bytesDurable, bytesInput, v, storageBytes); } }; diff --git a/fdbclient/versions.h.cmake b/fdbclient/versions.h.cmake new file mode 100644 index 0000000000..3e6c450496 --- /dev/null +++ b/fdbclient/versions.h.cmake @@ -0,0 +1,3 @@ +#pragma once +#define FDB_VT_VERSION "${FDB_VERSION}" +#define FDB_VT_PACKAGE_NAME "${FDB_PACKAGE_NAME}" diff --git a/fdbmonitor/CMakeLists.txt b/fdbmonitor/CMakeLists.txt new file mode 100644 index 0000000000..7d035f77d0 --- /dev/null +++ b/fdbmonitor/CMakeLists.txt @@ -0,0 +1,6 @@ +set(FDBMONITOR_SRCS ConvertUTF.h SimpleIni.h fdbmonitor.cpp) + +add_executable(fdbmonitor ${FDBMONITOR_SRCS}) +target_link_libraries(fdbmonitor flow) + +install(TARGETS fdbmonitor DESTINATION "${FDB_LIB_DIR}/foundationdb" COMPONENT server) diff --git a/fdbrpc/AsyncFileCached.actor.h b/fdbrpc/AsyncFileCached.actor.h index b10e8d45ad..1523565d17 100644 --- a/fdbrpc/AsyncFileCached.actor.h +++ b/fdbrpc/AsyncFileCached.actor.h @@ -257,9 +257,9 @@ private: try { TraceEvent("AFCUnderlyingOpenBegin").detail("Filename", filename); if(flags & IAsyncFile::OPEN_CACHED_READ_ONLY) - flags = flags & ~IAsyncFile::OPEN_READWRITE | IAsyncFile::OPEN_READONLY; + flags = (flags & ~IAsyncFile::OPEN_READWRITE) | IAsyncFile::OPEN_READONLY; else - flags = flags & ~IAsyncFile::OPEN_READONLY | IAsyncFile::OPEN_READWRITE; + flags = (flags & ~IAsyncFile::OPEN_READONLY) | IAsyncFile::OPEN_READWRITE; state Reference f = wait( IAsyncFileSystem::filesystem()->open(filename, flags | IAsyncFile::OPEN_UNCACHED | IAsyncFile::OPEN_UNBUFFERED, mode) ); TraceEvent("AFCUnderlyingOpenEnd").detail("Filename", filename); int64_t l = wait( f->size() ); diff --git a/fdbrpc/CMakeLists.txt b/fdbrpc/CMakeLists.txt new file mode 100644 index 0000000000..dcea3cea4f --- /dev/null +++ b/fdbrpc/CMakeLists.txt @@ -0,0 +1,61 @@ +set(FDBRPC_SRCS + ActorFuzz.actor.cpp + AsyncFileCached.actor.h + AsyncFileEIO.actor.h + AsyncFileKAIO.actor.h + AsyncFileNonDurable.actor.h + AsyncFileReadAhead.actor.h + AsyncFileWinASIO.actor.h + AsyncFileCached.actor.cpp + AsyncFileNonDurable.actor.cpp + AsyncFileWriteChecker.cpp + batcher.actor.h + crc32c.cpp + dsltest.actor.cpp + FailureMonitor.actor.cpp + FlowTests.actor.cpp + FlowTransport.actor.cpp + genericactors.actor.h + genericactors.actor.cpp + IAsyncFile.actor.cpp + LoadBalance.actor.h + Locality.cpp + Net2FileSystem.cpp + networksender.actor.h + Platform.cpp + QueueModel.cpp + ReplicationPolicy.cpp + ReplicationTypes.cpp + ReplicationUtils.cpp + sim2.actor.cpp + sim_validation.cpp + TLSConnection.actor.cpp + TraceFileIO.cpp + # C files + libcoroutine/Common.c + libcoroutine/context.c + libcoroutine/Coro.c + libeio/eio.c + zlib/adler32.c + zlib/crc32.c + zlib/deflate.c + zlib/gzclose.c + zlib/gzlib.c + zlib/gzread.c + zlib/gzwrite.c + zlib/infback.c + zlib/inffast.c + zlib/inflate.c + zlib/inftrees.c + zlib/trees.c + zlib/zutil.c) + +if(APPLE) + list(APPEND FDBRPC_SRCS libcoroutine/asm.S libcoroutine/context.c) +endif() + +actor_set(FDBRPC_BUILD "${FDBRPC_SRCS}") +add_library(fdbrpc STATIC ${FDBRPC_BUILD}) +actor_compile(fdbrpc "${FDBRPC_SRCS}") +target_include_directories(fdbrpc PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/libeio) +target_link_libraries(fdbrpc PUBLIC flow) diff --git a/fdbrpc/ContinuousSample.h b/fdbrpc/ContinuousSample.h index 54ff1b1094..577c228ae7 100644 --- a/fdbrpc/ContinuousSample.h +++ b/fdbrpc/ContinuousSample.h @@ -26,6 +26,7 @@ #include "flow/IRandom.h" #include #include +#include template class ContinuousSample { diff --git a/fdbrpc/EndpointGroup.h b/fdbrpc/EndpointGroup.h deleted file mode 100644 index 306aa35382..0000000000 --- a/fdbrpc/EndpointGroup.h +++ /dev/null @@ -1,74 +0,0 @@ -/* - * EndpointGroup.h - * - * This source file is part of the FoundationDB open source project - * - * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef FLOW_ENDPOINT_GROUP_H -#define FLOW_ENDPOINT_GROUP_H -#pragma once - -#include "fdbrpc/flow.h" - -// EndpointGroup makes it easier to implement backward compatibility for interface serialization -// It also provides a central place to implement more compact serialization for a group of related endpoints in the future. - -/* Typical usage: - -template -void serialize(Ar& ar) { - auto endpoints = endpointGroup(ar); - endpoints.require( ar.protocolVersion() <= currentProtocolVersion ); - endpoints & apple & banana; - endpoints.require( ar.protocolVersion() >= 0xabc ); // Following endpoints added in this version - endpoints & cherry; - endpoints.require( ar.protocolVersion() >= 0xdef ); // .. and then some more were added - endpoints & date; -} - -*/ - - -template -struct EndpointGroup : NonCopyable { - Ar& ar; - bool enabled; - - explicit EndpointGroup( Ar& ar ) : ar(ar), enabled(true) { - ASSERT( ar.protocolVersion() != 0 ); - } - EndpointGroup( EndpointGroup&& g ) : ar(g.ar), enabled(g.enabled) {} - - EndpointGroup& require( bool condition ) { - enabled = enabled && condition; - return *this; - } - - template - EndpointGroup& operator & (PromiseStream& stream) { - if (enabled) - ar & stream; - else if (Ar::isDeserializing) - stream.sendError( incompatible_protocol_version() ); - return *this; - } -}; - -template -EndpointGroup endpointGroup( Ar& ar ) { return EndpointGroup(ar); } - -#endif \ No newline at end of file diff --git a/fdbrpc/FailureMonitor.h b/fdbrpc/FailureMonitor.h index 47ad74fffd..975e60762b 100644 --- a/fdbrpc/FailureMonitor.h +++ b/fdbrpc/FailureMonitor.h @@ -74,7 +74,7 @@ struct FailureStatus { bool operator != (FailureStatus const& r) const { return failed != r.failed; } template void serialize(Ar& ar) { - ar & failed; + serializer(ar, failed); } }; @@ -141,4 +141,4 @@ private: friend class OnStateChangedActorActor; }; -#endif \ No newline at end of file +#endif diff --git a/fdbrpc/LoadBalance.actor.h b/fdbrpc/LoadBalance.actor.h index 0096c3675b..32bb7ee908 100644 --- a/fdbrpc/LoadBalance.actor.h +++ b/fdbrpc/LoadBalance.actor.h @@ -71,7 +71,7 @@ struct LoadBalancedReply { template void serialize(Ar &ar) { - ar & penalty; + serializer(ar, penalty); } }; diff --git a/fdbrpc/Locality.h b/fdbrpc/Locality.h index ed15c93942..bf5aabb9c4 100644 --- a/fdbrpc/Locality.h +++ b/fdbrpc/Locality.h @@ -110,7 +110,7 @@ public: template void serialize(Ar& ar) { - ar & _class & _source; + serializer(ar, _class, _source); } }; @@ -160,10 +160,13 @@ public: std::string describeZone() const { return describeValue(keyZoneId); } std::string describeDataHall() const { return describeValue(keyDataHallId); } + std::string describeDcId() const { return describeValue(keyDcId); } + std::string describeMachineId() const { return describeValue(keyMachineId); } + std::string describeProcessId() const { return describeValue(keyProcessId); } Optional> processId() const { return get(keyProcessId); } Optional> zoneId() const { return get(keyZoneId); } - Optional> machineId() const { return get(keyMachineId); } + Optional> machineId() const { return get(keyMachineId); } // default is "" Optional> dcId() const { return get(keyDcId); } Optional> dataHallId() const { return get(keyDataHallId); } @@ -185,10 +188,10 @@ public: Standalone key; Optional> value; uint64_t mapSize = (uint64_t)_data.size(); - ar & mapSize; + serializer(ar, mapSize); if (ar.isDeserializing) { for (size_t i = 0; i < mapSize; i++) { - ar & key & value; + serializer(ar, key, value); _data[key] = value; } } @@ -196,24 +199,24 @@ public: for (auto it = _data.begin(); it != _data.end(); it++) { key = it->first; value = it->second; - ar & key & value; + serializer(ar, key, value); } } } else { ASSERT(ar.isDeserializing); UID zoneId, dcId, processId; - ar & zoneId & dcId; + serializer(ar, zoneId, dcId); set(keyZoneId, Standalone(zoneId.toString())); set(keyDcId, Standalone(dcId.toString())); if (ar.protocolVersion() >= 0x0FDB00A340000001LL) { - ar & processId; + serializer(ar, processId); set(keyProcessId, Standalone(processId.toString())); } else { int _machineClass = ProcessClass::UnsetClass; - ar & _machineClass; + serializer(ar, _machineClass); } } } @@ -255,7 +258,7 @@ struct ProcessData { template void serialize(Ar& ar) { - ar & locality & processClass & address; + serializer(ar, locality, processClass, address); } struct sort_by_address { diff --git a/fdbrpc/PerfMetric.h b/fdbrpc/PerfMetric.h index 8b342ecd7f..8beea3e1fd 100644 --- a/fdbrpc/PerfMetric.h +++ b/fdbrpc/PerfMetric.h @@ -43,7 +43,7 @@ struct PerfMetric { template void serialize( Ar& ar ) { - ar & m_name & m_format_code & m_value & m_averaged; + serializer(ar, m_name, m_format_code, m_value, m_averaged); } private: @@ -106,4 +106,4 @@ struct GlobalCounters { extern GlobalCounters g_counters; -#endif \ No newline at end of file +#endif diff --git a/fdbrpc/Platform.cpp b/fdbrpc/Platform.cpp index 4dd378141e..bbfde86f30 100644 --- a/fdbrpc/Platform.cpp +++ b/fdbrpc/Platform.cpp @@ -101,8 +101,9 @@ int eraseDirectoryRecursive(std::string const& dir) { the directory we're deleting doesn't exist in the first place */ if (error && errno != ENOENT) { - TraceEvent(SevError, "EraseDirectoryRecursiveError").detail("Directory", dir).GetLastError(); - throw platform_error(); + Error e = systemErrorCodeToError(); + TraceEvent(SevError, "EraseDirectoryRecursiveError").detail("Directory", dir).GetLastError().error(e); + throw e; } #else #error Port me! @@ -132,15 +133,15 @@ std::string getDefaultConfigPath() { bool isSse42Supported() { #if defined(_WIN32) - int info[4]; - __cpuid(info, 1); - return (info[2] & (1 << 20)) != 0; + int info[4]; + __cpuid(info, 1); + return (info[2] & (1 << 20)) != 0; #elif defined(__unixish__) - uint32_t eax, ebx, ecx, edx, level = 1, count = 0; - __cpuid_count(level, count, eax, ebx, ecx, edx); - return ((ecx >> 20) & 1) != 0; + uint32_t eax, ebx, ecx, edx, level = 1, count = 0; + __cpuid_count(level, count, eax, ebx, ecx, edx); + return ((ecx >> 20) & 1) != 0; #else - #error Port me! + #error Port me! #endif } diff --git a/fdbrpc/Replication.h b/fdbrpc/Replication.h index 557a2fc0d3..828ca1fd42 100644 --- a/fdbrpc/Replication.h +++ b/fdbrpc/Replication.h @@ -102,6 +102,11 @@ public: return _localitygroup->getRecord(getEntry(localIndex)._id); } + // Return record array to help debug the locality information for servers + virtual std::vector> const& getRecordArray() const { + return _localitygroup->getRecordArray(); + } + Reference const& getRecordViaEntry(LocalityEntry localEntry) const { return _localitygroup->getRecord(localEntry._id); } @@ -167,6 +172,8 @@ public: // This function is used to create an subset containing all of the entries within // the specified value for the given key + // The returned LocalitySet contains the LocalityRecords that have the same value as + // the indexValue under the same indexKey (e.g., zoneid) LocalitySetRef restrict(AttribKey indexKey, AttribValue indexValue ) { LocalitySetRef localitySet; LocalityCacheRecord searchRecord(AttribRecord(indexKey, indexValue), localitySet); @@ -497,6 +504,7 @@ struct LocalityGroup : public LocalitySet { virtual ~LocalityGroup() { } LocalityEntry const& add(LocalityData const& data) { + // _recordArray.size() is the new entry index for the new data Reference record(new LocalityRecord(convertToAttribMap(data), _recordArray.size())); _recordArray.push_back(record); return LocalitySet::add(record, *this); @@ -527,6 +535,9 @@ struct LocalityGroup : public LocalitySet { return _recordArray[recordIndex]; } + // Get the locality info for debug purpose + virtual std::vector> const& getRecordArray() const { return _recordArray; } + virtual int getMemoryUsed() const { int memorySize = sizeof(_recordArray) + _keymap->getMemoryUsed(); for (auto& record : _recordArray) { diff --git a/fdbrpc/ReplicationPolicy.cpp b/fdbrpc/ReplicationPolicy.cpp index a101c411b5..070b8dd767 100644 --- a/fdbrpc/ReplicationPolicy.cpp +++ b/fdbrpc/ReplicationPolicy.cpp @@ -147,15 +147,53 @@ PolicyAcross::~PolicyAcross() return; } +// Debug purpose only +// Trace all record entries to help debug +// fromServers is the servers locality to be printed out. +void IReplicationPolicy::traceLocalityRecords(LocalitySetRef const& fromServers) { + std::vector> const& recordArray = fromServers->getRecordArray(); + TraceEvent("LocalityRecordArray").detail("Size", recordArray.size()); + for (auto& record : recordArray) { + traceOneLocalityRecord(record, fromServers); + } +} + +void IReplicationPolicy::traceOneLocalityRecord(Reference record, LocalitySetRef const& fromServers) { + int localityEntryIndex = record->_entryIndex._id; + Reference const& dataMap = record->_dataMap; + std::vector const& keyValueArray = dataMap->_keyvaluearray; + + TraceEvent("LocalityRecordInfo") + .detail("EntryIndex", localityEntryIndex) + .detail("KeyValueArraySize", keyValueArray.size()); + for (int i = 0; i < keyValueArray.size(); ++i) { + AttribRecord attribRecord = keyValueArray[i]; // first is key, second is value + TraceEvent("LocalityRecordInfo") + .detail("EntryIndex", localityEntryIndex) + .detail("ArrayIndex", i) + .detail("Key", attribRecord.first._id) + .detail("Value", attribRecord.second._id) + .detail("KeyName", fromServers->keyText(attribRecord.first)) + .detail("ValueName", fromServers->valueText(attribRecord.second)); + } +} + +// Validate if the team satisfies the replication policy +// LocalitySet is the base class about the locality information +// solutionSet is the team to be validated +// fromServers is the location information of all servers +// return true if the team satisfies the policy; false otherwise bool PolicyAcross::validate( std::vector const& solutionSet, LocalitySetRef const& fromServers ) const { bool valid = true; int count = 0; - AttribKey indexKey = fromServers->keyIndex(_attribKey); + // Get the indexKey from the policy name (e.g., zoneid) in _attribKey + AttribKey indexKey = fromServers->keyIndex(_attribKey); auto groupIndexKey = fromServers->getGroupKeyIndex(indexKey); std::map> validMap; + for (auto& item : solutionSet) { auto value = fromServers->getValueViaGroupKey(item, groupIndexKey); if (value.present()) { @@ -182,9 +220,14 @@ bool PolicyAcross::validate( } } for (auto& itValid : validMap) { + // itValid.second is the vector of LocalityEntries that belong to the same locality if (_policy->validate(itValid.second, fromServers)) { if (g_replicationdebug > 4) { - printf("Across valid solution: %6lu key: %-7s count:%3d of%3d value: (%3d) %-10s policy: %-10s => %s\n", itValid.second.size(), _attribKey.c_str(), count+1, _count, itValid.first._id, fromServers->valueText(itValid.first).c_str(), _policy->name().c_str(), _policy->info().c_str()); + printf("Across valid solution: %6lu key: %-7s count:%3d of%3d value: (%3d) %-10s policy: %-10s => " + "%s\n", + itValid.second.size(), _attribKey.c_str(), count + 1, _count, itValid.first._id, + fromServers->valueText(itValid.first).c_str(), _policy->name().c_str(), + _policy->info().c_str()); if (g_replicationdebug > 5) { for (auto& entry : itValid.second) { printf(" entry: %s\n", fromServers->getEntryInfo(entry).c_str()); @@ -192,8 +235,7 @@ bool PolicyAcross::validate( } } count ++; - } - else if (g_replicationdebug > 4) { + } else if (g_replicationdebug > 4) { printf("Across invalid solution:%5lu key: %-7s value: (%3d) %-10s policy: %-10s => %s\n", itValid.second.size(), _attribKey.c_str(), itValid.first._id, fromServers->valueText(itValid.first).c_str(), _policy->name().c_str(), _policy->info().c_str()); if (g_replicationdebug > 5) { for (auto& entry : itValid.second) { @@ -215,6 +257,10 @@ bool PolicyAcross::validate( return valid; } +// Choose new servers from "least utilized" alsoServers and append the new servers to results +// fromserverse are the servers that have already been chosen and +// that should be excluded from being selected as replicas. +// FIXME: Simplify this function, such as removing unnecessary printf bool PolicyAcross::selectReplicas( LocalitySetRef & fromServers, std::vector const& alsoServers, @@ -239,11 +285,15 @@ bool PolicyAcross::selectReplicas( if (value.present()) { auto lowerBound = std::lower_bound(_usedValues.begin(), _usedValues.end(), value.get()); if ((lowerBound == _usedValues.end()) || (*lowerBound != value.get())) { + //_selected is a set of processes that have the same indexKey and indexValue (value) _selected = fromServers->restrict(indexKey, value.get()); if (_selected->size()) { // Pass only the also array item which are valid for the value if (g_replicationdebug > 5) { - printf("Across !select key: %-7s value: (%3d) %-10s entry: %s\n", _attribKey.c_str(), value.get()._id, fromServers->valueText(value.get()).c_str(), fromServers->getEntryInfo(alsoServer).c_str()); + // entry is the locality entry info (entryValue) from the to-be-selected team member alsoServer + printf("Across !select key: %-7s value: (%3d) %-10s entry: %s\n", _attribKey.c_str(), + value.get()._id, fromServers->valueText(value.get()).c_str(), + fromServers->getEntryInfo(alsoServer).c_str()); } resultsSize = _newResults.size(); if (_policy->selectReplicas(_selected, alsoServers, _newResults)) @@ -256,7 +306,10 @@ bool PolicyAcross::selectReplicas( _addedResults.push_back(_arena, std::pair(resultsAdded, resultsSize)); } if (g_replicationdebug > 5) { - printf("Across !added:%3d key: %-7s count:%3d of%3d value: (%3d) %-10s entry: %s\n", resultsAdded, _attribKey.c_str(), count, _count, value.get()._id, fromServers->valueText(value.get()).c_str(), fromServers->getEntryInfo(alsoServer).c_str()); + printf("Across !added:%3d key: %-7s count:%3d of%3d value: (%3d) %-10s entry: %s\n", + resultsAdded, _attribKey.c_str(), count, _count, value.get()._id, + fromServers->valueText(value.get()).c_str(), + fromServers->getEntryInfo(alsoServer).c_str()); } if (count >= _count) break; _usedValues.insert(lowerBound, value.get()); @@ -308,6 +361,7 @@ bool PolicyAcross::selectReplicas( } } + // Cannot find replica from the least used alsoServers, now try to find replicas from all servers // Process the remaining values if (count < _count) { if (g_replicationdebug > 3) { @@ -329,12 +383,18 @@ bool PolicyAcross::selectReplicas( _selected = fromServers->restrict(indexKey, value.get()); if (_selected->size()) { if (g_replicationdebug > 5) { - printf("Across select:%3d key: %-7s value: (%3d) %-10s entry: %s index:%4d\n", fromServers->size()-checksLeft+1, _attribKey.c_str(), value.get()._id, fromServers->valueText(value.get()).c_str(), fromServers->getEntryInfo(entry).c_str(), recordIndex); + printf("Across select:%3d key: %-7s value: (%3d) %-10s entry: %s index:%4d\n", + fromServers->size() - checksLeft + 1, _attribKey.c_str(), value.get()._id, + fromServers->valueText(value.get()).c_str(), + fromServers->getEntryInfo(entry).c_str(), recordIndex); } if (_policy->selectReplicas(_selected, emptyEntryArray, results)) { if (g_replicationdebug > 5) { - printf("Across added:%4d key: %-7s value: (%3d) %-10s policy: %-10s => %s needed:%3d\n", count+1, _attribKey.c_str(), value.get()._id, fromServers->valueText(value.get()).c_str(), _policy->name().c_str(), _policy->info().c_str(), _count); + printf("Across added:%4d key: %-7s value: (%3d) %-10s policy: %-10s => %s needed:%3d\n", + count + 1, _attribKey.c_str(), value.get()._id, + fromServers->valueText(value.get()).c_str(), _policy->name().c_str(), + _policy->info().c_str(), _count); } count ++; if (count >= _count) break; diff --git a/fdbrpc/ReplicationPolicy.h b/fdbrpc/ReplicationPolicy.h index 945f7f6931..74bc0baa80 100644 --- a/fdbrpc/ReplicationPolicy.h +++ b/fdbrpc/ReplicationPolicy.h @@ -43,7 +43,9 @@ struct IReplicationPolicy : public ReferenceCounted { LocalitySetRef & fromServers, std::vector const& alsoServers, std::vector & results ) = 0; - virtual bool validate( + virtual void traceLocalityRecords(LocalitySetRef const& fromServers); + virtual void traceOneLocalityRecord(Reference record, LocalitySetRef const& fromServers); + virtual bool validate( std::vector const& solutionSet, LocalitySetRef const& fromServers ) const = 0; @@ -134,7 +136,7 @@ struct PolicyAcross : IReplicationPolicy, public ReferenceCounted template void serialize(Ar& ar) { - ar & _attribKey & _count; + serializer(ar, _attribKey, _count); serializeReplicationPolicy(ar, _policy); } @@ -205,7 +207,7 @@ struct PolicyAnd : IReplicationPolicy, public ReferenceCounted { template void serialize(Ar& ar) { int count = _policies.size(); - ar & count; + serializer(ar, count); _policies.resize(count); for(int i = 0; i < count; i++) { serializeReplicationPolicy(ar, _policies[i]); @@ -231,7 +233,7 @@ template void serializeReplicationPolicy(Ar& ar, IRepPolicyRef& policy) { if(Ar::isDeserializing) { StringRef name; - ar & name; + serializer(ar, name); if(name == LiteralStringRef("One")) { PolicyOne* pointer = new PolicyOne(); @@ -259,7 +261,7 @@ void serializeReplicationPolicy(Ar& ar, IRepPolicyRef& policy) { else { std::string name = policy ? policy->name() : "None"; Standalone nameRef = StringRef(name); - ar & nameRef; + serializer(ar, nameRef); if(name == "One") { ((PolicyOne*)policy.getPtr())->serialize(ar); } diff --git a/fdbrpc/ReplicationTypes.h b/fdbrpc/ReplicationTypes.h index d30925e8b4..ef5463f54b 100644 --- a/fdbrpc/ReplicationTypes.h +++ b/fdbrpc/ReplicationTypes.h @@ -22,6 +22,7 @@ #define FLOW_REPLICATION_TYPES_H #pragma once +#include #include "flow/flow.h" #include "fdbrpc/Locality.h" @@ -140,6 +141,18 @@ struct LocalityRecord : public ReferenceCounted { int getMemoryUsed() const { return sizeof(_entryIndex) + sizeof(_dataMap) + _dataMap->getMemoryUsed(); } + + std::string toString() { + std::stringstream ss; + ss << "KeyValueArraySize:" << _dataMap->_keyvaluearray.size(); + for (int i = 0; i < _dataMap->size(); ++i) { + AttribRecord attribRecord = _dataMap->_keyvaluearray[i]; // first is key, second is value + ss << " KeyValueArrayIndex:" << i << " Key:" << attribRecord.first._id << + " Value:" << attribRecord.second._id; + } + + return ss.str(); + } }; // This class stores the information for string to integer map for keys and values diff --git a/fdbrpc/Smoother.h b/fdbrpc/Smoother.h index 3ed8e6e983..fb4694750f 100644 --- a/fdbrpc/Smoother.h +++ b/fdbrpc/Smoother.h @@ -23,6 +23,7 @@ #pragma once #include "flow/flow.h" +#include struct Smoother { // Times (t) are expected to be nondecreasing @@ -90,4 +91,4 @@ struct TimerSmoother { double time, total, estimate; }; -#endif \ No newline at end of file +#endif diff --git a/fdbrpc/TLSConnection.actor.cpp b/fdbrpc/TLSConnection.actor.cpp index 0b737c302c..c2a275fb43 100644 --- a/fdbrpc/TLSConnection.actor.cpp +++ b/fdbrpc/TLSConnection.actor.cpp @@ -46,10 +46,10 @@ static int send_func(void* ctx, const uint8_t* buf, int len) { int w = conn->conn->write( &sb ); return w; } catch ( Error& e ) { - TraceEvent("TLSConnectionSendError", conn->getDebugID()).error(e); + TraceEvent("TLSConnectionSendError", conn->getDebugID()).error(e).suppressFor(1.0); return -1; } catch ( ... ) { - TraceEvent("TLSConnectionSendError", conn->getDebugID()).error( unknown_error() ); + TraceEvent("TLSConnectionSendError", conn->getDebugID()).error( unknown_error() ).suppressFor(1.0); return -1; } } @@ -62,10 +62,10 @@ static int recv_func(void* ctx, uint8_t* buf, int len) { int r = conn->conn->read( buf, buf + len ); return r; } catch ( Error& e ) { - TraceEvent("TLSConnectionRecvError", conn->getDebugID()).error(e); + TraceEvent("TLSConnectionRecvError", conn->getDebugID()).error(e).suppressFor(1.0); return -1; } catch ( ... ) { - TraceEvent("TLSConnectionRecvError", conn->getDebugID()).error( unknown_error() ); + TraceEvent("TLSConnectionRecvError", conn->getDebugID()).error( unknown_error() ).suppressFor(1.0); return -1; } } diff --git a/fdbrpc/dsltest.actor.cpp b/fdbrpc/dsltest.actor.cpp index 804caa1449..bfde5ccee9 100644 --- a/fdbrpc/dsltest.actor.cpp +++ b/fdbrpc/dsltest.actor.cpp @@ -925,7 +925,7 @@ struct AddReply { template void serialize(Ar& ar) { - ar & sum; + serializer(ar, sum); } }; @@ -938,7 +938,7 @@ struct AddRequest { template void serialize(Ar& ar) { - ar & a & b & reply; + serializer(ar, a, b, reply); } }; diff --git a/fdbrpc/fdbrpc.vcxproj b/fdbrpc/fdbrpc.vcxproj index 67a7eb9ff7..3231afcfbe 100644 --- a/fdbrpc/fdbrpc.vcxproj +++ b/fdbrpc/fdbrpc.vcxproj @@ -74,7 +74,6 @@ - false diff --git a/fdbrpc/fdbrpc.vcxproj.filters b/fdbrpc/fdbrpc.vcxproj.filters index 5590cd39c8..0aed89c29f 100644 --- a/fdbrpc/fdbrpc.vcxproj.filters +++ b/fdbrpc/fdbrpc.vcxproj.filters @@ -137,7 +137,6 @@ - diff --git a/fdbserver/CMakeLists.txt b/fdbserver/CMakeLists.txt new file mode 100644 index 0000000000..37adfa2366 --- /dev/null +++ b/fdbserver/CMakeLists.txt @@ -0,0 +1,185 @@ +set(FDBSERVER_SRCS + ApplyMetadataMutation.h + ClusterController.actor.cpp + ClusterRecruitmentInterface.h + ConflictSet.h + CoordinatedState.actor.cpp + CoordinatedState.h + Coordination.actor.cpp + CoordinationInterface.h + CoroFlow.actor.cpp + CoroFlow.h + DataDistribution.actor.cpp + DataDistribution.h + DataDistributionQueue.actor.cpp + DataDistributionTracker.actor.cpp + DBCoreState.h + DiskQueue.actor.cpp + fdbserver.actor.cpp + IDiskQueue.h + IKeyValueStore.h + IPager.h + IVersionedStore.h + IndirectShadowPager.actor.cpp + IndirectShadowPager.h + KeyValueStoreCompressTestData.actor.cpp + KeyValueStoreMemory.actor.cpp + KeyValueStoreSQLite.actor.cpp + Knobs.cpp + Knobs.h + LeaderElection.actor.cpp + LeaderElection.h + LogProtocolMessage.h + LogRouter.actor.cpp + LogSystem.h + LogSystemConfig.h + LogSystemDiskQueueAdapter.actor.cpp + LogSystemDiskQueueAdapter.h + LogSystemPeekCursor.actor.cpp + MasterInterface.h + MasterProxyServer.actor.cpp + masterserver.actor.cpp + MemoryPager.actor.cpp + MemoryPager.h + MoveKeys.actor.cpp + MoveKeys.h + networktest.actor.cpp + NetworkTest.h + OldTLogServer.actor.cpp + Orderer.actor.h + pubsub.actor.cpp + pubsub.h + QuietDatabase.actor.cpp + QuietDatabase.h + Ratekeeper.actor.cpp + Ratekeeper.h + RecoveryState.h + Restore.actor.cpp + RestoreInterface.h + Resolver.actor.cpp + ResolverInterface.h + ServerDBInfo.h + SimulatedCluster.actor.cpp + SimulatedCluster.h + SkipList.cpp + sqlite/btree.h + sqlite/hash.h + sqlite/sqlite3.h + sqlite/sqlite3ext.h + sqlite/sqliteInt.h + sqlite/sqliteLimit.h + sqlite/sqlite3.amalgamation.c + Status.actor.cpp + Status.h + StorageMetrics.actor.h + StorageMetrics.h + storageserver.actor.cpp + TagPartitionedLogSystem.actor.cpp + template_fdb.h + tester.actor.cpp + TesterInterface.h + TLogInterface.h + TLogServer.actor.cpp + VersionedBTree.actor.cpp + VFSAsync.cpp + WaitFailure.actor.cpp + WaitFailure.h + worker.actor.cpp + WorkerInterface.h + workloads/ApiCorrectness.actor.cpp + workloads/ApiWorkload.actor.cpp + workloads/ApiWorkload.h + workloads/AsyncFile.actor.h + workloads/AsyncFile.cpp + workloads/AsyncFileCorrectness.actor.cpp + workloads/AsyncFileRead.actor.cpp + workloads/AsyncFileWrite.actor.cpp + workloads/AtomicOps.actor.cpp + workloads/AtomicOpsApiCorrectness.actor.cpp + workloads/AtomicRestore.actor.cpp + workloads/AtomicSwitchover.actor.cpp + workloads/BackgroundSelectors.actor.cpp + workloads/BackupCorrectness.actor.cpp + workloads/BackupToDBAbort.actor.cpp + workloads/BackupToDBCorrectness.actor.cpp + workloads/BackupToDBUpgrade.actor.cpp + workloads/BulkLoad.actor.cpp + workloads/BulkSetup.actor.h + workloads/ChangeConfig.actor.cpp + workloads/ClientTransactionProfileCorrectness.actor.cpp + workloads/CommitBugCheck.actor.cpp + workloads/ConfigureDatabase.actor.cpp + workloads/ConflictRange.actor.cpp + workloads/ConsistencyCheck.actor.cpp + workloads/CpuProfiler.actor.cpp + workloads/Cycle.actor.cpp + workloads/DDBalance.actor.cpp + workloads/DDMetrics.actor.cpp + workloads/DiskDurability.actor.cpp + workloads/DiskDurabilityTest.actor.cpp + workloads/DummyWorkload.actor.cpp + workloads/FastTriggeredWatches.actor.cpp + workloads/FileSystem.actor.cpp + workloads/Fuzz.cpp + workloads/FuzzApiCorrectness.actor.cpp + workloads/Increment.actor.cpp + workloads/IndexScan.actor.cpp + workloads/Inventory.actor.cpp + workloads/KVStoreTest.actor.cpp + workloads/LockDatabase.actor.cpp + workloads/LogMetrics.actor.cpp + workloads/LowLatency.actor.cpp + workloads/MachineAttrition.actor.cpp + workloads/MemoryKeyValueStore.cpp + workloads/MemoryKeyValueStore.h + workloads/MemoryLifetime.actor.cpp + workloads/MetricLogging.actor.cpp + workloads/Performance.actor.cpp + workloads/Ping.actor.cpp + workloads/PubSubMultiples.actor.cpp + workloads/QueuePush.actor.cpp + workloads/RandomClogging.actor.cpp + workloads/RandomMoveKeys.actor.cpp + workloads/RandomSelector.actor.cpp + workloads/ReadWrite.actor.cpp + workloads/RemoveServersSafely.actor.cpp + workloads/Rollback.actor.cpp + workloads/RyowCorrectness.actor.cpp + workloads/RYWDisable.actor.cpp + workloads/RYWPerformance.actor.cpp + workloads/SaveAndKill.actor.cpp + workloads/SelectorCorrectness.actor.cpp + workloads/Serializability.actor.cpp + workloads/Sideband.actor.cpp + workloads/SlowTaskWorkload.actor.cpp + workloads/StatusWorkload.actor.cpp + workloads/Storefront.actor.cpp + workloads/StreamingRead.actor.cpp + workloads/TargetedKill.actor.cpp + workloads/TaskBucketCorrectness.actor.cpp + workloads/ThreadSafety.actor.cpp + workloads/Throughput.actor.cpp + workloads/TimeKeeperCorrectness.actor.cpp + workloads/UnitPerf.actor.cpp + workloads/UnitTests.actor.cpp + workloads/Unreadable.actor.cpp + workloads/VersionStamp.actor.cpp + workloads/WatchAndWait.actor.cpp + workloads/Watches.actor.cpp + workloads/WorkerErrors.actor.cpp + workloads/workloads.h + workloads/WriteBandwidth.actor.cpp + workloads/WriteDuringRead.actor.cpp) + +file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/workloads) + +actor_set(FDBSERVER_BUILD "${FDBSERVER_SRCS}") +add_executable(fdbserver ${FDBSERVER_BUILD}) +actor_compile(fdbserver "${FDBSERVER_SRCS}") +target_include_directories(fdbserver PRIVATE + ${CMAKE_CURRENT_BINARY_DIR}/workloads + ${CMAKE_CURRENT_SOURCE_DIR}/workloads) +target_link_libraries(fdbserver PRIVATE fdbclient) + +install(TARGETS fdbserver DESTINATION ${FDB_SBIN_DIR} COMPONENT server) + diff --git a/fdbserver/ClusterRecruitmentInterface.h b/fdbserver/ClusterRecruitmentInterface.h index 3f891a19ec..f3d0860c29 100644 --- a/fdbserver/ClusterRecruitmentInterface.h +++ b/fdbserver/ClusterRecruitmentInterface.h @@ -61,7 +61,7 @@ struct ClusterControllerFullInterface { template void serialize( Ar& ar ) { ASSERT( ar.protocolVersion() >= 0x0FDB00A200040001LL ); - ar & clientInterface & recruitFromConfiguration & recruitRemoteFromConfiguration & recruitStorage & registerWorker & getWorkers & registerMaster & getServerDBInfo; + serializer(ar, clientInterface, recruitFromConfiguration, recruitRemoteFromConfiguration, recruitStorage, registerWorker, getWorkers, registerMaster, getServerDBInfo); } }; @@ -77,7 +77,7 @@ struct RecruitFromConfigurationRequest { template void serialize( Ar& ar ) { - ar & configuration & recruitSeedServers & maxOldLogRouters & reply; + serializer(ar, configuration, recruitSeedServers, maxOldLogRouters, reply); } }; @@ -95,7 +95,7 @@ struct RecruitFromConfigurationReply { template void serialize( Ar& ar ) { - ar & tLogs & satelliteTLogs & proxies & resolvers & storageServers & oldLogRouters & dcId & satelliteFallback; + serializer(ar, tLogs, satelliteTLogs, proxies, resolvers, storageServers, oldLogRouters, dcId, satelliteFallback); } }; @@ -110,7 +110,7 @@ struct RecruitRemoteFromConfigurationRequest { template void serialize( Ar& ar ) { - ar & configuration & dcId & logRouterCount & reply; + serializer(ar, configuration, dcId, logRouterCount, reply); } }; @@ -120,7 +120,7 @@ struct RecruitRemoteFromConfigurationReply { template void serialize( Ar& ar ) { - ar & remoteTLogs & logRouters; + serializer(ar, remoteTLogs, logRouters); } }; @@ -130,7 +130,7 @@ struct RecruitStorageReply { template void serialize( Ar& ar ) { - ar & worker & processClass; + serializer(ar, worker, processClass); } }; @@ -143,7 +143,7 @@ struct RecruitStorageRequest { template void serialize( Ar& ar ) { - ar & excludeMachines & excludeAddresses & includeDCs & criticalRecruitment & reply; + serializer(ar, excludeMachines, excludeAddresses, includeDCs, criticalRecruitment, reply); } }; @@ -156,7 +156,7 @@ struct RegisterWorkerReply { template void serialize( Ar& ar ) { - ar & processClass & priorityInfo; + serializer(ar, processClass, priorityInfo); } }; @@ -174,7 +174,7 @@ struct RegisterWorkerRequest { template void serialize( Ar& ar ) { - ar & wi & initialClass & processClass & priorityInfo & generation & reply; + serializer(ar, wi, initialClass, processClass, priorityInfo, generation, reply); } }; @@ -189,7 +189,7 @@ struct GetWorkersRequest { template void serialize(Ar& ar) { - ar & flags & reply; + serializer(ar, flags, reply); } }; @@ -213,7 +213,7 @@ struct RegisterMasterRequest { template void serialize( Ar& ar ) { ASSERT( ar.protocolVersion() >= 0x0FDB00A200040001LL ); - ar & id & mi & logSystemConfig & proxies & resolvers & recoveryCount & registrationCount & configuration & priorCommittedLogServers & recoveryState & recoveryStalled & reply; + serializer(ar, id, mi, logSystemConfig, proxies, resolvers, recoveryCount, registrationCount, configuration, priorCommittedLogServers, recoveryState, recoveryStalled, reply); } }; @@ -225,7 +225,7 @@ struct GetServerDBInfoRequest { template void serialize(Ar& ar) { - ar & knownServerInfoID & issues & incompatiblePeers & reply; + serializer(ar, knownServerInfoID, issues, incompatiblePeers, reply); } }; diff --git a/fdbserver/CoordinatedState.actor.cpp b/fdbserver/CoordinatedState.actor.cpp index 0bc7b8495a..018bd97c8d 100644 --- a/fdbserver/CoordinatedState.actor.cpp +++ b/fdbserver/CoordinatedState.actor.cpp @@ -208,7 +208,7 @@ struct MovableValue { template void serialize(Ar& ar) { ASSERT( ar.protocolVersion() >= 0x0FDB00A2000D0001LL ); - ar & value & mode & other; + serializer(ar, value, mode, other); } }; @@ -316,4 +316,4 @@ MovableCoordinatedState::~MovableCoordinatedState() { Future MovableCoordinatedState::read() { return MovableCoordinatedStateImpl::read(impl); } Future MovableCoordinatedState::onConflict() { return impl->onConflict(); } Future MovableCoordinatedState::setExclusive(Value v) { return impl->setExclusive(v); } -Future MovableCoordinatedState::move( ClusterConnectionString const& nc ) { return MovableCoordinatedStateImpl::move(impl, nc); } \ No newline at end of file +Future MovableCoordinatedState::move( ClusterConnectionString const& nc ) { return MovableCoordinatedStateImpl::move(impl, nc); } diff --git a/fdbserver/Coordination.actor.cpp b/fdbserver/Coordination.actor.cpp index a8d878b717..c111baf3fb 100644 --- a/fdbserver/Coordination.actor.cpp +++ b/fdbserver/Coordination.actor.cpp @@ -33,7 +33,7 @@ struct GenerationRegVal { Optional val; template void serialize(Ar& ar) { - ar & readGen & writeGen & val; + serializer(ar, readGen, writeGen, val); } }; diff --git a/fdbserver/CoordinationInterface.h b/fdbserver/CoordinationInterface.h index b7044f5c9c..931fec5500 100644 --- a/fdbserver/CoordinationInterface.h +++ b/fdbserver/CoordinationInterface.h @@ -66,7 +66,7 @@ struct UniqueGeneration { } template void serialize(Ar& ar) { - ar & generation & uid; + serializer(ar, generation, uid); } }; @@ -78,7 +78,7 @@ struct GenerationRegReadRequest { GenerationRegReadRequest( Key key, UniqueGeneration gen ) : key(key), gen(gen) {} template void serialize(Ar& ar) { - ar & key & gen & reply; + serializer(ar, key, gen, reply); } }; @@ -89,7 +89,7 @@ struct GenerationRegReadReply { GenerationRegReadReply( Optional value, UniqueGeneration gen, UniqueGeneration rgen ) : value(value), gen(gen), rgen(rgen) {} template void serialize(Ar& ar) { - ar & value & gen & rgen; + serializer(ar, value, gen, rgen); } }; @@ -101,7 +101,7 @@ struct GenerationRegWriteRequest { GenerationRegWriteRequest(KeyValue kv, UniqueGeneration gen) : kv(kv), gen(gen) {} template void serialize(Ar& ar) { - ar & kv & gen & reply; + serializer(ar, kv, gen, reply); } }; @@ -126,7 +126,7 @@ struct CandidacyRequest { template void serialize(Ar& ar) { - ar & key & myInfo & knownLeader & prevChangeID & reply; + serializer(ar, key, myInfo, knownLeader, prevChangeID, reply); } }; @@ -141,7 +141,7 @@ struct LeaderHeartbeatRequest { template void serialize(Ar& ar) { - ar & key & myInfo & prevChangeID & reply; + serializer(ar, key, myInfo, prevChangeID, reply); } }; @@ -155,7 +155,7 @@ struct ForwardRequest { template void serialize(Ar& ar) { - ar & key & conn & reply; + serializer(ar, key, conn, reply); } }; @@ -169,4 +169,4 @@ public: Future coordinationServer( std::string const& dataFolder ); -#endif \ No newline at end of file +#endif diff --git a/fdbserver/DBCoreState.h b/fdbserver/DBCoreState.h index ef2814b539..56e8503ba6 100644 --- a/fdbserver/DBCoreState.h +++ b/fdbserver/DBCoreState.h @@ -54,7 +54,7 @@ struct CoreTLogSet { template void serialize(Archive& ar) { - ar & tLogs & tLogWriteAntiQuorum & tLogReplicationFactor & tLogPolicy & tLogLocalities & isLocal & locality & startVersion & satelliteTagLocations; + serializer(ar, tLogs, tLogWriteAntiQuorum, tLogReplicationFactor, tLogPolicy, tLogLocalities, isLocal, locality, startVersion, satelliteTagLocations); } }; @@ -72,11 +72,11 @@ struct OldTLogCoreData { template void serialize(Archive& ar) { if( ar.protocolVersion() >= 0x0FDB00A560010001LL) { - ar & tLogs & logRouterTags & epochEnd; + serializer(ar, tLogs, logRouterTags, epochEnd); } else if(ar.isDeserializing) { tLogs.push_back(CoreTLogSet()); - ar & tLogs[0].tLogs & tLogs[0].tLogWriteAntiQuorum & tLogs[0].tLogReplicationFactor & tLogs[0].tLogPolicy & epochEnd & tLogs[0].tLogLocalities; + serializer(ar, tLogs[0].tLogs, tLogs[0].tLogWriteAntiQuorum, tLogs[0].tLogReplicationFactor, tLogs[0].tLogPolicy, epochEnd, tLogs[0].tLogLocalities); } } }; @@ -122,18 +122,18 @@ struct DBCoreState { ASSERT(ar.protocolVersion() >= 0x0FDB00A460010001LL); if(ar.protocolVersion() >= 0x0FDB00A560010001LL) { - ar & tLogs & logRouterTags & oldTLogData & recoveryCount & logSystemType; + serializer(ar, tLogs, logRouterTags, oldTLogData, recoveryCount, logSystemType); } else if(ar.isDeserializing) { tLogs.push_back(CoreTLogSet()); - ar & tLogs[0].tLogs & tLogs[0].tLogWriteAntiQuorum & recoveryCount & tLogs[0].tLogReplicationFactor & logSystemType; + serializer(ar, tLogs[0].tLogs, tLogs[0].tLogWriteAntiQuorum, recoveryCount, tLogs[0].tLogReplicationFactor, logSystemType); uint64_t tLocalitySize = (uint64_t)tLogs[0].tLogLocalities.size(); - ar & oldTLogData & tLogs[0].tLogPolicy & tLocalitySize; + serializer(ar, oldTLogData, tLogs[0].tLogPolicy, tLocalitySize); if (ar.isDeserializing) { tLogs[0].tLogLocalities.reserve(tLocalitySize); for (size_t i = 0; i < tLocalitySize; i++) { LocalityData locality; - ar & locality; + serializer(ar, locality); tLogs[0].tLogLocalities.push_back(locality); } diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp index d3c89cdd8c..c320dd3afc 100644 --- a/fdbserver/DataDistribution.actor.cpp +++ b/fdbserver/DataDistribution.actor.cpp @@ -25,21 +25,26 @@ #include "fdbserver/MoveKeys.h" #include "fdbserver/Knobs.h" #include +#include #include "fdbserver/WaitFailure.h" #include "fdbserver/ServerDBInfo.h" #include "fdbserver/IKeyValueStore.h" #include "fdbclient/ManagementAPI.h" #include "fdbrpc/Replication.h" #include "flow/UnitTest.h" +#include "flow/Trace.h" #include "flow/actorcompiler.h" // This must be the last #include. class TCTeamInfo; +struct TCMachineInfo; +class TCMachineTeamInfo; struct TCServerInfo : public ReferenceCounted { UID id; StorageServerInterface lastKnownInterface; ProcessClass lastKnownClass; vector> teams; + Reference machine; Future tracker; int64_t dataInFlightToServer; ErrorOr serverMetrics; @@ -57,6 +62,30 @@ struct TCServerInfo : public ReferenceCounted { } }; +struct TCMachineInfo : public ReferenceCounted { + std::vector> serversOnMachine; // SOMEDAY: change from vector to set + Standalone machineID; + std::vector> machineTeams; // SOMEDAY: split good and bad machine teams. + LocalityEntry localityEntry; + + explicit TCMachineInfo(Reference server, const LocalityEntry& entry) : localityEntry(entry) { + ASSERT(serversOnMachine.empty()); + serversOnMachine.push_back(server); + machineID = server->lastKnownInterface.locality.zoneId().get(); + } + + std::string getServersIDStr() { + std::stringstream ss; + if (serversOnMachine.empty()) return "[unset]"; + + for (auto& server : serversOnMachine) { + ss << server->id.toString() << " "; + } + + return ss.str(); + } +}; + ACTOR Future updateServerMetrics( TCServerInfo *server ) { state StorageServerInterface ssi = server->lastKnownInterface; state Future> metricsRequest = ssi.getPhysicalMetrics.tryGetReply( GetPhysicalMetricsRequest(), TaskDataDistributionLaunch ); @@ -103,22 +132,72 @@ ACTOR Future updateServerMetrics( Reference server ) { return Void(); } +// Machine team information +class TCMachineTeamInfo : public ReferenceCounted { +public: + vector> machines; + vector> machineIDs; + + explicit TCMachineTeamInfo(vector> const& machines) : machines(machines) { + machineIDs.reserve(machines.size()); + for (int i = 0; i < machines.size(); i++) { + machineIDs.push_back(machines[i]->machineID); + } + sort(machineIDs.begin(), machineIDs.end()); + } + + int size() { + ASSERT(machines.size() == machineIDs.size()); + return machineIDs.size(); + } + + std::string getMachineIDsStr() { + std::stringstream ss; + + if (machineIDs.empty()) return "[unset]"; + + for (auto& id : machineIDs) { + ss << id.contents().toString() << " "; + } + + return ss.str(); + } + + int getTotalMachineTeamNumber() { + int count = 0; + + for (auto& machine : machines) { + ASSERT(machine->machineTeams.size() >= 0); + count += machine->machineTeams.size(); + } + + return count; + } + + bool operator==(TCMachineTeamInfo& rhs) const { return this->machineIDs == rhs.machineIDs; } +}; + class TCTeamInfo : public ReferenceCounted, public IDataDistributionTeam { public: vector< Reference > servers; vector serverIDs; + Reference machineTeam; Future tracker; bool healthy; bool wrongConfiguration; //True if any of the servers in the team have the wrong configuration int priority; - TCTeamInfo( vector< Reference > const& servers ) - : servers(servers), healthy(true), priority(PRIORITY_TEAM_HEALTHY), wrongConfiguration(false) - { + explicit TCTeamInfo(vector> const& servers) + : servers(servers), healthy(true), priority(PRIORITY_TEAM_HEALTHY), wrongConfiguration(false) { + if (servers.empty()) { + TraceEvent(SevInfo, "ConstructTCTeamFromEmptyServers"); + } serverIDs.reserve(servers.size()); - for(int i=0; iid); + } } + virtual vector getLastKnownServerInterfaces() { vector v; v.reserve(servers.size()); @@ -128,6 +207,19 @@ public: } virtual int size() { return servers.size(); } virtual vector const& getServerIDs() { return serverIDs; } + + virtual std::string getServerIDsStr() { + std::stringstream ss; + + if (serverIDs.empty()) return "[unset]"; + + for (auto& id : serverIDs) { + ss << id.toString() << " "; + } + + return ss.str(); + } + virtual void addDataInFlightToTeam( int64_t delta ) { for(int i=0; idataInFlightToServer += delta; @@ -294,8 +386,6 @@ ACTOR Future waitForAllDataRemoved( Database cx, UID serverID, Version add // Wait for any change to the serverKeys for this server wait( delay(SERVER_KNOBS->ALL_DATA_REMOVED_DELAY, TaskDataDistribution) ); - //Void _ = tr.waitForChanges( KeyRangeRef( serverKeysPrefixFor(serverID), - // serverKeysPrefixFor(serverID).toString() + allKeys.end.toString() ) ); tr.reset(); } catch (Error& e) { wait( tr.onError(e) ); @@ -327,7 +417,7 @@ ACTOR Future> getInitialDataDistribution( Dat BinaryReader rd( mode.get(), Unversioned() ); rd >> result->mode; } - if(!result->mode) + if (!result->mode) // result->mode can be changed to 0 when we disable data distribution return result; @@ -487,6 +577,12 @@ struct DDTeamCollection : ReferenceCounted { int64_t unhealthyServers; std::map priority_teams; std::map> server_info; + + // machine_info has all machines info; key must be unique across processes on the same machine + std::map, Reference> machine_info; + std::vector> machineTeams; // all machine teams + LocalityMap machineLocalityMap; // locality info of machines + vector> teams; vector> badTeams; Reference shardsAffectedByTeamFailure; @@ -645,6 +741,8 @@ struct DDTeamCollection : ReferenceCounted { // tracking is "edge triggered") // SOMEDAY: Account for capacity, load (when shardMetrics load is high) + // self->teams.size() can be 0 under the ConfigureTest.txt test when we change configurations + // The situation happens rarely. We may want to eliminate this situation someday if( !self->teams.size() ) { req.reply.send( Optional>() ); return Void(); @@ -665,6 +763,7 @@ struct DDTeamCollection : ReferenceCounted { for( int i = 0; i < req.sources.size(); i++ ) { if( !self->server_info.count( req.sources[i] ) ) { TEST( true ); // GetSimilarTeams source server now unknown + TraceEvent(SevWarn, "GetTeam").detail("ReqSourceUnknown", req.sources[i]); } else { auto& teamList = self->server_info[ req.sources[i] ]->teams; @@ -697,6 +796,8 @@ struct DDTeamCollection : ReferenceCounted { if( foundExact || (req.wantsTrueBest && bestOption.present() ) ) { ASSERT( bestOption.present() ); + // Check the team size: be sure team size is correct + ASSERT(bestOption.get()->size() == self->configuration.storageTeamSize); req.reply.send( bestOption ); return Void(); } @@ -747,6 +848,8 @@ struct DDTeamCollection : ReferenceCounted { } } + // Note: req.completeSources can be empty and all servers (and server teams) can be unhealthy. + // We will get stuck at this! This only happens when a DC fails. No need to consider it right now. if(!bestOption.present() && self->zeroHealthyTeams->get()) { //Attempt to find the unhealthy source server team and return it std::set completeSources; @@ -837,7 +940,7 @@ struct DDTeamCollection : ReferenceCounted { if( self->satisfiesPolicy(servers) ) { if(servers.size() == self->configuration.storageTeamSize || self->satisfiesPolicy(servers, self->configuration.storageTeamSize)) { servers.resize(self->configuration.storageTeamSize); - self->addTeam(servers); + self->addTeam(servers, true); } else { tempSet->clear(); for( auto it : servers ) { @@ -853,7 +956,7 @@ struct DDTeamCollection : ReferenceCounted { for(auto& it : self->resultEntries) { serverIds.push_back(*tempMap->getObject(it)); } - self->addTeam(serverIds.begin(), serverIds.end()); + self->addTeam(serverIds.begin(), serverIds.end(), true); } } else { serverIds.clear(); @@ -881,7 +984,7 @@ struct DDTeamCollection : ReferenceCounted { state std::set>::iterator teamIter = self->primary ? initTeams->primaryTeams.begin() : initTeams->remoteTeams.begin(); state std::set>::iterator teamIterEnd = self->primary ? initTeams->primaryTeams.end() : initTeams->remoteTeams.end(); for(; teamIter != teamIterEnd; ++teamIter) { - self->addTeam(teamIter->begin(), teamIter->end() ); + self->addTeam(teamIter->begin(), teamIter->end(), true); wait( yield() ); } @@ -894,7 +997,8 @@ struct DDTeamCollection : ReferenceCounted { ASSERT( serverCount == server_info.size() ); - int minTeams = 100000, maxTeams = 0; + int minTeams = std::numeric_limits::max(); + int maxTeams = std::numeric_limits::min(); double varTeams = 0; std::map>, int> machineTeams; @@ -904,12 +1008,14 @@ struct DDTeamCollection : ReferenceCounted { minTeams = std::min(minTeams, stc); maxTeams = std::max(maxTeams, stc); varTeams += (stc - teamsPerServer)*(stc - teamsPerServer); + // Use zoneId as server's machine id machineTeams[s->second->lastKnownInterface.locality.zoneId()] += stc; } } varTeams /= teamsPerServer*teamsPerServer; - int minMachineTeams = 100000, maxMachineTeams = 0; + int minMachineTeams = std::numeric_limits::max(); + int maxMachineTeams = std::numeric_limits::min(); for( auto m = machineTeams.begin(); m != machineTeams.end(); ++m ) { minMachineTeams = std::min( minMachineTeams, m->second ); maxMachineTeams = std::max( maxMachineTeams, m->second ); @@ -929,189 +1035,665 @@ struct DDTeamCollection : ReferenceCounted { } bool teamExists( vector &team ) { - bool exists = false; - for (int i=0;igetServerIDs() == team) { - exists = true; - break; + if (team.empty()) { + return false; + } + + UID& serverID = team[0]; + for (auto& usedTeam : server_info[serverID]->teams) { + if (team == usedTeam->getServerIDs()) { + return true; } } - return exists; + + return false; } - void addTeam( const vector>& newTeamServers ) { - Reference teamInfo( new TCTeamInfo( newTeamServers ) ); + // SOMEDAY: when machineTeams is changed from vector to set, we may check the existance faster + bool machineTeamExists(vector>& machineIDs) { return findMachineTeam(machineIDs).isValid(); } - bool badTeam = !satisfiesPolicy(teamInfo->servers) || teamInfo->servers.size() != configuration.storageTeamSize; - teamInfo->tracker = teamTracker( this, teamInfo, badTeam ); - if( badTeam ) { - badTeams.push_back( teamInfo ); - } else { - teams.push_back( teamInfo ); - for (int i=0;iteams.push_back( teamInfo ); + Reference findMachineTeam(vector>& machineIDs) { + if (machineIDs.empty()) { + return Reference(); + } + + Standalone machineID = machineIDs[0]; + for (auto& machineTeam : machine_info[machineID]->machineTeams) { + if (machineTeam->machineIDs == machineIDs) { + return machineTeam; } } + + return Reference(); } - template - void addTeam( InputIt begin, InputIt end) { - vector< Reference > newTeamServers; + // Assume begin to end is sorted by std::sort + // Assume InputIt is iterator to UID + // Note: We must allow creating empty teams because empty team is created when a remote DB is initialized. + // The empty team is used as the starting point to move data to the remote DB + // begin : the start of the team member ID + // end : end of the team member ID + // isIntialTeam : False when the team is added by addTeamsBestOf(); True otherwise, e.g., + // when the team added at init() when we recreate teams by looking up DB + template + void addTeam(InputIt begin, InputIt end, bool isInitialTeam) { + vector> newTeamServers; for (auto i = begin; i != end; ++i) { if (server_info.find(*i) != server_info.end()) { newTeamServers.push_back(server_info[*i]); } } - addTeam( newTeamServers ); + addTeam(newTeamServers, isInitialTeam); } - void addTeam( std::set const& team ) { - addTeam(team.begin(), team.end()); - } + void addTeam(const vector>& newTeamServers, bool isInitialTeam) { + Reference teamInfo(new TCTeamInfo(newTeamServers)); + bool badTeam = !satisfiesPolicy(teamInfo->servers) || teamInfo->servers.size() != configuration.storageTeamSize; - ACTOR Future addAllTeams( DDTeamCollection* self, int location, vector* history, Reference> processes, vector>* output, int teamLimit, int* addedTeams ) { - wait( yield( TaskDataDistributionLaunch ) ); - - // Add team, if valid - if(history->size() == self->configuration.storageTeamSize) { - auto valid = self->configuration.storagePolicy->validate(*history, processes); - if(!valid) { - return Void(); - } - std::vector team; - for(auto it = history->begin(); it != history->end(); it++) { - team.push_back(*processes->getObject(*it)); - } - - if( !self->teamExists(team) && *addedTeams < teamLimit ) { - output->push_back(team); - (*addedTeams)++; - } - return Void(); + teamInfo->tracker = teamTracker(this, teamInfo, badTeam); + // ASSERT( teamInfo->serverIDs.size() > 0 ); //team can be empty at DB initialization + if (badTeam) { + badTeams.push_back(teamInfo); + return; } - //loop through remaining potential team members, add one and recursively call function - for(; location < processes->size(); location++) { - history->push_back(processes->getEntry(location)); - state int depth = history->size(); - wait( self->addAllTeams( self, location + 1, history, processes, output, teamLimit, addedTeams ) ); - ASSERT( history->size() == depth); // the "stack" should be unchanged by this call - history->pop_back(); - if(*addedTeams > teamLimit) - break; + // For a good team, we add it to teams and create machine team for it when necessary + teams.push_back(teamInfo); + for (int i = 0; i < newTeamServers.size(); ++i) { + newTeamServers[i]->teams.push_back(teamInfo); } - return Void(); + // Find or create machine team for the server team + // Add the reference of machineTeam (with machineIDs) into process team + vector> machineIDs; + for (auto server = newTeamServers.begin(); server != newTeamServers.end(); ++server) { + machineIDs.push_back((*server)->machine->machineID); + } + sort(machineIDs.begin(), machineIDs.end()); + Reference machineTeamInfo = findMachineTeam(machineIDs); + + // A team is not initial team if it is added by addTeamsBestOf() which always create a team with correct size + // A non-initial team must have its machine team created and its size must be correct + ASSERT(isInitialTeam || machineTeamInfo.isValid()); + + // Create a machine team if it does not exist + // Note an initial team may be added at init() even though the team size is not storageTeamSize + if (!machineTeamInfo.isValid() && !machineIDs.empty()) { + machineTeamInfo = addMachineTeam(machineIDs.begin(), machineIDs.end()); + } + + if (!machineTeamInfo.isValid()) { + TraceEvent(SevWarn, "AddTeamWarning") + .detail("NotFoundMachineTeam", "OKIfTeamIsEmpty") + .detail("TeamInfo", teamInfo->getDesc()); + } + + teamInfo->machineTeam = machineTeamInfo; } - ACTOR Future addAllTeams( DDTeamCollection* self, vector input, vector>* output, int teamLimit ) { - state int addedTeams = 0; - state vector history; - state Reference> processes(new LocalityMap()); - for(auto it = input.begin(); it != input.end(); it++) { - if(self->server_info[*it]) { - processes->add(self->server_info[*it]->lastKnownInterface.locality, &*it); + void addTeam(std::set const& team, bool isInitialTeam) { addTeam(team.begin(), team.end(), isInitialTeam); } + + // Add a machine team specified by input machines + Reference addMachineTeam(vector> machines) { + Reference machineTeamInfo(new TCMachineTeamInfo(machines)); + machineTeams.push_back(machineTeamInfo); + + // Assign machine teams to machine + for (auto machine : machines) { + machine->machineTeams.push_back(machineTeamInfo); + } + + return machineTeamInfo; + } + + // Add a machine team by using the machineIDs from begin to end + Reference addMachineTeam(vector>::iterator begin, + vector>::iterator end) { + vector> machines; + + for (auto i = begin; i != end; ++i) { + if (machine_info.find(*i) != machine_info.end()) { + machines.push_back(machine_info[*i]); + } else { + TraceEvent(SevWarn, "AddMachineTeamError").detail("MachineIDNotExist", i->contents().toString()); } } - wait( self->addAllTeams( self, 0, &history, processes, output, teamLimit, &addedTeams ) ); - return addedTeams; + + return addMachineTeam(machines); } - int addTeamsBestOf( int teamsToBuild ) { - int addedTeams = 0; - - LocalityMap totalServers; - + // Group storage servers (process) based on their machineId in LocalityData + // All created machines are healthy + // Return The number of healthy servers we grouped into machines + int constructMachinesFromServers() { + int totalServerIndex = 0; for(auto i = server_info.begin(); i != server_info.end(); ++i) { if (!server_status.get(i->first).isUnhealthy()) { - auto& id = i->first; - auto& locality = i->second->lastKnownInterface.locality; - totalServers.add(locality, &id); + checkAndCreateMachine(i->second); + totalServerIndex++; } } - if(totalServers.size() < configuration.storageTeamSize ) { - TraceEvent(SevWarn, "DataDistributionBuildTeams", masterId).detail("Reason","Not enough servers for a team").detail("Servers",totalServers.size()).detail("TeamSize", configuration.storageTeamSize); - return addedTeams; + return totalServerIndex; + } + + void traceServerInfo() { + int i = 0; + + TraceEvent("ServerInfo").detail("Size", server_info.size()); + for (auto& server : server_info) { + const UID& uid = server.first; + TraceEvent("ServerInfo") + .detail("ServerInfoIndex", i++) + .detail("ServerID", server.first.toString()) + .detail("ServerTeamOwned", server.second->teams.size()) + .detail("MachineID", server.second->machine->machineID.contents().toString()); } + for (auto& server : server_info) { + const UID& uid = server.first; + TraceEvent("ServerStatus", uid) + .detail("Healthy", !server_status.get(uid).isUnhealthy()) + .detail("MachineIsValid", server_info[uid]->machine.isValid()) + .detail("MachineTeamSize", + server_info[uid]->machine.isValid() ? server_info[uid]->machine->machineTeams.size() : -1); + } + } + + void traceServerTeamInfo() { + int i = 0; + + TraceEvent("ServerTeamInfo").detail("Size", teams.size()); + for (auto& team : teams) { + TraceEvent("ServerTeamInfo") + .detail("TeamIndex", i++) + .detail("Healthy", team->isHealthy()) + .detail("ServerNumber", team->serverIDs.size()) + .detail("MemberIDs", team->getServerIDsStr()); + } + } + + void traceMachineInfo() { + int i = 0; + + TraceEvent("MachineInfo").detail("Size", machine_info.size()); + for (auto& machine : machine_info) { + TraceEvent("MachineInfo") + .detail("MachineInfoIndex", i++) + .detail("MachineID", machine.first.contents().toString()) + .detail("MachineTeamOwned", machine.second->machineTeams.size()) + .detail("ServerNumOnMachine", machine.second->serversOnMachine.size()) + .detail("ServersID", machine.second->getServersIDStr()); + } + } + + void traceMachineTeamInfo() { + int i = 0; + + TraceEvent("MachineTeamInfo").detail("Size", machineTeams.size()); + for (auto& team : machineTeams) { + TraceEvent("MachineTeamInfo").detail("TeamIndex", i++).detail("MachineIDs", team->getMachineIDsStr()); + } + } + + void traceMachineLocalityMap() { + int i = 0; + + TraceEvent("MachineLocalityMap").detail("Size", machineLocalityMap.size()); + for (auto& uid : machineLocalityMap.getObjects()) { + Reference record = machineLocalityMap.getRecord(i); + if (record.isValid()) { + TraceEvent("MachineLocalityMap") + .detail("LocalityIndex", i++) + .detail("UID", uid->toString()) + .detail("LocalityRecord", record->toString()); + } else { + TraceEvent("MachineLocalityMap") + .detail("LocalityIndex", i++) + .detail("UID", uid->toString()) + .detail("LocalityRecord", "[NotFound]"); + } + } + } + + // To enable verbose debug info, set shouldPrint to true + void traceAllInfo(bool shouldPrint = false) { + if (!shouldPrint) return; + + TraceEvent("TraceAllInfo").detail("Primary", primary).detail("DesiredTeamSize", configuration.storageTeamSize); + traceServerInfo(); + traceServerTeamInfo(); + traceMachineInfo(); + traceMachineTeamInfo(); + traceMachineLocalityMap(); + } + + // We must rebuild machine locality map whenever the entry in the map is inserted or removed + void rebuildMachineLocalityMap() { + machineLocalityMap.clear(); + int numHealthyMachine = 0; + for (auto machine = machine_info.begin(); machine != machine_info.end(); ++machine) { + if (machine->second->serversOnMachine.empty()) { + TraceEvent(SevWarn, "RebuildMachineLocalityMapError") + .detail("Machine", machine->second->machineID.toString()) + .detail("NumServersOnMachine", 0); + continue; + } + if (!isMachineHealthy(machine->second)) { + continue; + } + Reference representativeServer = machine->second->serversOnMachine[0]; + auto& locality = representativeServer->lastKnownInterface.locality; + const LocalityEntry& localityEntry = machineLocalityMap.add(locality, &representativeServer->id); + machine->second->localityEntry = localityEntry; + ++numHealthyMachine; + } + } + + // Create machineTeamsToBuild number of machine teams + // No operation if machineTeamsToBuild is 0 + // Note: The creation of machine teams should not depend on server teams: + // No matter how server teams will be created, we will create the same set of machine teams; + // We should never use server team number in building machine teams. + // + // Five steps to create each machine team, which are document in the function + // Reuse ReplicationPolicy selectReplicas func to select machine team + // return number of added machine teams + int addBestMachineTeams(int targetMachineTeamsToBuild) { + int addedMachineTeams = 0; + int totalServerIndex = 0; + int machineTeamsToBuild = 0; + + ASSERT(targetMachineTeamsToBuild >= 0); + // Not build any machine team if asked to build none + if (targetMachineTeamsToBuild == 0) return 0; + + machineTeamsToBuild = targetMachineTeamsToBuild; + + // The number of machines is always no smaller than the storageTeamSize in a correct configuration + ASSERT(machine_info.size() >= configuration.storageTeamSize); + + // Step 1: Create machineLocalityMap which will be used in building machine team + rebuildMachineLocalityMap(); int loopCount = 0; - // add teams - while( addedTeams < teamsToBuild ) { - std::vector leastUsedServers; - int minTeamCount = CLIENT_KNOBS->TOO_MANY; - for(int i = 0; i < totalServers.size(); i++) { - LocalityEntry process = totalServers.getEntry(i); - UID id = *totalServers.getObject(process); - int teamCount = server_info[id]->teams.size(); - if(teamCount < minTeamCount) { - leastUsedServers.clear(); + // Add a team in each iteration + while (addedMachineTeams < machineTeamsToBuild) { + // Step 2: Get least used machines from which we choose machines as a machine team + std::vector> leastUsedMachines; // A less used machine has less number of teams + int minTeamCount = std::numeric_limits::max(); + for (auto& machine : machine_info) { + // Skip invalid machine whose representative server is not in server_info + ASSERT_WE_THINK(server_info.find(machine.second->serversOnMachine[0]->id) != server_info.end()); + // Skip unhealthy machines + if (!isMachineHealthy(machine.second)) continue; + + // Invariant: We only create correct size machine teams. + // When configuration (e.g., team size) is changed, the DDTeamCollection will be destroyed and rebuilt + // so that the invariant will not be violated. + int teamCount = machine.second->machineTeams.size(); + + if (teamCount < minTeamCount) { + leastUsedMachines.clear(); minTeamCount = teamCount; } - if(teamCount <= minTeamCount) { - leastUsedServers.push_back(process); + if (teamCount == minTeamCount) { + leastUsedMachines.push_back(machine.second); } } std::vector team; std::vector forcedAttributes; - if (leastUsedServers.size()) { - forcedAttributes.push_back(g_random->randomChoice(leastUsedServers)); + // Step 3: Create a representative process for each machine. + // Construct forcedAttribute from leastUsedMachines. + // We will use forcedAttribute to call existing function to form a team + if (leastUsedMachines.size()) { + // Randomly choose 1 least used machine + Reference tcMachineInfo = g_random->randomChoice(leastUsedMachines); + ASSERT(!tcMachineInfo->serversOnMachine.empty()); + LocalityEntry process = tcMachineInfo->localityEntry; + forcedAttributes.push_back(process); + } else { + // when leastUsedMachine is empty, we will never find a team later, so we can simply return. + return addedMachineTeams; } + // Step 4: Reuse Policy's selectReplicas() to create team for the representative process. std::vector bestTeam; - int bestScore = CLIENT_KNOBS->TOO_MANY; - - int maxAttempts = SERVER_KNOBS->BEST_OF_AMT; - for( int i = 0; i < maxAttempts && i < 100; i++) { + int bestScore = std::numeric_limits::max(); + int maxAttempts = SERVER_KNOBS->BEST_OF_AMT; // BEST_OF_AMT = 4 + for (int i = 0; i < maxAttempts && i < 100; ++i) { + // Choose a team that balances the # of teams per server among the teams + // that have the least-utilized server team.clear(); - auto success = totalServers.selectReplicas(configuration.storagePolicy, forcedAttributes, team); - if(!success) { + auto success = machineLocalityMap.selectReplicas(configuration.storagePolicy, forcedAttributes, team); + if (!success) { break; } + ASSERT(forcedAttributes.size() > 0); + team.push_back((UID*)machineLocalityMap.getObject(forcedAttributes[0])); - if(forcedAttributes.size() > 0) { - team.push_back((UID*)totalServers.getObject(forcedAttributes[0])); - } - if( team.size() != configuration.storageTeamSize) { - maxAttempts += 1; + // selectReplicas() may NEVER return server not in server_info. + for (auto& pUID : team) { + ASSERT_WE_THINK(server_info.find(*pUID) != server_info.end()); } + // selectReplicas() should always return a team with correct size. otherwise, it has a bug + ASSERT(team.size() == configuration.storageTeamSize); + int score = 0; - for(auto process = team.begin(); process != team.end(); process++) { - score += server_info[**process]->teams.size(); + vector> machineIDs; + for (auto process = team.begin(); process != team.end(); process++) { + Reference server = server_info[**process]; + score += server->machine->machineTeams.size(); + Standalone machine_id = server->lastKnownInterface.locality.zoneId().get(); + machineIDs.push_back(machine_id); } - if(score < bestScore) { + // Only choose healthy machines into machine team + ASSERT_WE_THINK(isMachineTeamHealthy(machineIDs)); + + std::sort(machineIDs.begin(), machineIDs.end()); + if (machineTeamExists(machineIDs)) { + maxAttempts += 1; + continue; + } + + // SOMEDAY: randomly pick one from teams with the lowest score + if (score < bestScore) { + // bestTeam is the team which has the smallest number of teams its team members belong to. bestTeam = team; bestScore = score; } } - if( bestTeam.size() == configuration.storageTeamSize) { - vector processIDs; - + // bestTeam should be a new valid team to be added into machine team now + // Step 5: Restore machine from its representative process team and get the machine team + if (bestTeam.size() == configuration.storageTeamSize) { + // machineIDs is used to quickly check if the machineIDs belong to an existed team + // machines keep machines reference for performance benefit by avoiding looking up machine by machineID + vector> machines; for (auto process = bestTeam.begin(); process < bestTeam.end(); process++) { - processIDs.push_back(**process); + Reference machine = server_info[**process]->machine; + machines.push_back(machine); } - std::sort(processIDs.begin(), processIDs.end()); - - if( !teamExists( processIDs ) ) { - addTeam(processIDs.begin(), processIDs.end()); - addedTeams++; - } - } - else { - TraceEvent(SevWarn, "DataDistributionBuildTeams", masterId).detail("Reason","Unable to make desiredTeams"); + addMachineTeam(machines); + addedMachineTeams++; + } else { + TraceEvent(SevWarn, "DataDistributionBuildTeams", masterId) + .detail("Primary", primary) + .detail("Reason", "Unable to make desired machine Teams"); break; } - if(++loopCount > 2*teamsToBuild*(configuration.storageTeamSize+1) ) { + + if (++loopCount > 2 * machineTeamsToBuild * (configuration.storageTeamSize + 1)) { break; } } + + return addedMachineTeams; + } + + bool isMachineTeamHealthy(vector> const& machineIDs) { + int healthyNum = 0; + + // A healthy machine team should have the desired number of machines + if (machineIDs.size() != configuration.storageTeamSize) return false; + + for (auto& id : machineIDs) { + auto& machine = machine_info[id]; + if (isMachineHealthy(machine)) { + healthyNum++; + } + } + return (healthyNum == machineIDs.size()); + } + + bool isMachineTeamHealthy(Reference const& machineTeam) { + int healthyNum = 0; + + // A healthy machine team should have the desired number of machines + if (machineTeam->size() != configuration.storageTeamSize) return false; + + for (auto& machine : machineTeam->machines) { + if (isMachineHealthy(machine)) { + healthyNum++; + } + } + return (healthyNum == machineTeam->machines.size()); + } + + bool isMachineHealthy(Reference const& machine) { + if (!machine.isValid() || machine_info.find(machine->machineID) == machine_info.end() || + machine->serversOnMachine.empty()) { + return false; + } + + // Healthy machine has at least one healthy server + for (auto& server : machine->serversOnMachine) { + if (!server_status.get(server->id).isUnhealthy()) { + return true; + } + } + + return false; + } + + // Return the healthy server with the least number of correct-size server teams + Reference findOneLeastUsedServer() { + vector> leastUsedServers; + int minTeamNumber = std::numeric_limits::max(); + for (auto& server : server_info) { + // Only pick healthy server, which is not failed or excluded. + if (server_status.get(server.first).isUnhealthy()) continue; + + int numTeams = server.second->teams.size(); + if (numTeams < minTeamNumber) { + minTeamNumber = numTeams; + leastUsedServers.clear(); + } + if (minTeamNumber == numTeams) { + leastUsedServers.push_back(server.second); + } + } + + return g_random->randomChoice(leastUsedServers); + } + + // Randomly choose one machine team that has chosenServer and has the correct size + // When configuration is changed, we may have machine teams with old storageTeamSize + Reference findOneRandomMachineTeam(Reference chosenServer) { + if (!chosenServer->machine->machineTeams.empty()) { + std::vector> machineTeams; + for (auto& mt : chosenServer->machine->machineTeams) { + if (isMachineTeamHealthy(mt)) { + machineTeams.push_back(mt); + } + } + if (!machineTeams.empty()) { + return g_random->randomChoice(machineTeams); + } + } + + // If we cannot find a healthy machine team + TraceEvent("NoHealthyMachineTeamForServer") + .detail("ServerID", chosenServer->id) + .detail("MachineTeamsNumber", chosenServer->machine->machineTeams.size()); + return Reference(); + } + + // A server team should always come from servers on a machine team + // Check if it is true + bool isOnSameMachineTeam(Reference& team) { + std::vector> machineIDs; + for (auto& server : team->servers) { + if (!server->machine.isValid()) return false; + machineIDs.push_back(server->machine->machineID); + } + std::sort(machineIDs.begin(), machineIDs.end()); + + int numExistance = 0; + for (auto& server : team->servers) { + for (auto& candidateMachineTeam : server->machine->machineTeams) { + std::sort(candidateMachineTeam->machineIDs.begin(), candidateMachineTeam->machineIDs.end()); + if (machineIDs == candidateMachineTeam->machineIDs) { + numExistance++; + break; + } + } + } + return (numExistance == team->servers.size()); + } + + // Sanity check the property of teams in unit test + // Return true if all server teams belong to machine teams + bool sanityCheckTeams() { + for (auto& team : teams) { + if (isOnSameMachineTeam(team) == false) { + return false; + } + } + + return true; + } + + // Create server teams based on machine teams + // Before the number of machine teams reaches the threshold, build a machine team for each server team + // When it reaches the threshold, first try to build a server team with existing machine teams; if failed, + // build an extra machine team and record the event in trace + int addTeamsBestOf(int teamsToBuild) { + ASSERT(teamsToBuild > 0); + ASSERT_WE_THINK(machine_info.size() > 0 || server_info.size() == 0); + + int addedMachineTeams = 0; + int addedTeams = 0; + int loopCount = 0; + + // Exclude machine teams who have members in the wrong configuration. + // When we change configuration, we may have machine teams with storageTeamSize in the old configuration. + int healthyMachineTeamCount = 0; + int totalMachineTeamCount = 0; + for (auto mt = machineTeams.begin(); mt != machineTeams.end(); ++mt) { + ASSERT((*mt)->machines.size() == configuration.storageTeamSize); + + if (isMachineTeamHealthy(*mt)) { + ++healthyMachineTeamCount; + } + ++totalMachineTeamCount; + } + + int totalHealthyMachineCount = 0; + for (auto m : machine_info) { + if (isMachineHealthy(m.second)) { + ++totalHealthyMachineCount; + } + } + + int desiredMachineTeams = SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER * totalHealthyMachineCount; + int maxMachineTeams = SERVER_KNOBS->MAX_TEAMS_PER_SERVER * totalHealthyMachineCount; + // machineTeamsToBuild mimics how the teamsToBuild is calculated in buildTeams() + int machineTeamsToBuild = + std::min(desiredMachineTeams - healthyMachineTeamCount, maxMachineTeams - totalMachineTeamCount); + + TraceEvent("BuildMachineTeams") + .detail("TotalHealthyMachine", totalHealthyMachineCount) + .detail("HealthyMachineTeamCount", healthyMachineTeamCount) + .detail("DesiredMachineTeams", desiredMachineTeams) + .detail("MaxMachineTeams", maxMachineTeams) + .detail("MachineTeamsToBuild", machineTeamsToBuild); + // Pre-build all machine teams until we have the desired number of machine teams + if (machineTeamsToBuild > 0) { + addedMachineTeams = addBestMachineTeams(machineTeamsToBuild); + } + + while (addedTeams < teamsToBuild) { + // Step 1: Create 1 best machine team + std::vector bestServerTeam; + int bestScore = std::numeric_limits::max(); + int maxAttempts = SERVER_KNOBS->BEST_OF_AMT; // BEST_OF_AMT = 4 + for (int i = 0; i < maxAttempts && i < 100; ++i) { + // Step 2: Choose 1 least used server and then choose 1 least used machine team from the server + Reference chosenServer = findOneLeastUsedServer(); + // Note: To avoid creating correlation of picked machine teams, we simply choose a random machine team + // instead of choosing the least used machine team. + // The correlation happens, for example, when we add two new machines, we may always choose the machine + // team with these two new machines because they are typically less used. + Reference chosenMachineTeam = findOneRandomMachineTeam(chosenServer); + + if (!chosenMachineTeam.isValid()) { + // We may face the situation that temporarily we have no healthy machine. + TraceEvent(SevWarn, "MachineTeamNotFound") + .detail("Primary", primary) + .detail("MachineTeamNumber", machineTeams.size()); + continue; // try randomly to find another least used server + } + + // From here, chosenMachineTeam must have a healthy server team + // Step 3: Randomly pick 1 server from each machine in the chosen machine team to form a server team + vector serverTeam; + int chosenServerCount = 0; + for (auto& machine : chosenMachineTeam->machines) { + UID serverID; + if (machine == chosenServer->machine) { + serverID = chosenServer->id; + ++chosenServerCount; + } else { + std::vector> healthyProcesses; + for (auto it : machine->serversOnMachine) { + if (!server_status.get(it->id).isUnhealthy()) { + healthyProcesses.push_back(it); + } + } + serverID = g_random->randomChoice(healthyProcesses)->id; + } + serverTeam.push_back(serverID); + } + + ASSERT(chosenServerCount == 1); // chosenServer should be used exactly once + ASSERT(serverTeam.size() == configuration.storageTeamSize); + + std::sort(serverTeam.begin(), serverTeam.end()); + if (teamExists(serverTeam)) { + maxAttempts += 1; + continue; + } + + // Pick the server team with smallest score in all attempts + // SOMEDAY: Improve the code efficiency by using reservoir algorithm + int score = 0; + for (auto& server : serverTeam) { + score += server_info[server]->teams.size(); + } + if (score < bestScore) { + bestScore = score; + bestServerTeam = serverTeam; + } + } + + if (bestServerTeam.size() != configuration.storageTeamSize) { + // Not find any team and will unlikely find a team + break; + } + + // Step 4: Add the server team + addTeam(bestServerTeam.begin(), bestServerTeam.end(), false); + addedTeams++; + + if (++loopCount > 2 * teamsToBuild * (configuration.storageTeamSize + 1)) { + break; + } + } + + TraceEvent("AddTeamsBestOf") + .detail("Primary", primary) + .detail("AddedTeamNumber", addedTeams) + .detail("AimToBuildTeamNumber", teamsToBuild) + .detail("CurrentTeamNumber", teams.size()) + .detail("StorageTeamSize", configuration.storageTeamSize) + .detail("MachineTeamNum", machineTeams.size()); + return addedTeams; } @@ -1130,7 +1712,7 @@ struct DDTeamCollection : ReferenceCounted { int uniqueMachines = 0; std::set>> machines; - for(auto i = self->server_info.begin(); i != self->server_info.end(); ++i) { + for (auto i = self->server_info.begin(); i != self->server_info.end(); ++i) { if (!self->server_status.get(i->first).isUnhealthy()) { ++serverCount; LocalityData& serverLocation = i->second->lastKnownInterface.locality; @@ -1138,20 +1720,21 @@ struct DDTeamCollection : ReferenceCounted { } } uniqueMachines = machines.size(); + TraceEvent("BuildTeams") + .detail("ServerNumber", self->server_info.size()) + .detail("UniqueMachines", uniqueMachines) + .detail("StorageTeamSize", self->configuration.storageTeamSize); // If there are too few machines to even build teams or there are too few represented datacenters, build no new teams if( uniqueMachines >= self->configuration.storageTeamSize ) { desiredTeams = SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER*serverCount; int maxTeams = SERVER_KNOBS->MAX_TEAMS_PER_SERVER*serverCount; - // Count only properly sized teams against the desired number of teams. This is to prevent "emergency" merged teams (see MoveKeys) - // from overwhelming the team count (since we really did not want that team in the first place). These larger teams will not be - // returned from getRandomTeam() (as used by bestTeam to find a new home for a shard). - // Also exclude teams who have members in the wrong configuration, since we don't want these teams either + // Exclude teams who have members in the wrong configuration, since we don't want these teams int teamCount = 0; int totalTeamCount = 0; - for(int i = 0; i < self->teams.size(); i++) { - if( self->teams[i]->getServerIDs().size() == self->configuration.storageTeamSize && !self->teams[i]->isWrongConfiguration() ) { + for (int i = 0; i < self->teams.size(); ++i) { + if (!self->teams[i]->isWrongConfiguration()) { if( self->teams[i]->isHealthy() ) { teamCount++; } @@ -1163,37 +1746,29 @@ struct DDTeamCollection : ReferenceCounted { .detail("UniqueMachines", uniqueMachines).detail("TeamSize", self->configuration.storageTeamSize).detail("Servers", serverCount) .detail("CurrentTrackedTeams", self->teams.size()).detail("HealthyTeamCount", teamCount).detail("TotalTeamCount", totalTeamCount); - teamCount = std::max(teamCount, desiredTeams + totalTeamCount - maxTeams ); + // teamsToBuild is calculated such that we will not build too many teams in the situation + // when all (or most of) teams become unhealthy temporarily and then healthy again + state int teamsToBuild = std::min(desiredTeams - teamCount, maxTeams - totalTeamCount); - if( desiredTeams > teamCount ) { + if (teamsToBuild > 0) { std::set desiredServerSet; - for(auto i = self->server_info.begin(); i != self->server_info.end(); ++i) - if (!self->server_status.get(i->first).isUnhealthy()) + for (auto i = self->server_info.begin(); i != self->server_info.end(); ++i) { + if (!self->server_status.get(i->first).isUnhealthy()) { desiredServerSet.insert(i->second->id); + } + } vector desiredServerVector( desiredServerSet.begin(), desiredServerSet.end() ); - state int teamsToBuild = desiredTeams - teamCount; - state vector> builtTeams; - if( self->configuration.storageTeamSize > 3) { - int addedTeams = self->addTeamsBestOf( teamsToBuild ); - TraceEvent("AddTeamsBestOf", self->masterId).detail("CurrentTeams", self->teams.size()).detail("AddedTeams", addedTeams); - } else { - int addedTeams = wait( self->addAllTeams( self, desiredServerVector, &builtTeams, teamsToBuild ) ); - - if( addedTeams < teamsToBuild ) { - for( int i = 0; i < builtTeams.size(); i++ ) { - std::sort(builtTeams[i].begin(), builtTeams[i].end()); - self->addTeam( builtTeams[i].begin(), builtTeams[i].end() ); - } - TraceEvent("AddAllTeams", self->masterId).detail("CurrentTeams", self->teams.size()).detail("AddedTeams", builtTeams.size()); - } - else { - int addedTeams = self->addTeamsBestOf( teamsToBuild ); - TraceEvent("AddTeamsBestOf", self->masterId).detail("CurrentTeams", self->teams.size()).detail("AddedTeams", addedTeams); - } + int addedTeams = self->addTeamsBestOf(teamsToBuild); + if (addedTeams <= 0 && self->teams.size() == 0) { + TraceEvent(SevWarn, "NoTeamAfterBuildTeam") + .detail("TeamNum", self->teams.size()) + .detail("Debug", "Check information below"); + // Debug: set true for traceAllInfo() to print out more information + self->traceAllInfo(); } } } @@ -1210,7 +1785,7 @@ struct DDTeamCollection : ReferenceCounted { void noHealthyTeams() { std::set desiredServerSet; std::string desc; - for(auto i = server_info.begin(); i != server_info.end(); ++i) { + for (auto i = server_info.begin(); i != server_info.end(); ++i) { ASSERT(i->first == i->second->id); if (!server_status.get(i->first).isFailed) { desiredServerSet.insert(i->first); @@ -1226,7 +1801,10 @@ struct DDTeamCollection : ReferenceCounted { } bool shouldHandleServer(const StorageServerInterface &newServer) { - return (includedDCs.empty() || std::find(includedDCs.begin(), includedDCs.end(), newServer.locality.dcId()) != includedDCs.end() || (otherTrackedDCs.present() && std::find(otherTrackedDCs.get().begin(), otherTrackedDCs.get().end(), newServer.locality.dcId()) == otherTrackedDCs.get().end())); + return (includedDCs.empty() || + std::find(includedDCs.begin(), includedDCs.end(), newServer.locality.dcId()) != includedDCs.end() || + (otherTrackedDCs.present() && std::find(otherTrackedDCs.get().begin(), otherTrackedDCs.get().end(), + newServer.locality.dcId()) == otherTrackedDCs.get().end())); } void addServer( StorageServerInterface newServer, ProcessClass processClass, Promise errorOut, Version addedVersion ) { @@ -1237,7 +1815,12 @@ struct DDTeamCollection : ReferenceCounted { TraceEvent("AddedStorageServer", masterId).detail("ServerID", newServer.id()).detail("ProcessClass", processClass.toString()).detail("WaitFailureToken", newServer.waitFailure.getEndpoint().token).detail("Address", newServer.waitFailure.getEndpoint().address); auto &r = server_info[newServer.id()] = Reference( new TCServerInfo( newServer, processClass, includedDCs.empty() || std::find(includedDCs.begin(), includedDCs.end(), newServer.locality.dcId()) != includedDCs.end(), storageServerSet ) ); + + // Establish the relation between server and machine + checkAndCreateMachine(r); + r->tracker = storageServerTracker( this, cx, r.getPtr(), &server_status, lock, masterId, &server_info, serverChanges, errorOut, addedVersion ); + doBuildTeams = true; // Adding a new server triggers to build new teams restartTeamBuilder.trigger(); } @@ -1268,14 +1851,97 @@ struct DDTeamCollection : ReferenceCounted { return found; } + // Check if the server belongs to a machine; if not, create the machine. + // Establish the two-direction link between server and machine + Reference checkAndCreateMachine(Reference server) { + ASSERT(server.isValid() && server_info.find(server->id) != server_info.end()); + auto& locality = server->lastKnownInterface.locality; + Standalone machine_id = locality.zoneId().get(); // locality to machine_id with std::string type + + Reference machineInfo; + if (machine_info.find(machine_id) == + machine_info.end()) { // uid is the first storage server process on the machine + TEST(true); + // For each machine, store the first server's localityEntry into machineInfo for later use. + LocalityEntry localityEntry = machineLocalityMap.add(locality, &server->id); + machineInfo = Reference(new TCMachineInfo(server, localityEntry)); + machine_info.insert(std::make_pair(machine_id, machineInfo)); + } else { + machineInfo = machine_info.find(machine_id)->second; + machineInfo->serversOnMachine.push_back(server); + } + server->machine = machineInfo; + + return machineInfo; + } + + // Check if the serverTeam belongs to a machine team; If not, create the machine team + Reference checkAndCreateMachineTeam(Reference serverTeam) { + std::vector> machineIDs; + for (auto& server : serverTeam->servers) { + Reference machine = server->machine; + machineIDs.push_back(machine->machineID); + } + + std::sort(machineIDs.begin(), machineIDs.end()); + Reference machineTeam = findMachineTeam(machineIDs); + if (!machineTeam.isValid()) { // Create the machine team if it does not exist + machineTeam = addMachineTeam(machineIDs.begin(), machineIDs.end()); + } + + return machineTeam; + } + + // Remove the removedMachineInfo machine and any related machine team + void removeMachine(Reference removedMachineInfo) { + // Find machines that share teams with the removed machine + std::set> machinesWithAjoiningTeams; + for (auto& machineTeam : removedMachineInfo->machineTeams) { + machinesWithAjoiningTeams.insert(machineTeam->machineIDs.begin(), machineTeam->machineIDs.end()); + } + machinesWithAjoiningTeams.erase(removedMachineInfo->machineID); + // For each machine in a machine team with the removed machine, + // erase shared machine teams from the list of teams. + for (auto it = machinesWithAjoiningTeams.begin(); it != machinesWithAjoiningTeams.end(); ++it) { + auto& machineTeams = machine_info[*it]->machineTeams; + for (int t = 0; t < machineTeams.size(); t++) { + auto& machineTeam = machineTeams[t]; + if (std::count(machineTeam->machineIDs.begin(), machineTeam->machineIDs.end(), + removedMachineInfo->machineID)) { + machineTeams[t--] = machineTeams.back(); + machineTeams.pop_back(); + } + } + } + // Remove global machine team that includes removedMachineInfo + for (int t = 0; t < machineTeams.size(); t++) { + auto& machineTeam = machineTeams[t]; + if (std::count(machineTeam->machineIDs.begin(), machineTeam->machineIDs.end(), + removedMachineInfo->machineID)) { + machineTeams[t--] = machineTeams.back(); + machineTeams.pop_back(); + } + } + + // Remove removedMachineInfo from machine's global info + machine_info.erase(removedMachineInfo->machineID); + TraceEvent("MachineLocalityMapUpdate").detail("MachineUIDRemoved", removedMachineInfo->machineID.toString()); + + // We do not update macineLocalityMap when a machine is removed because we will do so when we use it in + // addBestMachineTeams() + // rebuildMachineLocalityMap(); + } + void removeServer( UID removedServer ) { TraceEvent("RemovedStorageServer", masterId).detail("ServerID", removedServer); // ASSERT( !shardsAffectedByTeamFailure->getServersForTeam( t ) for all t in teams that contain removedServer ) + Reference removedServerInfo = server_info[removedServer]; + // Step: Remove server team that relate to removedServer // Find all servers with which the removedServer shares teams std::set serversWithAjoiningTeams; - auto& sharedTeams = server_info[ removedServer ]->teams; - for( int i = 0; i < sharedTeams.size(); i++ ) { + auto& sharedTeams = removedServerInfo->teams; + for (int i = 0; i < sharedTeams.size(); ++i) { auto& teamIds = sharedTeams[i]->getServerIDs(); serversWithAjoiningTeams.insert( teamIds.begin(), teamIds.end() ); } @@ -1283,18 +1949,38 @@ struct DDTeamCollection : ReferenceCounted { // For each server in a team with the removedServer, erase shared teams from the list of teams in that other server for( auto it = serversWithAjoiningTeams.begin(); it != serversWithAjoiningTeams.end(); ++it ) { - auto& teams = server_info[ *it ]->teams; - for( int t = 0; t < teams.size(); t++ ) { - auto& serverIds = teams[t]->getServerIDs(); + auto& serverTeams = server_info[*it]->teams; + for (int t = 0; t < serverTeams.size(); t++) { + auto& serverIds = serverTeams[t]->getServerIDs(); if ( std::count( serverIds.begin(), serverIds.end(), removedServer ) ) { - teams[t--] = teams.back(); - teams.pop_back(); + serverTeams[t--] = serverTeams.back(); + serverTeams.pop_back(); } } } - // remove removedServer from allServers, server_info - for(int s=0; s removedMachineInfo = removedServerInfo->machine; + for (int i = 0; i < removedMachineInfo->serversOnMachine.size(); ++i) { + if (removedMachineInfo->serversOnMachine[i] == removedServerInfo) { + // Safe even when removedServerInfo is the last one + removedMachineInfo->serversOnMachine[i--] = removedMachineInfo->serversOnMachine.back(); + removedMachineInfo->serversOnMachine.pop_back(); + break; + } + } + // Remove machine if no server on it + if (removedMachineInfo->serversOnMachine.size() == 0) { + removeMachine(removedMachineInfo); + } + // If the machine uses removedServer's locality and the machine still has servers, the the machine's + // representative server will be updated when it is used in addBestMachineTeams() + // Note that since we do not rebuildMachineLocalityMap() here, the machineLocalityMap can be stale. + // This is ok as long as we do not arbitrarily validate if machine team satisfies replication policy. + + // Step: Remove removedServer from server's global data + for (int s = 0; s < allServers.size(); s++) { if (allServers[s] == removedServer) { allServers[s--] = allServers.back(); allServers.pop_back(); @@ -1312,20 +1998,33 @@ struct DDTeamCollection : ReferenceCounted { // remove all teams that contain removedServer // SOMEDAY: can we avoid walking through all teams, since we have an index of teams in which removedServer participated - for(int t=0; tgetServerIDs().begin(), teams[t]->getServerIDs().end(), removedServer ) ) { + TraceEvent("TeamRemoved") + .detail("Primary", primary) + .detail("TeamServerIDs", teams[t]->getServerIDsStr()); teams[t]->tracker.cancel(); teams[t--] = teams.back(); teams.pop_back(); + removedCount++; } } + if (removedCount == 0) { + TraceEvent(SevInfo, "NoneTeamRemovedWhenServerRemoved") + .detail("Primary", primary) + .detail("Debug", "ThisShouldRarelyHappen_CheckInfoBelow"); + traceAllInfo(); + } + doBuildTeams = true; restartTeamBuilder.trigger(); TraceEvent("DataDistributionTeamCollectionUpdate", masterId) - .detail("Teams", teams.size()) - .detail("Servers", allServers.size()); + .detail("Teams", teams.size()) + .detail("BadTeams", badTeams.size()) + .detail("Servers", allServers.size()); } }; @@ -1338,6 +2037,7 @@ ACTOR Future teamTracker( DDTeamCollection* self, Reference te state bool lastHealthy; state bool lastOptimal; state bool lastWrongConfiguration = team->isWrongConfiguration(); + state bool lastZeroHealthy = self->zeroHealthyTeams->get(); state bool firstCheck = true; @@ -1348,6 +2048,9 @@ ACTOR Future teamTracker( DDTeamCollection* self, Reference te try { loop { + TraceEvent("TeamHealthChangeDetected", self->masterId) + .detail("Primary", self->primary) + .detail("IsReady", self->initialFailureReactionDelay.isReady()); // Check if the number of degraded machines has changed state vector> change; auto servers = team->getServerIDs(); @@ -1378,6 +2081,7 @@ ACTOR Future teamTracker( DDTeamCollection* self, Reference te team->setHealthy( healthy ); // Unhealthy teams won't be chosen by bestTeam bool optimal = team->isOptimal() && healthy; bool recheck = !healthy && (lastReady != self->initialFailureReactionDelay.isReady() || (lastZeroHealthy && !self->zeroHealthyTeams->get())); + lastReady = self->initialFailureReactionDelay.isReady(); lastZeroHealthy = self->zeroHealthyTeams->get(); @@ -1396,7 +2100,8 @@ ACTOR Future teamTracker( DDTeamCollection* self, Reference te lastOptimal = optimal; } - if( serversLeft != lastServersLeft || anyUndesired != lastAnyUndesired || anyWrongConfiguration != lastWrongConfiguration || recheck ) { + if (serversLeft != lastServersLeft || anyUndesired != lastAnyUndesired || + anyWrongConfiguration != lastWrongConfiguration || recheck) { // NOTE: do not check wrongSize if(logTeamEvents) { TraceEvent("TeamHealthChanged", self->masterId) .detail("Team", team->getDesc()).detail("ServersLeft", serversLeft) @@ -1416,13 +2121,16 @@ ACTOR Future teamTracker( DDTeamCollection* self, Reference te if( lastHealthy != healthy ) { lastHealthy = healthy; + // Update healthy team count when the team healthy changes self->healthyTeamCount += healthy ? 1 : -1; ASSERT( self->healthyTeamCount >= 0 ); self->zeroHealthyTeams->set(self->healthyTeamCount == 0); if( self->healthyTeamCount == 0 ) { - TraceEvent(SevWarn, "ZeroTeamsHealthySignalling", self->masterId).detail("SignallingTeam", team->getDesc()); + TraceEvent(SevWarn, "ZeroTeamsHealthySignalling", self->masterId) + .detail("SignallingTeam", team->getDesc()) + .detail("Primary", self->primary); } TraceEvent("TeamHealthDifference", self->masterId) @@ -1763,6 +2471,8 @@ ACTOR Future storageServerFailureTracker( return Void(); } +// Check the status of a storage server. +// Apply all requirements to the server and mark it as excluded if it fails to satisfies these requirements ACTOR Future storageServerTracker( DDTeamCollection* self, Database cx, @@ -1807,7 +2517,8 @@ ACTOR Future storageServerTracker( .detail("NumShards", self->shardsAffectedByTeamFailure->getNumberOfShards(server->id)) .detail("OtherNumShards", self->shardsAffectedByTeamFailure->getNumberOfShards(i->second->id)) .detail("OtherHealthy", !statusMap->get( i->second->id ).isUnhealthy()); - otherChanges.push_back( statusMap->onChange( i->second->id ) ); + // wait for the server's ip to be changed + otherChanges.push_back(statusMap->onChange(i->second->id)); if(!statusMap->get( i->second->id ).isUnhealthy()) { if(self->shardsAffectedByTeamFailure->getNumberOfShards(i->second->id) >= self->shardsAffectedByTeamFailure->getNumberOfShards(server->id)) { @@ -1833,7 +2544,10 @@ ACTOR Future storageServerTracker( if( server->lastKnownClass.machineClassFitness( ProcessClass::Storage ) > ProcessClass::UnsetFit ) { if( self->optimalTeamCount > 0 ) { - TraceEvent(SevWarn, "UndesiredStorageServer", masterId).detail("Server", server->id).detail("OptimalTeamCount", self->optimalTeamCount); + TraceEvent(SevWarn, "UndesiredStorageServer", masterId) + .detail("Server", server->id) + .detail("OptimalTeamCount", self->optimalTeamCount) + .detail("Fitness", server->lastKnownClass.machineClassFitness(ProcessClass::Storage)); status.isUndesired = true; } otherChanges.push_back( self->zeroOptimalTeams.onChange() ); @@ -1893,6 +2607,8 @@ ACTOR Future storageServerTracker( when( std::pair newInterface = wait( interfaceChanged ) ) { bool restartRecruiting = newInterface.first.waitFailure.getEndpoint().address != server->lastKnownInterface.waitFailure.getEndpoint().address; bool localityChanged = server->lastKnownInterface.locality != newInterface.first.locality; + bool machineLocalityChanged = server->lastKnownInterface.locality.zoneId().get() != + newInterface.first.locality.zoneId().get(); TraceEvent("StorageServerInterfaceChanged", masterId).detail("ServerID", server->id) .detail("NewWaitFailureToken", newInterface.first.waitFailure.getEndpoint().token) .detail("OldWaitFailureToken", server->lastKnownInterface.waitFailure.getEndpoint().token) @@ -1900,23 +2616,75 @@ ACTOR Future storageServerTracker( server->lastKnownInterface = newInterface.first; server->lastKnownClass = newInterface.second; - if(localityChanged) { - server->inDesiredDC = (self->includedDCs.empty() || std::find(self->includedDCs.begin(), self->includedDCs.end(), server->lastKnownInterface.locality.dcId()) != self->includedDCs.end()); - self->resetLocalitySet(); + if (localityChanged) { + TEST(true); // Server locality changed + + // The locality change of a server will affect machine teams related to the server if + // the server's machine locality is changed + if (machineLocalityChanged) { + // First handle the impact on the machine of the server on the old locality + Reference machine = server->machine; + ASSERT(machine->serversOnMachine.size() >= 1); + if (machine->serversOnMachine.size() == 1) { + // When server is the last server on the machine, + // remove the machine and the related machine team + self->removeMachine(machine); + } else { + // we remove the server from the machine, and + // update locality entry for the machine and the global machineLocalityMap + int serverIndex = -1; + for (int i = 0; i < machine->serversOnMachine.size(); ++i) { + if (machine->serversOnMachine[i].getPtr() == server) { + serverIndex = i; + machine->serversOnMachine[i] = machine->serversOnMachine.back(); + machine->serversOnMachine.pop_back(); + break; // Invariant: server only appear on the machine once + } + } + ASSERT(serverIndex != -1); + // NOTE: we do not update the machine's locality map even when + // its representative server is changed. + } + + // Second handle the impact on the destination machine where the server's new locality is; + // If the destination machine is new, create one; otherwise, add server to an existing one + // Update server's machine reference to the destination machine + Reference destMachine = + self->checkAndCreateMachine(self->server_info[server->id]); + ASSERT(destMachine.isValid()); + } + + // Ensure the server's server team belong to a machine team, and + // Get the newBadTeams due to the locality change vector> newBadTeams; - bool addedNewBadTeam = false; - for(auto it : server->teams) { - if(!self->satisfiesPolicy(it->servers)) { - newBadTeams.push_back(it); + for (auto& serverTeam : server->teams) { + if (!self->satisfiesPolicy(serverTeam->servers)) { + newBadTeams.push_back(serverTeam); + continue; + } + if (machineLocalityChanged) { + Reference machineTeam = self->checkAndCreateMachineTeam(serverTeam); + ASSERT(machineTeam.isValid()); + serverTeam->machineTeam = machineTeam; } } + + server->inDesiredDC = + (self->includedDCs.empty() || + std::find(self->includedDCs.begin(), self->includedDCs.end(), + server->lastKnownInterface.locality.dcId()) != self->includedDCs.end()); + self->resetLocalitySet(); + + bool addedNewBadTeam = false; for(auto it : newBadTeams) { if( self->removeTeam(it) ) { - self->addTeam(it->servers); + self->addTeam(it->servers, true); addedNewBadTeam = true; } } if(addedNewBadTeam && self->badTeamRemover.isReady()) { + TEST(true); // Server locality change created bad teams + self->doBuildTeams = true; self->badTeamRemover = removeBadTeams(self); self->addActor.send(self->badTeamRemover); } @@ -1955,7 +2723,7 @@ ACTOR Future storageServerTracker( } } } catch( Error &e ) { - if (e.code() != error_code_actor_cancelled) + if (e.code() != error_code_actor_cancelled && errorOut.canBeSet()) errorOut.sendError(e); throw; } @@ -1965,8 +2733,8 @@ ACTOR Future storageServerTracker( ACTOR Future monitorStorageServerRecruitment(DDTeamCollection* self) { state bool recruiting = false; TraceEvent("StorageServerRecruitment", self->masterId) - .detail("State", "Idle") - .trackLatest(("StorageServerRecruitment_" + self->masterId.toString()).c_str()); + .detail("State", "Idle") + .trackLatest(("StorageServerRecruitment_" + self->masterId.toString()).c_str()); loop { if( !recruiting ) { while(self->recruitingStream.get() == 0) { @@ -2209,8 +2977,15 @@ ACTOR Future dataDistributionTeamCollection( highestPriority = std::max(highestPriority, it.first); } } - TraceEvent("TotalDataInFlight", self->masterId).detail("Primary", self->primary).detail("TotalBytes", self->getDebugTotalDataInFlight()).detail("UnhealthyServers", self->unhealthyServers) - .detail("HighestPriority", highestPriority).trackLatest( self->primary ? "TotalDataInFlight" : "TotalDataInFlightRemote" ); + + TraceEvent("TotalDataInFlight", self->masterId) + .detail("Primary", self->primary) + .detail("TotalBytes", self->getDebugTotalDataInFlight()) + .detail("UnhealthyServers", self->unhealthyServers) + .detail("ServerNumber", self->server_info.size()) + .detail("StorageTeamSize", self->configuration.storageTeamSize) + .detail("HighestPriority", highestPriority) + .trackLatest(self->primary ? "TotalDataInFlight" : "TotalDataInFlightRemote"); loggingTrigger = delay( SERVER_KNOBS->DATA_DISTRIBUTION_LOGGING_INTERVAL ); } when( wait( self->serverTrackerErrorOut.getFuture() ) ) {} // Propagate errors from storageServerTracker @@ -2378,12 +3153,17 @@ ACTOR Future dataDistribution( TraceEvent("DDInitTookMoveKeysLock", mi.id()); state Reference initData = wait( getInitialDataDistribution(cx, mi.id(), lock, configuration.usableRegions > 1 ? remoteDcIds : std::vector>() ) ); if(initData->shards.size() > 1) { - TraceEvent("DDInitGotInitialDD", mi.id()).detail("B", printable(initData->shards.end()[-2].key)).detail("E", printable(initData->shards.end()[-1].key)).detail("Src", describe(initData->shards.end()[-2].primarySrc)).detail("Dest", describe(initData->shards.end()[-2].primaryDest)).trackLatest("InitialDD"); + TraceEvent("DDInitGotInitialDD", mi.id()) + .detail("B", printable(initData->shards.end()[-2].key)) + .detail("E", printable(initData->shards.end()[-1].key)) + .detail("Src", describe(initData->shards.end()[-2].primarySrc)) + .detail("Dest", describe(initData->shards.end()[-2].primaryDest)) + .trackLatest("InitialDD"); } else { TraceEvent("DDInitGotInitialDD", mi.id()).detail("B","").detail("E", "").detail("Src", "[no items]").detail("Dest", "[no items]").trackLatest("InitialDD"); } - if (initData->mode) break; + if (initData->mode) break; // mode may be set true by system operator using fdbcli TraceEvent("DataDistributionDisabled", mi.id()); TraceEvent("MovingData", mi.id()) @@ -2520,7 +3300,7 @@ DDTeamCollection* testTeamCollection(int teamSize, IRepPolicyRef policy, int pro Reference>( new AsyncVar(false) ) ); - for(int id = 1; id <= processCount; id++) { + for (int id = 1; id <= processCount; ++id) { UID uid(id, 0); StorageServerInterface interface; interface.uniqueID = uid; @@ -2529,40 +3309,109 @@ DDTeamCollection* testTeamCollection(int teamSize, IRepPolicyRef policy, int pro interface.locality.set(LiteralStringRef("data_hall"), Standalone(std::to_string(id % 3))); collection->server_info[uid] = Reference(new TCServerInfo(interface, ProcessClass(), true, collection->storageServerSet)); collection->server_status.set(uid, ServerStatus(false, false, interface.locality)); + collection->checkAndCreateMachine(collection->server_info[uid]); } return collection; } -TEST_CASE("/DataDistribution/AddAllTeams/isExhaustive") { +DDTeamCollection* testMachineTeamCollection(int teamSize, IRepPolicyRef policy, int processCount) { + Database database = DatabaseContext::create(Reference>(new AsyncVar()), + Never(), LocalityData(), false); + + DatabaseConfiguration conf; + conf.storageTeamSize = teamSize; + conf.storagePolicy = policy; + + DDTeamCollection* collection = + new DDTeamCollection(database, UID(0, 0), MoveKeysLock(), PromiseStream(), + Reference(new ShardsAffectedByTeamFailure()), conf, {}, {}, + PromiseStream>>(), Future(Void()), + Reference>(new AsyncVar(true)), true, + Reference>(new AsyncVar(false))); + + for (int id = 1; id <= processCount; id++) { + UID uid(id, 0); + StorageServerInterface interface; + interface.uniqueID = uid; + int process_id = id; + int dc_id = process_id / 1000; + int data_hall_id = process_id / 100; + int zone_id = process_id / 10; + int machine_id = process_id / 5; + + printf("testMachineTeamCollection: process_id:%d zone_id:%d machine_id:%d ip_addr:%s\n", process_id, zone_id, + machine_id, interface.address().toString().c_str()); + interface.locality.set(LiteralStringRef("processid"), Standalone(std::to_string(process_id))); + interface.locality.set(LiteralStringRef("machineid"), Standalone(std::to_string(machine_id))); + interface.locality.set(LiteralStringRef("zoneid"), Standalone(std::to_string(zone_id))); + interface.locality.set(LiteralStringRef("data_hall"), Standalone(std::to_string(data_hall_id))); + interface.locality.set(LiteralStringRef("dcid"), Standalone(std::to_string(dc_id))); + collection->server_info[uid] = + Reference(new TCServerInfo(interface, ProcessClass(), true, collection->storageServerSet)); + + collection->server_status.set(uid, ServerStatus(false, false, interface.locality)); + } + + int totalServerIndex = collection->constructMachinesFromServers(); + printf("testMachineTeamCollection: construct machines for %d servers\n", totalServerIndex); + + return collection; +} + +TEST_CASE("DataDistribution/AddTeamsBestOf/UseMachineID") { + wait(Future(Void())); + + int teamSize = 3; // replication size + int processSize = 60; + + IRepPolicyRef policy = IRepPolicyRef(new PolicyAcross(teamSize, "zoneid", IRepPolicyRef(new PolicyOne()))); + state DDTeamCollection* collection = testMachineTeamCollection(teamSize, policy, processSize); + + int result = collection->addTeamsBestOf(30); + + ASSERT(collection->sanityCheckTeams() == true); + + delete (collection); + + return Void(); +} + +TEST_CASE("DataDistribution/AddTeamsBestOf/NotUseMachineID") { + wait(Future(Void())); + + int teamSize = 3; // replication size + int processSize = 60; + + IRepPolicyRef policy = IRepPolicyRef(new PolicyAcross(teamSize, "zoneid", IRepPolicyRef(new PolicyOne()))); + state DDTeamCollection* collection = testMachineTeamCollection(teamSize, policy, processSize); + + if (collection == NULL) { + fprintf(stderr, "collection is null\n"); + return Void(); + } + + collection->addBestMachineTeams(30); // Create machine teams to help debug + int result = collection->addTeamsBestOf(30); + collection->sanityCheckTeams(); // Server team may happen to be on the same machine team, although unlikely + + if (collection) delete (collection); + + return Void(); +} + +TEST_CASE("DataDistribution/AddAllTeams/isExhaustive") { IRepPolicyRef policy = IRepPolicyRef(new PolicyAcross(3, "zoneid", IRepPolicyRef(new PolicyOne()))); state DDTeamCollection* collection = testTeamCollection(3, policy, 10); - vector processes; - for(auto process = collection->server_info.begin(); process != collection->server_info.end(); process++) { - processes.push_back(process->first); - } + int result = collection->addTeamsBestOf(200); - state vector> teams; - int result = wait(collection->addAllTeams(collection, processes, &teams, 200)); delete(collection); - for(int i = 0; i < teams.size(); i++) { - auto team = teams[i]; - } + // The maximum number of available server teams without considering machine locality is 120 + // The maximum number of available server teams with machine locality constraint is 120 - 40, because + // the 40 (5*4*2) server teams whose servers come from the same machine are invalid. ASSERT(result == 80); - ASSERT(teams[0] == std::vector({ UID(1,0), UID(2,0), UID(3,0) })); - ASSERT(teams[1] == std::vector({ UID(1,0), UID(2,0), UID(4,0) })); - ASSERT(teams[2] == std::vector({ UID(1,0), UID(2,0), UID(5,0) })); - ASSERT(teams[3] == std::vector({ UID(1,0), UID(2,0), UID(8,0) })); - ASSERT(teams[4] == std::vector({ UID(1,0), UID(2,0), UID(9,0) })); - ASSERT(teams[5] == std::vector({ UID(1,0), UID(2,0), UID(10,0) })); - ASSERT(teams[6] == std::vector({ UID(1,0), UID(3,0), UID(4,0) })); - ASSERT(teams[7] == std::vector({ UID(1,0), UID(3,0), UID(5,0) })); - ASSERT(teams[8] == std::vector({ UID(1,0), UID(3,0), UID(7,0) })); - ASSERT(teams[9] == std::vector({ UID(1,0), UID(3,0), UID(9,0) })); - ASSERT(teams[10] == std::vector({ UID(1,0), UID(3,0), UID(10,0) })); - ASSERT(teams[79] == std::vector({ UID(8,0), UID(9,0), UID(10,0) })); return Void(); } @@ -2571,29 +3420,11 @@ TEST_CASE("/DataDistribution/AddAllTeams/withLimit") { IRepPolicyRef policy = IRepPolicyRef(new PolicyAcross(3, "zoneid", IRepPolicyRef(new PolicyOne()))); state DDTeamCollection* collection = testTeamCollection(3, policy, 10); - vector processes; - for(auto process = collection->server_info.begin(); process != collection->server_info.end(); process++) { - processes.push_back(process->first); - } + int result = collection->addTeamsBestOf(10); - state vector> teams; - int result = wait(collection->addAllTeams(collection, processes, &teams, 10)); delete(collection); - for(int i = 0; i < teams.size(); i++) { - auto team = teams[i]; - } ASSERT(result == 10); - ASSERT(teams[0] == std::vector({ UID(1,0), UID(2,0), UID(3,0) })); - ASSERT(teams[1] == std::vector({ UID(1,0), UID(2,0), UID(4,0) })); - ASSERT(teams[2] == std::vector({ UID(1,0), UID(2,0), UID(5,0) })); - ASSERT(teams[3] == std::vector({ UID(1,0), UID(2,0), UID(8,0) })); - ASSERT(teams[4] == std::vector({ UID(1,0), UID(2,0), UID(9,0) })); - ASSERT(teams[5] == std::vector({ UID(1,0), UID(2,0), UID(10,0) })); - ASSERT(teams[6] == std::vector({ UID(1,0), UID(3,0), UID(4,0) })); - ASSERT(teams[7] == std::vector({ UID(1,0), UID(3,0), UID(5,0) })); - ASSERT(teams[8] == std::vector({ UID(1,0), UID(3,0), UID(7,0) })); - ASSERT(teams[9] == std::vector({ UID(1,0), UID(3,0), UID(9,0) })); return Void(); } @@ -2603,8 +3434,8 @@ TEST_CASE("/DataDistribution/AddTeamsBestOf/SkippingBusyServers") { IRepPolicyRef policy = IRepPolicyRef(new PolicyAcross(3, "zoneid", IRepPolicyRef(new PolicyOne()))); state DDTeamCollection* collection = testTeamCollection(3, policy, 10); - collection->addTeam(std::set({ UID(1,0), UID(2,0), UID(3,0) })); - collection->addTeam(std::set({ UID(1,0), UID(3,0), UID(4,0) })); + collection->addTeam(std::set({ UID(1, 0), UID(2, 0), UID(3, 0) }), true); + collection->addTeam(std::set({ UID(1, 0), UID(3, 0), UID(4, 0) }), true); int result = collection->addTeamsBestOf(8); @@ -2621,18 +3452,39 @@ TEST_CASE("/DataDistribution/AddTeamsBestOf/SkippingBusyServers") { return Void(); } +// Due to the randomness in choosing the machine team and the server team from the machine team, it is possible that +// we may not find the remaining several (e.g., 1 or 2) available teams. +// It is hard to conclude what is the minimum number of teams the addTeamsBestOf() should create in this situation. TEST_CASE("/DataDistribution/AddTeamsBestOf/NotEnoughServers") { wait(Future(Void())); IRepPolicyRef policy = IRepPolicyRef(new PolicyAcross(3, "zoneid", IRepPolicyRef(new PolicyOne()))); state DDTeamCollection* collection = testTeamCollection(3, policy, 5); - collection->addTeam(std::set({ UID(1,0), UID(2,0), UID(3,0) })); - collection->addTeam(std::set({ UID(1,0), UID(3,0), UID(4,0) })); + collection->addTeam(std::set({ UID(1, 0), UID(2, 0), UID(3, 0) }), true); + collection->addTeam(std::set({ UID(1, 0), UID(3, 0), UID(4, 0) }), true); + int resultMachineTeams = collection->addBestMachineTeams(10); int result = collection->addTeamsBestOf(10); + + if (collection->machineTeams.size() != 10 || result != 8) { + collection->traceAllInfo(true); // Debug message + } + + // NOTE: Due to the pure randomness in selecting a machine for a machine team, + // we cannot guarantee that all machine teams are created. + // When we chnage the selectReplicas function to achieve such guarantee, we can enable the following ASSERT + ASSERT(collection->machineTeams.size() == 10); // Should create all machine teams + + // We need to guarantee a server always have at least a team so that the server can participate in data distribution + for (auto process = collection->server_info.begin(); process != collection->server_info.end(); process++) { + auto teamCount = process->second->teams.size(); + ASSERT(teamCount >= 1); + } + delete(collection); + // If we find all available teams, result will be 8 because we prebuild 2 teams ASSERT(result == 8); return Void(); diff --git a/fdbserver/DataDistributionQueue.actor.cpp b/fdbserver/DataDistributionQueue.actor.cpp index cd33ad5be8..38cc158e43 100644 --- a/fdbserver/DataDistributionQueue.actor.cpp +++ b/fdbserver/DataDistributionQueue.actor.cpp @@ -881,6 +881,7 @@ ACTOR Future dataDistributionRelocator( DDQueueData *self, RelocateData rd ASSERT( rd.src.size() ); loop { state int stuckCount = 0; + // state int bestTeamStuckThreshold = 50; loop { state int tciIndex = 0; state bool foundTeams = true; @@ -897,6 +898,8 @@ ACTOR Future dataDistributionRelocator( DDQueueData *self, RelocateData rd req.sources = rd.src; req.completeSources = rd.completeSources; Optional> bestTeam = wait(brokenPromiseToNever(self->teamCollections[tciIndex].getTeam.getReply(req))); + // If a DC has no healthy team, we stop checking the other DCs until + // the unhealthy DC is healthy again or is excluded. if(!bestTeam.present()) { foundTeams = false; break; @@ -922,9 +925,14 @@ ACTOR Future dataDistributionRelocator( DDQueueData *self, RelocateData rd if (foundTeams && anyHealthy) { break; } + TEST(true); //did not find a healthy destination team on the first attempt stuckCount++; - TraceEvent(stuckCount > 50 ? SevWarnAlways : SevWarn, "BestTeamStuck", masterId).suppressFor(1.0).detail("Count", stuckCount); + TraceEvent(stuckCount > 50 ? SevWarnAlways : SevWarn, "BestTeamStuck", masterId) + .suppressFor(1.0) + .detail("Count", stuckCount) + .detail("TeamCollectionId", tciIndex) + .detail("NumOfTeamCollections", self->teamCollections.size()); wait( delay( SERVER_KNOBS->BEST_TEAM_STUCK_DELAY, TaskDataDistributionLaunch ) ); } @@ -936,7 +944,8 @@ ACTOR Future dataDistributionRelocator( DDQueueData *self, RelocateData rd for(int i = 0; i < bestTeams.size(); i++) { auto& serverIds = bestTeams[i].first->getServerIDs(); destinationTeams.push_back(ShardsAffectedByTeamFailure::Team(serverIds, i == 0)); - if(allHealthy && anyWithSource && !bestTeams[i].second) { + if (allHealthy && anyWithSource && !bestTeams[i].second) { // bestTeams[i] is not the source of the + // shard int idx = g_random->randomInt(0, serverIds.size()); destIds.push_back(serverIds[idx]); healthyIds.push_back(serverIds[idx]); @@ -955,6 +964,18 @@ ACTOR Future dataDistributionRelocator( DDQueueData *self, RelocateData rd } } + // Sanity check + state int totalIds = 0; + for (auto& destTeam : destinationTeams) { + totalIds += destTeam.servers.size(); + } + if (totalIds != self->teamSize) { + TraceEvent(SevWarn, "IncorrectDestTeamSize") + .suppressFor(1.0) + .detail("ExpectedTeamSize", self->teamSize) + .detail("DestTeamSize", totalIds); + } + self->shardsAffectedByTeamFailure->moveShard(rd.keys, destinationTeams); //FIXME: do not add data in flight to servers that were already in the src. @@ -977,6 +998,7 @@ ACTOR Future dataDistributionRelocator( DDQueueData *self, RelocateData rd destIds.insert(destIds.end(), extraIds.begin(), extraIds.end()); healthyIds.insert(healthyIds.end(), extraIds.begin(), extraIds.end()); extraIds.clear(); + ASSERT(totalIds == destIds.size()); // Sanity check the destIDs before we move keys doMoveKeys = moveKeys(self->cx, rd.keys, destIds, healthyIds, self->lock, Promise(), &self->startMoveKeysParallelismLock, &self->finishMoveKeysParallelismLock, self->recoveryVersion, self->teamCollections.size() > 1, relocateShardInterval.pairID ); } else { self->fetchKeysComplete.insert( rd ); diff --git a/fdbserver/DiskQueue.actor.cpp b/fdbserver/DiskQueue.actor.cpp index 24970ef78a..92463259d6 100644 --- a/fdbserver/DiskQueue.actor.cpp +++ b/fdbserver/DiskQueue.actor.cpp @@ -770,6 +770,8 @@ private: uint64_t popped; int payloadSize; }; + // The on disk format depends on the size of PageHeader. + static_assert( sizeof(PageHeader) == 36, "PageHeader must be packed" ); struct Page : PageHeader { static const int maxPayload = _PAGE_SIZE - sizeof(PageHeader); @@ -901,7 +903,7 @@ private: // Writes go at the end of our reads (but on the next page) self->nextPageSeq = self->nextReadLocation/sizeof(Page)*sizeof(Page); - if (self->nextReadLocation % sizeof(Page) > 36) self->nextPageSeq += sizeof(Page); + if (self->nextReadLocation % sizeof(Page) > sizeof(PageHeader)) self->nextPageSeq += sizeof(Page); TraceEvent("DQRecovered", self->dbgid).detail("LastPoppedSeq", self->lastPoppedSeq).detail("PoppedSeq", self->poppedSeq).detail("NextPageSeq", self->nextPageSeq).detail("File0Name", self->rawQueue->files[0].dbgFilename); self->recovered = true; diff --git a/fdbserver/Knobs.cpp b/fdbserver/Knobs.cpp index b660c31d5b..9f594c876a 100644 --- a/fdbserver/Knobs.cpp +++ b/fdbserver/Knobs.cpp @@ -20,6 +20,7 @@ #include "fdbserver/Knobs.h" #include "fdbrpc/Locality.h" +#include ServerKnobs const* SERVER_KNOBS = new ServerKnobs(); @@ -377,6 +378,8 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) { init( MAX_STORAGE_SERVER_WATCH_BYTES, 100e6 ); if( randomize && BUGGIFY ) MAX_STORAGE_SERVER_WATCH_BYTES = 10e3; init( MAX_BYTE_SAMPLE_CLEAR_MAP_SIZE, 1e9 ); if( randomize && BUGGIFY ) MAX_BYTE_SAMPLE_CLEAR_MAP_SIZE = 1e3; init( LONG_BYTE_SAMPLE_RECOVERY_DELAY, 60.0 ); + init( BYTE_SAMPLE_LOAD_PARALLELISM, 32 ); if( randomize && BUGGIFY ) BYTE_SAMPLE_LOAD_PARALLELISM = 1; + init( BYTE_SAMPLE_LOAD_DELAY, 0.0 ); if( randomize && BUGGIFY ) BYTE_SAMPLE_LOAD_DELAY = 0.1; //Wait Failure init( BUGGIFY_OUTSTANDING_WAIT_FAILURE_REQUESTS, 2 ); diff --git a/fdbserver/Knobs.h b/fdbserver/Knobs.h index 7248f41dba..11eab24db9 100644 --- a/fdbserver/Knobs.h +++ b/fdbserver/Knobs.h @@ -315,6 +315,8 @@ public: int MAX_STORAGE_SERVER_WATCH_BYTES; int MAX_BYTE_SAMPLE_CLEAR_MAP_SIZE; double LONG_BYTE_SAMPLE_RECOVERY_DELAY; + int BYTE_SAMPLE_LOAD_PARALLELISM; + double BYTE_SAMPLE_LOAD_DELAY; //Wait Failure int BUGGIFY_OUTSTANDING_WAIT_FAILURE_REQUESTS; diff --git a/fdbserver/LogProtocolMessage.h b/fdbserver/LogProtocolMessage.h index 76672c87e3..60cd886682 100644 --- a/fdbserver/LogProtocolMessage.h +++ b/fdbserver/LogProtocolMessage.h @@ -60,7 +60,7 @@ struct LogProtocolMessage { template void serialize(Ar& ar) { uint8_t poly = MutationRef::Reserved_For_LogProtocolMessage; - ar & poly; + serializer(ar, poly); applyVersionStartingHere(ar, IncludeVersion()); } @@ -70,4 +70,4 @@ struct LogProtocolMessage { template static bool isNextIn(Ar& ar) { return startsLogProtocolMessage(*(const uint8_t*)ar.peekBytes(1)); } }; -#endif \ No newline at end of file +#endif diff --git a/fdbserver/LogSystemConfig.h b/fdbserver/LogSystemConfig.h index 83a90aa9de..1839c59dde 100644 --- a/fdbserver/LogSystemConfig.h +++ b/fdbserver/LogSystemConfig.h @@ -45,8 +45,8 @@ struct OptionalInterface { template void serialize( Ar& ar ) { - ar & iface; - if( !iface.present() ) ar & ident; + serializer(ar, iface); + if( !iface.present() ) serializer(ar, ident); else ident = iface.get().id(); } @@ -111,7 +111,7 @@ struct TLogSet { template void serialize( Ar& ar ) { - ar & tLogs & logRouters & tLogWriteAntiQuorum & tLogReplicationFactor & tLogPolicy & tLogLocalities & isLocal & locality & startVersion & satelliteTagLocations; + serializer(ar, tLogs, logRouters, tLogWriteAntiQuorum, tLogReplicationFactor, tLogPolicy, tLogLocalities, isLocal, locality, startVersion, satelliteTagLocations); } }; @@ -144,7 +144,7 @@ struct OldTLogConf { template void serialize( Ar& ar ) { - ar & tLogs & epochEnd & logRouterTags; + serializer(ar, tLogs, epochEnd, logRouterTags); } }; @@ -304,7 +304,7 @@ struct LogSystemConfig { template void serialize( Ar& ar ) { - ar & logSystemType & tLogs & logRouterTags & oldTLogs & expectedLogSets & recruitmentID & stopped & recoveredAt; + serializer(ar, logSystemType, tLogs, logRouterTags, oldTLogs, expectedLogSets, recruitmentID, stopped, recoveredAt); } }; diff --git a/fdbserver/MasterInterface.h b/fdbserver/MasterInterface.h index 22f1dbbdde..d4b5f396ab 100644 --- a/fdbserver/MasterInterface.h +++ b/fdbserver/MasterInterface.h @@ -43,7 +43,7 @@ struct MasterInterface { template void serialize(Archive& ar) { ASSERT( ar.protocolVersion() >= 0x0FDB00A200040001LL ); - ar & locality & waitFailure & getRateInfo & tlogRejoin & changeCoordinators & getCommitVersion; + serializer(ar, locality, waitFailure, getRateInfo, tlogRejoin, changeCoordinators, getCommitVersion); } void initEndpoints() { @@ -61,7 +61,7 @@ struct GetRateInfoRequest { template void serialize(Ar& ar) { - ar & requesterID & totalReleasedTransactions & reply; + serializer(ar, requesterID, totalReleasedTransactions, reply); } }; @@ -71,7 +71,7 @@ struct GetRateInfoReply { template void serialize(Ar& ar) { - ar & transactionRate & leaseDuration; + serializer(ar, transactionRate, leaseDuration); } }; @@ -83,7 +83,7 @@ struct TLogRejoinRequest { explicit TLogRejoinRequest(const TLogInterface &interf) : myInterface(interf) { } template void serialize(Ar& ar) { - ar & myInterface & reply; + serializer(ar, myInterface, reply); } }; @@ -96,7 +96,7 @@ struct ChangeCoordinatorsRequest { template void serialize(Ar& ar) { - ar & newConnectionString & reply; + serializer(ar, newConnectionString, reply); } }; @@ -121,7 +121,7 @@ struct ResolverMoveRef { template void serialize( Ar& ar ) { - ar & range & dest; + serializer(ar, range, dest); } }; @@ -137,7 +137,7 @@ struct GetCommitVersionReply { template void serialize(Ar& ar) { - ar & resolverChanges & resolverChangesVersion & version & prevVersion & requestNum; + serializer(ar, resolverChanges, resolverChangesVersion, version, prevVersion, requestNum); } }; @@ -153,7 +153,7 @@ struct GetCommitVersionRequest { template void serialize(Ar& ar) { - ar & requestNum & mostRecentProcessedRequestNum & requestingProxy & reply; + serializer(ar, requestNum, mostRecentProcessedRequestNum, requestingProxy, reply); } }; @@ -175,8 +175,8 @@ struct LifetimeToken { template void serialize(Ar& ar) { - ar & ccID & count; + serializer(ar, ccID, count); } }; -#endif \ No newline at end of file +#endif diff --git a/fdbserver/MasterProxyServer.actor.cpp b/fdbserver/MasterProxyServer.actor.cpp index 7fa6c214fe..75a2cc5800 100644 --- a/fdbserver/MasterProxyServer.actor.cpp +++ b/fdbserver/MasterProxyServer.actor.cpp @@ -206,6 +206,7 @@ struct ProxyCommitData { std::map tag_popped; Deque> txsPopVersions; Version lastTxsPop; + bool popRemoteTxs; //The tag related to a storage server rarely change, so we keep a vector of tags for each key range to be slightly more CPU efficient. //When a tag related to a storage server does change, we empty out all of these vectors to signify they must be repopulated. @@ -228,12 +229,12 @@ struct ProxyCommitData { ProxyCommitData(UID dbgid, MasterInterface master, RequestStream getConsistentReadVersion, Version recoveryTransactionVersion, RequestStream commit, Reference> db, bool firstProxy) : dbgid(dbgid), stats(dbgid, &version, &committedVersion, &commitBatchesMemBytesCount), master(master), - logAdapter(NULL), txnStateStore(NULL), + logAdapter(NULL), txnStateStore(NULL), popRemoteTxs(false), committedVersion(recoveryTransactionVersion), version(0), minKnownCommittedVersion(0), lastVersionTime(0), commitVersionRequestNumber(1), mostRecentProcessedRequestNumber(0), getConsistentReadVersion(getConsistentReadVersion), commit(commit), lastCoalesceTime(0), - localCommitBatchesStarted(0), locked(false), commitBatchInterval(SERVER_KNOBS->COMMIT_TRANSACTION_BATCH_INTERVAL_MIN), - firstProxy(firstProxy), cx(openDBOnServer(db, TaskDefaultEndpoint, true, true)), + localCommitBatchesStarted(0), locked(false), commitBatchInterval(SERVER_KNOBS->COMMIT_TRANSACTION_BATCH_INTERVAL_MIN), + firstProxy(firstProxy), cx(openDBOnServer(db, TaskDefaultEndpoint, true, true)), singleKeyMutationEvent(LiteralStringRef("SingleKeyMutation")), commitBatchesMemBytesCount(0), lastTxsPop(0) {} }; @@ -915,7 +916,7 @@ ACTOR Future commitBatch( } wait(yield()); - if(!self->txsPopVersions.size() || msg.popTo > self->txsPopVersions.back().second) { + if( self->popRemoteTxs && msg.popTo > ( self->txsPopVersions.size() ? self->txsPopVersions.back().second : self->lastTxsPop ) ) { if(self->txsPopVersions.size() >= SERVER_KNOBS->MAX_TXS_POP_VERSION_HISTORY) { TraceEvent(SevWarnAlways, "DiscardingTxsPopHistory").suppressFor(1.0); self->txsPopVersions.pop_front(); @@ -1297,6 +1298,7 @@ ACTOR Future monitorRemoteCommitted(ProxyCommitData* self, ReferenceonChange()); continue; } + self->popRemoteTxs = true; state Future onChange = db->onChange(); loop { diff --git a/fdbserver/MoveKeys.h b/fdbserver/MoveKeys.h index a54dcc7529..7f6a4f5f5a 100644 --- a/fdbserver/MoveKeys.h +++ b/fdbserver/MoveKeys.h @@ -30,7 +30,7 @@ struct MoveKeysLock { UID prevOwner, myOwner, prevWrite; template - void serialize(Ar& ar) { ar & prevOwner & myOwner & prevWrite; } + void serialize(Ar& ar) { serializer(ar, prevOwner, myOwner, prevWrite); } }; Future takeMoveKeysLock( Database const& cx, UID const& masterId ); @@ -86,4 +86,4 @@ Future canRemoveStorageServer( Transaction* const& tr, UID const& serverID // Returns true if the given storage server has no keys assigned to it and may be safely removed // Obviously that could change later! -#endif \ No newline at end of file +#endif diff --git a/fdbserver/NetworkTest.h b/fdbserver/NetworkTest.h index d1ffec063b..37057bf3a1 100644 --- a/fdbserver/NetworkTest.h +++ b/fdbserver/NetworkTest.h @@ -40,7 +40,7 @@ struct NetworkTestRequest { NetworkTestRequest( Key key, uint32_t replySize ) : key(key), replySize(replySize) {} template void serialize(Ar& ar) { - ar & key & replySize & reply; + serializer(ar, key, replySize, reply); } }; @@ -50,7 +50,7 @@ struct NetworkTestReply { NetworkTestReply( Value value ) : value(value) {} template void serialize(Ar& ar) { - ar & value; + serializer(ar, value); } }; @@ -58,4 +58,4 @@ Future networkTestServer(); Future networkTestClient( std:: string const& testServers ); -#endif \ No newline at end of file +#endif diff --git a/fdbserver/OldTLogServer.actor.cpp b/fdbserver/OldTLogServer.actor.cpp index 937d7d8f1e..b0d92885b7 100644 --- a/fdbserver/OldTLogServer.actor.cpp +++ b/fdbserver/OldTLogServer.actor.cpp @@ -74,7 +74,7 @@ namespace oldTLog { template void serialize(Ar& ar) { - ar & tag & messageOffsets; + serializer(ar, tag, messageOffsets); } }; @@ -93,9 +93,9 @@ namespace oldTLog { template void serialize(Ar& ar) { if( ar.protocolVersion() >= 0x0FDB00A460010001) { - ar & version & messages & tags & knownCommittedVersion & id; + serializer(ar, version, messages, tags, knownCommittedVersion, id); } else if(ar.isDeserializing) { - ar & version & messages & tags; + serializer(ar, version, messages, tags); knownCommittedVersion = 0; id = UID(); } diff --git a/fdbserver/ResolverInterface.h b/fdbserver/ResolverInterface.h index 3b3a72898c..97d6c33b95 100644 --- a/fdbserver/ResolverInterface.h +++ b/fdbserver/ResolverInterface.h @@ -49,7 +49,7 @@ struct ResolverInterface { template void serialize( Ar& ar ) { - ar & uniqueID & locality & resolve & metrics & split & waitFailure; + serializer(ar, uniqueID, locality, resolve, metrics, split, waitFailure); } }; @@ -65,7 +65,7 @@ struct StateTransactionRef { template void serialize(Archive& ar) { - ar & committed & mutations; + serializer(ar, committed, mutations); } }; @@ -77,7 +77,7 @@ struct ResolveTransactionBatchReply { template void serialize(Archive& ar) { - ar & committed & stateMutations & arena & debugID; + serializer(ar, committed, stateMutations, arena, debugID); } }; @@ -95,7 +95,7 @@ struct ResolveTransactionBatchRequest { template void serialize(Archive& ar) { - ar & prevVersion & version & lastReceivedVersion & transactions & txnStateTransactions & reply & arena & debugID; + serializer(ar, prevVersion, version, lastReceivedVersion, transactions, txnStateTransactions, reply, arena, debugID); } }; @@ -104,7 +104,7 @@ struct ResolutionMetricsRequest { template void serialize(Archive& ar) { - ar & reply; + serializer(ar, reply); } }; @@ -113,7 +113,7 @@ struct ResolutionSplitReply { int64_t used; template void serialize(Archive& ar) { - ar & key & used; + serializer(ar, key, used); } }; @@ -126,7 +126,7 @@ struct ResolutionSplitRequest { template void serialize(Archive& ar) { - ar & range & offset & front & reply; + serializer(ar, range, offset, front, reply); } }; diff --git a/fdbserver/RestoreInterface.h b/fdbserver/RestoreInterface.h index 8529fff255..a6f546e58d 100644 --- a/fdbserver/RestoreInterface.h +++ b/fdbserver/RestoreInterface.h @@ -41,7 +41,7 @@ struct RestoreInterface { template void serialize( Ar& ar ) { - ar & test; + serializer(ar, test); } }; @@ -54,7 +54,7 @@ struct TestRequest { template void serialize(Ar& ar) { - ar & testData & reply; + serializer(ar, testData, reply); } }; @@ -66,7 +66,7 @@ struct TestReply { template void serialize(Ar& ar) { - ar & replyData; + serializer(ar, replyData); } }; diff --git a/fdbserver/ServerDBInfo.h b/fdbserver/ServerDBInfo.h index 8d8e8c188a..cd2f327a71 100644 --- a/fdbserver/ServerDBInfo.h +++ b/fdbserver/ServerDBInfo.h @@ -51,7 +51,7 @@ struct ServerDBInfo { template void serialize( Ar& ar ) { - ar & id & clusterInterface & client & master & resolvers & recoveryCount & masterLifetime & logSystemConfig & priorCommittedLogServers & recoveryState; + serializer(ar, id, clusterInterface, client, master, resolvers, recoveryCount, masterLifetime, logSystemConfig, priorCommittedLogServers, recoveryState); } }; diff --git a/fdbserver/TLogInterface.h b/fdbserver/TLogInterface.h index 7ea5ce5861..3acebb4c7b 100644 --- a/fdbserver/TLogInterface.h +++ b/fdbserver/TLogInterface.h @@ -65,8 +65,8 @@ struct TLogInterface { template void serialize( Ar& ar ) { ASSERT(ar.isDeserializing || uniqueID != UID()); - ar & uniqueID & sharedTLogID & locality & peekMessages & popMessages - & commit & lock & getQueuingMetrics & confirmRunning & waitFailure & recoveryFinished; + serializer(ar, uniqueID, sharedTLogID, locality, peekMessages, popMessages + , commit, lock, getQueuingMetrics, confirmRunning, waitFailure, recoveryFinished); } }; @@ -77,7 +77,7 @@ struct TLogRecoveryFinishedRequest { template void serialize( Ar& ar ) { - ar & reply; + serializer(ar, reply); } }; @@ -87,7 +87,7 @@ struct TLogLockResult { template void serialize( Ar& ar ) { - ar & end & knownCommittedVersion; + serializer(ar, end, knownCommittedVersion); } }; @@ -100,7 +100,7 @@ struct TLogConfirmRunningRequest { template void serialize( Ar& ar ) { - ar & debugID & reply; + serializer(ar, debugID, reply); } }; @@ -116,7 +116,7 @@ struct VersionUpdateRef { template void serialize( Ar& ar ) { - ar & version & mutations & isPrivateData; + serializer(ar, version, mutations, isPrivateData); } }; @@ -131,7 +131,7 @@ struct VerUpdateRef { template void serialize( Ar& ar ) { - ar & version & mutations & isPrivateData; + serializer(ar, version, mutations, isPrivateData); } }; @@ -146,7 +146,7 @@ struct TLogPeekReply { template void serialize(Ar& ar) { - ar & arena & messages & end & popped & maxKnownVersion & minKnownCommittedVersion & begin; + serializer(ar, arena, messages, end, popped, maxKnownVersion, minKnownCommittedVersion, begin); } }; @@ -163,7 +163,7 @@ struct TLogPeekRequest { template void serialize(Ar& ar) { - ar & arena & begin & tag & returnIfBlocked & sequence & reply; + serializer(ar, arena, begin, tag, returnIfBlocked, sequence, reply); } }; @@ -179,7 +179,7 @@ struct TLogPopRequest { template void serialize(Ar& ar) { - ar & arena & to & durableKnownCommittedVersion & tag & reply; + serializer(ar, arena, to, durableKnownCommittedVersion, tag, reply); } }; @@ -196,7 +196,7 @@ struct TagMessagesRef { template void serialize(Ar& ar) { - ar & tag & messageOffsets; + serializer(ar, tag, messageOffsets); } }; @@ -214,7 +214,7 @@ struct TLogCommitRequest { : arena(a), prevVersion(prevVersion), version(version), knownCommittedVersion(knownCommittedVersion), minKnownCommittedVersion(minKnownCommittedVersion), messages(messages), debugID(debugID) {} template void serialize( Ar& ar ) { - ar & prevVersion & version & knownCommittedVersion & minKnownCommittedVersion & messages & reply & arena & debugID; + serializer(ar, prevVersion, version, knownCommittedVersion, minKnownCommittedVersion, messages, reply, arena, debugID); } }; @@ -223,7 +223,7 @@ struct TLogQueuingMetricsRequest { template void serialize(Ar& ar) { - ar & reply; + serializer(ar, reply); } }; @@ -236,7 +236,7 @@ struct TLogQueuingMetricsReply { template void serialize(Ar& ar) { - ar & localTime & instanceID & bytesDurable & bytesInput & storageBytes & v; + serializer(ar, localTime, instanceID, bytesDurable, bytesInput, storageBytes, v); } }; diff --git a/fdbserver/TLogServer.actor.cpp b/fdbserver/TLogServer.actor.cpp index ef355022fc..90132bc48a 100644 --- a/fdbserver/TLogServer.actor.cpp +++ b/fdbserver/TLogServer.actor.cpp @@ -57,7 +57,7 @@ struct TLogQueueEntryRef { template void serialize(Ar& ar) { - ar & version & messages & knownCommittedVersion & id; + serializer(ar, version, messages, knownCommittedVersion, id); } size_t expectedSize() const { return messages.expectedSize(); @@ -76,11 +76,11 @@ struct AlternativeTLogQueueEntryRef { void serialize(Ar& ar) { ASSERT(!ar.isDeserializing && alternativeMessages); uint32_t msgSize = expectedSize(); - ar & version & msgSize; + serializer(ar, version, msgSize); for(auto& msg : *alternativeMessages) { ar.serializeBytes( msg.message ); } - ar & knownCommittedVersion & id; + serializer(ar, knownCommittedVersion, id); } uint32_t expectedSize() const { diff --git a/fdbserver/TesterInterface.h b/fdbserver/TesterInterface.h index 63991668ce..4394746826 100644 --- a/fdbserver/TesterInterface.h +++ b/fdbserver/TesterInterface.h @@ -37,7 +37,7 @@ struct WorkloadInterface { template void serialize( Ar& ar ) { - ar & setup & start & check & metrics & stop; + serializer(ar, setup, start, check, metrics, stop); } }; @@ -68,7 +68,7 @@ struct WorkloadRequest { template void serialize( Ar& ar ) { - ar & title & timeout & databasePingDelay & sharedRandomNumber & useDatabase & options & clientId & clientCount & reply & arena; + serializer(ar, title, timeout, databasePingDelay, sharedRandomNumber, useDatabase, options, clientId, clientCount, reply, arena); } }; @@ -79,7 +79,7 @@ struct TesterInterface { template void serialize(Ar& ar) { - ar & recruitments; + serializer(ar, recruitments); } }; @@ -88,6 +88,9 @@ Future testerServerCore( TesterInterface const& interf, Reference runTests( Reference const& connFile, test_type_t const& whatToRun, test_location_t const& whereToRun, int const& minTestersExpected, std::string const& fileName = std::string(), StringRef const& startingConfiguration = StringRef(), LocalityData const& locality = LocalityData() ); +Future runTests(Reference const& connFile, test_type_t const& whatToRun, + test_location_t const& whereToRun, int const& minTestersExpected, + std::string const& fileName = std::string(), StringRef const& startingConfiguration = StringRef(), + LocalityData const& locality = LocalityData()); #endif diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index c092ab3ba5..4ef9e06851 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -93,7 +93,7 @@ struct BTreePage { r += format("['%s']", c.getKeyRef().toHexString(20).c_str()); r += " -> "; - if(flags && IS_LEAF) + if(flags & IS_LEAF) r += format("'%s'", c.getValueRef().toHexString(20).c_str()); else r += format("Page id=%u", *(const uint32_t *)c.getValueRef().begin()); diff --git a/fdbserver/WorkerInterface.h b/fdbserver/WorkerInterface.h index 7c9925288e..b9de0c4484 100644 --- a/fdbserver/WorkerInterface.h +++ b/fdbserver/WorkerInterface.h @@ -62,7 +62,7 @@ struct WorkerInterface { template void serialize(Ar& ar) { - ar & clientInterface & locality & tLog & master & masterProxy & resolver & storage & logRouter & debugPing & coordinationPing & waitFailure & setMetricsRate & eventLogRequest & traceBatchDumpRequest & testerInterface & diskStoreRequest; + serializer(ar, clientInterface, locality, tLog, master, masterProxy, resolver, storage, logRouter, debugPing, coordinationPing, waitFailure, setMetricsRate, eventLogRequest, traceBatchDumpRequest, testerInterface, diskStoreRequest); } }; @@ -87,7 +87,7 @@ struct InitializeTLogRequest { template void serialize( Ar& ar ) { - ar & recruitmentID & recoverFrom & recoverAt & knownCommittedVersion & epoch & recoverTags & allTags & storeType & remoteTag & locality & isPrimary & startVersion & logRouterTags & reply; + serializer(ar, recruitmentID, recoverFrom, recoverAt, knownCommittedVersion, epoch, recoverTags, allTags, storeType, remoteTag, locality, isPrimary, startVersion, logRouterTags, reply); } }; @@ -102,7 +102,7 @@ struct InitializeLogRouterRequest { template void serialize(Ar& ar) { - ar & recoveryCount & routerTag & startVersion & tLogLocalities & tLogPolicy & locality & reply; + serializer(ar, recoveryCount, routerTag, startVersion, tLogLocalities, tLogPolicy, locality, reply); } }; @@ -116,7 +116,7 @@ struct RecruitMasterRequest { template void serialize(Ar& ar) { ASSERT( ar.protocolVersion() >= 0x0FDB00A200040001LL ); - ar & lifetime & forceRecovery & reply & arena; + serializer(ar, lifetime, forceRecovery, reply, arena); } }; @@ -129,7 +129,7 @@ struct InitializeMasterProxyRequest { template void serialize(Ar& ar) { - ar & master & recoveryCount & recoveryTransactionVersion & firstProxy & reply; + serializer(ar, master, recoveryCount, recoveryTransactionVersion, firstProxy, reply); } }; @@ -141,7 +141,7 @@ struct InitializeResolverRequest { template void serialize(Ar& ar) { - ar & recoveryCount & proxyCount & resolverCount & reply; + serializer(ar, recoveryCount, proxyCount, resolverCount, reply); } }; @@ -151,7 +151,7 @@ struct InitializeStorageReply { template void serialize(Ar& ar) { - ar & interf & addedVersion; + serializer(ar, interf, addedVersion); } }; @@ -164,7 +164,7 @@ struct InitializeStorageRequest { template void serialize( Ar& ar ) { - ar & seedTag & reqId & interfaceId & storeType & reply; + serializer(ar, seedTag, reqId, interfaceId, storeType, reply); } }; @@ -173,7 +173,7 @@ struct TraceBatchDumpRequest { template void serialize( Ar& ar ) { - ar & reply; + serializer(ar, reply); } }; @@ -183,7 +183,7 @@ struct LoadedReply { template void serialize(Ar& ar) { - ar & payload & id; + serializer(ar, payload, id); } }; @@ -195,7 +195,7 @@ struct LoadedPingRequest { template void serialize(Ar& ar) { - ar & id & loadReply & payload & reply; + serializer(ar, id, loadReply, payload, reply); } }; @@ -208,7 +208,7 @@ struct CoordinationPingMessage { template void serialize(Ar& ar) { - ar & clusterControllerId & timeStep; + serializer(ar, clusterControllerId, timeStep); } }; @@ -220,7 +220,7 @@ struct SetMetricsLogRateRequest { template void serialize(Ar& ar) { - ar & metricsLogsPerSecond; + serializer(ar, metricsLogsPerSecond); } }; @@ -234,7 +234,7 @@ struct EventLogRequest { template void serialize(Ar& ar) { - ar & getLastError & eventName & reply; + serializer(ar, getLastError, eventName, reply); } }; @@ -254,7 +254,7 @@ struct DebugEntryRef { template void serialize(Ar& ar) { - ar & time & address & context & version & mutation; + serializer(ar, time, address, context, version, mutation); } }; @@ -266,7 +266,7 @@ struct DiskStoreRequest { template void serialize(Ar& ar) { - ar & includePartialStores & reply; + serializer(ar, includePartialStores, reply); } }; diff --git a/fdbserver/fdbserver.actor.cpp b/fdbserver/fdbserver.actor.cpp index 75e57dbeb2..bb78fa33f5 100644 --- a/fdbserver/fdbserver.actor.cpp +++ b/fdbserver/fdbserver.actor.cpp @@ -77,7 +77,7 @@ enum { OPT_CONNFILE, OPT_SEEDCONNFILE, OPT_SEEDCONNSTRING, OPT_ROLE, OPT_LISTEN, OPT_PUBLICADDR, OPT_DATAFOLDER, OPT_LOGFOLDER, OPT_PARENTPID, OPT_NEWCONSOLE, OPT_NOBOX, OPT_TESTFILE, OPT_RESTARTING, OPT_RANDOMSEED, OPT_KEY, OPT_MEMLIMIT, OPT_STORAGEMEMLIMIT, OPT_MACHINEID, OPT_DCID, OPT_MACHINE_CLASS, OPT_BUGGIFY, OPT_VERSION, OPT_CRASHONERROR, OPT_HELP, OPT_NETWORKIMPL, OPT_NOBUFSTDOUT, OPT_BUFSTDOUTERR, OPT_TRACECLOCK, OPT_NUMTESTERS, OPT_DEVHELP, OPT_ROLLSIZE, OPT_MAXLOGS, OPT_MAXLOGSSIZE, OPT_KNOB, OPT_TESTSERVERS, OPT_TEST_ON_SERVERS, OPT_METRICSCONNFILE, OPT_METRICSPREFIX, - OPT_LOGGROUP, OPT_LOCALITY, OPT_IO_TRUST_SECONDS, OPT_IO_TRUST_WARN_ONLY, OPT_FILESYSTEM, OPT_KVFILE }; + OPT_LOGGROUP, OPT_LOCALITY, OPT_IO_TRUST_SECONDS, OPT_IO_TRUST_WARN_ONLY, OPT_FILESYSTEM, OPT_KVFILE, OPT_TRACE_FORMAT }; CSimpleOpt::SOption g_rgOptions[] = { { OPT_CONNFILE, "-C", SO_REQ_SEP }, @@ -151,6 +151,7 @@ CSimpleOpt::SOption g_rgOptions[] = { { OPT_METRICSPREFIX, "--metrics_prefix", SO_REQ_SEP }, { OPT_IO_TRUST_SECONDS, "--io_trust_seconds", SO_REQ_SEP }, { OPT_IO_TRUST_WARN_ONLY, "--io_trust_warn_only", SO_NONE }, + { OPT_TRACE_FORMAT , "--trace_format", SO_REQ_SEP }, #ifndef TLS_DISABLED TLS_OPTION_FLAGS @@ -551,6 +552,8 @@ static void printUsage( const char *name, bool devhelp ) { " Delete the oldest log file when the total size of all log\n" " files exceeds SIZE bytes. If set to 0, old log files will not\n" " be deleted. The default value is 100MiB.\n"); + printf(" --trace_format FORMAT\n" + " Select the format of the log files. xml (the default) and json are supported.\n"); printf(" -i ID, --machine_id ID\n" " Machine identifier key (up to 16 hex characters). Defaults\n" " to a random value shared by all fdbserver processes on this\n" @@ -1129,6 +1132,11 @@ int main(int argc, char* argv[]) { case OPT_IO_TRUST_WARN_ONLY: fileIoWarnOnly = true; break; + case OPT_TRACE_FORMAT: + if (!selectTraceFormatter(args.OptionArg())) { + fprintf(stderr, "WARNING: Unrecognized trace format `%s'\n", args.OptionArg()); + } + break; #ifndef TLS_DISABLED case TLSOptions::OPT_TLS_PLUGIN: args.OptionArg(); diff --git a/fdbserver/pubsub.h b/fdbserver/pubsub.h index cd0cf70bc1..970774e9de 100644 --- a/fdbserver/pubsub.h +++ b/fdbserver/pubsub.h @@ -69,7 +69,7 @@ public: template void serialize( Ar& ar ) { - ar & originatorFeed & messageId & data & data.arena(); + serializer(ar, originatorFeed, messageId, data, data.arena()); } }; diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp index eeec67403d..f53b03d364 100644 --- a/fdbserver/storageserver.actor.cpp +++ b/fdbserver/storageserver.actor.cpp @@ -2834,48 +2834,76 @@ void StorageServerDisk::changeLogProtocol(Version version, uint64_t protocol) { data->addMutationToMutationLogOrStorage(version, MutationRef(MutationRef::SetValue, persistLogProtocol, BinaryWriter::toValue(protocol, Unversioned()))); } -ACTOR Future applyByteSampleResult( StorageServer* data, KeyRange range, Future>> result) { - Standalone> bs = wait( result ); - for( int j = 0; j < bs.size(); j++ ) { - KeyRef key = bs[j].key.removePrefix(persistByteSampleKeys.begin); - if(!data->byteSampleClears.rangeContaining(key).value()) { - data->metrics.byteSample.sample.insert( key, BinaryReader::fromStringRef(bs[j].value, Unversioned()), false ); +ACTOR Future applyByteSampleResult( StorageServer* data, IKeyValueStore* storage, Key begin, Key end, std::vector>>* results = NULL) { + state int totalFetches = 0; + state int totalKeys = 0; + state int totalBytes = 0; + loop { + Standalone> bs = wait( storage->readRange( KeyRangeRef(begin, end), SERVER_KNOBS->STORAGE_LIMIT_BYTES, SERVER_KNOBS->STORAGE_LIMIT_BYTES ) ); + if(results) results->push_back(bs); + int rangeSize = bs.expectedSize(); + totalFetches++; + totalKeys += bs.size(); + totalBytes += rangeSize; + for( int j = 0; j < bs.size(); j++ ) { + KeyRef key = bs[j].key.removePrefix(persistByteSampleKeys.begin); + if(!data->byteSampleClears.rangeContaining(key).value()) { + data->metrics.byteSample.sample.insert( key, BinaryReader::fromStringRef(bs[j].value, Unversioned()), false ); + } + } + if( rangeSize >= SERVER_KNOBS->STORAGE_LIMIT_BYTES ) { + Key nextBegin = keyAfter(bs.back().key); + data->byteSampleClears.insert(KeyRangeRef(begin, nextBegin).removePrefix(persistByteSampleKeys.begin), true); + data->byteSampleClearsTooLarge.set(data->byteSampleClears.size() > SERVER_KNOBS->MAX_BYTE_SAMPLE_CLEAR_MAP_SIZE); + begin = nextBegin; + if(begin == end) { + break; + } + } else { + data->byteSampleClears.insert(KeyRangeRef(begin.removePrefix(persistByteSampleKeys.begin), end == persistByteSampleKeys.end ? LiteralStringRef("\xff\xff\xff") : end.removePrefix(persistByteSampleKeys.begin)), true); + data->byteSampleClearsTooLarge.set(data->byteSampleClears.size() > SERVER_KNOBS->MAX_BYTE_SAMPLE_CLEAR_MAP_SIZE); + break; + } + + if(!results) { + wait(delay(SERVER_KNOBS->BYTE_SAMPLE_LOAD_DELAY)); } } - data->byteSampleClears.insert(range, true); - data->byteSampleClearsTooLarge.set(data->byteSampleClears.size() > SERVER_KNOBS->MAX_BYTE_SAMPLE_CLEAR_MAP_SIZE); - + TraceEvent("RecoveredByteSampleRange", data->thisServerID).detail("Begin", printable(begin)).detail("End", printable(end)).detail("Fetches", totalFetches).detail("Keys", totalKeys).detail("ReadBytes", totalBytes); return Void(); } -ACTOR Future restoreByteSample(StorageServer* data, IKeyValueStore* storage, Standalone> bsSample) { +ACTOR Future restoreByteSample(StorageServer* data, IKeyValueStore* storage, Promise byteSampleSampleRecovered) { + state std::vector>> byteSampleSample; + wait( applyByteSampleResult(data, storage, persistByteSampleSampleKeys.begin, persistByteSampleSampleKeys.end, &byteSampleSample) ); + byteSampleSampleRecovered.send(Void()); wait( delay( BUGGIFY ? g_random->random01() * 2.0 : 0.0001 ) ); - TraceEvent("RecoveredByteSampleSample", data->thisServerID).detail("Keys", bsSample.size()).detail("ReadBytes", bsSample.expectedSize()); - size_t bytes_per_fetch = 0; // Since the expected size also includes (as of now) the space overhead of the container, we calculate our own number here - for( int i = 0; i < bsSample.size(); i++ ) - bytes_per_fetch += BinaryReader::fromStringRef(bsSample[i].value, Unversioned()); - bytes_per_fetch /= 32; + for( auto& it : byteSampleSample ) { + for( auto& kv : it ) { + bytes_per_fetch += BinaryReader::fromStringRef(kv.value, Unversioned()); + } + } + bytes_per_fetch = (bytes_per_fetch/SERVER_KNOBS->BYTE_SAMPLE_LOAD_PARALLELISM) + 1; state std::vector> sampleRanges; int accumulatedSize = 0; - std::string prefix = PERSIST_PREFIX "BS/"; - Key lastStart = LiteralStringRef( PERSIST_PREFIX "BS/" ); // make sure the first range starts at the absolute beginning of the byte sample - for( auto it = bsSample.begin(); it != bsSample.end(); ++it ) { - if( accumulatedSize >= bytes_per_fetch ) { - accumulatedSize = 0; - Key realKey = it->key.removePrefix( prefix ); - KeyRange sampleRange = KeyRangeRef( lastStart, realKey ); - sampleRanges.push_back( applyByteSampleResult(data, sampleRange.removePrefix(persistByteSampleKeys.begin), storage->readRange( sampleRange )) ); - lastStart = realKey; + Key lastStart = persistByteSampleKeys.begin; // make sure the first range starts at the absolute beginning of the byte sample + for( auto& it : byteSampleSample ) { + for( auto& kv : it ) { + if( accumulatedSize >= bytes_per_fetch ) { + accumulatedSize = 0; + Key realKey = kv.key.removePrefix( persistByteSampleKeys.begin ); + sampleRanges.push_back( applyByteSampleResult(data, storage, lastStart, realKey) ); + lastStart = realKey; + } + accumulatedSize += BinaryReader::fromStringRef(kv.value, Unversioned()); } - accumulatedSize += BinaryReader::fromStringRef(it->value, Unversioned()); } // make sure that the last range goes all the way to the end of the byte sample - KeyRange sampleRange = KeyRangeRef( lastStart, LiteralStringRef( PERSIST_PREFIX "BS0" )); - sampleRanges.push_back( applyByteSampleResult(data, KeyRangeRef(lastStart.removePrefix(persistByteSampleKeys.begin), LiteralStringRef("\xff\xff\xff")), storage->readRange( sampleRange )) ); + sampleRanges.push_back( applyByteSampleResult(data, storage, lastStart, persistByteSampleKeys.end) ); wait( waitForAll( sampleRanges ) ); TraceEvent("RecoveredByteSampleChunkedRead", data->thisServerID).detail("Ranges",sampleRanges.size()); @@ -2893,11 +2921,14 @@ ACTOR Future restoreDurableState( StorageServer* data, IKeyValueStore* sto state Future> fLogProtocol = storage->readValue(persistLogProtocol); state Future>> fShardAssigned = storage->readRange(persistShardAssignedKeys); state Future>> fShardAvailable = storage->readRange(persistShardAvailableKeys); - state Future>> fByteSampleSample = storage->readRange(persistByteSampleSampleKeys); + + state Promise byteSampleSampleRecovered; + data->byteSampleRecovery = restoreByteSample(data, storage, byteSampleSampleRecovered); TraceEvent("ReadingDurableState", data->thisServerID); wait( waitForAll( (vector>>(), fFormat, fID, fVersion, fLogProtocol) ) ); - wait( waitForAll( (vector>>>(), fShardAssigned, fShardAvailable, fByteSampleSample) ) ); + wait( waitForAll( (vector>>>(), fShardAssigned, fShardAvailable) ) ); + wait( byteSampleSampleRecovered.getFuture() ); TraceEvent("RestoringDurableState", data->thisServerID); if (!fFormat.get().present()) { @@ -2952,9 +2983,6 @@ ACTOR Future restoreDurableState( StorageServer* data, IKeyValueStore* sto wait(yield()); } - wait( applyByteSampleResult(data, persistByteSampleSampleKeys.removePrefix(persistByteSampleKeys.begin), fByteSampleSample) ); - data->byteSampleRecovery = restoreByteSample(data, storage, fByteSampleSample.get()); - wait( delay( 0.0001 ) ); { @@ -3556,7 +3584,7 @@ void versionedMapTest() { const int NSIZE = sizeof(VersionedMap::PTreeT); const int ASIZE = NSIZE<=64 ? 64 : NextPowerOfTwo::Result; - auto before = FastAllocator< ASIZE >::getMemoryUsed(); + auto before = FastAllocator< ASIZE >::getTotalMemory(); for(int v=1; v<=1000; ++v) { vm.createNewVersion(v); @@ -3570,7 +3598,7 @@ void versionedMapTest() { } } - auto after = FastAllocator< ASIZE >::getMemoryUsed(); + auto after = FastAllocator< ASIZE >::getTotalMemory(); int count = 0; for(auto i = vm.atLatest().begin(); i != vm.atLatest().end(); ++i) diff --git a/fdbserver/tester.actor.cpp b/fdbserver/tester.actor.cpp index 5779c3da19..9e8ce4a067 100644 --- a/fdbserver/tester.actor.cpp +++ b/fdbserver/tester.actor.cpp @@ -622,6 +622,7 @@ ACTOR Future runWorkload( Database cx, std::vector< Test TraceEvent("TestRunning").detail( "WorkloadTitle", printable(spec.title) ) .detail("TesterCount", testers.size()).detail("Phases", spec.phases) .detail("TestTimeout", spec.timeout); + state vector< Future< WorkloadInterface > > workRequests; state vector> metricsResults; @@ -659,7 +660,7 @@ ACTOR Future runWorkload( Database cx, std::vector< Test if( spec.phases & TestWorkload::EXECUTION ) { TraceEvent("TestStarting").detail("WorkloadTitle", printable(spec.title)); - printf("running test...\n"); + printf("running test (%s)...\n", printable(spec.title).c_str()); std::vector< Future > starts; for(int i= 0; i < workloads.size(); i++) starts.push_back( workloads[i].start.template getReply() ); @@ -675,7 +676,7 @@ ACTOR Future runWorkload( Database cx, std::vector< Test state std::vector< Future > checks; TraceEvent("CheckingResults"); - printf("checking tests...\n"); + printf("checking test (%s)...\n", printable(spec.title).c_str()); for(int i= 0; i < workloads.size(); i++) checks.push_back( workloads[i].check.template getReply() ); wait( waitForAll( checks ) ); @@ -690,7 +691,7 @@ ACTOR Future runWorkload( Database cx, std::vector< Test if( spec.phases & TestWorkload::METRICS ) { state std::vector< Future> > metricTasks; - printf("fetching metrics...\n"); + printf("fetching metrics (%s)...\n", printable(spec.title).c_str()); TraceEvent("TestFetchingMetrics").detail("WorkloadTitle", printable(spec.title)); for(int i= 0; i < workloads.size(); i++) metricTasks.push_back( workloads[i].metrics.template getReply>() ); @@ -725,6 +726,7 @@ ACTOR Future changeConfiguration(Database cx, std::vector< TesterInterface spec.options.push_back_deep(spec.options.arena(), options); DistributedTestResults testResults = wait(runWorkload(cx, testers, spec)); + return Void(); } @@ -764,6 +766,7 @@ ACTOR Future checkConsistency(Database cx, std::vector< TesterInterface > spec.options[0].push_back_deep(spec.options.arena(), KeyValueRef(LiteralStringRef("failureIsError"), LiteralStringRef("true"))); lastRun = true; } + wait( repairDeadDatacenter(cx, dbInfo, "ConsistencyCheck") ); } } @@ -1033,6 +1036,7 @@ ACTOR Future runTests( Reference disabler = disableConnectionFailuresAfter(450, "Tester"); //Change the configuration (and/or create the database) if necessary + printf("startingConfiguration:%s start\n", startingConfiguration.toString().c_str()); if(useDB && startingConfiguration != StringRef()) { try { wait(timeoutError(changeConfiguration(cx, testers, startingConfiguration), 2000.0)); @@ -1066,8 +1070,9 @@ ACTOR Future runTests( Reference workerHandleErrors(FutureStream errors) { err.error.code() == error_code_coordinators_changed || // The worker server was cancelled err.error.code() == error_code_shutdown_in_progress; - if(!ok) + if (!ok) { err.error = checkIOTimeout(err.error); // Possibly convert error to io_timeout + } endRole(err.role, err.id, "Error", ok, err.error); @@ -366,12 +367,25 @@ ACTOR Future storageServerRollbackRebooter( Future prevStorageServer TraceEvent("StorageServerRequestedReboot", id); - StorageServerInterface ssi; - ssi.uniqueID = id; - ssi.locality = locality; - ssi.initEndpoints(); + StorageServerInterface recruited; + recruited.uniqueID = id; + recruited.locality = locality; + recruited.initEndpoints(); - prevStorageServer = storageServer( store, ssi, db, folder, Promise() ); + DUMPTOKEN(recruited.getVersion); + DUMPTOKEN(recruited.getValue); + DUMPTOKEN(recruited.getKey); + DUMPTOKEN(recruited.getKeyValues); + DUMPTOKEN(recruited.getShardState); + DUMPTOKEN(recruited.waitMetrics); + DUMPTOKEN(recruited.splitMetrics); + DUMPTOKEN(recruited.getPhysicalMetrics); + DUMPTOKEN(recruited.waitFailure); + DUMPTOKEN(recruited.getQueuingMetrics); + DUMPTOKEN(recruited.getKeyValueStoreType); + DUMPTOKEN(recruited.watchValue); + + prevStorageServer = storageServer( store, recruited, db, folder, Promise() ); prevStorageServer = handleIOErrors(prevStorageServer, store, id, store->onClosed()); } } diff --git a/fdbserver/workloads/BackupCorrectness.actor.cpp b/fdbserver/workloads/BackupCorrectness.actor.cpp index 1e21eefd75..16441c139b 100644 --- a/fdbserver/workloads/BackupCorrectness.actor.cpp +++ b/fdbserver/workloads/BackupCorrectness.actor.cpp @@ -188,40 +188,43 @@ struct BackupAndRestoreCorrectnessWorkload : TestWorkload { if (BUGGIFY) { state KeyBackedTag backupTag = makeBackupTag(tag.toString()); TraceEvent("BARW_DoBackupWaitForRestorable", randomID).detail("Tag", backupTag.tagName); - // Wait until the backup is in a restorable state - state int resultWait = wait(backupAgent->waitBackup(cx, backupTag.tagName, false)); - UidAndAbortedFlagT uidFlag = wait(backupTag.getOrThrow(cx)); - state UID logUid = uidFlag.first; - state Reference lastBackupContainer = wait(BackupConfig(logUid).backupContainer().getD(cx)); + + // Wait until the backup is in a restorable state and get the status, URL, and UID atomically + state Reference lastBackupContainer; + state UID lastBackupUID; + state int resultWait = wait(backupAgent->waitBackup(cx, backupTag.tagName, false, &lastBackupContainer, &lastBackupUID)); state bool restorable = false; if(lastBackupContainer) { - state BackupDescription desc = wait(lastBackupContainer->describeBackup()); - wait(desc.resolveVersionTimes(cx)); - printf("BackupDescription:\n%s\n", desc.toString().c_str()); - restorable = desc.maxRestorableVersion.present(); + state Future fdesc = lastBackupContainer->describeBackup(); + wait(ready(fdesc)); + + if(!fdesc.isError()) { + state BackupDescription desc = fdesc.get(); + wait(desc.resolveVersionTimes(cx)); + printf("BackupDescription:\n%s\n", desc.toString().c_str()); + restorable = desc.maxRestorableVersion.present(); + } } TraceEvent("BARW_LastBackupContainer", randomID) .detail("BackupTag", printable(tag)) .detail("LastBackupContainer", lastBackupContainer ? lastBackupContainer->getURL() : "") - .detail("LogUid", logUid).detail("WaitStatus", resultWait).detail("Restorable", restorable); + .detail("LastBackupUID", lastBackupUID).detail("WaitStatus", resultWait).detail("Restorable", restorable); // Do not check the backup, if aborted if (resultWait == BackupAgentBase::STATE_ABORTED) { } // Ensure that a backup container was found else if (!lastBackupContainer) { - TraceEvent("BARW_MissingBackupContainer", randomID).detail("LogUid", logUid).detail("BackupTag", printable(tag)).detail("WaitStatus", resultWait); + TraceEvent(SevError, "BARW_MissingBackupContainer", randomID).detail("LastBackupUID", lastBackupUID).detail("BackupTag", printable(tag)).detail("WaitStatus", resultWait); printf("BackupCorrectnessMissingBackupContainer tag: %s status: %d\n", printable(tag).c_str(), resultWait); } // Check that backup is restorable - else { - if(!restorable) { - TraceEvent("BARW_NotRestorable", randomID).detail("LogUid", logUid).detail("BackupTag", printable(tag)) - .detail("BackupFolder", lastBackupContainer->getURL()).detail("WaitStatus", resultWait); - printf("BackupCorrectnessNotRestorable: tag: %s\n", printable(tag).c_str()); - } + else if(!restorable) { + TraceEvent(SevError, "BARW_NotRestorable", randomID).detail("LastBackupUID", lastBackupUID).detail("BackupTag", printable(tag)) + .detail("BackupFolder", lastBackupContainer->getURL()).detail("WaitStatus", resultWait); + printf("BackupCorrectnessNotRestorable: tag: %s\n", printable(tag).c_str()); } // Abort the backup, if not the first backup because the second backup may have aborted the backup by now diff --git a/fdbserver/workloads/ConsistencyCheck.actor.cpp b/fdbserver/workloads/ConsistencyCheck.actor.cpp index 4ef3df7207..047aee1cac 100644 --- a/fdbserver/workloads/ConsistencyCheck.actor.cpp +++ b/fdbserver/workloads/ConsistencyCheck.actor.cpp @@ -630,7 +630,18 @@ struct ConsistencyCheckWorkload : TestWorkload //In a quiescent database, check that the team size is the same as the desired team size if(self->firstClient && self->performQuiescentChecks && sourceStorageServers.size() != configuration.usableRegions*configuration.storageTeamSize) { - TraceEvent("ConsistencyCheck_InvalidTeamSize").detail("ShardBegin", printable(range.begin)).detail("ShardEnd", printable(range.end)).detail("TeamSize", sourceStorageServers.size()).detail("DesiredTeamSize", configuration.storageTeamSize); + TraceEvent("ConsistencyCheck_InvalidTeamSize") + .detail("ShardBegin", printable(range.begin)) + .detail("ShardEnd", printable(range.end)) + .detail("SourceTeamSize", sourceStorageServers.size()) + .detail("DestServerSize", destStorageServers.size()) + .detail("ConfigStorageTeamSize", configuration.storageTeamSize) + .detail("UsableRegions", configuration.usableRegions); + // Record the server reponsible for the problematic shards + int i = 0; + for (auto& id : sourceStorageServers) { + TraceEvent("IncorrectSizeTeamInfo").detail("ServerUID", id).detail("TeamIndex", i++); + } self->testFailure("Invalid team size"); return false; } @@ -1070,7 +1081,10 @@ struct ConsistencyCheckWorkload : TestWorkload } } if( !found ) { - TraceEvent("ConsistencyCheck_NoStorage").detail("Address", workers[i].first.address()); + TraceEvent("ConsistencyCheck_NoStorage") + .detail("Address", workers[i].first.address()) + .detail("ProcessClassEqualToStorageClass", + (int)(workers[i].second == ProcessClass::StorageClass)); missingStorage.insert(workers[i].first.locality.dcId()); } } diff --git a/fdbserver/workloads/Ping.actor.cpp b/fdbserver/workloads/Ping.actor.cpp index 8a65fb4c1c..f61f8b1aec 100644 --- a/fdbserver/workloads/Ping.actor.cpp +++ b/fdbserver/workloads/Ping.actor.cpp @@ -34,7 +34,7 @@ struct PingWorkloadInterface { template void serialize( Ar& ar ) { - ar & payloadPing; + serializer(ar, payloadPing); } }; diff --git a/fdbserver/workloads/RemoveServersSafely.actor.cpp b/fdbserver/workloads/RemoveServersSafely.actor.cpp index c870f0e1e2..095a910737 100644 --- a/fdbserver/workloads/RemoveServersSafely.actor.cpp +++ b/fdbserver/workloads/RemoveServersSafely.actor.cpp @@ -247,6 +247,7 @@ struct RemoveServersSafelyWorkload : TestWorkload { int randomIndex; bool bCanKillProcess; ISimulator::ProcessInfo* randomProcess; + for (int killsLeft = killProcArray.size(); killsLeft > 0; killsLeft --) { // Select a random kill process @@ -267,9 +268,11 @@ struct RemoveServersSafelyWorkload : TestWorkload { if (bCanKillProcess) { killableProcesses.push_back(randomProcess); killableAddrs.push_back(AddressExclusion(randomProcess->address.ip, randomProcess->address.port)); - TraceEvent("RemoveAndKill").detail("Step", "identifyVictim") - .detail("VictimCount", killableAddrs.size()).detail("Victim",randomProcess->toString()) - .detail("Victims", describe(killableAddrs)); + TraceEvent("RemoveAndKill") + .detail("Step", "identifyVictim") + .detail("VictimCount", killableAddrs.size()) + .detail("Victim", randomProcess->toString()) + .detail("Victims", describe(killableAddrs)); } // Move the process to the keep array else { diff --git a/fdbserver/workloads/Sideband.actor.cpp b/fdbserver/workloads/Sideband.actor.cpp index 461d66f664..87b8d278cb 100644 --- a/fdbserver/workloads/Sideband.actor.cpp +++ b/fdbserver/workloads/Sideband.actor.cpp @@ -32,7 +32,7 @@ struct SidebandMessage { template void serialize( Ar& ar ) { - ar & key & commitVersion; + serializer(ar, key, commitVersion); } }; @@ -43,7 +43,7 @@ struct SidebandInterface { template void serialize( Ar& ar ) { - ar & updates; + serializer(ar, updates); } }; diff --git a/flow/Arena.h b/flow/Arena.h index 1b72fc088c..65399f6acb 100644 --- a/flow/Arena.h +++ b/flow/Arena.h @@ -375,7 +375,7 @@ public: //T tmp; //ar >> tmp; //*this = tmp; - ar & (*(T*)this) & arena(); + serializer(ar, (*(T*)this), arena()); } /*static Standalone fakeStandalone( const T& t ) { diff --git a/flow/CMakeLists.txt b/flow/CMakeLists.txt new file mode 100644 index 0000000000..35ad067a99 --- /dev/null +++ b/flow/CMakeLists.txt @@ -0,0 +1,97 @@ +find_package(Threads REQUIRED) + +set(FLOW_SRCS + ActorCollection.actor.cpp + ActorCollection.h + Arena.h + AsioReactor.h + CompressedInt.actor.cpp + CompressedInt.h + Deque.cpp + Deque.h + DeterministicRandom.h + Error.cpp + Error.h + EventTypes.actor.h + FastAlloc.cpp + FastAlloc.h + FastRef.h + FaultInjection.cpp + FaultInjection.h + FileTraceLogWriter.cpp + FileTraceLogWriter.h + Hash3.c + Hash3.h + IDispatched.h + IRandom.h + IThreadPool.cpp + IThreadPool.h + IndexedSet.actor.h + IndexedSet.cpp + IndexedSet.h + JsonTraceLogFormatter.cpp + JsonTraceLogFormatter.h + Knobs.cpp + Knobs.h + MetricSample.h + Net2.actor.cpp + Net2Packet.cpp + Net2Packet.h + Platform.cpp + Platform.h + Profiler.actor.cpp + Profiler.h + SignalSafeUnwind.cpp + SignalSafeUnwind.h + SimpleOpt.h + Stats.actor.cpp + Stats.h + SystemMonitor.cpp + SystemMonitor.h + TDMetric.actor.h + TDMetric.cpp + ThreadHelper.actor.h + ThreadHelper.cpp + ThreadPrimitives.cpp + ThreadPrimitives.h + ThreadSafeQueue.h + Trace.cpp + Trace.h + UnitTest.cpp + UnitTest.h + XmlTraceLogFormatter.h + XmlTraceLogFormatter.cpp + actorcompiler.h + boost.cpp + error_definitions.h + flow.cpp + flow.h + genericactors.actor.cpp + genericactors.actor.h + network.cpp + network.h + serialize.h + stacktrace.amalgamation.cpp + stacktrace.h + version.cpp) + +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/hgVersion.h.cmake ${CMAKE_CURRENT_BINARY_DIR}/hgVersion.h) + +actor_set(FLOW_BUILD "${FLOW_SRCS}") +add_library(flow STATIC ${FLOW_BUILD}) +actor_compile(flow "${FLOW_SRCS}") +target_include_directories(flow SYSTEM PUBLIC ${CMAKE_THREAD_LIBS_INIT}) +target_include_directories(flow PUBLIC ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR}) +if (NOT APPLE) + set (FLOW_LIBS ${FLOW_LIBS} rt) +endif() +target_link_libraries(flow PRIVATE ${FLOW_LIBS}) +target_link_libraries(flow PUBLIC boost_target Threads::Threads ${CMAKE_DL_LIBS}) +target_compile_definitions(flow PUBLIC TLS_DISABLED) + +if(APPLE) + find_library(IO_KIT IOKit) + find_library(CORE_FOUNDATION CoreFoundation) + target_link_libraries(flow PRIVATE ${IO_KIT} ${CORE_FOUNDATION}) +endif() + diff --git a/flow/CompressedInt.h b/flow/CompressedInt.h index 97cfc0d486..e473f0bf80 100644 --- a/flow/CompressedInt.h +++ b/flow/CompressedInt.h @@ -51,7 +51,7 @@ struct CompressedInt { template void serialize(Ar &ar) { if(ar.isDeserializing) { uint8_t b; - ar & b; + serializer(ar, b); int bytesToRead = 0; // Additional bytes to read after the required first byte bool positive = (b & 0x80) != 0; // Sign bit if(!positive) @@ -62,7 +62,7 @@ struct CompressedInt { // Scan the unary len bits across multiple bytes if needed while(1) { if(hb == 0) { // Go to next byte if needed - ar & b; // Read byte + serializer(ar, b); // Read byte if(!positive) b = ~b; // Negative, so invert bytes read @@ -78,7 +78,7 @@ struct CompressedInt { value = b; // b contains the highest byte of value while(bytesToRead-- != 0) { - ar & b; // Read byte + serializer(ar, b); // Read byte if(!positive) b = ~b; // Negative, so invert bytes read value <<= 8; // Shift value up to make room for new byte diff --git a/flow/Error.cpp b/flow/Error.cpp index 1222f8842f..6a6c31e9ff 100644 --- a/flow/Error.cpp +++ b/flow/Error.cpp @@ -115,5 +115,5 @@ void ErrorCodeTable::addCode(int code, const char *name, const char *description } bool isAssertDisabled(int line) { - return FLOW_KNOBS->DISABLE_ASSERTS == -1 || FLOW_KNOBS->DISABLE_ASSERTS == line; + return FLOW_KNOBS && (FLOW_KNOBS->DISABLE_ASSERTS == -1 || FLOW_KNOBS->DISABLE_ASSERTS == line); } diff --git a/flow/Error.h b/flow/Error.h index 4d52679586..545fccb78a 100644 --- a/flow/Error.h +++ b/flow/Error.h @@ -48,7 +48,7 @@ public: template void serialize( Ar& ar ) { - ar & error_code; + serializer(ar, error_code); } Error() : error_code(invalid_error_code), flags(0) {} @@ -68,6 +68,8 @@ private: enum Flags { FLAG_INJECTED_FAULT=1 }; }; +Error systemErrorCodeToError(); + #undef ERROR #define ERROR(name, number, description) inline Error name() { return Error( number ); }; enum { error_code_##name = number }; #include "error_definitions.h" diff --git a/flow/FastAlloc.cpp b/flow/FastAlloc.cpp index d0e59440cf..4f9adf70de 100644 --- a/flow/FastAlloc.cpp +++ b/flow/FastAlloc.cpp @@ -61,13 +61,16 @@ template INIT_SEG thread_local typename FastAllocator::ThreadData FastAllocator::threadData; +template +thread_local bool FastAllocator::threadInitialized = false; + #ifdef VALGRIND template unsigned long FastAllocator::vLock = 1; #endif template -void* FastAllocator::freelist = 0; +void* FastAllocator::freelist = nullptr; typedef void (*ThreadInitFunction)(); @@ -118,32 +121,30 @@ void recordAllocation( void *ptr, size_t size ) { #error Instrumentation not supported on this platform #endif + uint32_t a = 0, b = 0; if( nptrs > 0 ) { - uint32_t a = 0, b = 0; hashlittle2( buffer, nptrs * sizeof(void *), &a, &b ); - - { - double countDelta = std::max(1.0, ((double)SAMPLE_BYTES) / size); - size_t sizeDelta = std::max(SAMPLE_BYTES, size); - ThreadSpinLockHolder holder( memLock ); - auto it = backTraceLookup.find( a ); - if( it == backTraceLookup.end() ) { - auto& bt = backTraceLookup[ a ]; - bt.backTrace = new std::vector(); - for (int j = 0; j < nptrs; j++) { - bt.backTrace->push_back( buffer[j] ); - } - bt.totalSize = sizeDelta; - bt.count = countDelta; - bt.sampleCount = 1; - } else { - it->second.totalSize += sizeDelta; - it->second.count += countDelta; - it->second.sampleCount++; - } - memSample[(int64_t)ptr] = std::make_pair(a, size); - } } + + double countDelta = std::max(1.0, ((double)SAMPLE_BYTES) / size); + size_t sizeDelta = std::max(SAMPLE_BYTES, size); + ThreadSpinLockHolder holder( memLock ); + auto it = backTraceLookup.find( a ); + if( it == backTraceLookup.end() ) { + auto& bt = backTraceLookup[ a ]; + bt.backTrace = new std::vector(); + for (int j = 0; j < nptrs; j++) { + bt.backTrace->push_back( buffer[j] ); + } + bt.totalSize = sizeDelta; + bt.count = countDelta; + bt.sampleCount = 1; + } else { + it->second.totalSize += sizeDelta; + it->second.count += countDelta; + it->second.sampleCount++; + } + memSample[(int64_t)ptr] = std::make_pair(a, size); } memSample_entered = false; #endif @@ -188,20 +189,28 @@ struct FastAllocator::GlobalData { CRITICAL_SECTION mutex; std::vector magazines; // These magazines are always exactly magazine_size ("full") std::vector> partial_magazines; // Magazines that are not "full" and their counts. Only created by releaseThreadMagazines(). - long long memoryUsed; - GlobalData() : memoryUsed(0) { + long long totalMemory; + long long partialMagazineUnallocatedMemory; + long long activeThreads; + GlobalData() : totalMemory(0), partialMagazineUnallocatedMemory(0), activeThreads(0) { InitializeCriticalSection(&mutex); } }; template -long long FastAllocator::getMemoryUsed() { - return globalData()->memoryUsed; +long long FastAllocator::getTotalMemory() { + return globalData()->totalMemory; +} + +// This does not include memory held by various threads that's available for allocation +template +long long FastAllocator::getApproximateMemoryUnused() { + return globalData()->magazines.size() * magazine_size * Size + globalData()->partialMagazineUnallocatedMemory; } template -long long FastAllocator::getMemoryUnused() { - return globalData()->magazines.size() * magazine_size * Size; +long long FastAllocator::getActiveThreads() { + return globalData()->activeThreads; } static int64_t getSizeCode(int i) { @@ -221,15 +230,21 @@ static int64_t getSizeCode(int i) { template void *FastAllocator::allocate() { + if(!threadInitialized) { + initThread(); + } + #if FASTALLOC_THREAD_SAFE ThreadData& thr = threadData; if (!thr.freelist) { + ASSERT(thr.count == 0); if (thr.alternate) { thr.freelist = thr.alternate; - thr.alternate = 0; + thr.alternate = nullptr; thr.count = magazine_size; - } else + } else { getMagazine(); + } } --thr.count; void* p = thr.freelist; @@ -237,6 +252,7 @@ void *FastAllocator::allocate() { VALGRIND_MAKE_MEM_DEFINED(p, sizeof(void*)); #endif thr.freelist = *(void**)p; + ASSERT(!thr.freelist == (thr.count == 0)); // freelist is empty if and only if count is 0 //check( p, true ); #else void* p = freelist; @@ -257,15 +273,22 @@ void *FastAllocator::allocate() { template void FastAllocator::release(void *ptr) { + if(!threadInitialized) { + initThread(); + } + #if FASTALLOC_THREAD_SAFE ThreadData& thr = threadData; if (thr.count == magazine_size) { if (thr.alternate) // Two full magazines, return one releaseMagazine( thr.alternate ); thr.alternate = thr.freelist; - thr.freelist = 0; + thr.freelist = nullptr; thr.count = 0; } + + ASSERT(!thr.freelist == (thr.count == 0)); // freelist is empty if and only if count is 0 + ++thr.count; *(void**)ptr = thr.freelist; //check(ptr, false); @@ -334,9 +357,27 @@ void FastAllocator::check(void* ptr, bool alloc) { #endif } +template +void FastAllocator::initThread() { + threadInitialized = true; + if (threadInitFunction) { + threadInitFunction(); + } + + EnterCriticalSection(&globalData()->mutex); + ++globalData()->activeThreads; + LeaveCriticalSection(&globalData()->mutex); + + threadData.freelist = nullptr; + threadData.alternate = nullptr; + threadData.count = 0; +} + template void FastAllocator::getMagazine() { - if (threadInitFunction) threadInitFunction(); + ASSERT(threadInitialized); + ASSERT(!threadData.freelist && !threadData.alternate && threadData.count == 0); + EnterCriticalSection(&globalData()->mutex); if (globalData()->magazines.size()) { void* m = globalData()->magazines.back(); @@ -348,12 +389,13 @@ void FastAllocator::getMagazine() { } else if (globalData()->partial_magazines.size()) { std::pair p = globalData()->partial_magazines.back(); globalData()->partial_magazines.pop_back(); + globalData()->partialMagazineUnallocatedMemory -= p.first * Size; LeaveCriticalSection(&globalData()->mutex); threadData.freelist = p.second; threadData.count = p.first; return; } - globalData()->memoryUsed += magazine_size*Size; + globalData()->totalMemory += magazine_size*Size; LeaveCriticalSection(&globalData()->mutex); // Allocate a new page of data from the system allocator @@ -361,7 +403,7 @@ void FastAllocator::getMagazine() { interlockedIncrement(&pageCount); #endif - void** block = 0; + void** block = nullptr; #if FAST_ALLOCATOR_DEBUG #ifdef WIN32 static int alt = 0; alt++; @@ -386,30 +428,42 @@ void FastAllocator::getMagazine() { check( &block[i*PSize], false ); } - block[(magazine_size-1)*PSize+1] = block[(magazine_size-1)*PSize] = 0; + block[(magazine_size-1)*PSize+1] = block[(magazine_size-1)*PSize] = nullptr; check( &block[(magazine_size-1)*PSize], false ); threadData.freelist = block; threadData.count = magazine_size; } template void FastAllocator::releaseMagazine(void* mag) { + ASSERT(threadInitialized); EnterCriticalSection(&globalData()->mutex); globalData()->magazines.push_back(mag); LeaveCriticalSection(&globalData()->mutex); } template void FastAllocator::releaseThreadMagazines() { - ThreadData& thr = threadData; + if(threadInitialized) { + threadInitialized = false; + ThreadData& thr = threadData; - if (thr.freelist || thr.alternate) { EnterCriticalSection(&globalData()->mutex); - if (thr.freelist) globalData()->partial_magazines.push_back( std::make_pair(thr.count, thr.freelist) ); - if (thr.alternate) globalData()->magazines.push_back(thr.alternate); + if (thr.freelist || thr.alternate) { + if (thr.freelist) { + ASSERT(thr.count > 0 && thr.count <= magazine_size); + globalData()->partial_magazines.push_back( std::make_pair(thr.count, thr.freelist) ); + globalData()->partialMagazineUnallocatedMemory += thr.count * Size; + } + if (thr.alternate) { + globalData()->magazines.push_back(thr.alternate); + } + } + --globalData()->activeThreads; LeaveCriticalSection(&globalData()->mutex); + + thr.count = 0; + thr.alternate = nullptr; + thr.freelist = nullptr; } - thr.count = 0; - thr.alternate = 0; - thr.freelist = 0; } void releaseAllThreadMagazines() { @@ -427,15 +481,15 @@ void releaseAllThreadMagazines() { int64_t getTotalUnusedAllocatedMemory() { int64_t unusedMemory = 0; - unusedMemory += FastAllocator<16>::getMemoryUnused(); - unusedMemory += FastAllocator<32>::getMemoryUnused(); - unusedMemory += FastAllocator<64>::getMemoryUnused(); - unusedMemory += FastAllocator<128>::getMemoryUnused(); - unusedMemory += FastAllocator<256>::getMemoryUnused(); - unusedMemory += FastAllocator<512>::getMemoryUnused(); - unusedMemory += FastAllocator<1024>::getMemoryUnused(); - unusedMemory += FastAllocator<2048>::getMemoryUnused(); - unusedMemory += FastAllocator<4096>::getMemoryUnused(); + unusedMemory += FastAllocator<16>::getApproximateMemoryUnused(); + unusedMemory += FastAllocator<32>::getApproximateMemoryUnused(); + unusedMemory += FastAllocator<64>::getApproximateMemoryUnused(); + unusedMemory += FastAllocator<128>::getApproximateMemoryUnused(); + unusedMemory += FastAllocator<256>::getApproximateMemoryUnused(); + unusedMemory += FastAllocator<512>::getApproximateMemoryUnused(); + unusedMemory += FastAllocator<1024>::getApproximateMemoryUnused(); + unusedMemory += FastAllocator<2048>::getApproximateMemoryUnused(); + unusedMemory += FastAllocator<4096>::getApproximateMemoryUnused(); return unusedMemory; } diff --git a/flow/FastAlloc.h b/flow/FastAlloc.h index 7aa450adce..06f35711c3 100644 --- a/flow/FastAlloc.h +++ b/flow/FastAlloc.h @@ -106,8 +106,9 @@ public: static void release(void* ptr); static void check( void* ptr, bool alloc ); - static long long getMemoryUsed(); - static long long getMemoryUnused(); + static long long getTotalMemory(); + static long long getApproximateMemoryUnused(); + static long long getActiveThreads(); static void releaseThreadMagazines(); @@ -129,6 +130,7 @@ private: void* alternate; // alternate is either a full magazine, or an empty one }; static thread_local ThreadData threadData; + static thread_local bool threadInitialized; static GlobalData* globalData() { #ifdef VALGRIND ANNOTATE_RWLOCK_ACQUIRED(vLock, 1); @@ -144,7 +146,8 @@ private: static void* freelist; FastAllocator(); // not implemented - static void getMagazine(); // sets threadData.freelist and threadData.count + static void initThread(); + static void getMagazine(); static void releaseMagazine(void*); }; diff --git a/flow/IRandom.h b/flow/IRandom.h index e4a6c65a6f..c8362e23ef 100644 --- a/flow/IRandom.h +++ b/flow/IRandom.h @@ -51,7 +51,7 @@ public: template void serialize_unversioned(Ar& ar) { // Changing this serialization format will affect key definitions, so can't simply be versioned! - ar & part[0] & part[1]; + serializer(ar, part[0], part[1]); } }; diff --git a/flow/JsonTraceLogFormatter.cpp b/flow/JsonTraceLogFormatter.cpp new file mode 100644 index 0000000000..8574f35a80 --- /dev/null +++ b/flow/JsonTraceLogFormatter.cpp @@ -0,0 +1,85 @@ +/* + * JsonTraceLogFormatter.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2018 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flow/flow.h" +#include "flow/JsonTraceLogFormatter.h" + +#include + +void JsonTraceLogFormatter::addref() { + ReferenceCounted::addref(); +} + +void JsonTraceLogFormatter::delref() { + ReferenceCounted::delref(); +} + +const char* JsonTraceLogFormatter::getExtension() { + return "json"; +} + +const char* JsonTraceLogFormatter::getHeader() { + return ""; +} + +const char* JsonTraceLogFormatter::getFooter() { + return ""; +} + +namespace { + +void escapeString(std::stringstream& ss, const std::string& source) { + for (auto c : source) { + if (c == '"') { + ss << "\\\""; + } else if (c == '\\') { + ss << "\\\\"; + } else if (c == '\n') { + ss << "\\n"; + } else if (c == '\r') { + ss << "\\r"; + } else if (isprint(c)) { + ss << c; + } else { + constexpr char hex[] = "0123456789abcdef"; + int x = int{ static_cast(c) }; + ss << "\\x" << hex[x / 16] << hex[x % 16]; + } + } +} + +} // namespace + +std::string JsonTraceLogFormatter::formatEvent(const TraceEventFields& fields) { + std::stringstream ss; + ss << "{ "; + for (auto iter = fields.begin(); iter != fields.end(); ++iter) { + if (iter != fields.begin()) { + ss << ", "; + } + ss << "\""; + escapeString(ss, iter->first); + ss << "\": \""; + escapeString(ss, iter->second); + ss << "\""; + } + ss << " }\r\n"; + return ss.str(); +} diff --git a/flow/JsonTraceLogFormatter.h b/flow/JsonTraceLogFormatter.h new file mode 100644 index 0000000000..78ce3bb276 --- /dev/null +++ b/flow/JsonTraceLogFormatter.h @@ -0,0 +1,32 @@ +/* + * JsonTraceLogFormatter.h + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2018 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flow/FastRef.h" +#include "flow/Trace.h" + +struct JsonTraceLogFormatter : public ITraceLogFormatter, ReferenceCounted { + const char* getExtension() override; + const char* getHeader() override; // Called when starting a new file + const char* getFooter() override; // Called when ending a file + std::string formatEvent(const TraceEventFields&) override; // Called for each event + + void addref() override; + void delref() override; +}; diff --git a/flow/Knobs.cpp b/flow/Knobs.cpp index 9d7034e1e2..98f472a0bc 100644 --- a/flow/Knobs.cpp +++ b/flow/Knobs.cpp @@ -20,6 +20,7 @@ #include "flow/Knobs.h" #include "flow/flow.h" +#include FlowKnobs const* FLOW_KNOBS = new FlowKnobs(); diff --git a/flow/Net2.actor.cpp b/flow/Net2.actor.cpp index 267be58a8f..15ca0ba9b4 100644 --- a/flow/Net2.actor.cpp +++ b/flow/Net2.actor.cpp @@ -164,7 +164,6 @@ public: ASIOReactor reactor; INetworkConnections *network; // initially this, but can be changed - tcp::resolver tcpResolver; int64_t tsc_begin, tsc_end; double taskBegin; @@ -484,7 +483,6 @@ Net2::Net2(NetworkAddress localAddress, bool useThreadPool, bool useMetrics) : useThreadPool(useThreadPool), network(this), reactor(this), - tcpResolver(reactor.ios), stopped(false), tasksIssued(0), // Until run() is called, yield() will always yield @@ -841,11 +839,13 @@ Future< Reference > Net2::connect( NetworkAddress toAddr, std::stri } ACTOR static Future> resolveTCPEndpoint_impl( Net2 *self, std::string host, std::string service) { - Promise> result; + state tcp::resolver tcpResolver(self->reactor.ios); + Promise> promise; + state Future> result = promise.getFuture(); - self->tcpResolver.async_resolve(tcp::resolver::query(host, service), [=](const boost::system::error_code &ec, tcp::resolver::iterator iter) { + tcpResolver.async_resolve(tcp::resolver::query(host, service), [=](const boost::system::error_code &ec, tcp::resolver::iterator iter) { if(ec) { - result.sendError(lookup_failed()); + promise.sendError(lookup_failed()); return; } @@ -853,18 +853,27 @@ ACTOR static Future> resolveTCPEndpoint_impl( Net2 * tcp::resolver::iterator end; while(iter != end) { - // The easiest way to get an ip:port formatted endpoint with this interface is with a string stream because - // endpoint::to_string doesn't exist but operator<< does. - std::stringstream s; - s << iter->endpoint(); - addrs.push_back(NetworkAddress::parse(s.str())); + auto endpoint = iter->endpoint(); + // Currently only ipv4 is supported by NetworkAddress + auto addr = endpoint.address(); + if(addr.is_v4()) { + addrs.push_back(NetworkAddress(addr.to_v4().to_ulong(), endpoint.port())); + } ++iter; } - result.send(addrs); + + if(addrs.empty()) { + promise.sendError(lookup_failed()); + } + else { + promise.send(addrs); + } }); - std::vector addresses = wait(result.getFuture()); - return addresses; + wait(ready(result)); + tcpResolver.cancel(); + + return result.get(); } Future> Net2::resolveTCPEndpoint( std::string host, std::string service) { @@ -1014,7 +1023,7 @@ struct TestGVR { template void serialize( Ar& ar ) { - ar & key & version & debugID & reply; + serializer(ar, key, version, debugID, reply); } }; @@ -1094,7 +1103,7 @@ void net2_test() { Endpoint destination; - printf(" Used: %lld\n", FastAllocator<4096>::getMemoryUsed()); + printf(" Used: %lld\n", FastAllocator<4096>::getTotalMemory()); char junk[100]; @@ -1144,6 +1153,6 @@ void net2_test() { printf("SimSend x 1Kx10K: %0.2f sec\n", timer()-before); printf(" Bytes: %d\n", totalBytes); - printf(" Used: %lld\n", FastAllocator<4096>::getMemoryUsed()); + printf(" Used: %lld\n", FastAllocator<4096>::getTotalMemory()); */ }; diff --git a/flow/Platform.cpp b/flow/Platform.cpp index b1f283f9ed..43fdd1ea17 100644 --- a/flow/Platform.cpp +++ b/flow/Platform.cpp @@ -98,6 +98,8 @@ #include /* Needed for crash handler */ #include +/* Needed for gnu_dev_{major,minor} */ +#include #endif #ifdef __APPLE__ @@ -432,22 +434,40 @@ void getMachineRAMInfo(MachineRAMInfo& memInfo) { #endif } +Error systemErrorCodeToError() { +#if defined(_WIN32) + if(GetLastError() == ERROR_IO_DEVICE) { + return io_error(); + } +#elif defined(__unixish__) + if(errno == EIO || errno == EROFS) { + return io_error(); + } +#else + #error Port me! +#endif + + return platform_error(); +} + void getDiskBytes(std::string const& directory, int64_t& free, int64_t& total) { INJECT_FAULT( platform_error, "getDiskBytes" ); #if defined(__unixish__) #ifdef __linux__ struct statvfs buf; if (statvfs(directory.c_str(), &buf)) { - TraceEvent(SevError, "GetDiskBytesStatvfsError").detail("Directory", directory).GetLastError(); - throw platform_error(); + Error e = systemErrorCodeToError(); + TraceEvent(SevError, "GetDiskBytesStatvfsError").detail("Directory", directory).GetLastError().error(e); + throw e; } uint64_t blockSize = buf.f_frsize; #elif defined(__APPLE__) struct statfs buf; if (statfs(directory.c_str(), &buf)) { - TraceEvent(SevError, "GetDiskBytesStatfsError").detail("Directory", directory).GetLastError(); - throw platform_error(); + Error e = systemErrorCodeToError(); + TraceEvent(SevError, "GetDiskBytesStatfsError").detail("Directory", directory).GetLastError().error(e); + throw e; } uint64_t blockSize = buf.f_bsize; @@ -466,7 +486,9 @@ void getDiskBytes(std::string const& directory, int64_t& free, int64_t& total) { ULARGE_INTEGER totalSpace; ULARGE_INTEGER totalFreeSpace; if( !GetDiskFreeSpaceEx( fullPath.c_str(), &freeSpace, &totalSpace, &totalFreeSpace ) ) { - TraceEvent(SevError, "DiskFreeError").detail("Path", fullPath).GetLastError(); + Error e = systemErrorCodeToError(); + TraceEvent(SevError, "DiskFreeError").detail("Path", fullPath).GetLastError().error(e); + throw e; } total = std::min( (uint64_t) std::numeric_limits::max(), totalSpace.QuadPart ); free = std::min( (uint64_t) std::numeric_limits::max(), freeSpace.QuadPart ); @@ -623,7 +645,7 @@ void getDiskStatistics(std::string const& directory, uint64_t& currentIOs, uint6 unsigned int minorId; disk_stream >> majorId; disk_stream >> minorId; - if(majorId == (unsigned int) major(buf.st_dev) && minorId == (unsigned int) minor(buf.st_dev)) { + if(majorId == (unsigned int) gnu_dev_major(buf.st_dev) && minorId == (unsigned int) gnu_dev_minor(buf.st_dev)) { std::string ignore; uint64_t rd_ios; /* # of reads completed */ // This is the total number of reads completed successfully. @@ -814,8 +836,9 @@ void getDiskStatistics(std::string const& directory, uint64_t& currentIOs, uint6 struct statfs buf; if (statfs(directory.c_str(), &buf)) { - TraceEvent(SevError, "GetDiskStatisticsStatfsError").detail("Directory", directory).GetLastError(); - throw platform_error(); + Error e = systemErrorCodeToError(); + TraceEvent(SevError, "GetDiskStatisticsStatfsError").detail("Directory", directory).GetLastError().error(e); + throw e; } const char* dev = strrchr(buf.f_mntfromname, '/'); @@ -1349,12 +1372,12 @@ void getLocalTime(const time_t *timep, struct tm *result) { #ifdef _WIN32 if(localtime_s(result, timep) != 0) { TraceEvent(SevError, "GetLocalTimeError").GetLastError(); - throw platform_error; + throw platform_error(); } #elif defined(__unixish__) if(localtime_r(timep, result) == NULL) { TraceEvent(SevError, "GetLocalTimeError").GetLastError(); - throw platform_error; + throw platform_error(); } #else #error Port me! @@ -1709,8 +1732,9 @@ bool deleteFile( std::string const& filename ) { #else #error Port me! #endif - TraceEvent(SevError, "DeleteFile").detail("Filename", filename).GetLastError(); - throw platform_error(); + Error e = systemErrorCodeToError(); + TraceEvent(SevError, "DeleteFile").detail("Filename", filename).GetLastError().error(e); + throw errno; } static void createdDirectory() { INJECT_FAULT( platform_error, "createDirectory" ); } @@ -1734,8 +1758,9 @@ bool createDirectory( std::string const& directory ) { return createDirectory( directory ); } } - TraceEvent(SevError, "CreateDirectory").detail("Directory", directory).GetLastError(); - throw platform_error(); + Error e = systemErrorCodeToError(); + TraceEvent(SevError, "CreateDirectory").detail("Directory", directory).GetLastError().error(e); + throw e; #elif (defined(__linux__) || defined(__APPLE__)) size_t sep = 0; do { @@ -1744,12 +1769,16 @@ bool createDirectory( std::string const& directory ) { if (errno == EEXIST) continue; - TraceEvent(SevError, "CreateDirectory").detail("Directory", directory).GetLastError(); - if (errno == EACCES) - throw file_not_writable(); - else { - throw platform_error(); + Error e; + if(errno == EACCES) { + e = file_not_writable(); } + else { + e = systemErrorCodeToError(); + } + + TraceEvent(SevError, "CreateDirectory").detail("Directory", directory).GetLastError().error(e); + throw e; } createdDirectory(); } while (sep != std::string::npos && sep != directory.length() - 1); @@ -1768,8 +1797,9 @@ std::string abspath( std::string const& filename ) { #ifdef _WIN32 char nameBuffer[MAX_PATH]; if (!GetFullPathName(filename.c_str(), MAX_PATH, nameBuffer, NULL)) { - TraceEvent(SevError, "AbsolutePathError").detail("Filename", filename).GetLastError(); - throw platform_error(); + Error e = systemErrorCodeToError(); + TraceEvent(SevError, "AbsolutePathError").detail("Filename", filename).GetLastError().error(e); + throw e; } // Not totally obvious from the help whether GetFullPathName canonicalizes slashes, so let's do it... for(char*x = nameBuffer; *x; x++) @@ -1789,8 +1819,9 @@ std::string abspath( std::string const& filename ) { return joinPath( abspath( "." ), filename ); } } - TraceEvent(SevError, "AbsolutePathError").detail("Filename", filename).GetLastError(); - throw platform_error(); + Error e = systemErrorCodeToError(); + TraceEvent(SevError, "AbsolutePathError").detail("Filename", filename).GetLastError().error(e); + throw e; } return std::string(r); #else @@ -2033,6 +2064,19 @@ bool fileExists(std::string const& filename) { return true; } +bool directoryExists(std::string const& path) { +#ifdef _WIN32 + DWORD bits = ::GetFileAttributes(path.c_str()); + return bits != INVALID_FILE_ATTRIBUTES && (bits & FILE_ATTRIBUTE_DIRECTORY); +#else + DIR *d = opendir(path.c_str()); + if(d == nullptr) + return false; + closedir(d); + return true; +#endif +} + int64_t fileSize(std::string const& filename) { #ifdef _WIN32 struct _stati64 file_status; @@ -2190,7 +2234,7 @@ std::string getDefaultPluginPath( const char* plugin_name ) { }; // namespace platform #ifdef ALLOC_INSTRUMENTATION -#define TRACEALLOCATOR( size ) TraceEvent("MemSample").detail("Count", FastAllocator::getMemoryUnused()/size).detail("TotalSize", FastAllocator::getMemoryUnused()).detail("SampleCount", 1).detail("Hash", "FastAllocatedUnused" #size ).detail("Bt", "na") +#define TRACEALLOCATOR( size ) TraceEvent("MemSample").detail("Count", FastAllocator::getApproximateMemoryUnused()/size).detail("TotalSize", FastAllocator::getApproximateMemoryUnused()).detail("SampleCount", 1).detail("Hash", "FastAllocatedUnused" #size ).detail("Bt", "na") #ifdef __linux__ #include #endif diff --git a/flow/Platform.h b/flow/Platform.h index d0039ff3a5..56b6fa9cdf 100644 --- a/flow/Platform.h +++ b/flow/Platform.h @@ -291,6 +291,9 @@ void threadYield(); // Attempt to yield to other processes or threads // Returns true iff the file exists bool fileExists(std::string const& filename); +// Returns true iff the directory exists +bool directoryExists(std::string const& path); + // Returns size of file in bytes int64_t fileSize(std::string const& filename); diff --git a/flow/README.md b/flow/README.md index b42b137c85..50677ab0ed 100644 --- a/flow/README.md +++ b/flow/README.md @@ -240,7 +240,7 @@ struct CountingServerInterface { // serialization code required for use on a network template void serialize( Ar& ar ) { - ar & addCount & subtractCount & getCount; + serializer(ar, addCount, subtractCount, getCount); } }; ``` diff --git a/flow/SystemMonitor.cpp b/flow/SystemMonitor.cpp index 187b7405c8..80053b3262 100644 --- a/flow/SystemMonitor.cpp +++ b/flow/SystemMonitor.cpp @@ -41,8 +41,8 @@ void systemMonitor() { customSystemMonitor("ProcessMetrics", &statState, true ); } -#define TRACEALLOCATOR( size ) TraceEvent("MemSample").detail("Count", FastAllocator::getMemoryUnused()/size).detail("TotalSize", FastAllocator::getMemoryUnused()).detail("SampleCount", 1).detail("Hash", "FastAllocatedUnused" #size ).detail("Bt", "na") -#define DETAILALLOCATORMEMUSAGE( size ) detail("AllocatedMemory"#size, FastAllocator::getMemoryUsed()).detail("ApproximateUnusedMemory"#size, FastAllocator::getMemoryUnused()) +#define TRACEALLOCATOR( size ) TraceEvent("MemSample").detail("Count", FastAllocator::getApproximateMemoryUnused()/size).detail("TotalSize", FastAllocator::getApproximateMemoryUnused()).detail("SampleCount", 1).detail("Hash", "FastAllocatedUnused" #size ).detail("Bt", "na") +#define DETAILALLOCATORMEMUSAGE( size ) detail("TotalMemory"#size, FastAllocator::getTotalMemory()).detail("ApproximateUnusedMemory"#size, FastAllocator::getApproximateMemoryUnused()).detail("ActiveThreads"#size, FastAllocator::getActiveThreads()) SystemStatistics customSystemMonitor(std::string eventName, StatisticsState *statState, bool machineMetrics) { SystemStatistics currentStats = getSystemStatistics(machineState.folder.present() ? machineState.folder.get() : "", diff --git a/flow/TDMetric.actor.h b/flow/TDMetric.actor.h index 98d1922382..33bf512fe3 100755 --- a/flow/TDMetric.actor.h +++ b/flow/TDMetric.actor.h @@ -35,6 +35,7 @@ #include "flow/CompressedInt.h" #include #include +#include #include "flow/actorcompiler.h" // This must be the last #include. struct MetricNameRef { @@ -338,9 +339,9 @@ struct FieldHeader { sum += v; } template void serialize(Ar &ar) { - ar & version; + serializer(ar, version); ASSERT(version == 1); - ar & count & sum; + serializer(ar, count, sum); } }; @@ -1126,9 +1127,9 @@ struct FieldHeader> { previous_time = v.time; } template void serialize(Ar &ar) { - ar & version; + serializer(ar, version); ASSERT(version == 1); - ar & count & area; + serializer(ar, count, area); } }; diff --git a/flow/Trace.cpp b/flow/Trace.cpp index c4ade68d3f..2a669b4ec9 100644 --- a/flow/Trace.cpp +++ b/flow/Trace.cpp @@ -22,10 +22,12 @@ #include "flow/Trace.h" #include "flow/FileTraceLogWriter.h" #include "flow/XmlTraceLogFormatter.h" +#include "flow/JsonTraceLogFormatter.h" #include "flow/flow.h" #include "flow/DeterministicRandom.h" #include #include +#include #include #include "flow/IThreadPool.h" @@ -41,6 +43,18 @@ #undef min #endif +namespace { + Reference createLogFormatter(const std::string& f) { + if (f == "json") { + return Reference(new JsonTraceLogFormatter()); + } else if (f == "xml") { + return Reference(new XmlTraceLogFormatter()); + } else { + UNREACHABLE(); + } + } +} // namespace + class DummyThreadPool : public IThreadPool, ReferenceCounted { public: ~DummyThreadPool() {} @@ -122,10 +136,10 @@ static int TRACE_LOG_MAX_PREOPEN_BUFFER = 1000000; static int TRACE_EVENT_MAX_SIZE = 4000; struct TraceLog { + Reference formatter; private: Reference logWriter; - Reference formatter; std::vector eventBuffer; int loggedLength; int bufferLength; @@ -562,6 +576,17 @@ TraceEventFields LatestEventCache::getLatestError() { static TraceLog g_traceLog; +bool selectTraceFormatter(std::string format) { + ASSERT(!g_traceLog.isOpen()); + std::transform(format.begin(), format.end(), format.begin(), ::tolower); + if (format == "xml" || format == "json") { + g_traceLog.formatter = createLogFormatter(format); + return true; + } else { + return false; + } +} + ThreadFuture flushTraceFile() { if (!g_traceLog.isOpen()) return Void(); diff --git a/flow/Trace.h b/flow/Trace.h index d8d7beadb7..03da3c36bb 100644 --- a/flow/Trace.h +++ b/flow/Trace.h @@ -262,6 +262,10 @@ void initTraceEventMetrics(); void closeTraceFile(); bool traceFileIsOpen(); +// Changes the format of trace files. Returns false if the format is unrecognized. No longer safe to call after a call +// to openTraceFile. +bool selectTraceFormatter(std::string format); + void addTraceRole(std::string role); void removeTraceRole(std::string role); diff --git a/flow/error_definitions.h b/flow/error_definitions.h index 0858afc207..9d560e1280 100755 --- a/flow/error_definitions.h +++ b/flow/error_definitions.h @@ -102,7 +102,7 @@ ERROR( io_timeout, 1521, "A disk IO operation failed to complete in a timely man ERROR( file_corrupt, 1522, "A structurally corrupt data file was detected" ) ERROR( http_request_failed, 1523, "HTTP response code not received or indicated failure" ) ERROR( http_auth_failed, 1524, "HTTP request failed due to bad credentials" ) - +ERROR( http_bad_request_id, 1525, "HTTP response contained an unexpected X-Request-ID header" ) // 2xxx Attempt (presumably by a _client_) to do something illegal. If an error is known to // be internally caused, it should be 41xx @@ -178,6 +178,7 @@ ERROR( backup_invalid_info, 2315, "Backup Container URL invalid") ERROR( backup_cannot_expire, 2316, "Cannot expire requested data from backup without violating minimum restorability") ERROR( backup_auth_missing, 2317, "Cannot find authentication details (such as a password or secret key) for the specified Backup Container URL") ERROR( backup_auth_unreadable, 2318, "Cannot read or parse one or more sources of authentication information for Backup Container URLs") +ERROR( backup_does_not_exist, 2319, "Backup does not exist") ERROR( restore_invalid_version, 2361, "Invalid restore version") ERROR( restore_corrupted_data, 2362, "Corrupted backup data") ERROR( restore_missing_data, 2363, "Missing backup data") diff --git a/flow/flow.h b/flow/flow.h index 3118a52eeb..cf67de117b 100644 --- a/flow/flow.h +++ b/flow/flow.h @@ -170,10 +170,10 @@ public: // SOMEDAY: specialize for space efficiency? if (valid && Ar::isDeserializing) (*(T *)&value).~T(); - ar & valid; + serializer(ar, valid); if (valid) { if (Ar::isDeserializing) new (&value) T(); - ar & *(T*)&value; + serializer(ar, *(T*)&value); } } @@ -247,26 +247,13 @@ public: template void serialize(Ar& ar) { // SOMEDAY: specialize for space efficiency? - ar & error; + serializer(ar, error); if (present()) { if (Ar::isDeserializing) new (&value) T(); - ar & *(T*)&value; + serializer(ar, *(T*)&value); } } - bool operator == (ErrorOr const& o) const { - return error == o.error && (!present() || get() == o.get()); - } - bool operator != (ErrorOr const& o) const { - return !(*this == o); - } - - bool operator < (ErrorOr const& o) const { - if (error != o.error) return error < o.error; - if (!present()) return false; - return get() < o.get(); - } - bool isError() const { return error.code() != invalid_error_code; } bool isError(int code) const { return error.code() == code; } Error getError() const { ASSERT(isError()); return error; } diff --git a/flow/flow.vcxproj b/flow/flow.vcxproj index da4545142b..764cbb8981 100644 --- a/flow/flow.vcxproj +++ b/flow/flow.vcxproj @@ -20,8 +20,10 @@ + + diff --git a/flow/genericactors.actor.h b/flow/genericactors.actor.h index 22e3060715..6d6293f46a 100644 --- a/flow/genericactors.actor.h +++ b/flow/genericactors.actor.h @@ -298,6 +298,16 @@ Future store(Future what, T &out) { return map(what, [&out](T const &v) { out = v; return Void(); }); } +template +Future storeOrThrow(Future> what, T &out, Error e = key_not_found()) { + return map(what, [&out,e](Optional const &o) { + if(!o.present()) + throw e; + out = o.get(); + return Void(); + }); +} + //Waits for a future to be ready, and then applies an asynchronous function to it. ACTOR template()(fake()).getValue() )> Future mapAsync(Future what, F actorFunc) diff --git a/flow/hgVersion.h.cmake b/flow/hgVersion.h.cmake new file mode 100644 index 0000000000..7083caa285 --- /dev/null +++ b/flow/hgVersion.h.cmake @@ -0,0 +1,2 @@ +#pragma once +#define hgVersion "${CURRENT_GIT_VERSION}" diff --git a/flow/serialize.h b/flow/serialize.h index 9b382d6296..89cb011676 100644 --- a/flow/serialize.h +++ b/flow/serialize.h @@ -62,15 +62,20 @@ inline typename Archive::READER& operator >> (Archive& ar, Item& item ) { return ar; } -template -inline typename Archive::WRITER& operator & (Archive& ar, Item& item ) { +template +void serializer(Archive& ar) {} + +template +typename Archive::WRITER& serializer(Archive& ar, const Item& item, const Items&... items) { save(ar, item); + serializer(ar, items...); return ar; } -template -inline typename Archive::READER& operator & (Archive& ar, Item& item ) { +template +typename Archive::READER& serializer(Archive& ar, Item& item, Items&... items) { load(ar, item); + serializer(ar, items...); return ar; } @@ -121,7 +126,7 @@ template class Serializer< Archive, std::pair, void > { public: static void serialize( Archive& ar, std::pair& p ) { - ar & p.first & p.second; + serializer(ar, p.first, p.second); } }; diff --git a/packaging/deb/foundationdb-server.control.in b/packaging/deb/foundationdb-server.control.in index 346bc1e13c..4b827c449c 100644 --- a/packaging/deb/foundationdb-server.control.in +++ b/packaging/deb/foundationdb-server.control.in @@ -4,7 +4,8 @@ Section: database Priority: optional Architecture: amd64 Conflicts: foundationdb (<< 0.1.4) -Depends: foundationdb-clients (= VERSION-RELEASE), adduser, libc6 (>= 2.11), python (>= 2.6) +Depends: foundationdb-clients (= VERSION-RELEASE), adduser, libc6 (>= 2.11) +Recommends: python (>= 2.6) Maintainer: FoundationDB Homepage: https://www.foundationdb.org Description: FoundationDB server diff --git a/packaging/description b/packaging/description new file mode 100644 index 0000000000..dfb33ed611 --- /dev/null +++ b/packaging/description @@ -0,0 +1,3 @@ +FoundationDB is a scalable, fault-tolerant, ordered key-value store +with full ACID transactions. + diff --git a/packaging/docker/Dockerfile b/packaging/docker/Dockerfile new file mode 100644 index 0000000000..b5cd01ee00 --- /dev/null +++ b/packaging/docker/Dockerfile @@ -0,0 +1,71 @@ +# Dockerfile +# +# This source file is part of the FoundationDB open source project +# +# Copyright 2013-2018 Apple Inc. and the FoundationDB project authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +FROM ubuntu:18.04 + +# Install dependencies + +RUN apt-get update && \ + apt-get install -y wget=1.19.4-1ubuntu2 \ + dnsutils=1:9.11.3+dfsg-1ubuntu1.3 && \ + rm -r /var/lib/apt/lists/* + +# Install FoundationDB Binaries + +ARG FDB_VERSION +ARG FDB_WEBSITE=https://www.foundationdb.org + +WORKDIR /var/fdb/tmp +RUN wget $FDB_WEBSITE/downloads/$FDB_VERSION/linux/fdb_$FDB_VERSION.tar.gz && \ + tar -xzf fdb_$FDB_VERSION.tar.gz --strip-components=1 && \ + rm fdb_$FDB_VERSION.tar.gz && \ + chmod u+x fdb* && \ + mv fdb* /usr/bin && \ + rm -r /var/fdb/tmp + +WORKDIR /var/fdb + +# Install FoundationDB Client Libraries + +ARG FDB_ADDITIONAL_VERSIONS="5.1.7" + +COPY download_multiversion_libraries.bash scripts/ + +RUN wget $FDB_WEBSITE/downloads/$FDB_VERSION/linux/libfdb_c_$FDB_VERSION.so -O /usr/lib/libfdb_c.so && \ + bash scripts/download_multiversion_libraries.bash $FDB_WEBSITE $FDB_ADDITIONAL_VERSIONS + +# Set Up Runtime Scripts and Directoris + +COPY fdb.bash scripts/ +COPY create_server_environment.bash scripts/ +COPY create_cluster_file.bash scripts/ +RUN chmod u+x scripts/*.bash && \ + mkdir -p logs +VOLUME /var/fdb/data + +CMD /var/fdb/scripts/fdb.bash + +# Runtime Configuration Options + +ENV FDB_PORT 4500 +ENV FDB_CLUSTER_FILE /var/fdb/fdb.cluster +ENV FDB_NETWORKING_MODE container +ENV FDB_COORDINATOR "" +ENV FDB_CLUSTER_FILE_CONTENTS "" +ENV FDB_PROCESS_CLASS unset \ No newline at end of file diff --git a/packaging/docker/README.md b/packaging/docker/README.md new file mode 100644 index 0000000000..a8d6f48de8 --- /dev/null +++ b/packaging/docker/README.md @@ -0,0 +1,71 @@ +# Overview + +This directory provides a Docker image for running FoundationDB. + +The image in this directory is based on Ubuntu 18.04, but the commands and +scripts used to build it should be suitable for most other distros with small +tweaks to the installation of dependencies. + +The image relies on the following dependencies: + +* bash +* wget +* dig +* glibc + +# Build Configuration + +This image supports several build arguments for build-time configuration. + +### FDB_VERSION + +The version of FoundationDB to install in the container. This is required. + +### FDB_WEBSITE + +The base URL for the FoundationDB website. The default is +`https://www.foundationdb.org`. + +### FDB_ADDITIONAL_VERSIONS + +A list of additional client library versions to include in this image. These +libraries will be in a special multiversion library folder. + +# Runtime Configuration + +This image supports several environment variables for run-time configuration. + +### FDB_PORT + +The port that FoundationDB should bind to. The default is 4500. + +### FDB_NETWORKING_MODE + +A networking mode that controls what address FoundationDB listens on. If this +is `container` (the default), then the server will listen on its public IP +within the docker network, and will only be accessible from other containers. + +If this is `host`, then the server will listen on `127.0.0.1`, and will not be +accessible from other containers. You should use `host` networking mode if you +want to access your container from your host machine, and you should also +map the port to the same port on your host machine when you run the container. + +### FDB_COORDINATOR + +A name of another FDB instance to use as a coordinator process. This can be +helpful when setting up a larger cluster inside a docker network, for instance +when using Docker Compose. The name you provide must be resolvable through the +DNS on the container you are running. + +# Copying Into Other Images + +You can also use this image to provide files for images that are clients of a +FoundationDB cluster, by using the `from` argument of the `COPY` command. Some +files you may want to copy are: + +* `/usr/lib/libfdb_c.so`: The primary FoundationDB client library +* `/usr/lib/fdb/multiversion/libfdb_*.so`: Additional versions of the client + library, which you can use if you are setting up a multiversion client. +* `/var/fdb/scripts/create_cluster_file.bash`: A script for setting up the + cluster file based on an `FDB_COORDINATOR` environment variable. +* `/usr/bin/fdbcli`: The FoundationDB CLI. \ No newline at end of file diff --git a/packaging/docker/create_cluster_file.bash b/packaging/docker/create_cluster_file.bash new file mode 100644 index 0000000000..b701b03d1a --- /dev/null +++ b/packaging/docker/create_cluster_file.bash @@ -0,0 +1,51 @@ +#! /bin/bash + +# +# create_cluster_file.bash +# +# This source file is part of the FoundationDB open source project +# +# Copyright 2013-2018 Apple Inc. and the FoundationDB project authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# This script creates a cluster file for a server or client. +# This takes the cluster file path from the FDB_CLUSTER_FILE +# environment variable, with a default of /etc/foundationdb/fdb.cluster +# +# The name of the coordinator must be defined in the FDB_COORDINATOR environment +# variable, and it must be a name that can be resolved through DNS. + +function create_cluster_file() { + FDB_CLUSTER_FILE=${FDB_CLUSTER_FILE:-/etc/foundationdb/fdb.cluster} + mkdir -p $(dirname $FDB_CLUSTER_FILE) + + if [[ -n "$FDB_CLUSTER_FILE_CONTENTS" ]]; then + echo "$FDB_CLUSTER_FILE_CONTENTS" > $FDB_CLUSTER_FILE + elif [[ -n $FDB_COORDINATOR ]]; then + coordinator_ip=$(dig +short $FDB_COORDINATOR) + if [[ -z "$coordinator_ip" ]]; then + echo "Failed to look up coordinator address for $FDB_COORDINATOR" 1>&2 + exit 1 + fi + echo "docker:docker@$coordinator_ip:4500" > $FDB_CLUSTER_FILE + else + echo "FDB_COORDINATOR environment variable not defined" 1>&2 + exit 1 + fi +} + +if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then + create_cluster_file "$@" +fi \ No newline at end of file diff --git a/packaging/docker/create_server_environment.bash b/packaging/docker/create_server_environment.bash new file mode 100644 index 0000000000..67979839b9 --- /dev/null +++ b/packaging/docker/create_server_environment.bash @@ -0,0 +1,46 @@ +#! /bin/bash + +# +# create_server_environment.bash +# +# This source file is part of the FoundationDB open source project +# +# Copyright 2013-2018 Apple Inc. and the FoundationDB project authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +source /var/fdb/scripts/create_cluster_file.bash + +function create_server_environment() { + fdb_dir=/var/fdb + env_file=$fdb_dir/.fdbenv + + : > $env_file + + if [[ "$FDB_NETWORKING_MODE" == "host" ]]; then + public_ip=127.0.0.1 + elif [[ "$FDB_NETWORKING_MODE" == "container" ]]; then + public_ip=$(grep `hostname` /etc/hosts | sed -e "s/\s *`hostname`.*//") + else + echo "Unknown FDB Networking mode \"$FDB_NETWORKING_MODE\"" 1>&2 + exit 1 + fi + + echo "export PUBLIC_IP=$public_ip" >> $env_file + if [[ -z $FDB_COORDINATOR ]]; then + FDB_CLUSTER_FILE_CONTENTS="docker:docker@$public_ip:$FDB_PORT" + fi + + create_cluster_file +} \ No newline at end of file diff --git a/packaging/docker/download_multiversion_libraries.bash b/packaging/docker/download_multiversion_libraries.bash new file mode 100644 index 0000000000..4df401c6ea --- /dev/null +++ b/packaging/docker/download_multiversion_libraries.bash @@ -0,0 +1,31 @@ +#! /bin/bash + +# +# download_multiversion_libraries.bash +# +# This source file is part of the FoundationDB open source project +# +# Copyright 2013-2018 Apple Inc. and the FoundationDB project authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +mkdir -p /usr/lib/fdb/multiversion +website=$1 +shift +for version in $*; do + origin=$website/downloads/$version/linux/libfdb_c_$version.so + destination=/usr/lib/fdb/multiversion/libfdb_c_$version.so + echo "Downloading $origin to $destination" + wget $origin -o $destination +done \ No newline at end of file diff --git a/packaging/docker/fdb.bash b/packaging/docker/fdb.bash new file mode 100644 index 0000000000..3fb322c431 --- /dev/null +++ b/packaging/docker/fdb.bash @@ -0,0 +1,29 @@ +#! /bin/bash + +# +# fdb.bash +# +# This source file is part of the FoundationDB open source project +# +# Copyright 2013-2018 Apple Inc. and the FoundationDB project authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +source /var/fdb/scripts/create_server_environment.bash +create_server_environment +source /var/fdb/.fdbenv +echo "Starting FDB server on $PUBLIC_IP:4500" +fdbserver --listen_address 0.0.0.0:$FDB_PORT --public_address $PUBLIC_IP:4500 \ + --datadir /var/fdb/data --logdir /var/fdb/logs \ + --locality_zoneid=`hostname` --locality_machineid=`hostname` --class $FDB_PROCESS_CLASS \ No newline at end of file diff --git a/packaging/docker/samples/python/.env b/packaging/docker/samples/python/.env new file mode 100644 index 0000000000..bef2d23f7f --- /dev/null +++ b/packaging/docker/samples/python/.env @@ -0,0 +1,20 @@ +# .env +# +# This source file is part of the FoundationDB open source project +# +# Copyright 2013-2018 Apple Inc. and the FoundationDB project authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +COMPOSE_PROJECT_NAME=fdbpythonsample \ No newline at end of file diff --git a/packaging/docker/samples/python/app/Dockerfile b/packaging/docker/samples/python/app/Dockerfile new file mode 100644 index 0000000000..8172f5aaea --- /dev/null +++ b/packaging/docker/samples/python/app/Dockerfile @@ -0,0 +1,41 @@ +# Dockerfile +# +# This source file is part of the FoundationDB open source project +# +# Copyright 2013-2018 Apple Inc. and the FoundationDB project authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +FROM python:3.6 + +RUN apt-get update; apt-get install -y dnsutils + +RUN mkdir -p /app +WORKDIR /app + +COPY --from=foundationdb:5.2.5 /usr/lib/libfdb_c.so /usr/lib +COPY --from=foundationdb:5.2.5 /usr/bin/fdbcli /usr/bin/ +COPY --from=foundationdb:5.2.5 /var/fdb/scripts/create_cluster_file.bash /app + +COPY requirements.txt /app +RUN pip install -r requirements.txt + +COPY start.bash /app +COPY server.py /app +RUN chmod u+x /app/start.bash + +CMD /app/start.bash + +ENV FLASK_APP=server.py +ENV FLASK_ENV=development \ No newline at end of file diff --git a/packaging/docker/samples/python/app/requirements.txt b/packaging/docker/samples/python/app/requirements.txt new file mode 100644 index 0000000000..ecae3f8996 --- /dev/null +++ b/packaging/docker/samples/python/app/requirements.txt @@ -0,0 +1,21 @@ +# requirements.txt +# +# This source file is part of the FoundationDB open source project +# +# Copyright 2013-2018 Apple Inc. and the FoundationDB project authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +Flask==1.0.2 +foundationdb==5.1.5 \ No newline at end of file diff --git a/packaging/docker/samples/python/app/server.py b/packaging/docker/samples/python/app/server.py new file mode 100644 index 0000000000..e7b0029113 --- /dev/null +++ b/packaging/docker/samples/python/app/server.py @@ -0,0 +1,48 @@ +# server.py +# +# This source file is part of the FoundationDB open source project +# +# Copyright 2013-2018 Apple Inc. and the FoundationDB project authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from flask import Flask +import fdb + +app = Flask(__name__) + +fdb.api_version(510) +db=fdb.open() + +COUNTER_KEY=fdb.tuple.pack(('counter',)) +def _increment_counter(tr): + counter_value = tr[COUNTER_KEY] + if counter_value == None: + counter = 1 + else: + counter = fdb.tuple.unpack(counter_value)[0] + 1 + tr[COUNTER_KEY] = fdb.tuple.pack((counter,)) + return counter + +@app.route("/counter", methods=['GET']) +def get_counter(): + counter_value = db[COUNTER_KEY] + if counter_value == None: + return '0' + else: + return str(fdb.tuple.unpack(counter_value)[0]) + +@app.route("/counter/increment", methods=['POST']) +def increment_counter(): + return str(_increment_counter(db)) \ No newline at end of file diff --git a/packaging/docker/samples/python/app/start.bash b/packaging/docker/samples/python/app/start.bash new file mode 100644 index 0000000000..f8baf94a2a --- /dev/null +++ b/packaging/docker/samples/python/app/start.bash @@ -0,0 +1,23 @@ +#! /bin/bash + +# start.bash +# +# This source file is part of the FoundationDB open source project +# +# Copyright 2013-2018 Apple Inc. and the FoundationDB project authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +/app/create_cluster_file.bash +FLASK_APP=server.py flask run --host=0.0.0.0 \ No newline at end of file diff --git a/packaging/docker/samples/python/docker-compose.yml b/packaging/docker/samples/python/docker-compose.yml new file mode 100644 index 0000000000..2280414688 --- /dev/null +++ b/packaging/docker/samples/python/docker-compose.yml @@ -0,0 +1,36 @@ +# docker-compose.yaml +# +# This source file is part of the FoundationDB open source project +# +# Copyright 2013-2018 Apple Inc. and the FoundationDB project authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +version: '3' +services: + fdb: + image: foundationdb:5.2.5 + environment: + FDB_COORDINATOR: fdb-coordinator + fdb-coordinator: + image: foundationdb:5.2.5 + environment: + FDB_COORDINATOR: fdb-coordinator + app: + build: + context: app + ports: + - 5000:5000 + environment: + FDB_COORDINATOR: fdb-coordinator \ No newline at end of file diff --git a/packaging/fdb.cluster.cmake b/packaging/fdb.cluster.cmake new file mode 100644 index 0000000000..632fa1a4d3 --- /dev/null +++ b/packaging/fdb.cluster.cmake @@ -0,0 +1 @@ +${CLUSTER_DESCRIPTION1}:${CLUSTER_DESCRIPTION1}@127.0.0.1:4500 diff --git a/packaging/msi/FDBInstaller.wxs b/packaging/msi/FDBInstaller.wxs index 71db7fcb11..769d8abb1e 100644 --- a/packaging/msi/FDBInstaller.wxs +++ b/packaging/msi/FDBInstaller.wxs @@ -32,7 +32,7 @@ /etc/foundationdb/fdb.cluster + chown foundationdb:foundationdb /etc/foundationdb/fdb.cluster + chmod 0664 /etc/foundationdb/fdb.cluster + NEWDB=1 + fi + + if pidof systemd + then + /usr/bin/systemctl enable foundationdb >/dev/null 2>&1 + /usr/bin/systemctl start foundationdb >/dev/null 2>&1 + else + /sbin/chkconfig --add foundationdb >/dev/null 2>&1 + /sbin/service foundationdb start >/dev/null 2>&1 + fi + + if [ "$NEWDB" != "" ]; then + /usr/bin/fdbcli -C /etc/foundationdb/fdb.cluster --exec "configure new single memory" --timeout 20 >/dev/null 2>&1 + fi +else + if pidof systemd + then + /usr/bin/systemctl condrestart foundationdb >/dev/null 2>&1 + else + /sbin/service foundationdb condrestart >/dev/null 2>&1 + fi +fi +exit 0 + diff --git a/packaging/rpm/scripts/preclients.sh b/packaging/rpm/scripts/preclients.sh new file mode 100644 index 0000000000..fb543b5029 --- /dev/null +++ b/packaging/rpm/scripts/preclients.sh @@ -0,0 +1,3 @@ +getent group foundationdb >/dev/null || groupadd -r foundationdb >/dev/null +getent passwd foundationdb >/dev/null || useradd -c "FoundationDB" -g foundationdb -s /bin/false -r -d /var/lib/foundationdb foundationdb >/dev/null +exit 0 diff --git a/packaging/rpm/scripts/preserver.sh b/packaging/rpm/scripts/preserver.sh new file mode 100644 index 0000000000..14fd4150db --- /dev/null +++ b/packaging/rpm/scripts/preserver.sh @@ -0,0 +1,24 @@ +# This should be ensured by the foundationdb-clients package, but it can't hurt... +getent group foundationdb >/dev/null || groupadd -r foundationdb >/dev/null +getent passwd foundationdb >/dev/null || useradd -c "FoundationDB" -g foundationdb -s /bin/false -r -d /var/lib/foundationdb foundationdb >/dev/null + +if [ $1 -gt 1 ]; then + # old versions could leave this behind + rm -f /usr/lib/foundationdb/argparse.pyc +fi + +CURRENTVER=$(rpm -q --queryformat %%{VERSION} foundationdb-server) || : +if [ "$CURRENTVER" = "0.1.5" ] || [ "$CURRENTVER" = "0.1.4" ]; then + mv /etc/foundationdb/foundationdb.conf /etc/foundationdb/foundationdb.conf.rpmsave +fi +if [ $1 -eq 0 ]; then + if pidof systemd + then + /usr/bin/systemctl stop foundationdb >/dev/null 2>&1 + /usr/bin/systemctl disable foundationdb >/dev/null 2>&1 + else + /sbin/service foundationdb stop >/dev/null 2>&1 + /sbin/chkconfig --del foundationdb >/dev/null 2>&1 + fi +fi +exit 0 diff --git a/packaging/rpm/scripts/preunserver.sh b/packaging/rpm/scripts/preunserver.sh new file mode 100644 index 0000000000..88943c53e8 --- /dev/null +++ b/packaging/rpm/scripts/preunserver.sh @@ -0,0 +1,12 @@ +if [ $1 -eq 0 ]; then + + if pidof systemd + then + /usr/bin/systemctl stop foundationdb >/dev/null 2>&1 + /usr/bin/systemctl disable foundationdb >/dev/null 2>&1 + else + /sbin/service foundationdb stop >/dev/null 2>&1 + /sbin/chkconfig --del foundationdb >/dev/null 2>&1 + fi +fi +exit 0