Merge branch 'release-6.2' of github.com:apple/foundationdb into feature-redwood

This commit is contained in:
Stephen Atherton 2019-10-23 10:12:54 -07:00
commit 0e51a248b4
45 changed files with 988 additions and 462 deletions

View File

@ -18,7 +18,7 @@
# limitations under the License.
cmake_minimum_required(VERSION 3.12)
project(foundationdb
VERSION 6.2.5
VERSION 6.2.7
DESCRIPTION "FoundationDB is a scalable, fault-tolerant, ordered key-value store with full ACID transactions."
HOMEPAGE_URL "http://www.foundationdb.org/"
LANGUAGES C CXX ASM)

View File

@ -107,12 +107,10 @@ fdb_error_t fdb_network_set_option( FDBNetworkOption option,
API->setNetworkOption( (FDBNetworkOptions::Option)option, value ? StringRef( value, value_length ) : Optional<StringRef>() ); );
}
extern "C"
fdb_error_t fdb_setup_network_impl() {
CATCH_AND_RETURN( API->setupNetwork(); );
}
extern "C"
fdb_error_t fdb_setup_network_v13( const char* localAddress ) {
fdb_error_t errorCode = fdb_network_set_option( FDB_NET_OPTION_LOCAL_ADDRESS, (uint8_t const*)localAddress, strlen(localAddress) );
if(errorCode != 0)
@ -159,7 +157,6 @@ fdb_error_t fdb_future_block_until_ready( FDBFuture* f ) {
CATCH_AND_RETURN( TSAVB(f)->blockUntilReady(); );
}
extern "C" DLLEXPORT
fdb_bool_t fdb_future_is_error_v22( FDBFuture* f ) {
return TSAVB(f)->isError();
}
@ -200,12 +197,10 @@ fdb_error_t fdb_future_set_callback( FDBFuture* f,
CATCH_AND_RETURN( TSAVB(f)->callOrSetAsCallback( cb, ignore, 0 ); );
}
extern "C" DLLEXPORT
fdb_error_t fdb_future_get_error_impl( FDBFuture* f ) {
return TSAVB(f)->getErrorCode();
}
extern "C" DLLEXPORT
fdb_error_t fdb_future_get_error_v22( FDBFuture* f, const char** description ) {
if ( !( TSAVB(f)->isError() ) )
return error_code_future_not_error;
@ -232,14 +227,12 @@ fdb_error_t fdb_future_get_key( FDBFuture* f, uint8_t const** out_key,
*out_key_length = key.size(); );
}
extern "C" DLLEXPORT
fdb_error_t fdb_future_get_cluster_v609( FDBFuture* f, FDBCluster** out_cluster ) {
CATCH_AND_RETURN(
*out_cluster = (FDBCluster*)
( (TSAV( char*, f )->get() ) ); );
}
extern "C" DLLEXPORT
fdb_error_t fdb_future_get_database_v609( FDBFuture* f, FDBDatabase** out_database ) {
CATCH_AND_RETURN(
*out_database = (FDBDatabase*)
@ -258,7 +251,6 @@ fdb_error_t fdb_future_get_value( FDBFuture* f, fdb_bool_t* out_present,
} );
}
extern "C"
fdb_error_t fdb_future_get_keyvalue_array_impl(
FDBFuture* f, FDBKeyValue const** out_kv,
int* out_count, fdb_bool_t* out_more )
@ -270,7 +262,6 @@ fdb_error_t fdb_future_get_keyvalue_array_impl(
*out_more = rrr.more; );
}
extern "C"
fdb_error_t fdb_future_get_keyvalue_array_v13(
FDBFuture* f, FDBKeyValue const** out_kv, int* out_count)
{
@ -280,7 +271,7 @@ fdb_error_t fdb_future_get_keyvalue_array_v13(
*out_count = rrr.size(); );
}
extern "C"
extern "C" DLLEXPORT
fdb_error_t fdb_future_get_string_array(
FDBFuture* f, const char*** out_strings, int* out_count)
{
@ -291,7 +282,6 @@ fdb_error_t fdb_future_get_string_array(
);
}
extern "C" DLLEXPORT
FDBFuture* fdb_create_cluster_v609( const char* cluster_file_path ) {
char *path;
if(cluster_file_path) {
@ -305,7 +295,6 @@ FDBFuture* fdb_create_cluster_v609( const char* cluster_file_path ) {
return (FDBFuture*)ThreadFuture<char*>(path).extractPtr();
}
extern "C" DLLEXPORT
fdb_error_t fdb_cluster_set_option_v609( FDBCluster* c,
FDBClusterOption option,
uint8_t const* value,
@ -315,12 +304,19 @@ fdb_error_t fdb_cluster_set_option_v609( FDBCluster* c,
return error_code_success;
}
extern "C" DLLEXPORT
void fdb_cluster_destroy_v609( FDBCluster* c ) {
CATCH_AND_DIE( delete[] CLUSTER(c); );
}
extern "C" DLLEXPORT
// This exists so that fdb_cluster_create_database doesn't need to call the public symbol fdb_create_database.
// If it does and this is an external client loaded though the multi-version API, then it may inadvertently call
// the version of the function in the primary library if it was loaded into the global symbols.
fdb_error_t fdb_create_database_impl( const char* cluster_file_path, FDBDatabase** out_database ) {
CATCH_AND_RETURN(
*out_database = (FDBDatabase*)API->createDatabase( cluster_file_path ? cluster_file_path : "" ).extractPtr();
);
}
FDBFuture* fdb_cluster_create_database_v609( FDBCluster* c, uint8_t const* db_name,
int db_name_length )
{
@ -329,7 +325,7 @@ FDBFuture* fdb_cluster_create_database_v609( FDBCluster* c, uint8_t const* db_na
}
FDBDatabase *db;
fdb_error_t err = fdb_create_database(CLUSTER(c), &db);
fdb_error_t err = fdb_create_database_impl(CLUSTER(c), &db);
if(err) {
return (FDBFuture*)ThreadFuture<Reference<IDatabase>>(Error(err)).extractPtr();
}
@ -339,9 +335,7 @@ FDBFuture* fdb_cluster_create_database_v609( FDBCluster* c, uint8_t const* db_na
extern "C" DLLEXPORT
fdb_error_t fdb_create_database( const char* cluster_file_path, FDBDatabase** out_database ) {
CATCH_AND_RETURN(
*out_database = (FDBDatabase*)API->createDatabase( cluster_file_path ? cluster_file_path : "" ).extractPtr();
);
return fdb_create_database_impl( cluster_file_path, out_database );
}
extern "C" DLLEXPORT
@ -393,21 +387,18 @@ FDBFuture* fdb_transaction_get_read_version( FDBTransaction* tr ) {
return (FDBFuture*)( TXN(tr)->getReadVersion().extractPtr() );
}
extern "C"
FDBFuture* fdb_transaction_get_impl( FDBTransaction* tr, uint8_t const* key_name,
int key_name_length, fdb_bool_t snapshot ) {
return (FDBFuture*)
( TXN(tr)->get( KeyRef( key_name, key_name_length ), snapshot ).extractPtr() );
}
extern "C"
FDBFuture* fdb_transaction_get_v13( FDBTransaction* tr, uint8_t const* key_name,
int key_name_length )
{
return fdb_transaction_get_impl( tr, key_name, key_name_length, 0 );
}
extern "C"
FDBFuture* fdb_transaction_get_key_impl( FDBTransaction* tr, uint8_t const* key_name,
int key_name_length, fdb_bool_t or_equal,
int offset, fdb_bool_t snapshot ) {
@ -418,7 +409,6 @@ FDBFuture* fdb_transaction_get_key_impl( FDBTransaction* tr, uint8_t const* key_
snapshot ).extractPtr() );
}
extern "C"
FDBFuture* fdb_transaction_get_key_v13( FDBTransaction* tr, uint8_t const* key_name,
int key_name_length, fdb_bool_t or_equal,
int offset ) {
@ -426,14 +416,13 @@ FDBFuture* fdb_transaction_get_key_v13( FDBTransaction* tr, uint8_t const* key_n
or_equal, offset, false );
}
extern "C"
extern "C" DLLEXPORT
FDBFuture* fdb_transaction_get_addresses_for_key( FDBTransaction* tr, uint8_t const* key_name,
int key_name_length ){
return (FDBFuture*)( TXN(tr)->getAddressesForKey( KeyRef(key_name, key_name_length) ).extractPtr() );
}
extern "C"
FDBFuture* fdb_transaction_get_range_impl(
FDBTransaction* tr, uint8_t const* begin_key_name,
int begin_key_name_length, fdb_bool_t begin_or_equal, int begin_offset,
@ -504,7 +493,6 @@ FDBFuture* fdb_transaction_get_range_impl(
snapshot, reverse ).extractPtr() );
}
extern "C"
FDBFuture* fdb_transaction_get_range_selector_v13(
FDBTransaction* tr, uint8_t const* begin_key_name, int begin_key_name_length,
fdb_bool_t begin_or_equal, int begin_offset, uint8_t const* end_key_name,
@ -516,7 +504,6 @@ FDBFuture* fdb_transaction_get_range_selector_v13(
limit, 0, FDB_STREAMING_MODE_EXACT, 0, false, false);
}
extern "C"
FDBFuture* fdb_transaction_get_range_v13(
FDBTransaction* tr, uint8_t const* begin_key_name, int begin_key_name_length,
uint8_t const* end_key_name, int end_key_name_length, int limit )
@ -599,7 +586,6 @@ FDBFuture* fdb_transaction_get_versionstamp( FDBTransaction* tr )
return (FDBFuture*)(TXN(tr)->getVersionstamp().extractPtr());
}
extern "C"
fdb_error_t fdb_transaction_set_option_impl( FDBTransaction* tr,
FDBTransactionOption option,
uint8_t const* value,
@ -609,7 +595,6 @@ fdb_error_t fdb_transaction_set_option_impl( FDBTransaction* tr,
TXN(tr)->setOption( (FDBTransactionOptions::Option)option, value ? StringRef( value, value_length ) : Optional<StringRef>() ); );
}
extern "C"
void fdb_transaction_set_option_v13( FDBTransaction* tr,
FDBTransactionOption option )
{
@ -679,6 +664,10 @@ fdb_error_t fdb_select_api_version_impl( int runtime_version, int header_version
// Versioned API changes -- descending order by version (new changes at top)
// FDB_API_CHANGED( function, ver ) means there is a new implementation as of ver, and a function function_(ver-1) is the old implementation
// FDB_API_REMOVED( function, ver ) means the function was removed as of ver, and function_(ver-1) is the old implementation
//
// WARNING: use caution when implementing removed functions by calling public API functions. This can lead to undesired behavior when
// using the multi-version API. Instead, it is better to have both the removed and public functions call an internal implementation function.
// See fdb_create_database_impl for an example.
FDB_API_REMOVED( fdb_future_get_version, 620 );
FDB_API_REMOVED( fdb_create_cluster, 610 );
FDB_API_REMOVED( fdb_cluster_create_database, 610 );

View File

@ -294,6 +294,13 @@ extern "C" {
DLLEXPORT WARN_UNUSED_RESULT FDBFuture*
fdb_cluster_create_database( FDBCluster* c, uint8_t const* db_name,
int db_name_length );
#else
#define fdb_future_get_cluster(f, oc) FDB_REMOVED_FUNCTION
#define fdb_future_get_database(f, od) FDB_REMOVED_FUNCTION
#define fdb_create_cluster(cfp) FDB_REMOVED_FUNCTION
#define fdb_cluster_destroy(c) FDB_REMOVED_FUNCTION
#define fdb_cluster_set_option(c, o, v, vl) FDB_REMOVED_FUNCTION
#define fdb_cluster_create_database(c, dn, dnl) FDB_REMOVED_FUNCTION
#endif
#if FDB_API_VERSION < 23

View File

@ -1,13 +1,13 @@
FROM centos:6
LABEL version=0.1.7
ENV DOCKER_IMAGEVER=0.1.7
LABEL version=0.1.8
ENV DOCKER_IMAGEVER=0.1.8
# Install dependencies for developer tools, bindings,\
# documentation, actorcompiler, and packaging tools\
RUN yum install -y yum-utils &&\
yum-config-manager --enable rhel-server-rhscl-7-rpms &&\
yum -y install centos-release-scl epel-release &&\
yum -y install devtoolset-8 java-1.8.0-openjdk-devel \
yum -y install devtoolset-8-8.1-1.el6 java-1.8.0-openjdk-devel \
rh-python36-python-devel devtoolset-8-valgrind-devel \
mono-core rh-ruby24 golang python27 rpm-build debbuild \
python-pip npm dos2unix valgrind-devel ccache distcc &&\

View File

@ -2,7 +2,7 @@ version: "3"
services:
common: &common
image: foundationdb/foundationdb-build:0.1.7
image: foundationdb/foundationdb-build:0.1.8
build-setup: &build-setup
<<: *common

View File

@ -221,9 +221,14 @@ else()
# Check whether we can use dtrace probes
include(CheckSymbolExists)
check_symbol_exists(DTRACE_PROBE sys/sdt.h SUPPORT_DTRACE)
check_symbol_exists(aligned_alloc stdlib.h HAS_ALIGNED_ALLOC)
message(STATUS "Has aligned_alloc: ${HAS_ALIGNED_ALLOC}")
if(SUPPORT_DTRACE)
add_compile_definitions(DTRACE_PROBES)
endif()
if(HAS_ALIGNED_ALLOC)
add_compile_definitions(HAS_ALIGNED_ALLOC)
endif()
if(CMAKE_COMPILER_IS_GNUCXX)
set(USE_LTO OFF CACHE BOOL "Do link time optimization")

View File

@ -380,6 +380,24 @@ The ``list`` subcommand will list the backups at a given 'base' or shortened Bac
This a shortened Backup URL which looks just like a Backup URL but without the backup <name> so that the list command will discover and list all of the backups in the bucket.
.. program:: fdbbackup cleanup
``cleanup``
------------
The ``cleanup`` subcommand will list orphaned backups and DRs and optionally remove their mutations.
::
user@host$ fdbbackup cleanup [--delete_data] [--min_cleanup_seconds] [-C <CLUSTER_FILE>]
``--delete_data``
This flag will cause ``cleanup`` to remove mutations for the most stale backup or DR.
``--min_cleanup_seconds``
Specifies the amount of time a backup or DR needs to be stale before ``cleanup`` will remove mutations for it. By default this is set to one hour.
``fdbrestore`` command line tool
================================

View File

@ -599,7 +599,8 @@ Regions are configured in FoundationDB as a json document. For example::
"datacenters":[{
"id":"WC1",
"priority":1,
"satellite":1
"satellite":1,
"satellite_logs":2
}],
"satellite_redundancy_mode":"one_satellite_double",
"satellite_logs":2
@ -659,7 +660,8 @@ This is the region configuration that implements the example::
},{
"id":"WC2",
"priority":0,
"satellite":1
"satellite":1,
"satellite_logs":2
}],
"satellite_redundancy_mode":"one_satellite_double"
},{

View File

@ -10,38 +10,38 @@ macOS
The macOS installation package is supported on macOS 10.7+. It includes the client and (optionally) the server.
* `FoundationDB-6.2.4.pkg <https://www.foundationdb.org/downloads/6.2.4/macOS/installers/FoundationDB-6.2.4.pkg>`_
* `FoundationDB-6.2.6.pkg <https://www.foundationdb.org/downloads/6.2.6/macOS/installers/FoundationDB-6.2.6.pkg>`_
Ubuntu
------
The Ubuntu packages are supported on 64-bit Ubuntu 12.04+, but beware of the Linux kernel bug in Ubuntu 12.x.
* `foundationdb-clients-6.2.4-1_amd64.deb <https://www.foundationdb.org/downloads/6.2.4/ubuntu/installers/foundationdb-clients_6.2.4-1_amd64.deb>`_
* `foundationdb-server-6.2.4-1_amd64.deb <https://www.foundationdb.org/downloads/6.2.4/ubuntu/installers/foundationdb-server_6.2.4-1_amd64.deb>`_ (depends on the clients package)
* `foundationdb-clients-6.2.6-1_amd64.deb <https://www.foundationdb.org/downloads/6.2.6/ubuntu/installers/foundationdb-clients_6.2.6-1_amd64.deb>`_
* `foundationdb-server-6.2.6-1_amd64.deb <https://www.foundationdb.org/downloads/6.2.6/ubuntu/installers/foundationdb-server_6.2.6-1_amd64.deb>`_ (depends on the clients package)
RHEL/CentOS EL6
---------------
The RHEL/CentOS EL6 packages are supported on 64-bit RHEL/CentOS 6.x.
* `foundationdb-clients-6.2.4-1.el6.x86_64.rpm <https://www.foundationdb.org/downloads/6.2.4/rhel6/installers/foundationdb-clients-6.2.4-1.el6.x86_64.rpm>`_
* `foundationdb-server-6.2.4-1.el6.x86_64.rpm <https://www.foundationdb.org/downloads/6.2.4/rhel6/installers/foundationdb-server-6.2.4-1.el6.x86_64.rpm>`_ (depends on the clients package)
* `foundationdb-clients-6.2.6-1.el6.x86_64.rpm <https://www.foundationdb.org/downloads/6.2.6/rhel6/installers/foundationdb-clients-6.2.6-1.el6.x86_64.rpm>`_
* `foundationdb-server-6.2.6-1.el6.x86_64.rpm <https://www.foundationdb.org/downloads/6.2.6/rhel6/installers/foundationdb-server-6.2.6-1.el6.x86_64.rpm>`_ (depends on the clients package)
RHEL/CentOS EL7
---------------
The RHEL/CentOS EL7 packages are supported on 64-bit RHEL/CentOS 7.x.
* `foundationdb-clients-6.2.4-1.el7.x86_64.rpm <https://www.foundationdb.org/downloads/6.2.4/rhel7/installers/foundationdb-clients-6.2.4-1.el7.x86_64.rpm>`_
* `foundationdb-server-6.2.4-1.el7.x86_64.rpm <https://www.foundationdb.org/downloads/6.2.4/rhel7/installers/foundationdb-server-6.2.4-1.el7.x86_64.rpm>`_ (depends on the clients package)
* `foundationdb-clients-6.2.6-1.el7.x86_64.rpm <https://www.foundationdb.org/downloads/6.2.6/rhel7/installers/foundationdb-clients-6.2.6-1.el7.x86_64.rpm>`_
* `foundationdb-server-6.2.6-1.el7.x86_64.rpm <https://www.foundationdb.org/downloads/6.2.6/rhel7/installers/foundationdb-server-6.2.6-1.el7.x86_64.rpm>`_ (depends on the clients package)
Windows
-------
The Windows installer is supported on 64-bit Windows XP and later. It includes the client and (optionally) the server.
* `foundationdb-6.2.4-x64.msi <https://www.foundationdb.org/downloads/6.2.4/windows/installers/foundationdb-6.2.4-x64.msi>`_
* `foundationdb-6.2.6-x64.msi <https://www.foundationdb.org/downloads/6.2.6/windows/installers/foundationdb-6.2.6-x64.msi>`_
API Language Bindings
=====================
@ -58,18 +58,18 @@ On macOS and Windows, the FoundationDB Python API bindings are installed as part
If you need to use the FoundationDB Python API from other Python installations or paths, download the Python package:
* `foundationdb-6.2.4.tar.gz <https://www.foundationdb.org/downloads/6.2.4/bindings/python/foundationdb-6.2.4.tar.gz>`_
* `foundationdb-6.2.6.tar.gz <https://www.foundationdb.org/downloads/6.2.6/bindings/python/foundationdb-6.2.6.tar.gz>`_
Ruby 1.9.3/2.0.0+
-----------------
* `fdb-6.2.4.gem <https://www.foundationdb.org/downloads/6.2.4/bindings/ruby/fdb-6.2.4.gem>`_
* `fdb-6.2.6.gem <https://www.foundationdb.org/downloads/6.2.6/bindings/ruby/fdb-6.2.6.gem>`_
Java 8+
-------
* `fdb-java-6.2.4.jar <https://www.foundationdb.org/downloads/6.2.4/bindings/java/fdb-java-6.2.4.jar>`_
* `fdb-java-6.2.4-javadoc.jar <https://www.foundationdb.org/downloads/6.2.4/bindings/java/fdb-java-6.2.4-javadoc.jar>`_
* `fdb-java-6.2.6.jar <https://www.foundationdb.org/downloads/6.2.6/bindings/java/fdb-java-6.2.6.jar>`_
* `fdb-java-6.2.6-javadoc.jar <https://www.foundationdb.org/downloads/6.2.6/bindings/java/fdb-java-6.2.6-javadoc.jar>`_
Go 1.11+
--------

View File

@ -498,7 +498,8 @@
"datacenters":[{
"id":"mr",
"priority":1,
"satellite":1
"satellite":1,
"satellite_logs":2
}],
"satellite_redundancy_mode":{
"$enum":[
@ -577,6 +578,7 @@
"max_machine_failures_without_losing_availability":0,
"total_disk_used_bytes":0,
"total_kv_size_bytes":0, // estimated
"system_kv_size_bytes":0, // estimated
"partitions_count":2,
"moving_data":{
"total_written_bytes":0,

View File

@ -2,6 +2,13 @@
Release Notes
#############
6.1.13
======
* Loading a 6.1 or newer ``fdb_c`` library as a secondary client using the multi-version client could lead to an infinite recursion when run with API versions older than 610. `(PR #2169) <https://github.com/apple/foundationdb/pull/2169>`_
* Using C API functions that were removed in 6.1 when using API version 610 or above now results in a compilation error. `(PR #2169) <https://github.com/apple/foundationdb/pull/2169>`_
* ``fdbrestore`` commands other than ``start`` required a default cluster file to be found but did not actually use it. `(PR #1912) <https://github.com/apple/foundationdb/pull/1912>`_.
6.1.12
======

View File

@ -2,7 +2,7 @@
Release Notes
#############
6.2.5
6.2.6
=====
Performance
@ -27,6 +27,7 @@ Performance
* Log routers will prefer to peek from satellites at ``log_version >= 4``. `(PR #1795) <https://github.com/apple/foundationdb/pull/1795>`_.
* In clusters using a region configuration, clients will read from the remote region if all of the servers in the primary region are overloaded. [6.2.3] `(PR #2019) <https://github.com/apple/foundationdb/pull/2019>`_.
* Significantly improved the rate at which the transaction logs in a remote region can pull data from the primary region. [6.2.4] `(PR #2101) <https://github.com/apple/foundationdb/pull/2101>`_.
* Raised the data distribution priority of splitting shards because delaying splits can cause hot write shards. [6.2.6] `(PR #2234) <https://github.com/apple/foundationdb/pull/2234>`_.
Fixes
-----
@ -47,6 +48,16 @@ Fixes
* Configuring regions would fail with an internal error if the cluster contained storage servers that didn't set a datacenter ID. `(PR #2017) <https://github.com/apple/foundationdb/pull/2017>`_.
* Clients no longer prefer reading from servers with the same zone ID, because it could create hot shards. [6.2.3] `(PR #2019) <https://github.com/apple/foundationdb/pull/2019>`_.
* Data distribution could fail to start if any storage servers had misconfigured locality information. This problem could persist even after the offending storage servers were removed or fixed. [6.2.5] `(PR #2110) <https://github.com/apple/foundationdb/pull/2110>`_.
* Data distribution was running at too high of a priority, which sometimes caused other roles on the same process to stall. [6.2.5] `(PR #2170) <https://github.com/apple/foundationdb/pull/2170>`_.
* Loading a 6.1 or newer ``fdb_c`` library as a secondary client using the multi-version client could lead to an infinite recursion when run with API versions older than 610. [6.2.5] `(PR #2169) <https://github.com/apple/foundationdb/pull/2169>`_
* Using C API functions that were removed in 6.1 when using API version 610 or above now results in a compilation error. [6.2.5] `(PR #2169) <https://github.com/apple/foundationdb/pull/2169>`_
* Coordinator changes could fail to complete if the database wasn't allowing any transactions to start. [6.2.6] `(PR #2191) <https://github.com/apple/foundationdb/pull/2191>`_
* Status would report incorrect fault tolerance metrics when a remote region was configured and the primary region lost a storage replica. [6.2.6] `(PR #2230) <https://github.com/apple/foundationdb/pull/2230>`_
* The cluster would not change to a new set of satellite transaction logs when they become available in a better satellite location. [6.2.6] `(PR #2241) <https://github.com/apple/foundationdb/pull/2241>`_.
* The existence of ``proxy`` or ``resolver`` class processes prevented ``stateless`` class processes from being recruited as proxies or resolvers. [6.2.6] `(PR #2241) <https://github.com/apple/foundationdb/pull/2241>`_.
* Committing transactions larger than 1 MB could cause the proxy to stall for up to a second. [6.2.6] `(PR #2250) <https://github.com/apple/foundationdb/pull/2250>`_.
* The cluster controller could become saturated in clusters with large numbers of connected clients using TLS. [6.2.6] `(PR #2252) <https://github.com/apple/foundationdb/pull/2252>`_.
* Backup and DR would not share a mutation stream if they were started on different versions of FoundationDB. Either backup or DR must be restarted to resolve this issue. [6.2.6] `(PR #2202) <https://github.com/apple/foundationdb/pull/2202>`_.
Status
------
@ -64,6 +75,7 @@ Status
* Add ``coordinator`` to the list of roles that can be reported for a process. [6.2.3] `(PR #2006) <https://github.com/apple/foundationdb/pull/2006>`_.
* Added ``worst_durability_lag_storage_server`` and ``limiting_durability_lag_storage_server`` to the ``cluster.qos`` section, each with subfields ``versions`` and ``seconds``. These report the durability lag values being used by ratekeeper to potentially limit the transaction rate. [6.2.3] `(PR #2003) <https://github.com/apple/foundationdb/pull/2003>`_.
* Added ``worst_data_lag_storage_server`` and ``limiting_data_lag_storage_server`` to the ``cluster.qos`` section, each with subfields ``versions`` and ``seconds``. These are meant to replace ``worst_version_lag_storage_server`` and ``limiting_version_lag_storage_server``, which are now deprecated. [6.2.3] `(PR #2003) <https://github.com/apple/foundationdb/pull/2003>`_.
* Added ``system_kv_size_bytes`` to the ``cluster.data`` section to record the size of the system keyspace. [6.2.5] `(PR #2170) <https://github.com/apple/foundationdb/pull/2170>`_.
Bindings
--------
@ -78,6 +90,12 @@ Bindings
* Added a transaction option to control the whether ``get_addresses_for_key`` includes a port in the address. This will be deprecated in api version 700, and addresses will include ports by default. [6.2.4] `(PR #2060) <https://github.com/apple/foundationdb/pull/2060>`_.
* Python: ``Versionstamp`` comparisons didn't work in Python 3. [6.2.4] `(PR #2089) <https://github.com/apple/foundationdb/pull/2089>`_.
Features
--------
* Added the ``cleanup`` command to ``fdbbackup`` which can be used to remove orphaned backups or DRs. [6.2.5] `(PR #2170) <https://github.com/apple/foundationdb/pull/2170>`_.
* Added the ability to configure ``satellite_logs`` by satellite location. This will overwrite the region configure of ``satellite_logs`` if both are present. [6.2.6] `(PR #2241) <https://github.com/apple/foundationdb/pull/2241>`_.
Other Changes
-------------
@ -112,6 +130,10 @@ Fixes only impacting 6.2.0+
* The cluster controller would saturate its CPU for a few seconds when sending configuration information to all of the worker processes. [6.2.4] `(PR #2086) <https://github.com/apple/foundationdb/pull/2086>`_.
* The data distributor would build all possible team combinations if it was tracking an unhealthy server with less than 10 teams. [6.2.4] `(PR #2099) <https://github.com/apple/foundationdb/pull/2099>`_.
* The cluster controller could crash if a coordinator was unreachable when compiling cluster status. [6.2.4] `(PR #2065) <https://github.com/apple/foundationdb/pull/2065>`_.
* The cluster controller could crash if a coordinator was unreachable when compiling cluster status. [6.2.4] `(PR #2065) <https://github.com/apple/foundationdb/pull/2065>`_.
* A storage server could crash if it took longer than 10 minutes to fetch a key range from another server. [6.2.5] `(PR #2170) <https://github.com/apple/foundationdb/pull/2170>`_.
* Excluding or including servers would restart the data distributor. [6.2.5] `(PR #2170) <https://github.com/apple/foundationdb/pull/2170>`_.
* The data distributor could read invalid memory when estimating database size. [6.2.6] `(PR #2225) <https://github.com/apple/foundationdb/pull/2225>`_.
Earlier release notes
---------------------

View File

@ -77,7 +77,7 @@ enum enumProgramExe {
};
enum enumBackupType {
BACKUP_UNDEFINED=0, BACKUP_START, BACKUP_MODIFY, BACKUP_STATUS, BACKUP_ABORT, BACKUP_WAIT, BACKUP_DISCONTINUE, BACKUP_PAUSE, BACKUP_RESUME, BACKUP_EXPIRE, BACKUP_DELETE, BACKUP_DESCRIBE, BACKUP_LIST, BACKUP_DUMP
BACKUP_UNDEFINED=0, BACKUP_START, BACKUP_MODIFY, BACKUP_STATUS, BACKUP_ABORT, BACKUP_WAIT, BACKUP_DISCONTINUE, BACKUP_PAUSE, BACKUP_RESUME, BACKUP_EXPIRE, BACKUP_DELETE, BACKUP_DESCRIBE, BACKUP_LIST, BACKUP_DUMP, BACKUP_CLEANUP
};
enum enumDBType {
@ -95,7 +95,7 @@ enum {
OPT_EXPIRE_BEFORE_VERSION, OPT_EXPIRE_BEFORE_DATETIME, OPT_EXPIRE_DELETE_BEFORE_DAYS,
OPT_EXPIRE_RESTORABLE_AFTER_VERSION, OPT_EXPIRE_RESTORABLE_AFTER_DATETIME, OPT_EXPIRE_MIN_RESTORABLE_DAYS,
OPT_BASEURL, OPT_BLOB_CREDENTIALS, OPT_DESCRIBE_DEEP, OPT_DESCRIBE_TIMESTAMPS,
OPT_DUMP_BEGIN, OPT_DUMP_END, OPT_JSON,
OPT_DUMP_BEGIN, OPT_DUMP_END, OPT_JSON, OPT_DELETE_DATA, OPT_MIN_CLEANUP_SECONDS,
// Backup and Restore constants
OPT_TAGNAME, OPT_BACKUPKEYS, OPT_WAITFORDONE,
@ -253,6 +253,7 @@ CSimpleOpt::SOption g_rgBackupStatusOptions[] = {
{ OPT_HELP, "--help", SO_NONE },
{ OPT_DEVHELP, "--dev-help", SO_NONE },
{ OPT_JSON, "--json", SO_NONE},
{ OPT_KNOB, "--knob_", SO_REQ_SEP },
#ifndef TLS_DISABLED
TLS_OPTION_FLAGS
#endif
@ -282,6 +283,37 @@ CSimpleOpt::SOption g_rgBackupAbortOptions[] = {
{ OPT_HELP, "-h", SO_NONE },
{ OPT_HELP, "--help", SO_NONE },
{ OPT_DEVHELP, "--dev-help", SO_NONE },
{ OPT_KNOB, "--knob_", SO_REQ_SEP },
#ifndef TLS_DISABLED
TLS_OPTION_FLAGS
#endif
SO_END_OF_OPTIONS
};
CSimpleOpt::SOption g_rgBackupCleanupOptions[] = {
#ifdef _WIN32
{ OPT_PARENTPID, "--parentpid", SO_REQ_SEP },
#endif
{ OPT_CLUSTERFILE, "-C", SO_REQ_SEP },
{ OPT_CLUSTERFILE, "--cluster_file", SO_REQ_SEP },
{ OPT_TRACE, "--log", SO_NONE },
{ OPT_TRACE_DIR, "--logdir", SO_REQ_SEP },
{ OPT_TRACE_FORMAT, "--trace_format", SO_REQ_SEP },
{ OPT_TRACE_LOG_GROUP, "--loggroup", SO_REQ_SEP },
{ OPT_QUIET, "-q", SO_NONE },
{ OPT_QUIET, "--quiet", SO_NONE },
{ OPT_VERSION, "--version", SO_NONE },
{ OPT_VERSION, "-v", SO_NONE },
{ OPT_CRASHONERROR, "--crash", SO_NONE },
{ OPT_MEMLIMIT, "-m", SO_REQ_SEP },
{ OPT_MEMLIMIT, "--memory", SO_REQ_SEP },
{ OPT_HELP, "-?", SO_NONE },
{ OPT_HELP, "-h", SO_NONE },
{ OPT_HELP, "--help", SO_NONE },
{ OPT_DEVHELP, "--dev-help", SO_NONE },
{ OPT_KNOB, "--knob_", SO_REQ_SEP },
{ OPT_DELETE_DATA, "--delete_data", SO_NONE },
{ OPT_MIN_CLEANUP_SECONDS, "--min_cleanup_seconds", SO_REQ_SEP },
#ifndef TLS_DISABLED
TLS_OPTION_FLAGS
#endif
@ -313,6 +345,7 @@ CSimpleOpt::SOption g_rgBackupDiscontinueOptions[] = {
{ OPT_HELP, "-h", SO_NONE },
{ OPT_HELP, "--help", SO_NONE },
{ OPT_DEVHELP, "--dev-help", SO_NONE },
{ OPT_KNOB, "--knob_", SO_REQ_SEP },
#ifndef TLS_DISABLED
TLS_OPTION_FLAGS
#endif
@ -344,6 +377,7 @@ CSimpleOpt::SOption g_rgBackupWaitOptions[] = {
{ OPT_HELP, "-h", SO_NONE },
{ OPT_HELP, "--help", SO_NONE },
{ OPT_DEVHELP, "--dev-help", SO_NONE },
{ OPT_KNOB, "--knob_", SO_REQ_SEP },
#ifndef TLS_DISABLED
TLS_OPTION_FLAGS
#endif
@ -371,6 +405,7 @@ CSimpleOpt::SOption g_rgBackupPauseOptions[] = {
{ OPT_HELP, "-h", SO_NONE },
{ OPT_HELP, "--help", SO_NONE },
{ OPT_DEVHELP, "--dev-help", SO_NONE },
{ OPT_KNOB, "--knob_", SO_REQ_SEP },
#ifndef TLS_DISABLED
TLS_OPTION_FLAGS
#endif
@ -640,6 +675,7 @@ CSimpleOpt::SOption g_rgDBStartOptions[] = {
{ OPT_HELP, "-h", SO_NONE },
{ OPT_HELP, "--help", SO_NONE },
{ OPT_DEVHELP, "--dev-help", SO_NONE },
{ OPT_KNOB, "--knob_", SO_REQ_SEP },
#ifndef TLS_DISABLED
TLS_OPTION_FLAGS
#endif
@ -673,6 +709,7 @@ CSimpleOpt::SOption g_rgDBStatusOptions[] = {
{ OPT_HELP, "-h", SO_NONE },
{ OPT_HELP, "--help", SO_NONE },
{ OPT_DEVHELP, "--dev-help", SO_NONE },
{ OPT_KNOB, "--knob_", SO_REQ_SEP },
#ifndef TLS_DISABLED
TLS_OPTION_FLAGS
#endif
@ -705,6 +742,7 @@ CSimpleOpt::SOption g_rgDBSwitchOptions[] = {
{ OPT_HELP, "-h", SO_NONE },
{ OPT_HELP, "--help", SO_NONE },
{ OPT_DEVHELP, "--dev-help", SO_NONE },
{ OPT_KNOB, "--knob_", SO_REQ_SEP },
#ifndef TLS_DISABLED
TLS_OPTION_FLAGS
#endif
@ -737,6 +775,7 @@ CSimpleOpt::SOption g_rgDBAbortOptions[] = {
{ OPT_HELP, "-h", SO_NONE },
{ OPT_HELP, "--help", SO_NONE },
{ OPT_DEVHELP, "--dev-help", SO_NONE },
{ OPT_KNOB, "--knob_", SO_REQ_SEP },
#ifndef TLS_DISABLED
TLS_OPTION_FLAGS
#endif
@ -766,6 +805,7 @@ CSimpleOpt::SOption g_rgDBPauseOptions[] = {
{ OPT_HELP, "-h", SO_NONE },
{ OPT_HELP, "--help", SO_NONE },
{ OPT_DEVHELP, "--dev-help", SO_NONE },
{ OPT_KNOB, "--knob_", SO_REQ_SEP },
#ifndef TLS_DISABLED
TLS_OPTION_FLAGS
#endif
@ -1186,6 +1226,7 @@ enumBackupType getBackupType(std::string backupType)
values["start"] = BACKUP_START;
values["status"] = BACKUP_STATUS;
values["abort"] = BACKUP_ABORT;
values["cleanup"] = BACKUP_CLEANUP;
values["wait"] = BACKUP_WAIT;
values["discontinue"] = BACKUP_DISCONTINUE;
values["pause"] = BACKUP_PAUSE;
@ -1863,6 +1904,21 @@ ACTOR Future<Void> abortBackup(Database db, std::string tagName) {
return Void();
}
ACTOR Future<Void> cleanupMutations(Database db, bool deleteData) {
try
{
wait(cleanupBackup(db, deleteData));
}
catch (Error& e) {
if(e.code() == error_code_actor_cancelled)
throw;
fprintf(stderr, "ERROR: %s\n", e.what());
throw;
}
return Void();
}
ACTOR Future<Void> waitBackup(Database db, std::string tagName, bool stopWhenDone) {
try
{
@ -2540,6 +2596,9 @@ int main(int argc, char* argv[]) {
case BACKUP_ABORT:
args = new CSimpleOpt(argc - 1, &argv[1], g_rgBackupAbortOptions, SO_O_EXACT);
break;
case BACKUP_CLEANUP:
args = new CSimpleOpt(argc - 1, &argv[1], g_rgBackupCleanupOptions, SO_O_EXACT);
break;
case BACKUP_WAIT:
args = new CSimpleOpt(argc - 1, &argv[1], g_rgBackupWaitOptions, SO_O_EXACT);
break;
@ -2712,6 +2771,7 @@ int main(int argc, char* argv[]) {
std::string restoreClusterFileDest;
std::string restoreClusterFileOrig;
bool jsonOutput = false;
bool deleteData = false;
BackupModifyOptions modifyOptions;
@ -2791,6 +2851,12 @@ int main(int argc, char* argv[]) {
case OPT_DRYRUN:
dryRun = true;
break;
case OPT_DELETE_DATA:
deleteData = true;
break;
case OPT_MIN_CLEANUP_SECONDS:
knobs.push_back( std::make_pair( "min_cleanup_seconds", args->OptionArg() ) );
break;
case OPT_FORCE:
forceAction = true;
break;
@ -3354,6 +3420,12 @@ int main(int argc, char* argv[]) {
f = stopAfter( abortBackup(db, tagName) );
break;
case BACKUP_CLEANUP:
if(!initCluster())
return FDB_EXIT_ERROR;
f = stopAfter( cleanupMutations(db, deleteData) );
break;
case BACKUP_WAIT:
if(!initCluster())
return FDB_EXIT_ERROR;

View File

@ -485,7 +485,7 @@ bool copyParameter(Reference<Task> source, Reference<Task> dest, Key key);
Version getVersionFromString(std::string const& value);
Standalone<VectorRef<KeyRangeRef>> getLogRanges(Version beginVersion, Version endVersion, Key destUidValue, int blockSize = CLIENT_KNOBS->LOG_RANGE_BLOCK_SIZE);
Standalone<VectorRef<KeyRangeRef>> getApplyRanges(Version beginVersion, Version endVersion, Key backupUid);
Future<Void> eraseLogData(Database cx, Key logUidValue, Key destUidValue, Optional<Version> endVersion = Optional<Version>(), bool checkBackupUid = false, Version backupUid = 0);
Future<Void> eraseLogData(Reference<ReadYourWritesTransaction> tr, Key logUidValue, Key destUidValue, Optional<Version> endVersion = Optional<Version>(), bool checkBackupUid = false, Version backupUid = 0);
Key getApplyKey( Version version, Key backupUid );
std::pair<uint64_t, uint32_t> decodeBKMutationLogKey(Key key);
Standalone<VectorRef<MutationRef>> decodeBackupLogValue(StringRef value);
@ -503,6 +503,7 @@ ACTOR Future<Void> readCommitted(Database cx, PromiseStream<RCGroup> results, Fu
ACTOR Future<Void> applyMutations(Database cx, Key uid, Key addPrefix, Key removePrefix, Version beginVersion,
Version* endVersion, RequestStream<CommitTransactionRequest> commit,
NotifiedVersion* committedVersion, Reference<KeyRangeMap<Version>> keyVersion);
ACTOR Future<Void> cleanupBackup(Database cx, bool deleteData);
typedef BackupAgentBase::enumState EBackupState;
template<> inline Tuple Codec<EBackupState>::pack(EBackupState const &val) { return Tuple().append(val); }

View File

@ -708,7 +708,7 @@ ACTOR Future<Void> applyMutations(Database cx, Key uid, Key addPrefix, Key remov
}
}
ACTOR static Future<Void> _eraseLogData(Database cx, Key logUidValue, Key destUidValue, Optional<Version> endVersion, bool checkBackupUid, Version backupUid) {
ACTOR static Future<Void> _eraseLogData(Reference<ReadYourWritesTransaction> tr, Key logUidValue, Key destUidValue, Optional<Version> endVersion, bool checkBackupUid, Version backupUid) {
state Key backupLatestVersionsPath = destUidValue.withPrefix(backupLatestVersionsPrefix);
state Key backupLatestVersionsKey = logUidValue.withPrefix(backupLatestVersionsPath);
@ -716,104 +716,199 @@ ACTOR static Future<Void> _eraseLogData(Database cx, Key logUidValue, Key destUi
return Void();
}
tr->setOption(FDBTransactionOptions::LOCK_AWARE);
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
if (checkBackupUid) {
Subspace sourceStates = Subspace(databaseBackupPrefixRange.begin).get(BackupAgentBase::keySourceStates).get(logUidValue);
Optional<Value> v = wait( tr->get( sourceStates.pack(DatabaseBackupAgent::keyFolderId) ) );
if(v.present() && BinaryReader::fromStringRef<Version>(v.get(), Unversioned()) > backupUid)
return Void();
}
state Standalone<RangeResultRef> backupVersions = wait(tr->getRange(KeyRangeRef(backupLatestVersionsPath, strinc(backupLatestVersionsPath)), CLIENT_KNOBS->TOO_MANY));
// Make sure version history key does exist and lower the beginVersion if needed
state Version currBeginVersion = invalidVersion;
for (auto backupVersion : backupVersions) {
Key currLogUidValue = backupVersion.key.removePrefix(backupLatestVersionsPrefix).removePrefix(destUidValue);
if (currLogUidValue == logUidValue) {
currBeginVersion = BinaryReader::fromStringRef<Version>(backupVersion.value, Unversioned());
break;
}
}
// Do not clear anything if version history key cannot be found
if (currBeginVersion == invalidVersion) {
return Void();
}
state Version currEndVersion = std::numeric_limits<Version>::max();
if(endVersion.present()) {
currEndVersion = std::min(currEndVersion, endVersion.get());
}
state Version nextSmallestVersion = currEndVersion;
bool clearLogRangesRequired = true;
// More than one backup/DR with the same range
if (backupVersions.size() > 1) {
for (auto backupVersion : backupVersions) {
Key currLogUidValue = backupVersion.key.removePrefix(backupLatestVersionsPrefix).removePrefix(destUidValue);
Version currVersion = BinaryReader::fromStringRef<Version>(backupVersion.value, Unversioned());
if (currLogUidValue == logUidValue) {
continue;
} else if (currVersion > currBeginVersion) {
nextSmallestVersion = std::min(currVersion, nextSmallestVersion);
} else {
// If we can find a version less than or equal to beginVersion, clearing log ranges is not required
clearLogRangesRequired = false;
break;
}
}
}
if (endVersion.present() || backupVersions.size() != 1 || BUGGIFY) {
if (!endVersion.present()) {
// Clear current backup version history
tr->clear(backupLatestVersionsKey);
if(backupVersions.size() == 1) {
tr->clear(prefixRange(destUidValue.withPrefix(logRangesRange.begin)));
}
} else {
// Update current backup latest version
tr->set(backupLatestVersionsKey, BinaryWriter::toValue<Version>(currEndVersion, Unversioned()));
}
// Clear log ranges if needed
if (clearLogRangesRequired) {
if((nextSmallestVersion - currBeginVersion) / CLIENT_KNOBS->LOG_RANGE_BLOCK_SIZE >= std::numeric_limits<uint8_t>::max() || BUGGIFY) {
Key baLogRangePrefix = destUidValue.withPrefix(backupLogKeys.begin);
for(int h = 0; h <= std::numeric_limits<uint8_t>::max(); h++) {
uint64_t bv = bigEndian64(Version(0));
uint64_t ev = bigEndian64(nextSmallestVersion);
uint8_t h1 = h;
Key vblockPrefix = StringRef(&h1, sizeof(uint8_t)).withPrefix(baLogRangePrefix);
tr->clear(KeyRangeRef(StringRef((uint8_t*)&bv, sizeof(uint64_t)).withPrefix(vblockPrefix),
StringRef((uint8_t*)&ev, sizeof(uint64_t)).withPrefix(vblockPrefix)));
}
} else {
Standalone<VectorRef<KeyRangeRef>> ranges = getLogRanges(currBeginVersion, nextSmallestVersion, destUidValue);
for (auto& range : ranges) {
tr->clear(range);
}
}
}
} else {
// Clear version history
tr->clear(prefixRange(backupLatestVersionsPath));
// Clear everything under blog/[destUid]
tr->clear(prefixRange(destUidValue.withPrefix(backupLogKeys.begin)));
// Disable committing mutations into blog
tr->clear(prefixRange(destUidValue.withPrefix(logRangesRange.begin)));
}
if(!endVersion.present() && backupVersions.size() == 1) {
Standalone<RangeResultRef> existingDestUidValues = wait(tr->getRange(KeyRangeRef(destUidLookupPrefix, strinc(destUidLookupPrefix)), CLIENT_KNOBS->TOO_MANY));
for(auto it : existingDestUidValues) {
if( it.value == destUidValue ) {
tr->clear(it.key);
}
}
}
return Void();
}
Future<Void> eraseLogData(Reference<ReadYourWritesTransaction> tr, Key logUidValue, Key destUidValue, Optional<Version> endVersion, bool checkBackupUid, Version backupUid) {
return _eraseLogData(tr, logUidValue, destUidValue, endVersion, checkBackupUid, backupUid);
}
ACTOR Future<Void> cleanupLogMutations(Database cx, Value destUidValue, bool deleteData) {
state Key backupLatestVersionsPath = destUidValue.withPrefix(backupLatestVersionsPrefix);
state Reference<ReadYourWritesTransaction> tr(new ReadYourWritesTransaction(cx));
loop{
state Optional<Key> removingLogUid;
state std::set<Key> loggedLogUids;
loop {
try {
tr->setOption(FDBTransactionOptions::LOCK_AWARE);
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
if (checkBackupUid) {
Subspace sourceStates = Subspace(databaseBackupPrefixRange.begin).get(BackupAgentBase::keySourceStates).get(logUidValue);
Optional<Value> v = wait( tr->get( sourceStates.pack(DatabaseBackupAgent::keyFolderId) ) );
if(v.present() && BinaryReader::fromStringRef<Version>(v.get(), Unversioned()) > backupUid)
return Void();
}
state Standalone<RangeResultRef> backupVersions = wait(tr->getRange(KeyRangeRef(backupLatestVersionsPath, strinc(backupLatestVersionsPath)), CLIENT_KNOBS->TOO_MANY));
state Version readVer = tr->getReadVersion().get();
// Make sure version history key does exist and lower the beginVersion if needed
state Version currBeginVersion = invalidVersion;
for (auto backupVersion : backupVersions) {
Key currLogUidValue = backupVersion.key.removePrefix(backupLatestVersionsPrefix).removePrefix(destUidValue);
state Version minVersion = std::numeric_limits<Version>::max();
state Key minVersionLogUid;
if (currLogUidValue == logUidValue) {
currBeginVersion = BinaryReader::fromStringRef<Version>(backupVersion.value, Unversioned());
break;
state int backupIdx = 0;
for (; backupIdx < backupVersions.size(); backupIdx++) {
state Version currVersion = BinaryReader::fromStringRef<Version>(backupVersions[backupIdx].value, Unversioned());
state Key currLogUid = backupVersions[backupIdx].key.removePrefix(backupLatestVersionsPrefix).removePrefix(destUidValue);
if( currVersion < minVersion ) {
minVersionLogUid = currLogUid;
minVersion = currVersion;
}
}
// Do not clear anything if version history key cannot be found
if (currBeginVersion == invalidVersion) {
return Void();
}
if(!loggedLogUids.count(currLogUid)) {
state Future<Optional<Value>> foundDRKey = tr->get(Subspace(databaseBackupPrefixRange.begin).get(BackupAgentBase::keySourceStates).get(currLogUid).pack(DatabaseBackupAgent::keyStateStatus));
state Future<Optional<Value>> foundBackupKey = tr->get(Subspace(currLogUid.withPrefix(LiteralStringRef("uid->config/")).withPrefix(fileBackupPrefixRange.begin)).pack(LiteralStringRef("stateEnum")));
wait(success(foundDRKey) && success(foundBackupKey));
state Version currEndVersion = currBeginVersion + CLIENT_KNOBS->CLEAR_LOG_RANGE_COUNT * CLIENT_KNOBS->LOG_RANGE_BLOCK_SIZE;
if(endVersion.present()) {
currEndVersion = std::min(currEndVersion, endVersion.get());
}
state Version nextSmallestVersion = currEndVersion;
bool clearLogRangesRequired = true;
// More than one backup/DR with the same range
if (backupVersions.size() > 1) {
for (auto backupVersion : backupVersions) {
Key currLogUidValue = backupVersion.key.removePrefix(backupLatestVersionsPrefix).removePrefix(destUidValue);
Version currVersion = BinaryReader::fromStringRef<Version>(backupVersion.value, Unversioned());
if (currLogUidValue == logUidValue) {
continue;
} else if (currVersion > currBeginVersion) {
nextSmallestVersion = std::min(currVersion, nextSmallestVersion);
if(foundDRKey.get().present() && foundBackupKey.get().present()) {
printf("WARNING: Found a tag which looks like both a backup and a DR. This tag was %.4f hours behind.\n", (readVer - currVersion)/(3600.0*CLIENT_KNOBS->CORE_VERSIONSPERSECOND));
} else if(foundDRKey.get().present() && !foundBackupKey.get().present()) {
printf("Found a DR which was %.4f hours behind.\n", (readVer - currVersion)/(3600.0*CLIENT_KNOBS->CORE_VERSIONSPERSECOND));
} else if(!foundDRKey.get().present() && foundBackupKey.get().present()) {
printf("Found a Backup which was %.4f hours behind.\n", (readVer - currVersion)/(3600.0*CLIENT_KNOBS->CORE_VERSIONSPERSECOND));
} else {
// If we can find a version less than or equal to beginVersion, clearing log ranges is not required
clearLogRangesRequired = false;
break;
printf("WARNING: Found a unknown tag which was %.4f hours behind.\n", (readVer - currVersion)/(3600.0*CLIENT_KNOBS->CORE_VERSIONSPERSECOND));
}
loggedLogUids.insert(currLogUid);
}
}
if (!endVersion.present() && backupVersions.size() == 1) {
// Clear version history
tr->clear(prefixRange(backupLatestVersionsPath));
// Clear everything under blog/[destUid]
tr->clear(prefixRange(destUidValue.withPrefix(backupLogKeys.begin)));
// Disable committing mutations into blog
tr->clear(prefixRange(destUidValue.withPrefix(logRangesRange.begin)));
if( readVer - minVersion > CLIENT_KNOBS->MIN_CLEANUP_SECONDS*CLIENT_KNOBS->CORE_VERSIONSPERSECOND && deleteData && (!removingLogUid.present() || minVersionLogUid == removingLogUid.get()) ) {
removingLogUid = minVersionLogUid;
wait(eraseLogData(tr, minVersionLogUid, destUidValue));
wait(tr->commit());
printf("\nSuccessfully removed the tag which was %.4f hours behind.\n", (readVer - minVersion)/(3600.0*CLIENT_KNOBS->CORE_VERSIONSPERSECOND));
} else if(removingLogUid.present() && minVersionLogUid != removingLogUid.get()) {
printf("\nWARNING: The oldest tag was possibly removed, run again without `--delete_data' to check.\n");
} else if( deleteData ) {
printf("\nWARNING: Did not delete data because the tag was not at least %.4f hours behind. Change `--min_cleanup_seconds' to adjust this threshold.\n", CLIENT_KNOBS->MIN_CLEANUP_SECONDS/3600.0);
} else {
if (!endVersion.present() && currEndVersion >= nextSmallestVersion) {
// Clear current backup version history
tr->clear(backupLatestVersionsKey);
} else {
// Update current backup latest version
tr->set(backupLatestVersionsKey, BinaryWriter::toValue<Version>(currEndVersion, Unversioned()));
}
printf("\nPassing `--delete_data' would delete the tag which was %.4f hours behind.\n", (readVer - minVersion)/(3600.0*CLIENT_KNOBS->CORE_VERSIONSPERSECOND));
}
// Clear log ranges if needed
if (clearLogRangesRequired) {
Standalone<VectorRef<KeyRangeRef>> ranges = getLogRanges(currBeginVersion, nextSmallestVersion, destUidValue);
for (auto& range : ranges) {
tr->clear(range);
}
}
}
wait(tr->commit());
if (!endVersion.present() && (backupVersions.size() == 1 || currEndVersion >= nextSmallestVersion)) {
return Void();
}
if(endVersion.present() && currEndVersion == endVersion.get()) {
return Void();
}
tr->reset();
} catch (Error &e) {
return Void();
} catch( Error& e) {
wait(tr->onError(e));
}
}
}
Future<Void> eraseLogData(Database cx, Key logUidValue, Key destUidValue, Optional<Version> endVersion, bool checkBackupUid, Version backupUid) {
return _eraseLogData(cx, logUidValue, destUidValue, endVersion, checkBackupUid, backupUid);
ACTOR Future<Void> cleanupBackup(Database cx, bool deleteData) {
state Reference<ReadYourWritesTransaction> tr(new ReadYourWritesTransaction(cx));
loop {
try {
tr->setOption(FDBTransactionOptions::LOCK_AWARE);
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
state Standalone<RangeResultRef> destUids = wait(tr->getRange(KeyRangeRef(destUidLookupPrefix, strinc(destUidLookupPrefix)), CLIENT_KNOBS->TOO_MANY));
for(auto destUid : destUids) {
wait(cleanupLogMutations(cx, destUid.value, deleteData));
}
return Void();
} catch( Error& e) {
wait(tr->onError(e));
}
}
}

View File

@ -482,11 +482,17 @@ namespace dbBackup {
wait(checkTaskVersion(cx, task, EraseLogRangeTaskFunc::name, EraseLogRangeTaskFunc::version));
Version endVersion = BinaryReader::fromStringRef<Version>(task->params[DatabaseBackupAgent::keyEndVersion], Unversioned());
wait(eraseLogData(taskBucket->src, task->params[BackupAgentBase::keyConfigLogUid], task->params[BackupAgentBase::destUid], Optional<Version>(endVersion), true, BinaryReader::fromStringRef<Version>(task->params[BackupAgentBase::keyFolderId], Unversioned())));
return Void();
state Reference<ReadYourWritesTransaction> tr(new ReadYourWritesTransaction(taskBucket->src));
loop {
try {
Version endVersion = BinaryReader::fromStringRef<Version>(task->params[DatabaseBackupAgent::keyEndVersion], Unversioned());
wait(eraseLogData(tr, task->params[BackupAgentBase::keyConfigLogUid], task->params[BackupAgentBase::destUid], Optional<Version>(endVersion), true, BinaryReader::fromStringRef<Version>(task->params[BackupAgentBase::keyFolderId], Unversioned())));
wait(tr->commit());
return Void();
} catch( Error &e ) {
wait(tr->onError(e));
}
}
}
ACTOR static Future<Key> addTask(Reference<ReadYourWritesTransaction> tr, Reference<TaskBucket> taskBucket, Reference<Task> parentTask, Version endVersion, TaskCompletionKey completionKey, Reference<TaskFuture> waitFor = Reference<TaskFuture>()) {
@ -833,8 +839,7 @@ namespace dbBackup {
state Reference<ReadYourWritesTransaction> tr(new ReadYourWritesTransaction(taskBucket->src));
state Key logUidValue = task->params[DatabaseBackupAgent::keyConfigLogUid];
state Key destUidValue = task->params[BackupAgentBase::destUid];
state Version beginVersion;
state Version endVersion;
state Version backupUid = BinaryReader::fromStringRef<Version>(task->params[BackupAgentBase::keyFolderId], Unversioned());
loop {
try {
@ -844,25 +849,13 @@ namespace dbBackup {
if(v.present() && BinaryReader::fromStringRef<Version>(v.get(), Unversioned()) > BinaryReader::fromStringRef<Version>(task->params[DatabaseBackupAgent::keyFolderId], Unversioned()))
return Void();
state Key latestVersionKey = logUidValue.withPrefix(task->params[BackupAgentBase::destUid].withPrefix(backupLatestVersionsPrefix));
state Optional<Key> bVersion = wait(tr->get(latestVersionKey));
if (!bVersion.present()) {
return Void();
}
beginVersion = BinaryReader::fromStringRef<Version>(bVersion.get(), Unversioned());
endVersion = tr->getReadVersion().get();
break;
wait(eraseLogData(tr, logUidValue, destUidValue, Optional<Version>(), true, backupUid));
wait(tr->commit());
return Void();
} catch(Error &e) {
wait(tr->onError(e));
}
}
Version backupUid = BinaryReader::fromStringRef<Version>(task->params[BackupAgentBase::keyFolderId], Unversioned());
wait(eraseLogData(taskBucket->src, logUidValue, destUidValue, Optional<Version>(), true, backupUid));
return Void();
}
ACTOR static Future<Key> addTask(Reference<ReadYourWritesTransaction> tr, Reference<TaskBucket> taskBucket, Reference<Task> parentTask, TaskCompletionKey completionKey, Reference<TaskFuture> waitFor = Reference<TaskFuture>()) {
@ -1303,19 +1296,25 @@ namespace dbBackup {
}
if (backupRanges.size() == 1) {
state Key destUidLookupPath = BinaryWriter::toValue(backupRanges[0], IncludeVersion()).withPrefix(destUidLookupPrefix);
Optional<Key> existingDestUidValue = wait(srcTr->get(destUidLookupPath));
if (existingDestUidValue.present()) {
if (destUidValue == existingDestUidValue.get()) {
// due to unknown commit result
break;
} else {
// existing backup/DR is running
return Void();
Standalone<RangeResultRef> existingDestUidValues = wait(srcTr->getRange(KeyRangeRef(destUidLookupPrefix, strinc(destUidLookupPrefix)), CLIENT_KNOBS->TOO_MANY));
bool found = false;
for(auto it : existingDestUidValues) {
if( BinaryReader::fromStringRef<KeyRange>(it.key.removePrefix(destUidLookupPrefix), IncludeVersion()) == backupRanges[0] ) {
if(destUidValue != it.value) {
// existing backup/DR is running
return Void();
} else {
// due to unknown commit result
found = true;
break;
}
}
}
if(found) {
break;
}
srcTr->set(destUidLookupPath, destUidValue);
srcTr->set(BinaryWriter::toValue(backupRanges[0], IncludeVersion(ProtocolVersion::withSharedMutations())).withPrefix(destUidLookupPrefix), destUidValue);
}
Key versionKey = logUidValue.withPrefix(destUidValue).withPrefix(backupLatestVersionsPrefix);
@ -1473,13 +1472,18 @@ namespace dbBackup {
// Initialize destUid
if (backupRanges.size() == 1) {
state Key destUidLookupPath = BinaryWriter::toValue(backupRanges[0], IncludeVersion()).withPrefix(destUidLookupPrefix);
Optional<Key> existingDestUidValue = wait(srcTr->get(destUidLookupPath));
if (existingDestUidValue.present()) {
destUidValue = existingDestUidValue.get();
} else {
Standalone<RangeResultRef> existingDestUidValues = wait(srcTr->getRange(KeyRangeRef(destUidLookupPrefix, strinc(destUidLookupPrefix)), CLIENT_KNOBS->TOO_MANY));
bool found = false;
for(auto it : existingDestUidValues) {
if( BinaryReader::fromStringRef<KeyRange>(it.key.removePrefix(destUidLookupPrefix), IncludeVersion()) == backupRanges[0] ) {
destUidValue = it.value;
found = true;
break;
}
}
if( !found ) {
destUidValue = BinaryWriter::toValue(deterministicRandom()->randomUniqueID(), Unversioned());
srcTr->set(destUidLookupPath, destUidValue);
srcTr->set(BinaryWriter::toValue(backupRanges[0], IncludeVersion(ProtocolVersion::withSharedMutations())).withPrefix(destUidLookupPrefix), destUidValue);
}
}
@ -2179,22 +2183,23 @@ public:
}
}
if(partial)
return Void();
state Future<Void> partialTimeout = partial ? delay(30.0) : Never();
state Reference<ReadYourWritesTransaction> srcTr(new ReadYourWritesTransaction(backupAgent->taskBucket->src));
state Version beginVersion;
state Version endVersion;
state bool clearSrcDb = true;
loop {
try {
srcTr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
srcTr->setOption(FDBTransactionOptions::LOCK_AWARE);
Optional<Value> v = wait( srcTr->get( backupAgent->sourceStates.get(logUidValue).pack(DatabaseBackupAgent::keyFolderId) ) );
state Future<Optional<Value>> backupVersionF = srcTr->get( backupAgent->sourceStates.get(logUidValue).pack(DatabaseBackupAgent::keyFolderId) );
wait(success(backupVersionF) || partialTimeout);
if(partialTimeout.isReady()) {
return Void();
}
if(v.present() && BinaryReader::fromStringRef<Version>(v.get(), Unversioned()) > BinaryReader::fromStringRef<Version>(backupUid, Unversioned())) {
clearSrcDb = false;
if(backupVersionF.get().present() && BinaryReader::fromStringRef<Version>(backupVersionF.get().get(), Unversioned()) > BinaryReader::fromStringRef<Version>(backupUid, Unversioned())) {
break;
}
@ -2208,18 +2213,31 @@ public:
Key latestVersionKey = logUidValue.withPrefix(destUidValue.withPrefix(backupLatestVersionsPrefix));
Optional<Key> bVersion = wait(srcTr->get(latestVersionKey));
if (bVersion.present()) {
beginVersion = BinaryReader::fromStringRef<Version>(bVersion.get(), Unversioned());
state Future<Optional<Key>> bVersionF = srcTr->get(latestVersionKey);
wait(success(bVersionF) || partialTimeout);
if(partialTimeout.isReady()) {
return Void();
}
if (bVersionF.get().present()) {
beginVersion = BinaryReader::fromStringRef<Version>(bVersionF.get().get(), Unversioned());
} else {
clearSrcDb = false;
break;
}
srcTr->set( backupAgent->sourceStates.pack(DatabaseBackupAgent::keyStateStatus), StringRef(DatabaseBackupAgent::getStateText(BackupAgentBase::STATE_PARTIALLY_ABORTED) ));
srcTr->set( backupAgent->sourceStates.get(logUidValue).pack(DatabaseBackupAgent::keyFolderId), backupUid );
wait(srcTr->commit());
wait( eraseLogData(srcTr, logUidValue, destUidValue) || partialTimeout );
if(partialTimeout.isReady()) {
return Void();
}
wait(srcTr->commit() || partialTimeout);
if(partialTimeout.isReady()) {
return Void();
}
endVersion = srcTr->getCommittedVersion() + 1;
break;
@ -2229,10 +2247,6 @@ public:
}
}
if (clearSrcDb && !abortOldBackup) {
wait(eraseLogData(backupAgent->taskBucket->src, logUidValue, destUidValue));
}
tr = Reference<ReadYourWritesTransaction>(new ReadYourWritesTransaction(cx));
loop {
try {

View File

@ -73,6 +73,7 @@ void parse( std::vector<RegionInfo>* regions, ValueRef const& v ) {
s.get("id", idStr);
satInfo.dcId = idStr;
s.get("priority", satInfo.priority);
s.tryGet("satellite_logs", satInfo.satelliteDesiredTLogCount);
info.satellites.push_back(satInfo);
} else {
if (foundNonSatelliteDatacenter) throw invalid_option();
@ -365,6 +366,9 @@ StatusArray DatabaseConfiguration::getRegionJSON() const {
satObj["id"] = s.dcId.toString();
satObj["priority"] = s.priority;
satObj["satellite"] = 1;
if(s.satelliteDesiredTLogCount != -1) {
satObj["satellite_logs"] = s.satelliteDesiredTLogCount;
}
dcArr.push_back(satObj);
}

View File

@ -32,6 +32,7 @@
struct SatelliteInfo {
Key dcId;
int32_t priority;
int32_t satelliteDesiredTLogCount = -1;
SatelliteInfo() : priority(0) {}
@ -41,7 +42,7 @@ struct SatelliteInfo {
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, dcId, priority);
serializer(ar, dcId, priority, satelliteDesiredTLogCount);
}
};

View File

@ -1988,6 +1988,7 @@ namespace fileBackup {
const uint32_t BackupLogRangeTaskFunc::version = 1;
REGISTER_TASKFUNC(BackupLogRangeTaskFunc);
//This task stopped being used in 6.2, however the code remains here to handle upgrades.
struct EraseLogRangeTaskFunc : BackupTaskFuncBase {
static StringRef name;
static const uint32_t version;
@ -2005,21 +2006,6 @@ namespace fileBackup {
}
} Params;
ACTOR static Future<Void> _execute(Database cx, Reference<TaskBucket> taskBucket, Reference<FutureBucket> futureBucket, Reference<Task> task) {
state Reference<FlowLock> lock(new FlowLock(CLIENT_KNOBS->BACKUP_LOCK_BYTES));
wait(checkTaskVersion(cx, task, EraseLogRangeTaskFunc::name, EraseLogRangeTaskFunc::version));
state Version endVersion = Params.endVersion().get(task);
state Key destUidValue = Params.destUidValue().get(task);
state BackupConfig config(task);
state Key logUidValue = config.getUidAsKey();
wait(eraseLogData(cx, logUidValue, destUidValue, endVersion != 0 ? Optional<Version>(endVersion) : Optional<Version>()));
return Void();
}
ACTOR static Future<Key> addTask(Reference<ReadYourWritesTransaction> tr, Reference<TaskBucket> taskBucket, UID logUid, TaskCompletionKey completionKey, Key destUidValue, Version endVersion = 0, Reference<TaskFuture> waitFor = Reference<TaskFuture>()) {
Key key = wait(addBackupTask(EraseLogRangeTaskFunc::name,
EraseLogRangeTaskFunc::version,
@ -2036,16 +2022,23 @@ namespace fileBackup {
return key;
}
ACTOR static Future<Void> _finish(Reference<ReadYourWritesTransaction> tr, Reference<TaskBucket> taskBucket, Reference<FutureBucket> futureBucket, Reference<Task> task) {
state Reference<TaskFuture> taskFuture = futureBucket->unpack(task->params[Task::reservedTaskParamKeyDone]);
wait(taskFuture->set(tr, taskBucket) && taskBucket->finish(tr, task));
wait(checkTaskVersion(tr->getDatabase(), task, EraseLogRangeTaskFunc::name, EraseLogRangeTaskFunc::version));
state Version endVersion = Params.endVersion().get(task);
state Key destUidValue = Params.destUidValue().get(task);
state BackupConfig config(task);
state Key logUidValue = config.getUidAsKey();
wait(taskFuture->set(tr, taskBucket) && taskBucket->finish(tr, task) && eraseLogData(tr, logUidValue, destUidValue, endVersion != 0 ? Optional<Version>(endVersion) : Optional<Version>()));
return Void();
}
Future<Void> execute(Database cx, Reference<TaskBucket> tb, Reference<FutureBucket> fb, Reference<Task> task) { return _execute(cx, tb, fb, task); };
Future<Void> execute(Database cx, Reference<TaskBucket> tb, Reference<FutureBucket> fb, Reference<Task> task) { return Void(); };
Future<Void> finish(Reference<ReadYourWritesTransaction> tr, Reference<TaskBucket> tb, Reference<FutureBucket> fb, Reference<Task> task) { return _finish(tr, tb, fb, task); };
};
StringRef EraseLogRangeTaskFunc::name = LiteralStringRef("file_backup_erase_logs_5.2");
@ -2132,7 +2125,7 @@ namespace fileBackup {
// Do not erase at the first time
if (prevBeginVersion > 0) {
state Key destUidValue = wait(config.destUidValue().getOrThrow(tr));
wait(success(EraseLogRangeTaskFunc::addTask(tr, taskBucket, config.getUid(), TaskCompletionKey::joinWith(logDispatchBatchFuture), destUidValue, beginVersion)));
wait( eraseLogData(tr, config.getUidAsKey(), destUidValue, Optional<Version>(beginVersion)) );
}
wait(taskBucket->finish(tr, task));
@ -2183,7 +2176,7 @@ namespace fileBackup {
tr->setOption(FDBTransactionOptions::COMMIT_ON_FIRST_PROXY);
state Key destUidValue = wait(backup.destUidValue().getOrThrow(tr));
wait(success(EraseLogRangeTaskFunc::addTask(tr, taskBucket, backup.getUid(), TaskCompletionKey::noSignal(), destUidValue)));
wait( eraseLogData(tr, backup.getUidAsKey(), destUidValue) );
backup.stateEnum().set(tr, EBackupState::STATE_COMPLETED);
@ -3626,13 +3619,18 @@ public:
state Key destUidValue(BinaryWriter::toValue(uid, Unversioned()));
if (normalizedRanges.size() == 1) {
state Key destUidLookupPath = BinaryWriter::toValue(normalizedRanges[0], IncludeVersion()).withPrefix(destUidLookupPrefix);
Optional<Key> existingDestUidValue = wait(tr->get(destUidLookupPath));
if (existingDestUidValue.present()) {
destUidValue = existingDestUidValue.get();
} else {
Standalone<RangeResultRef> existingDestUidValues = wait(tr->getRange(KeyRangeRef(destUidLookupPrefix, strinc(destUidLookupPrefix)), CLIENT_KNOBS->TOO_MANY));
bool found = false;
for(auto it : existingDestUidValues) {
if( BinaryReader::fromStringRef<KeyRange>(it.key.removePrefix(destUidLookupPrefix), IncludeVersion()) == normalizedRanges[0] ) {
destUidValue = it.value;
found = true;
break;
}
}
if( !found ) {
destUidValue = BinaryWriter::toValue(deterministicRandom()->randomUniqueID(), Unversioned());
tr->set(destUidLookupPath, destUidValue);
tr->set(BinaryWriter::toValue(normalizedRanges[0], IncludeVersion(ProtocolVersion::withSharedMutations())).withPrefix(destUidLookupPrefix), destUidValue);
}
}
@ -3820,8 +3818,7 @@ public:
state Key destUidValue = wait(config.destUidValue().getOrThrow(tr));
wait(success(tr->getReadVersion()));
wait(success(fileBackup::EraseLogRangeTaskFunc::addTask(tr, backupAgent->taskBucket, config.getUid(), TaskCompletionKey::noSignal(), destUidValue)));
wait( eraseLogData(tr, config.getUidAsKey(), destUidValue) );
config.stateEnum().set(tr, EBackupState::STATE_COMPLETED);
@ -3861,7 +3858,7 @@ public:
// Cancel backup task through tag
wait(tag.cancel(tr));
wait(success(fileBackup::EraseLogRangeTaskFunc::addTask(tr, backupAgent->taskBucket, config.getUid(), TaskCompletionKey::noSignal(), destUidValue)));
wait(eraseLogData(tr, config.getUidAsKey(), destUidValue));
config.stateEnum().set(tr, EBackupState::STATE_ABORTED);

View File

@ -145,7 +145,7 @@ ClientKnobs::ClientKnobs(bool randomize) {
init( BACKUP_ERROR_DELAY, 10.0 );
init( BACKUP_STATUS_DELAY, 40.0 );
init( BACKUP_STATUS_JITTER, 0.05 );
init( CLEAR_LOG_RANGE_COUNT, 1500); // transaction size / (size of '\xff\x02/blog/' + size of UID + size of hash result) = 200,000 / (8 + 16 + 8)
init( MIN_CLEANUP_SECONDS, 3600.0 );
// Configuration
init( DEFAULT_AUTO_PROXIES, 3 );

View File

@ -131,7 +131,6 @@ public:
int BACKUP_COPY_TASKS;
int BACKUP_BLOCK_SIZE;
int BACKUP_TASKS_PER_AGENT;
int CLEAR_LOG_RANGE_COUNT;
int SIM_BACKUP_TASKS_PER_AGENT;
int BACKUP_RANGEFILE_BLOCK_SIZE;
int BACKUP_LOGFILE_BLOCK_SIZE;
@ -147,6 +146,7 @@ public:
double BACKUP_ERROR_DELAY;
double BACKUP_STATUS_DELAY;
double BACKUP_STATUS_JITTER;
double MIN_CLEANUP_SECONDS;
// Configuration
int32_t DEFAULT_AUTO_PROXIES;

View File

@ -67,7 +67,7 @@ std::map<std::string, std::string> configForToken( std::string const& mode ) {
std::string key = mode.substr(0, pos);
std::string value = mode.substr(pos+1);
if( (key == "logs" || key == "proxies" || key == "resolvers" || key == "remote_logs" || key == "log_routers" || key == "satellite_logs" || key == "usable_regions" || key == "repopulate_anti_quorum") && isInteger(value) ) {
if( (key == "logs" || key == "proxies" || key == "resolvers" || key == "remote_logs" || key == "log_routers" || key == "usable_regions" || key == "repopulate_anti_quorum") && isInteger(value) ) {
out[p+key] = value;
}
@ -916,6 +916,7 @@ ACTOR Future<CoordinatorsResult::Type> changeQuorum( Database cx, Reference<IQuo
try {
tr.setOption( FDBTransactionOptions::LOCK_AWARE );
tr.setOption( FDBTransactionOptions::USE_PROVISIONAL_PROXIES );
tr.setOption( FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE );
Optional<Value> currentKey = wait( tr.get( coordinatorsKey ) );
if (!currentKey.present())
@ -1208,8 +1209,6 @@ ACTOR Future<Void> excludeServers( Database cx, vector<AddressExclusion> servers
tr.setOption( FDBTransactionOptions::USE_PROVISIONAL_PROXIES );
tr.addReadConflictRange( singleKeyRange(excludedServersVersionKey) ); //To conflict with parallel includeServers
tr.addReadConflictRange( singleKeyRange(moveKeysLockOwnerKey) );
tr.set( moveKeysLockOwnerKey, versionKey );
tr.set( excludedServersVersionKey, excludeVersionKey );
for(auto& s : servers)
tr.set( encodeExcludedServersKey(s), StringRef() );
@ -1240,9 +1239,6 @@ ACTOR Future<Void> includeServers( Database cx, vector<AddressExclusion> servers
// includeServers might be used in an emergency transaction, so make sure it is retry-self-conflicting and CAUSAL_WRITE_RISKY
tr.setOption( FDBTransactionOptions::CAUSAL_WRITE_RISKY );
tr.addReadConflictRange( singleKeyRange(excludedServersVersionKey) );
tr.addReadConflictRange( singleKeyRange(moveKeysLockOwnerKey) );
tr.set( moveKeysLockOwnerKey, versionKey );
tr.set( excludedServersVersionKey, excludeVersionKey );
for(auto& s : servers ) {

View File

@ -382,7 +382,9 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema(
"layer_status_incomplete",
"database_availability_timeout",
"consistencycheck_suspendkey_fetch_timeout",
"consistencycheck_disabled"
"consistencycheck_disabled",
"duplicate_mutation_streams",
"duplicate_mutation_fetch_timeout"
]
},
"issues":[
@ -522,7 +524,8 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema(
"datacenters":[{
"id":"mr",
"priority":1,
"satellite":1
"satellite":1,
"satellite_logs":2
}],
"satellite_redundancy_mode":{
"$enum":[
@ -603,6 +606,7 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema(
"max_machine_failures_without_losing_availability":0,
"total_disk_used_bytes":0,
"total_kv_size_bytes":0,
"system_kv_size_bytes":0,
"partitions_count":2,
"moving_data":{
"total_written_bytes":0,
@ -731,7 +735,8 @@ const KeyRef JSONSchemas::clusterConfigurationSchema = LiteralStringRef(R"config
"datacenters":[{
"id":"mr",
"priority":1,
"satellite":1
"satellite":1,
"satellite_logs":2
}],
"satellite_redundancy_mode":{
"$enum":[

View File

@ -179,7 +179,8 @@ public:
countConnClosedWithoutError.init(LiteralStringRef("Net2.CountConnClosedWithoutError"));
}
Reference<struct Peer> getPeer( NetworkAddress const& address, bool openConnection = true );
Reference<struct Peer> getPeer( NetworkAddress const& address );
Reference<struct Peer> getOrOpenPeer( NetworkAddress const& address, bool startConnectionKeeper=true );
// Returns true if given network address 'address' is one of the address we are listening on.
bool isLocalAddress(const NetworkAddress& address) const;
@ -410,7 +411,6 @@ ACTOR Future<Void> connectionKeeper( Reference<Peer> self,
try {
if (!conn) { // Always, except for the first loop with an incoming connection
self->outgoingConnectionIdle = true;
// Wait until there is something to send.
while (self->unsent.empty()) {
if (FlowTransport::transport().isClient() && self->destination.isPublic() &&
@ -654,7 +654,7 @@ ACTOR static void deliver(TransportData* self, Endpoint destination, ArenaReader
if (self->isLocalAddress(destination.getPrimaryAddress())) {
sendLocal(self, SerializeSource<Endpoint>(Endpoint(self->localAddresses, destination.token)), Endpoint(destination.addresses, WLTOKEN_ENDPOINT_NOT_FOUND));
} else {
Reference<Peer> peer = self->getPeer(destination.getPrimaryAddress());
Reference<Peer> peer = self->getOrOpenPeer(destination.getPrimaryAddress());
sendPacket(self, peer, SerializeSource<Endpoint>(Endpoint(self->localAddresses, destination.token)), Endpoint(destination.addresses, WLTOKEN_ENDPOINT_NOT_FOUND), false);
}
}
@ -908,7 +908,7 @@ ACTOR static Future<Void> connectionReader(
peerAddress = NetworkAddress(pkt.canonicalRemoteIp(), pkt.canonicalRemotePort, true,
peerAddress.isTLS());
}
peer = transport->getPeer(peerAddress);
peer = transport->getOrOpenPeer(peerAddress, false);
peer->compatible = compatible;
peer->incompatibleProtocolVersionNewer = incompatibleProtocolVersionNewer;
if (!compatible) {
@ -987,18 +987,25 @@ ACTOR static Future<Void> listen( TransportData* self, NetworkAddress listenAddr
}
}
Reference<Peer> TransportData::getPeer( NetworkAddress const& address, bool openConnection ) {
Reference<Peer> TransportData::getPeer( NetworkAddress const& address ) {
auto peer = peers.find(address);
if (peer != peers.end()) {
return peer->second;
}
if(!openConnection) {
return Reference<Peer>();
return Reference<Peer>();
}
Reference<Peer> TransportData::getOrOpenPeer( NetworkAddress const& address, bool startConnectionKeeper ) {
auto peer = getPeer(address);
if(!peer) {
peer = Reference<Peer>( new Peer(this, address) );
if(startConnectionKeeper) {
peer->connect = connectionKeeper(peer);
}
peers[address] = peer;
}
Reference<Peer> newPeer = Reference<Peer>( new Peer(this, address) );
newPeer->connect = connectionKeeper(newPeer);
peers[address] = newPeer;
return newPeer;
return peer;
}
bool TransportData::isLocalAddress(const NetworkAddress& address) const {
@ -1077,7 +1084,7 @@ void FlowTransport::addPeerReference(const Endpoint& endpoint, bool isStream) {
else if (FlowTransport::transport().isClient())
IFailureMonitor::failureMonitor().setStatus(endpoint.getPrimaryAddress(), FailureStatus(false));
Reference<Peer> peer = self->getPeer(endpoint.getPrimaryAddress());
Reference<Peer> peer = self->getOrOpenPeer(endpoint.getPrimaryAddress());
if(peer->peerReferences == -1) {
peer->peerReferences = 1;
} else {
@ -1087,7 +1094,7 @@ void FlowTransport::addPeerReference(const Endpoint& endpoint, bool isStream) {
void FlowTransport::removePeerReference(const Endpoint& endpoint, bool isStream) {
if (!isStream || !endpoint.getPrimaryAddress().isValid()) return;
Reference<Peer> peer = self->getPeer(endpoint.getPrimaryAddress(), false);
Reference<Peer> peer = self->getPeer(endpoint.getPrimaryAddress());
if(peer) {
peer->peerReferences--;
if(peer->peerReferences < 0) {
@ -1246,7 +1253,7 @@ ReliablePacket* FlowTransport::sendReliable( ISerializeSource const& what, const
sendLocal( self, what, destination );
return nullptr;
}
Reference<Peer> peer = self->getPeer(destination.getPrimaryAddress());
Reference<Peer> peer = self->getOrOpenPeer(destination.getPrimaryAddress());
return sendPacket( self, peer, what, destination, true );
}
@ -1260,7 +1267,14 @@ Reference<Peer> FlowTransport::sendUnreliable( ISerializeSource const& what, con
sendLocal( self, what, destination );
return Reference<Peer>();
}
Reference<Peer> peer = self->getPeer(destination.getPrimaryAddress(), openConnection);
Reference<Peer> peer;
if(openConnection) {
peer = self->getOrOpenPeer(destination.getPrimaryAddress());
}
else {
peer = self->getPeer(destination.getPrimaryAddress());
}
sendPacket( self, peer, what, destination, false );
return peer;
}

View File

@ -125,7 +125,7 @@ struct Peer : public ReferenceCounted<Peer> {
int outstandingReplies;
explicit Peer(TransportData* transport, NetworkAddress const& destination)
: transport(transport), destination(destination), outgoingConnectionIdle(false), lastConnectTime(0.0),
: transport(transport), destination(destination), outgoingConnectionIdle(true), lastConnectTime(0.0),
reconnectionDelay(FLOW_KNOBS->INITIAL_RECONNECTION_TIME), compatible(true), outstandingReplies(0),
incompatibleProtocolVersionNewer(false), peerReferences(-1), bytesReceived(0), lastDataPacketSentTime(now()) {}

View File

@ -398,8 +398,14 @@ public:
try {
bool remoteDCUsedAsSatellite = false;
std::set<Optional<Key>> satelliteDCs;
int32_t desiredSatelliteTLogs = 0;
for(int s = startDC; s < std::min<int>(startDC + (satelliteFallback ? region.satelliteTLogUsableDcsFallback : region.satelliteTLogUsableDcs), region.satellites.size()); s++) {
satelliteDCs.insert(region.satellites[s].dcId);
if(region.satellites[s].satelliteDesiredTLogCount == -1 || desiredSatelliteTLogs == -1) {
desiredSatelliteTLogs = -1;
} else {
desiredSatelliteTLogs += region.satellites[s].satelliteDesiredTLogCount;
}
if (region.satellites[s].dcId == remoteRegion.dcId) {
remoteDCUsedAsSatellite = true;
}
@ -413,9 +419,9 @@ public:
std::transform(remoteLogs.begin(), remoteLogs.end(), std::back_inserter(exclusionWorkerIds), [](const WorkerDetails &in) { return in.interf.id(); });
}
if(satelliteFallback) {
return getWorkersForTlogs( conf, region.satelliteTLogReplicationFactorFallback, conf.getDesiredSatelliteLogs(region.dcId)*region.satelliteTLogUsableDcsFallback/region.satelliteTLogUsableDcs, region.satelliteTLogPolicyFallback, id_used, checkStable, satelliteDCs, exclusionWorkerIds);
return getWorkersForTlogs( conf, region.satelliteTLogReplicationFactorFallback, desiredSatelliteTLogs>0 ? desiredSatelliteTLogs : conf.getDesiredSatelliteLogs(region.dcId)*region.satelliteTLogUsableDcsFallback/region.satelliteTLogUsableDcs, region.satelliteTLogPolicyFallback, id_used, checkStable, satelliteDCs, exclusionWorkerIds);
} else {
return getWorkersForTlogs( conf, region.satelliteTLogReplicationFactor, conf.getDesiredSatelliteLogs(region.dcId), region.satelliteTLogPolicy, id_used, checkStable, satelliteDCs, exclusionWorkerIds);
return getWorkersForTlogs( conf, region.satelliteTLogReplicationFactor, desiredSatelliteTLogs>0 ? desiredSatelliteTLogs : conf.getDesiredSatelliteLogs(region.dcId), region.satelliteTLogPolicy, id_used, checkStable, satelliteDCs, exclusionWorkerIds);
}
} catch (Error &e) {
if(e.code() != error_code_no_more_servers) {
@ -462,7 +468,7 @@ public:
deterministicRandom()->randomShuffle(w);
for( int i=0; i < w.size(); i++ ) {
id_used[w[i].interf.locality.processId()]++;
return WorkerFitnessInfo(w[i], it.first.first, it.first.second);
return WorkerFitnessInfo(w[i], std::max(ProcessClass::GoodFit, it.first.first), it.first.second);
}
}
}
@ -518,18 +524,8 @@ public:
RoleFitness() : bestFit(ProcessClass::NeverAssign), worstFit(ProcessClass::NeverAssign), role(ProcessClass::NoRole), count(0), worstIsDegraded(false) {}
RoleFitness(RoleFitness first, RoleFitness second, ProcessClass::ClusterRole role) : bestFit(std::min(first.worstFit, second.worstFit)), worstFit(std::max(first.worstFit, second.worstFit)), count(first.count + second.count), role(role) {
if(first.worstFit > second.worstFit) {
worstIsDegraded = first.worstIsDegraded;
} else if(second.worstFit > first.worstFit) {
worstIsDegraded = second.worstIsDegraded;
} else {
worstIsDegraded = first.worstIsDegraded || second.worstIsDegraded;
}
}
RoleFitness( vector<WorkerDetails> workers, ProcessClass::ClusterRole role ) : role(role) {
worstFit = ProcessClass::BestFit;
worstFit = ProcessClass::GoodFit;
worstIsDegraded = false;
bestFit = ProcessClass::NeverAssign;
for(auto& it : workers) {
@ -576,6 +572,35 @@ public:
std::string toString() const { return format("%d %d %d %d", bestFit, worstFit, count, worstIsDegraded); }
};
struct RoleFitnessPair {
RoleFitness proxy;
RoleFitness resolver;
RoleFitnessPair() {}
RoleFitnessPair(RoleFitness const& proxy, RoleFitness const& resolver) : proxy(proxy), resolver(resolver) {}
bool operator < (RoleFitnessPair const& r) const {
if(proxy.betterFitness(r.proxy)) {
return true;
}
if(r.proxy.betterFitness(proxy)) {
return false;
}
if(resolver.betterFitness(r.resolver)) {
return true;
}
if(r.resolver.betterFitness(resolver)) {
return false;
}
if(proxy.count != r.proxy.count) {
return proxy.count > r.proxy.count;
}
return resolver.count > r.resolver.count;
}
bool operator == (RoleFitnessPair const& r) const { return proxy == r.proxy && resolver == r.resolver; }
};
std::set<Optional<Standalone<StringRef>>> getDatacenters( DatabaseConfiguration const& conf, bool checkStable = false ) {
std::set<Optional<Standalone<StringRef>>> result;
for( auto& it : id_worker )
@ -776,7 +801,7 @@ public:
auto datacenters = getDatacenters( req.configuration );
RoleFitness bestFitness;
RoleFitnessPair bestFitness;
int numEquivalent = 1;
Optional<Key> bestDC;
@ -793,7 +818,7 @@ public:
proxies.push_back(first_proxy.worker);
resolvers.push_back(first_resolver.worker);
auto fitness = RoleFitness( RoleFitness(proxies, ProcessClass::Proxy), RoleFitness(resolvers, ProcessClass::Resolver), ProcessClass::NoRole );
RoleFitnessPair fitness( RoleFitness(proxies, ProcessClass::Proxy), RoleFitness(resolvers, ProcessClass::Resolver) );
if(dcId == clusterControllerDcId) {
bestFitness = fitness;
@ -839,7 +864,8 @@ public:
if( now() - startTime < SERVER_KNOBS->WAIT_FOR_GOOD_RECRUITMENT_DELAY &&
( RoleFitness(SERVER_KNOBS->EXPECTED_TLOG_FITNESS, req.configuration.getDesiredLogs(), ProcessClass::TLog).betterCount(RoleFitness(tlogs, ProcessClass::TLog)) ||
RoleFitness(std::min(SERVER_KNOBS->EXPECTED_PROXY_FITNESS, SERVER_KNOBS->EXPECTED_RESOLVER_FITNESS), std::max(SERVER_KNOBS->EXPECTED_PROXY_FITNESS, SERVER_KNOBS->EXPECTED_RESOLVER_FITNESS), req.configuration.getDesiredProxies()+req.configuration.getDesiredResolvers(), ProcessClass::NoRole).betterCount(bestFitness) ) ) {
RoleFitness(SERVER_KNOBS->EXPECTED_PROXY_FITNESS, req.configuration.getDesiredProxies(), ProcessClass::Proxy).betterCount(bestFitness.proxy) ||
RoleFitness(SERVER_KNOBS->EXPECTED_RESOLVER_FITNESS, req.configuration.getDesiredResolvers(), ProcessClass::Resolver).betterCount(bestFitness.resolver) ) ) {
throw operation_failed();
}
@ -985,10 +1011,14 @@ public:
std::map< Optional<Standalone<StringRef>>, int> id_used;
id_used[clusterControllerProcessId]++;
WorkerFitnessInfo mworker = getWorkerForRoleInDatacenter(clusterControllerDcId, ProcessClass::Master, ProcessClass::NeverAssign, db.config, id_used, true);
auto newMasterFit = mworker.worker.processClass.machineClassFitness( ProcessClass::Master );
if(db.config.isExcludedServer(mworker.worker.interf.address())) {
newMasterFit = std::max(newMasterFit, ProcessClass::ExcludeFit);
}
if ( oldMasterFit < mworker.fitness )
if ( oldMasterFit < newMasterFit )
return false;
if ( oldMasterFit > mworker.fitness || ( dbi.master.locality.processId() == clusterControllerProcessId && mworker.worker.interf.locality.processId() != clusterControllerProcessId ) )
if ( oldMasterFit > newMasterFit || ( dbi.master.locality.processId() == clusterControllerProcessId && mworker.worker.interf.locality.processId() != clusterControllerProcessId ) )
return true;
std::set<Optional<Key>> primaryDC;
@ -1018,6 +1048,7 @@ public:
if(oldTLogFit < newTLogFit) return false;
bool oldSatelliteFallback = false;
for(auto& logSet : dbi.logSystemConfig.tLogs) {
if(logSet.isLocal && logSet.locality == tagLocalitySatellite) {
oldSatelliteFallback = logSet.tLogPolicy->info() != region.satelliteTLogPolicy->info();
@ -1031,11 +1062,42 @@ public:
auto newSatelliteTLogs = region.satelliteTLogReplicationFactor > 0 ? getWorkersForSatelliteLogs(db.config, region, remoteRegion, id_used, newSatelliteFallback, true) : satellite_tlogs;
RoleFitness newSatelliteTLogFit(newSatelliteTLogs, ProcessClass::TLog);
if(oldSatelliteTLogFit < newSatelliteTLogFit)
return false;
std::map<Optional<Key>,int32_t> satellite_priority;
for(auto& r : region.satellites) {
satellite_priority[r.dcId] = r.priority;
}
int32_t oldSatelliteRegionFit = std::numeric_limits<int32_t>::max();
for(auto& it : satellite_tlogs) {
if(satellite_priority.count(it.interf.locality.dcId())) {
oldSatelliteRegionFit = std::min(oldSatelliteRegionFit, satellite_priority[it.interf.locality.dcId()]);
} else {
oldSatelliteRegionFit = -1;
}
}
int32_t newSatelliteRegionFit = std::numeric_limits<int32_t>::max();
for(auto& it : newSatelliteTLogs) {
if(satellite_priority.count(it.interf.locality.dcId())) {
newSatelliteRegionFit = std::min(newSatelliteRegionFit, satellite_priority[it.interf.locality.dcId()]);
} else {
newSatelliteRegionFit = -1;
}
}
if(oldSatelliteFallback && !newSatelliteFallback)
return true;
if(!oldSatelliteFallback && newSatelliteFallback)
return false;
if(oldSatelliteRegionFit < newSatelliteRegionFit)
return true;
if(oldSatelliteRegionFit > newSatelliteRegionFit)
return false;
if(oldSatelliteTLogFit < newSatelliteTLogFit)
return false;
RoleFitness oldRemoteTLogFit(remote_tlogs, ProcessClass::TLog);
std::vector<UID> exclusionWorkerIds;
auto fn = [](const WorkerDetails &in) { return in.interf.id(); };
@ -1059,7 +1121,7 @@ public:
}
if(oldLogRoutersFit < newLogRoutersFit) return false;
// Check proxy/resolver fitness
RoleFitness oldInFit(RoleFitness(proxyClasses, ProcessClass::Proxy), RoleFitness(resolverClasses, ProcessClass::Resolver), ProcessClass::NoRole);
RoleFitnessPair oldInFit(RoleFitness(proxyClasses, ProcessClass::Proxy), RoleFitness(resolverClasses, ProcessClass::Resolver));
auto first_resolver = getWorkerForRoleInDatacenter( clusterControllerDcId, ProcessClass::Resolver, ProcessClass::ExcludeFit, db.config, id_used, true );
auto first_proxy = getWorkerForRoleInDatacenter( clusterControllerDcId, ProcessClass::Proxy, ProcessClass::ExcludeFit, db.config, id_used, true );
@ -1069,12 +1131,15 @@ public:
proxies.push_back(first_proxy.worker);
resolvers.push_back(first_resolver.worker);
RoleFitness newInFit(RoleFitness(proxies, ProcessClass::Proxy), RoleFitness(resolvers, ProcessClass::Resolver), ProcessClass::NoRole);
if(oldInFit.betterFitness(newInFit)) return false;
if(oldTLogFit > newTLogFit || oldInFit > newInFit || (oldSatelliteFallback && !newSatelliteFallback) || oldSatelliteTLogFit > newSatelliteTLogFit || oldRemoteTLogFit > newRemoteTLogFit || oldLogRoutersFit > newLogRoutersFit) {
TraceEvent("BetterMasterExists", id).detail("OldMasterFit", oldMasterFit).detail("NewMasterFit", mworker.fitness)
RoleFitnessPair newInFit(RoleFitness(proxies, ProcessClass::Proxy), RoleFitness(resolvers, ProcessClass::Resolver));
if(oldInFit.proxy.betterFitness(newInFit.proxy) || oldInFit.resolver.betterFitness(newInFit.resolver)) {
return false;
}
if(oldTLogFit > newTLogFit || oldInFit > newInFit || oldSatelliteTLogFit > newSatelliteTLogFit || oldRemoteTLogFit > newRemoteTLogFit || oldLogRoutersFit > newLogRoutersFit) {
TraceEvent("BetterMasterExists", id).detail("OldMasterFit", oldMasterFit).detail("NewMasterFit", newMasterFit)
.detail("OldTLogFit", oldTLogFit.toString()).detail("NewTLogFit", newTLogFit.toString())
.detail("OldInFit", oldInFit.toString()).detail("NewInFit", newInFit.toString())
.detail("OldProxyFit", oldInFit.proxy.toString()).detail("NewProxyFit", newInFit.proxy.toString())
.detail("OldResolverFit", oldInFit.resolver.toString()).detail("NewResolverFit", newInFit.resolver.toString())
.detail("OldSatelliteFit", oldSatelliteTLogFit.toString()).detail("NewSatelliteFit", newSatelliteTLogFit.toString())
.detail("OldRemoteFit", oldRemoteTLogFit.toString()).detail("NewRemoteFit", newRemoteTLogFit.toString())
.detail("OldRouterFit", oldLogRoutersFit.toString()).detail("NewRouterFit", newLogRoutersFit.toString())
@ -1161,11 +1226,36 @@ public:
Optional<UID> recruitingRatekeeperID;
AsyncVar<bool> recruitRatekeeper;
CounterCollection clusterControllerMetrics;
Counter openDatabaseRequests;
Counter registerWorkerRequests;
Counter getWorkersRequests;
Counter getClientWorkersRequests;
Counter registerMasterRequests;
Counter getServerDBInfoRequests;
Counter statusRequests;
Counter failureMonitoringRequests;
Counter serversFailed;
Counter serversUnfailed;
ClusterControllerData( ClusterControllerFullInterface const& ccInterface, LocalityData const& locality )
: clusterControllerProcessId(locality.processId()), clusterControllerDcId(locality.dcId()),
id(ccInterface.id()), ac(false), outstandingRequestChecker(Void()), gotProcessClasses(false),
gotFullyRecoveredConfig(false), startTime(now()), datacenterVersionDifference(0),
versionDifferenceUpdated(false), recruitingDistributor(false), recruitRatekeeper(false)
versionDifferenceUpdated(false), recruitingDistributor(false), recruitRatekeeper(false),
clusterControllerMetrics("ClusterController", id.toString()),
openDatabaseRequests("OpenDatabaseRequests", clusterControllerMetrics),
registerWorkerRequests("RegisterWorkerRequests", clusterControllerMetrics),
getWorkersRequests("GetWorkersRequests", clusterControllerMetrics),
getClientWorkersRequests("GetClientWorkersRequests", clusterControllerMetrics),
registerMasterRequests("RegisterMasterRequests", clusterControllerMetrics),
getServerDBInfoRequests("GetServerDBInfoRequests", clusterControllerMetrics),
statusRequests("StatusRequests", clusterControllerMetrics),
failureMonitoringRequests("FailureMonitoringRequests", clusterControllerMetrics),
serversFailed("ServersFailed", clusterControllerMetrics),
serversUnfailed("ServersUnfailed", clusterControllerMetrics)
{
CachedSerialization<ServerDBInfo> newInfoCache = db.serverInfo->get();
auto& serverInfo = newInfoCache.mutate();
@ -1518,7 +1608,7 @@ struct FailureStatusInfo {
};
//The failure monitor client relies on the fact that the failure detection server will not declare itself failed
ACTOR Future<Void> failureDetectionServer( UID uniqueID, ClusterControllerData::DBInfo* db, FutureStream< FailureMonitoringRequest > requests ) {
ACTOR Future<Void> failureDetectionServer( UID uniqueID, ClusterControllerData* self, FutureStream< FailureMonitoringRequest > requests ) {
state Version currentVersion = 0;
state std::map<NetworkAddressList, FailureStatusInfo> currentStatus; // The status at currentVersion
state std::deque<SystemFailureStatus> statusHistory; // The last change in statusHistory is from currentVersion-1 to currentVersion
@ -1527,6 +1617,7 @@ ACTOR Future<Void> failureDetectionServer( UID uniqueID, ClusterControllerData::
loop choose {
when ( FailureMonitoringRequest req = waitNext( requests ) ) {
++self->failureMonitoringRequests;
if ( req.senderStatus.present() ) {
// Update the status of requester, if necessary
auto& stat = currentStatus[ req.addresses ];
@ -1536,6 +1627,12 @@ ACTOR Future<Void> failureDetectionServer( UID uniqueID, ClusterControllerData::
stat.insertRequest(now());
if (req.senderStatus != stat.status) {
if(newStat.failed) {
++self->serversFailed;
}
else {
++self->serversUnfailed;
}
TraceEvent("FailureDetectionStatus", uniqueID).detail("System", req.addresses.toString()).detail("Status", newStat.failed ? "Failed" : "OK").detail("Why", "Request");
statusHistory.push_back( SystemFailureStatus( req.addresses, newStat ) );
++currentVersion;
@ -1615,7 +1712,7 @@ ACTOR Future<Void> failureDetectionServer( UID uniqueID, ClusterControllerData::
//TraceEvent("FailureDetectionPoll", uniqueID).detail("PivotDelay", pivotDelay).detail("Clients", currentStatus.size());
//TraceEvent("FailureDetectionAcceptableDelay").detail("Delay", acceptableDelay1000);
bool tooManyLogGenerations = std::max(db->unfinishedRecoveries, db->logGenerations) > CLIENT_KNOBS->FAILURE_MAX_GENERATIONS;
bool tooManyLogGenerations = std::max(self->db.unfinishedRecoveries, self->db.logGenerations) > CLIENT_KNOBS->FAILURE_MAX_GENERATIONS;
for(auto it = currentStatus.begin(); it != currentStatus.end(); ) {
double delay = t - it->second.lastRequestTime;
@ -1624,7 +1721,8 @@ ACTOR Future<Void> failureDetectionServer( UID uniqueID, ClusterControllerData::
( delay > pivotDelay * 2 + FLOW_KNOBS->SERVER_REQUEST_INTERVAL + CLIENT_KNOBS->FAILURE_MIN_DELAY || delay > CLIENT_KNOBS->FAILURE_MAX_DELAY ) ) ) {
//printf("Failure Detection Server: Status of '%s' is now '%s' after %f sec\n", it->first.toString().c_str(), "Failed", now() - it->second.lastRequestTime);
TraceEvent("FailureDetectionStatus", uniqueID).detail("System", describe(it->first)).detail("Status","Failed").detail("Why", "Timeout").detail("LastRequestAge", delay)
.detail("PivotDelay", pivotDelay).detail("UnfinishedRecoveries", db->unfinishedRecoveries).detail("LogGenerations", db->logGenerations);
.detail("PivotDelay", pivotDelay).detail("UnfinishedRecoveries", self->db.unfinishedRecoveries).detail("LogGenerations", self->db.logGenerations);
++self->serversFailed;
statusHistory.push_back( SystemFailureStatus( it->first, FailureStatus(true) ) );
++currentVersion;
it = currentStatus.erase(it);
@ -2005,6 +2103,7 @@ ACTOR Future<Void> statusServer(FutureStream< StatusRequest> requests,
try {
// Wait til first request is ready
StatusRequest req = waitNext(requests);
++self->statusRequests;
requests_batch.push_back(req);
// Earliest time at which we may begin a new request
@ -2584,7 +2683,7 @@ ACTOR Future<Void> clusterControllerCore( ClusterControllerFullInterface interf,
state uint64_t step = 0;
state Future<ErrorOr<Void>> error = errorOr( actorCollection( self.addActor.getFuture() ) );
self.addActor.send( failureDetectionServer( self.id, &self.db, interf.clientInterface.failureMonitoring.getFuture() ) );
self.addActor.send( failureDetectionServer( self.id, &self, interf.clientInterface.failureMonitoring.getFuture() ) );
self.addActor.send( clusterWatchDatabase( &self, &self.db ) ); // Start the master database
self.addActor.send( self.updateWorkerList.init( self.db.db ) );
self.addActor.send( statusServer( interf.clientInterface.databaseStatus.getFuture(), &self, coordinators));
@ -2598,6 +2697,8 @@ ACTOR Future<Void> clusterControllerCore( ClusterControllerFullInterface interf,
self.addActor.send( handleForcedRecoveries(&self, interf) );
self.addActor.send( monitorDataDistributor(&self) );
self.addActor.send( monitorRatekeeper(&self) );
self.addActor.send( traceCounters("ClusterControllerMetrics", self.id, SERVER_KNOBS->STORAGE_LOGGING_DELAY, &self.clusterControllerMetrics, self.id.toString() + "/ClusterControllerMetrics") );
//printf("%s: I am the cluster controller\n", g_network->getLocalAddress().toString().c_str());
loop choose {
@ -2613,6 +2714,7 @@ ACTOR Future<Void> clusterControllerCore( ClusterControllerFullInterface interf,
return Void();
}
when( OpenDatabaseRequest req = waitNext( interf.clientInterface.openDatabase.getFuture() ) ) {
++self.openDatabaseRequests;
self.addActor.send(clusterOpenDatabase(&self.db, req));
}
when( RecruitFromConfigurationRequest req = waitNext( interf.recruitFromConfiguration.getFuture() ) ) {
@ -2625,9 +2727,11 @@ ACTOR Future<Void> clusterControllerCore( ClusterControllerFullInterface interf,
clusterRecruitStorage( &self, req );
}
when( RegisterWorkerRequest req = waitNext( interf.registerWorker.getFuture() ) ) {
++self.registerWorkerRequests;
registerWorker( req, &self );
}
when( GetWorkersRequest req = waitNext( interf.getWorkers.getFuture() ) ) {
++self.getWorkersRequests;
vector<WorkerDetails> workers;
for(auto& it : self.id_worker) {
@ -2645,6 +2749,7 @@ ACTOR Future<Void> clusterControllerCore( ClusterControllerFullInterface interf,
req.reply.send( workers );
}
when( GetClientWorkersRequest req = waitNext( interf.clientInterface.getClientWorkers.getFuture() ) ) {
++self.getClientWorkersRequests;
vector<ClientWorkerInterface> workers;
for(auto& it : self.id_worker) {
if (it.second.details.processClass.classType() != ProcessClass::TesterClass) {
@ -2661,9 +2766,11 @@ ACTOR Future<Void> clusterControllerCore( ClusterControllerFullInterface interf,
TraceEvent("CoordinationPingSent", self.id).detail("TimeStep", message.timeStep);
}
when( RegisterMasterRequest req = waitNext( interf.registerMaster.getFuture() ) ) {
++self.registerMasterRequests;
clusterRegisterMaster( &self, req );
}
when( GetServerDBInfoRequest req = waitNext( interf.getServerDBInfo.getFuture() ) ) {
++self.getServerDBInfoRequests;
self.addActor.send(
clusterGetServerInfo(&self.db, req.knownServerInfoID, req.issues, req.incompatiblePeers, req.reply));
}

View File

@ -189,7 +189,7 @@ public:
int priority;
explicit TCTeamInfo(vector<Reference<TCServerInfo>> const& servers)
: servers(servers), healthy(true), priority(PRIORITY_TEAM_HEALTHY), wrongConfiguration(false) {
: servers(servers), healthy(true), priority(SERVER_KNOBS->PRIORITY_TEAM_HEALTHY), wrongConfiguration(false) {
if (servers.empty()) {
TraceEvent(SevInfo, "ConstructTCTeamFromEmptyServers");
}
@ -2558,7 +2558,7 @@ ACTOR Future<Void> machineTeamRemover(DDTeamCollection* self) {
}
// To avoid removing machine teams too fast, which is unlikely happen though
wait( delay(SERVER_KNOBS->TR_REMOVE_MACHINE_TEAM_DELAY) );
wait( delay(SERVER_KNOBS->TR_REMOVE_MACHINE_TEAM_DELAY, TaskPriority::DataDistribution) );
wait(waitUntilHealthy(self));
// Wait for the badTeamRemover() to avoid the potential race between adding the bad team (add the team tracker)
@ -2681,7 +2681,7 @@ ACTOR Future<Void> serverTeamRemover(DDTeamCollection* self) {
removeServerTeamDelay = removeServerTeamDelay / 100;
}
// To avoid removing server teams too fast, which is unlikely happen though
wait(delay(removeServerTeamDelay));
wait(delay(removeServerTeamDelay, TaskPriority::DataDistribution));
wait(waitUntilHealthy(self, SERVER_KNOBS->TR_REMOVE_SERVER_TEAM_EXTRA_DELAY));
// Wait for the badTeamRemover() to avoid the potential race between
@ -2865,25 +2865,25 @@ ACTOR Future<Void> teamTracker(DDTeamCollection* self, Reference<TCTeamInfo> tea
state int lastPriority = team->getPriority();
if( serversLeft < self->configuration.storageTeamSize ) {
if( serversLeft == 0 )
team->setPriority( PRIORITY_TEAM_0_LEFT );
team->setPriority( SERVER_KNOBS->PRIORITY_TEAM_0_LEFT );
else if( serversLeft == 1 )
team->setPriority( PRIORITY_TEAM_1_LEFT );
team->setPriority( SERVER_KNOBS->PRIORITY_TEAM_1_LEFT );
else if( serversLeft == 2 )
team->setPriority( PRIORITY_TEAM_2_LEFT );
team->setPriority( SERVER_KNOBS->PRIORITY_TEAM_2_LEFT );
else
team->setPriority( PRIORITY_TEAM_UNHEALTHY );
team->setPriority( SERVER_KNOBS->PRIORITY_TEAM_UNHEALTHY );
}
else if ( badTeam || anyWrongConfiguration ) {
if ( redundantTeam ) {
team->setPriority( PRIORITY_TEAM_REDUNDANT );
team->setPriority( SERVER_KNOBS->PRIORITY_TEAM_REDUNDANT );
} else {
team->setPriority( PRIORITY_TEAM_UNHEALTHY );
team->setPriority( SERVER_KNOBS->PRIORITY_TEAM_UNHEALTHY );
}
}
else if( anyUndesired )
team->setPriority( PRIORITY_TEAM_CONTAINS_UNDESIRED_SERVER );
team->setPriority( SERVER_KNOBS->PRIORITY_TEAM_CONTAINS_UNDESIRED_SERVER );
else
team->setPriority( PRIORITY_TEAM_HEALTHY );
team->setPriority( SERVER_KNOBS->PRIORITY_TEAM_HEALTHY );
if(lastPriority != team->getPriority()) {
self->priority_teams[lastPriority]--;
@ -2901,13 +2901,13 @@ ACTOR Future<Void> teamTracker(DDTeamCollection* self, Reference<TCTeamInfo> tea
for(int i=0; i<shards.size(); i++) {
int maxPriority = team->getPriority();
if(maxPriority < PRIORITY_TEAM_0_LEFT) {
if(maxPriority < SERVER_KNOBS->PRIORITY_TEAM_0_LEFT) {
auto teams = self->shardsAffectedByTeamFailure->getTeamsFor( shards[i] );
for( int j=0; j < teams.first.size()+teams.second.size(); j++) {
// t is the team in primary DC or the remote DC
auto& t = j < teams.first.size() ? teams.first[j] : teams.second[j-teams.first.size()];
if( !t.servers.size() ) {
maxPriority = PRIORITY_TEAM_0_LEFT;
maxPriority = SERVER_KNOBS->PRIORITY_TEAM_0_LEFT;
break;
}
@ -2931,8 +2931,8 @@ ACTOR Future<Void> teamTracker(DDTeamCollection* self, Reference<TCTeamInfo> tea
// false We want to differentiate the redundant_team from unhealthy_team in
// terms of relocate priority
maxPriority =
std::max<int>(maxPriority, redundantTeam ? PRIORITY_TEAM_REDUNDANT
: PRIORITY_TEAM_UNHEALTHY);
std::max<int>(maxPriority, redundantTeam ? SERVER_KNOBS->PRIORITY_TEAM_REDUNDANT
: SERVER_KNOBS->PRIORITY_TEAM_UNHEALTHY);
}
} else {
TEST(true); // A removed server is still associated with a team in SABTF
@ -3064,7 +3064,7 @@ ACTOR Future<vector<std::pair<StorageServerInterface, ProcessClass>>> getServerL
}
ACTOR Future<Void> waitServerListChange( DDTeamCollection* self, FutureStream<Void> serverRemoved ) {
state Future<Void> checkSignal = delay(SERVER_KNOBS->SERVER_LIST_DELAY);
state Future<Void> checkSignal = delay(SERVER_KNOBS->SERVER_LIST_DELAY, TaskPriority::DataDistributionLaunch);
state Future<vector<std::pair<StorageServerInterface, ProcessClass>>> serverListAndProcessClasses = Never();
state bool isFetchingResults = false;
state Transaction tr(self->cx);
@ -3102,7 +3102,7 @@ ACTOR Future<Void> waitServerListChange( DDTeamCollection* self, FutureStream<Vo
}
tr = Transaction(self->cx);
checkSignal = delay(SERVER_KNOBS->SERVER_LIST_DELAY);
checkSignal = delay(SERVER_KNOBS->SERVER_LIST_DELAY, TaskPriority::DataDistributionLaunch);
}
when( waitNext( serverRemoved ) ) {
if( isFetchingResults ) {
@ -3136,7 +3136,7 @@ ACTOR Future<Void> waitHealthyZoneChange( DDTeamCollection* self ) {
healthyZoneTimeout = Never();
} else if (p.second > tr.getReadVersion().get()) {
double timeoutSeconds = (p.second - tr.getReadVersion().get())/(double)SERVER_KNOBS->VERSIONS_PER_SECOND;
healthyZoneTimeout = delay(timeoutSeconds);
healthyZoneTimeout = delay(timeoutSeconds, TaskPriority::DataDistribution);
if(self->healthyZone.get() != p.first) {
TraceEvent("MaintenanceZoneStart", self->distributorId).detail("ZoneID", printable(p.first)).detail("EndVersion", p.second).detail("Duration", timeoutSeconds);
self->healthyZone.set(p.first);
@ -3591,7 +3591,7 @@ ACTOR Future<Void> checkAndRemoveInvalidLocalityAddr(DDTeamCollection* self) {
loop {
try {
wait(delay(SERVER_KNOBS->DD_CHECK_INVALID_LOCALITY_DELAY));
wait(delay(SERVER_KNOBS->DD_CHECK_INVALID_LOCALITY_DELAY, TaskPriority::DataDistribution));
// Because worker's processId can be changed when its locality is changed, we cannot watch on the old
// processId; This actor is inactive most time, so iterating all workers incurs little performance overhead.
@ -3770,7 +3770,7 @@ ACTOR Future<Void> storageRecruiter( DDTeamCollection* self, Reference<AsyncVar<
}
when( wait( self->restartRecruiting.onTrigger() ) ) {}
}
wait( delay(FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY) );
wait( delay(FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY, TaskPriority::DataDistribution) );
} catch( Error &e ) {
if(e.code() != error_code_timed_out) {
throw;
@ -3830,7 +3830,7 @@ ACTOR Future<Void> remoteRecovered( Reference<AsyncVar<struct ServerDBInfo>> db
ACTOR Future<Void> monitorHealthyTeams( DDTeamCollection* self ) {
loop choose {
when ( wait(self->zeroHealthyTeams->get() ? delay(SERVER_KNOBS->DD_ZERO_HEALTHY_TEAM_DELAY) : Never()) ) {
when ( wait(self->zeroHealthyTeams->get() ? delay(SERVER_KNOBS->DD_ZERO_HEALTHY_TEAM_DELAY, TaskPriority::DataDistribution) : Never()) ) {
self->doBuildTeams = true;
wait( DDTeamCollection::checkBuildTeams(self) );
}
@ -4174,9 +4174,21 @@ ACTOR Future<Void> dataDistribution(Reference<DataDistributorData> self)
.detail( "InFlight", 0 )
.detail( "InQueue", 0 )
.detail( "AverageShardSize", -1 )
.detail( "LowPriorityRelocations", 0 )
.detail( "HighPriorityRelocations", 0 )
.detail( "UnhealthyRelocations", 0 )
.detail( "HighestPriority", 0 )
.detail( "BytesWritten", 0 )
.detail( "PriorityRecoverMove", 0 )
.detail( "PriorityRebalanceUnderutilizedTeam", 0 )
.detail( "PriorityRebalannceOverutilizedTeam", 0)
.detail( "PriorityTeamHealthy", 0 )
.detail( "PriorityTeamContainsUndesiredServer", 0 )
.detail( "PriorityTeamRedundant", 0 )
.detail( "PriorityMergeShard", 0 )
.detail( "PriorityTeamUnhealthy", 0 )
.detail( "PriorityTeam2Left", 0 )
.detail( "PriorityTeam1Left", 0 )
.detail( "PriorityTeam0Left", 0 )
.detail( "PrioritySplitShard", 0 )
.trackLatest( "MovingData" );
TraceEvent("TotalDataInFlight", self->ddId).detail("Primary", true).detail("TotalBytes", 0).detail("UnhealthyServers", 0).detail("HighestPriority", 0).trackLatest("TotalDataInFlight");
@ -4219,7 +4231,7 @@ ACTOR Future<Void> dataDistribution(Reference<DataDistributorData> self)
if (!unhealthy && configuration.usableRegions > 1) {
unhealthy = initData->shards[shard].remoteSrc.size() != configuration.storageTeamSize;
}
output.send( RelocateShard( keys, unhealthy ? PRIORITY_TEAM_UNHEALTHY : PRIORITY_RECOVER_MOVE ) );
output.send( RelocateShard( keys, unhealthy ? SERVER_KNOBS->PRIORITY_TEAM_UNHEALTHY : SERVER_KNOBS->PRIORITY_RECOVER_MOVE ) );
}
wait( yield(TaskPriority::DataDistribution) );
}

View File

@ -38,33 +38,6 @@ struct RelocateShard {
RelocateShard( KeyRange const& keys, int priority ) : keys(keys), priority(priority) {}
};
// Higher priorities are executed first
// Priority/100 is the "priority group"/"superpriority". Priority inversion
// is possible within but not between priority groups; fewer priority groups
// mean better worst case time bounds
enum {
PRIORITY_REBALANCE_SHARD = 100,
PRIORITY_RECOVER_MOVE = 110,
PRIORITY_REBALANCE_UNDERUTILIZED_TEAM = 120,
PRIORITY_REBALANCE_OVERUTILIZED_TEAM = 121,
PRIORITY_TEAM_HEALTHY = 140,
PRIORITY_TEAM_CONTAINS_UNDESIRED_SERVER = 150,
// Set removing_redundant_team priority lower than merge/split_shard_priority,
// so that removing redundant teams does not block merge/split shards.
PRIORITY_TEAM_REDUNDANT = 200,
PRIORITY_MERGE_SHARD = 340,
PRIORITY_SPLIT_SHARD = 350,
PRIORITY_TEAM_UNHEALTHY = 800,
PRIORITY_TEAM_2_LEFT = 809,
PRIORITY_TEAM_1_LEFT = 900,
PRIORITY_TEAM_0_LEFT = 999
};
enum {
SOME_SHARED = 2,
NONE_SHARED = 3

View File

@ -37,6 +37,9 @@
struct RelocateData {
KeyRange keys;
int priority;
int boundaryPriority;
int healthPriority;
double startTime;
UID randomId;
int workFactor;
@ -45,34 +48,42 @@ struct RelocateData {
bool wantsNewServers;
TraceInterval interval;
RelocateData() : startTime(-1), priority(-1), workFactor(0), wantsNewServers(false), interval("QueuedRelocation") {}
RelocateData( RelocateShard const& rs ) : keys(rs.keys), priority(rs.priority), startTime(now()), randomId(deterministicRandom()->randomUniqueID()), workFactor(0),
RelocateData() : startTime(-1), priority(-1), boundaryPriority(-1), healthPriority(-1), workFactor(0), wantsNewServers(false), interval("QueuedRelocation") {}
explicit RelocateData( RelocateShard const& rs ) : keys(rs.keys), priority(rs.priority), boundaryPriority(isBoundaryPriority(rs.priority) ? rs.priority : -1), healthPriority(isHealthPriority(rs.priority) ? rs.priority : -1), startTime(now()), randomId(deterministicRandom()->randomUniqueID()), workFactor(0),
wantsNewServers(
rs.priority == PRIORITY_REBALANCE_SHARD ||
rs.priority == PRIORITY_REBALANCE_OVERUTILIZED_TEAM ||
rs.priority == PRIORITY_REBALANCE_UNDERUTILIZED_TEAM ||
rs.priority == PRIORITY_SPLIT_SHARD ||
rs.priority == PRIORITY_TEAM_REDUNDANT ||
rs.priority == SERVER_KNOBS->PRIORITY_REBALANCE_OVERUTILIZED_TEAM ||
rs.priority == SERVER_KNOBS->PRIORITY_REBALANCE_UNDERUTILIZED_TEAM ||
rs.priority == SERVER_KNOBS->PRIORITY_SPLIT_SHARD ||
rs.priority == SERVER_KNOBS->PRIORITY_TEAM_REDUNDANT ||
mergeWantsNewServers(rs.keys, rs.priority)), interval("QueuedRelocation") {}
static bool mergeWantsNewServers(KeyRangeRef keys, int priority) {
return priority == PRIORITY_MERGE_SHARD &&
return priority == SERVER_KNOBS->PRIORITY_MERGE_SHARD &&
(SERVER_KNOBS->MERGE_ONTO_NEW_TEAM == 2 ||
(SERVER_KNOBS->MERGE_ONTO_NEW_TEAM == 1 && keys.begin.startsWith(LiteralStringRef("\xff"))));
}
static bool isHealthPriority(int priority) {
return priority == SERVER_KNOBS->PRIORITY_TEAM_UNHEALTHY ||
priority == SERVER_KNOBS->PRIORITY_TEAM_2_LEFT ||
priority == SERVER_KNOBS->PRIORITY_TEAM_1_LEFT ||
priority == SERVER_KNOBS->PRIORITY_TEAM_0_LEFT ||
priority == SERVER_KNOBS->PRIORITY_TEAM_REDUNDANT ||
priority == SERVER_KNOBS->PRIORITY_TEAM_HEALTHY ||
priority == SERVER_KNOBS->PRIORITY_TEAM_CONTAINS_UNDESIRED_SERVER;
}
static bool isBoundaryPriority(int priority) {
return priority == SERVER_KNOBS->PRIORITY_SPLIT_SHARD ||
priority == SERVER_KNOBS->PRIORITY_MERGE_SHARD;
}
bool operator> (const RelocateData& rhs) const {
return priority != rhs.priority ? priority > rhs.priority : ( startTime != rhs.startTime ? startTime < rhs.startTime : randomId > rhs.randomId );
}
bool operator== (const RelocateData& rhs) const {
return priority == rhs.priority && keys == rhs.keys && startTime == rhs.startTime && workFactor == rhs.workFactor && src == rhs.src && completeSources == rhs.completeSources && wantsNewServers == rhs.wantsNewServers && randomId == rhs.randomId;
}
bool changesBoundaries() {
return priority == PRIORITY_MERGE_SHARD ||
priority == PRIORITY_SPLIT_SHARD ||
priority == PRIORITY_RECOVER_MOVE;
return priority == rhs.priority && boundaryPriority == rhs.boundaryPriority && healthPriority == rhs.healthPriority && keys == rhs.keys && startTime == rhs.startTime && workFactor == rhs.workFactor && src == rhs.src && completeSources == rhs.completeSources && wantsNewServers == rhs.wantsNewServers && randomId == rhs.randomId;
}
};
@ -285,9 +296,9 @@ int getWorkFactor( RelocateData const& relocation ) {
// Avoid the divide by 0!
ASSERT( relocation.src.size() );
if( relocation.priority >= PRIORITY_TEAM_1_LEFT )
if( relocation.healthPriority == SERVER_KNOBS->PRIORITY_TEAM_1_LEFT || relocation.healthPriority == SERVER_KNOBS->PRIORITY_TEAM_0_LEFT )
return WORK_FULL_UTILIZATION / SERVER_KNOBS->RELOCATION_PARALLELISM_PER_SOURCE_SERVER;
else if( relocation.priority >= PRIORITY_TEAM_2_LEFT )
else if( relocation.healthPriority == SERVER_KNOBS->PRIORITY_TEAM_2_LEFT )
return WORK_FULL_UTILIZATION / 2 / SERVER_KNOBS->RELOCATION_PARALLELISM_PER_SOURCE_SERVER;
else // for now we assume that any message at a lower priority can best be assumed to have a full team left for work
return WORK_FULL_UTILIZATION / relocation.src.size() / SERVER_KNOBS->RELOCATION_PARALLELISM_PER_SOURCE_SERVER;
@ -384,20 +395,22 @@ struct DDQueueData {
std::map<int, int> priority_relocations;
int unhealthyRelocations;
void startRelocation(int priority) {
void startRelocation(int priority, int healthPriority) {
// Although PRIORITY_TEAM_REDUNDANT has lower priority than split and merge shard movement,
// we must count it into unhealthyRelocations; because team removers relies on unhealthyRelocations to
// ensure a team remover will not start before the previous one finishes removing a team and move away data
// NOTE: split and merge shard have higher priority. If they have to wait for unhealthyRelocations = 0,
// deadlock may happen: split/merge shard waits for unhealthyRelocations, while blocks team_redundant.
if (priority >= PRIORITY_TEAM_UNHEALTHY || priority == PRIORITY_TEAM_REDUNDANT) {
if (healthPriority == SERVER_KNOBS->PRIORITY_TEAM_UNHEALTHY || healthPriority == SERVER_KNOBS->PRIORITY_TEAM_2_LEFT ||
healthPriority == SERVER_KNOBS->PRIORITY_TEAM_1_LEFT || healthPriority == SERVER_KNOBS->PRIORITY_TEAM_0_LEFT || healthPriority == SERVER_KNOBS->PRIORITY_TEAM_REDUNDANT) {
unhealthyRelocations++;
rawProcessingUnhealthy->set(true);
}
priority_relocations[priority]++;
}
void finishRelocation(int priority) {
if (priority >= PRIORITY_TEAM_UNHEALTHY || priority == PRIORITY_TEAM_REDUNDANT) {
void finishRelocation(int priority, int healthPriority) {
if (healthPriority == SERVER_KNOBS->PRIORITY_TEAM_UNHEALTHY || healthPriority == SERVER_KNOBS->PRIORITY_TEAM_2_LEFT ||
healthPriority == SERVER_KNOBS->PRIORITY_TEAM_1_LEFT || healthPriority == SERVER_KNOBS->PRIORITY_TEAM_0_LEFT || healthPriority == SERVER_KNOBS->PRIORITY_TEAM_REDUNDANT) {
unhealthyRelocations--;
ASSERT(unhealthyRelocations >= 0);
if(unhealthyRelocations == 0) {
@ -524,7 +537,7 @@ struct DDQueueData {
state Transaction tr(cx);
// FIXME: is the merge case needed
if( input.priority == PRIORITY_MERGE_SHARD ) {
if( input.priority == SERVER_KNOBS->PRIORITY_MERGE_SHARD ) {
wait( delay( 0.5, decrementPriority(decrementPriority(TaskPriority::DataDistribution )) ) );
} else {
wait( delay( 0.0001, TaskPriority::DataDistributionLaunch ) );
@ -586,10 +599,14 @@ struct DDQueueData {
}
//This function cannot handle relocation requests which split a shard into three pieces
void queueRelocation( RelocateData rd, std::set<UID> &serversToLaunchFrom ) {
void queueRelocation( RelocateShard rs, std::set<UID> &serversToLaunchFrom ) {
//TraceEvent("QueueRelocationBegin").detail("Begin", rd.keys.begin).detail("End", rd.keys.end);
// remove all items from both queues that are fully contained in the new relocation (i.e. will be overwritten)
RelocateData rd(rs);
bool hasHealthPriority = RelocateData::isHealthPriority( rd.priority );
bool hasBoundaryPriority = RelocateData::isBoundaryPriority( rd.priority );
auto ranges = queueMap.intersectingRanges( rd.keys );
for(auto r = ranges.begin(); r != ranges.end(); ++r ) {
RelocateData& rrs = r->value();
@ -611,9 +628,13 @@ struct DDQueueData {
if( foundActiveFetching || foundActiveRelocation ) {
rd.wantsNewServers |= rrs.wantsNewServers;
rd.startTime = std::min( rd.startTime, rrs.startTime );
if ((rrs.priority >= PRIORITY_TEAM_UNHEALTHY || rrs.priority == PRIORITY_TEAM_REDUNDANT) &&
rd.changesBoundaries())
rd.priority = std::max( rd.priority, rrs.priority );
if(!hasHealthPriority) {
rd.healthPriority = std::max(rd.healthPriority, rrs.healthPriority);
}
if(!hasBoundaryPriority) {
rd.boundaryPriority = std::max(rd.boundaryPriority, rrs.boundaryPriority);
}
rd.priority = std::max(rd.priority, std::max(rd.boundaryPriority, rd.healthPriority));
}
if( rd.keys.contains( rrs.keys ) ) {
@ -631,7 +652,7 @@ struct DDQueueData {
/*TraceEvent(rrs.interval.end(), mi.id()).detail("Result","Cancelled")
.detail("WasFetching", foundActiveFetching).detail("Contained", rd.keys.contains( rrs.keys ));*/
queuedRelocations--;
finishRelocation(rrs.priority);
finishRelocation(rrs.priority, rrs.healthPriority);
}
}
@ -658,7 +679,7 @@ struct DDQueueData {
.detail("KeyBegin", rrs.keys.begin).detail("KeyEnd", rrs.keys.end)
.detail("Priority", rrs.priority).detail("WantsNewServers", rrs.wantsNewServers);*/
queuedRelocations++;
startRelocation(rrs.priority);
startRelocation(rrs.priority, rrs.healthPriority);
fetchingSourcesQueue.insert( rrs );
getSourceActors.insert( rrs.keys, getSourceServersForRange( cx, rrs, fetchSourceServersComplete ) );
@ -678,7 +699,7 @@ struct DDQueueData {
.detail("KeyBegin", newData.keys.begin).detail("KeyEnd", newData.keys.end)
.detail("Priority", newData.priority).detail("WantsNewServers", newData.wantsNewServers);*/
queuedRelocations++;
startRelocation(newData.priority);
startRelocation(newData.priority, newData.healthPriority);
foundActiveRelocation = true;
}
@ -773,7 +794,7 @@ struct DDQueueData {
for(auto it = intersectingInFlight.begin(); it != intersectingInFlight.end(); ++it) {
if (fetchKeysComplete.count(it->value()) && inFlightActors.liveActorAt(it->range().begin) &&
!rd.keys.contains(it->range()) && it->value().priority >= rd.priority &&
rd.priority < PRIORITY_TEAM_UNHEALTHY) {
rd.healthPriority < SERVER_KNOBS->PRIORITY_TEAM_UNHEALTHY) {
/*TraceEvent("OverlappingInFlight", distributorId)
.detail("KeyBegin", it->value().keys.begin)
.detail("KeyEnd", it->value().keys.end)
@ -813,7 +834,7 @@ struct DDQueueData {
//TraceEvent(rd.interval.end(), distributorId).detail("Result","Success");
queuedRelocations--;
finishRelocation(rd.priority);
finishRelocation(rd.priority, rd.healthPriority);
// now we are launching: remove this entry from the queue of all the src servers
for( int i = 0; i < rd.src.size(); i++ ) {
@ -841,7 +862,7 @@ struct DDQueueData {
launch( rrs, busymap );
activeRelocations++;
startRelocation(rrs.priority);
startRelocation(rrs.priority, rrs.healthPriority);
inFlightActors.insert( rrs.keys, dataDistributionRelocator( this, rrs ) );
}
@ -912,10 +933,10 @@ ACTOR Future<Void> dataDistributionRelocator( DDQueueData *self, RelocateData rd
bestTeams.clear();
while( tciIndex < self->teamCollections.size() ) {
double inflightPenalty = SERVER_KNOBS->INFLIGHT_PENALTY_HEALTHY;
if(rd.priority >= PRIORITY_TEAM_UNHEALTHY) inflightPenalty = SERVER_KNOBS->INFLIGHT_PENALTY_UNHEALTHY;
if(rd.priority >= PRIORITY_TEAM_1_LEFT) inflightPenalty = SERVER_KNOBS->INFLIGHT_PENALTY_ONE_LEFT;
if(rd.healthPriority == SERVER_KNOBS->PRIORITY_TEAM_UNHEALTHY || rd.healthPriority == SERVER_KNOBS->PRIORITY_TEAM_2_LEFT) inflightPenalty = SERVER_KNOBS->INFLIGHT_PENALTY_UNHEALTHY;
if(rd.healthPriority == SERVER_KNOBS->PRIORITY_TEAM_1_LEFT || rd.healthPriority == SERVER_KNOBS->PRIORITY_TEAM_0_LEFT) inflightPenalty = SERVER_KNOBS->INFLIGHT_PENALTY_ONE_LEFT;
auto req = GetTeamRequest(rd.wantsNewServers, rd.priority == PRIORITY_REBALANCE_UNDERUTILIZED_TEAM, true, inflightPenalty);
auto req = GetTeamRequest(rd.wantsNewServers, rd.priority == SERVER_KNOBS->PRIORITY_REBALANCE_UNDERUTILIZED_TEAM, true, inflightPenalty);
req.sources = rd.src;
req.completeSources = rd.completeSources;
Optional<Reference<IDataDistributionTeam>> bestTeam = wait(brokenPromiseToNever(self->teamCollections[tciIndex].getTeam.getReply(req)));
@ -1154,7 +1175,7 @@ ACTOR Future<bool> rebalanceTeams( DDQueueData* self, int priority, Reference<ID
std::vector<KeyRange> shards = self->shardsAffectedByTeamFailure->getShardsFor( ShardsAffectedByTeamFailure::Team( sourceTeam->getServerIDs(), primary ) );
for( int i = 0; i < shards.size(); i++ ) {
if( moveShard == shards[i] ) {
TraceEvent(priority == PRIORITY_REBALANCE_OVERUTILIZED_TEAM ? "BgDDMountainChopper" : "BgDDValleyFiller", self->distributorId)
TraceEvent(priority == SERVER_KNOBS->PRIORITY_REBALANCE_OVERUTILIZED_TEAM ? "BgDDMountainChopper" : "BgDDValleyFiller", self->distributorId)
.detail("SourceBytes", sourceBytes)
.detail("DestBytes", destBytes)
.detail("ShardBytes", metrics.bytes)
@ -1197,7 +1218,7 @@ ACTOR Future<Void> BgDDMountainChopper( DDQueueData* self, int teamCollectionInd
std::max(rebalancePollingInterval, SERVER_KNOBS->BG_REBALANCE_SWITCH_CHECK_INTERVAL);
continue;
}
if (self->priority_relocations[PRIORITY_REBALANCE_OVERUTILIZED_TEAM] <
if (self->priority_relocations[SERVER_KNOBS->PRIORITY_REBALANCE_OVERUTILIZED_TEAM] <
SERVER_KNOBS->DD_REBALANCE_PARALLELISM) {
state Optional<Reference<IDataDistributionTeam>> randomTeam = wait(brokenPromiseToNever(
self->teamCollections[teamCollectionIndex].getTeam.getReply(GetTeamRequest(true, false, true))));
@ -1208,7 +1229,7 @@ ACTOR Future<Void> BgDDMountainChopper( DDQueueData* self, int teamCollectionInd
GetTeamRequest(true, true, false))));
if (loadedTeam.present()) {
bool moved =
wait(rebalanceTeams(self, PRIORITY_REBALANCE_OVERUTILIZED_TEAM, loadedTeam.get(),
wait(rebalanceTeams(self, SERVER_KNOBS->PRIORITY_REBALANCE_OVERUTILIZED_TEAM, loadedTeam.get(),
randomTeam.get(), teamCollectionIndex == 0));
if (moved) {
resetCount = 0;
@ -1266,7 +1287,7 @@ ACTOR Future<Void> BgDDValleyFiller( DDQueueData* self, int teamCollectionIndex)
std::max(rebalancePollingInterval, SERVER_KNOBS->BG_REBALANCE_SWITCH_CHECK_INTERVAL);
continue;
}
if (self->priority_relocations[PRIORITY_REBALANCE_UNDERUTILIZED_TEAM] <
if (self->priority_relocations[SERVER_KNOBS->PRIORITY_REBALANCE_UNDERUTILIZED_TEAM] <
SERVER_KNOBS->DD_REBALANCE_PARALLELISM) {
state Optional<Reference<IDataDistributionTeam>> randomTeam = wait(brokenPromiseToNever(
self->teamCollections[teamCollectionIndex].getTeam.getReply(GetTeamRequest(true, false, false))));
@ -1276,7 +1297,7 @@ ACTOR Future<Void> BgDDValleyFiller( DDQueueData* self, int teamCollectionIndex)
if (unloadedTeam.present()) {
if (unloadedTeam.get()->getMinFreeSpaceRatio() > SERVER_KNOBS->FREE_SPACE_RATIO_DD_CUTOFF) {
bool moved =
wait(rebalanceTeams(self, PRIORITY_REBALANCE_UNDERUTILIZED_TEAM, randomTeam.get(),
wait(rebalanceTeams(self, SERVER_KNOBS->PRIORITY_REBALANCE_UNDERUTILIZED_TEAM, randomTeam.get(),
unloadedTeam.get(), teamCollectionIndex == 0));
if (moved) {
resetCount = 0;
@ -1382,7 +1403,7 @@ ACTOR Future<Void> dataDistributionQueue(
}
when ( RelocateData done = waitNext( self.relocationComplete.getFuture() ) ) {
self.activeRelocations--;
self.finishRelocation(done.priority);
self.finishRelocation(done.priority, done.healthPriority);
self.fetchKeysComplete.erase( done );
//self.logRelocation( done, "ShardRelocatorDone" );
actors.add( tag( delay(0, TaskPriority::DataDistributionLaunch), done.keys, rangesComplete ) );
@ -1400,24 +1421,32 @@ ACTOR Future<Void> dataDistributionQueue(
recordMetrics = delay(SERVER_KNOBS->DD_QUEUE_LOGGING_INTERVAL);
int lowPriorityRelocations = 0, highPriorityRelocations = 0, highestPriorityRelocation = 0;
int highestPriorityRelocation = 0;
for( auto it = self.priority_relocations.begin(); it != self.priority_relocations.end(); ++it ) {
if (it->second)
if (it->second) {
highestPriorityRelocation = std::max(highestPriorityRelocation, it->first);
if( it->first < 200 )
lowPriorityRelocations += it->second;
else
highPriorityRelocations += it->second;
}
}
TraceEvent("MovingData", distributorId)
.detail( "InFlight", self.activeRelocations )
.detail( "InQueue", self.queuedRelocations )
.detail( "AverageShardSize", req.getFuture().isReady() ? req.getFuture().get() : -1 )
.detail( "LowPriorityRelocations", lowPriorityRelocations )
.detail( "HighPriorityRelocations", highPriorityRelocations )
.detail( "UnhealthyRelocations", self.unhealthyRelocations )
.detail( "HighestPriority", highestPriorityRelocation )
.detail( "BytesWritten", self.bytesWritten )
.detail( "PriorityRecoverMove", self.priority_relocations[SERVER_KNOBS->PRIORITY_RECOVER_MOVE] )
.detail( "PriorityRebalanceUnderutilizedTeam", self.priority_relocations[SERVER_KNOBS->PRIORITY_REBALANCE_UNDERUTILIZED_TEAM] )
.detail( "PriorityRebalannceOverutilizedTeam", self.priority_relocations[SERVER_KNOBS->PRIORITY_REBALANCE_OVERUTILIZED_TEAM] )
.detail( "PriorityTeamHealthy", self.priority_relocations[SERVER_KNOBS->PRIORITY_TEAM_HEALTHY] )
.detail( "PriorityTeamContainsUndesiredServer", self.priority_relocations[SERVER_KNOBS->PRIORITY_TEAM_CONTAINS_UNDESIRED_SERVER] )
.detail( "PriorityTeamRedundant", self.priority_relocations[SERVER_KNOBS->PRIORITY_TEAM_REDUNDANT] )
.detail( "PriorityMergeShard", self.priority_relocations[SERVER_KNOBS->PRIORITY_MERGE_SHARD] )
.detail( "PriorityTeamUnhealthy", self.priority_relocations[SERVER_KNOBS->PRIORITY_TEAM_UNHEALTHY] )
.detail( "PriorityTeam2Left", self.priority_relocations[SERVER_KNOBS->PRIORITY_TEAM_2_LEFT] )
.detail( "PriorityTeam1Left", self.priority_relocations[SERVER_KNOBS->PRIORITY_TEAM_1_LEFT] )
.detail( "PriorityTeam0Left", self.priority_relocations[SERVER_KNOBS->PRIORITY_TEAM_0_LEFT] )
.detail( "PrioritySplitShard", self.priority_relocations[SERVER_KNOBS->PRIORITY_SPLIT_SHARD] )
.trackLatest( "MovingData" );
}
when ( wait( self.error.getFuture() ) ) {} // Propagate errors from dataDistributionRelocator

View File

@ -69,6 +69,7 @@ struct DataDistributionTracker {
KeyRangeMap< ShardTrackedData > shards;
ActorCollection sizeChanges;
int64_t systemSizeEstimate;
Reference<AsyncVar<int64_t>> dbSizeEstimate;
Reference<AsyncVar<Optional<int64_t>>> maxShardSize;
Future<Void> maxShardSizeUpdater;
@ -81,7 +82,7 @@ struct DataDistributionTracker {
Reference<AsyncVar<bool>> anyZeroHealthyTeams;
DataDistributionTracker(Database cx, UID distributorId, Promise<Void> const& readyToStart, PromiseStream<RelocateShard> const& output, Reference<ShardsAffectedByTeamFailure> shardsAffectedByTeamFailure, Reference<AsyncVar<bool>> anyZeroHealthyTeams)
: cx(cx), distributorId( distributorId ), dbSizeEstimate( new AsyncVar<int64_t>() ),
: cx(cx), distributorId( distributorId ), dbSizeEstimate( new AsyncVar<int64_t>() ), systemSizeEstimate(0),
maxShardSize( new AsyncVar<Optional<int64_t>>() ),
sizeChanges(false), readyToStart(readyToStart), output( output ), shardsAffectedByTeamFailure(shardsAffectedByTeamFailure), anyZeroHealthyTeams(anyZeroHealthyTeams) {}
@ -138,8 +139,7 @@ int64_t getMaxShardSize( double dbSizeEstimate ) {
ACTOR Future<Void> trackShardBytes(
DataDistributionTracker* self,
KeyRange keys,
Reference<AsyncVar<Optional<StorageMetrics>>> shardSize,
bool addToSizeEstimate = true)
Reference<AsyncVar<Optional<StorageMetrics>>> shardSize)
{
wait( delay( 0, TaskPriority::DataDistribution ) );
@ -203,8 +203,12 @@ ACTOR Future<Void> trackShardBytes(
.detail("OldShardSize", shardSize->get().present() ? shardSize->get().get().metrics.bytes : 0)
.detail("TrackerID", trackerID);*/
if( shardSize->get().present() && addToSizeEstimate )
if( shardSize->get().present() ) {
self->dbSizeEstimate->set( self->dbSizeEstimate->get() + metrics.bytes - shardSize->get().get().bytes );
if(keys.begin >= systemKeys.begin) {
self->systemSizeEstimate += metrics.bytes - shardSize->get().get().bytes;
}
}
shardSize->set( metrics );
}
@ -254,10 +258,15 @@ ACTOR Future<int64_t> getFirstSize( Reference<AsyncVar<Optional<StorageMetrics>>
}
}
ACTOR Future<Void> changeSizes( DataDistributionTracker* self, KeyRangeRef keys, int64_t oldShardsEndingSize ) {
ACTOR Future<Void> changeSizes( DataDistributionTracker* self, KeyRange keys, int64_t oldShardsEndingSize ) {
state vector<Future<int64_t>> sizes;
state vector<Future<int64_t>> systemSizes;
for (auto it : self->shards.intersectingRanges(keys) ) {
sizes.push_back( getFirstSize( it->value().stats ) );
Future<int64_t> thisSize = getFirstSize( it->value().stats );
sizes.push_back( thisSize );
if(it->range().begin >= systemKeys.begin) {
systemSizes.push_back( thisSize );
}
}
wait( waitForAll( sizes ) );
@ -267,12 +276,20 @@ ACTOR Future<Void> changeSizes( DataDistributionTracker* self, KeyRangeRef keys,
for ( int i = 0; i < sizes.size(); i++ )
newShardsStartingSize += sizes[i].get();
int64_t newSystemShardsStartingSize = 0;
for ( int i = 0; i < systemSizes.size(); i++ )
newSystemShardsStartingSize += systemSizes[i].get();
int64_t totalSizeEstimate = self->dbSizeEstimate->get();
/*TraceEvent("TrackerChangeSizes")
.detail("TotalSizeEstimate", totalSizeEstimate)
.detail("EndSizeOfOldShards", oldShardsEndingSize)
.detail("StartingSizeOfNewShards", newShardsStartingSize);*/
self->dbSizeEstimate->set( totalSizeEstimate + newShardsStartingSize - oldShardsEndingSize );
self->systemSizeEstimate += newSystemShardsStartingSize;
if(keys.begin >= systemKeys.begin) {
self->systemSizeEstimate -= oldShardsEndingSize;
}
return Void();
}
@ -352,12 +369,12 @@ ACTOR Future<Void> shardSplitter(
for( int i = 0; i < skipRange; i++ ) {
KeyRangeRef r(splitKeys[i], splitKeys[i+1]);
self->shardsAffectedByTeamFailure->defineShard( r );
self->output.send( RelocateShard( r, PRIORITY_SPLIT_SHARD) );
self->output.send( RelocateShard( r, SERVER_KNOBS->PRIORITY_SPLIT_SHARD) );
}
for( int i = numShards-1; i > skipRange; i-- ) {
KeyRangeRef r(splitKeys[i], splitKeys[i+1]);
self->shardsAffectedByTeamFailure->defineShard( r );
self->output.send( RelocateShard( r, PRIORITY_SPLIT_SHARD) );
self->output.send( RelocateShard( r, SERVER_KNOBS->PRIORITY_SPLIT_SHARD) );
}
self->sizeChanges.add( changeSizes( self, keys, shardSize->get().get().bytes ) );
@ -458,7 +475,7 @@ Future<Void> shardMerger(
restartShardTrackers( self, mergeRange, endingStats );
self->shardsAffectedByTeamFailure->defineShard( mergeRange );
self->output.send( RelocateShard( mergeRange, PRIORITY_MERGE_SHARD ) );
self->output.send( RelocateShard( mergeRange, SERVER_KNOBS->PRIORITY_MERGE_SHARD ) );
// We are about to be cancelled by the call to restartShardTrackers
return Void();
@ -641,7 +658,7 @@ ACTOR Future<Void> fetchShardMetrics_impl( DataDistributionTracker* self, GetMet
ACTOR Future<Void> fetchShardMetrics( DataDistributionTracker* self, GetMetricsRequest req ) {
choose {
when( wait( fetchShardMetrics_impl( self, req ) ) ) {}
when( wait( delay( SERVER_KNOBS->DD_SHARD_METRICS_TIMEOUT ) ) ) {
when( wait( delay( SERVER_KNOBS->DD_SHARD_METRICS_TIMEOUT, TaskPriority::DataDistribution ) ) ) {
TEST(true); // DD_SHARD_METRICS_TIMEOUT
StorageMetrics largeMetrics;
largeMetrics.bytes = SERVER_KNOBS->MAX_SHARD_BYTES;
@ -676,6 +693,7 @@ ACTOR Future<Void> dataDistributionTracker(
TraceEvent("DDTrackerStats", self.distributorId)
.detail("Shards", self.shards.size())
.detail("TotalSizeBytes", self.dbSizeEstimate->get())
.detail("SystemSizeBytes", self.systemSizeEstimate)
.trackLatest( "DDTrackerStats" );
loggingTrigger = delay(SERVER_KNOBS->DATA_DISTRIBUTION_LOGGING_INTERVAL);

View File

@ -106,6 +106,19 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) {
init( INFLIGHT_PENALTY_ONE_LEFT, 1000.0 );
init( MERGE_ONTO_NEW_TEAM, 1 ); if( randomize && BUGGIFY ) MERGE_ONTO_NEW_TEAM = deterministicRandom()->coinflip() ? 0 : 2;
init( PRIORITY_RECOVER_MOVE, 110 );
init( PRIORITY_REBALANCE_UNDERUTILIZED_TEAM, 120 );
init( PRIORITY_REBALANCE_OVERUTILIZED_TEAM, 121 );
init( PRIORITY_TEAM_HEALTHY, 140 );
init( PRIORITY_TEAM_CONTAINS_UNDESIRED_SERVER, 150 );
init( PRIORITY_TEAM_REDUNDANT, 200 );
init( PRIORITY_MERGE_SHARD, 340 );
init( PRIORITY_TEAM_UNHEALTHY, 700 );
init( PRIORITY_TEAM_2_LEFT, 709 );
init( PRIORITY_TEAM_1_LEFT, 800 );
init( PRIORITY_TEAM_0_LEFT, 809 );
init( PRIORITY_SPLIT_SHARD, 900 ); if( randomize && BUGGIFY ) PRIORITY_SPLIT_SHARD = 350;
// Data distribution
init( RETRY_RELOCATESHARD_DELAY, 0.1 );
init( DATA_DISTRIBUTION_FAILURE_REACTION_TIME, 60.0 ); if( randomize && BUGGIFY ) DATA_DISTRIBUTION_FAILURE_REACTION_TIME = 1.0;
@ -304,6 +317,7 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) {
init( ENFORCED_MIN_RECOVERY_DURATION, 0.085 ); if( shortRecoveryDuration ) ENFORCED_MIN_RECOVERY_DURATION = 0.01;
init( REQUIRED_MIN_RECOVERY_DURATION, 0.080 ); if( shortRecoveryDuration ) REQUIRED_MIN_RECOVERY_DURATION = 0.01;
init( ALWAYS_CAUSAL_READ_RISKY, false );
init( MAX_COMMIT_UPDATES, 100000 ); if( randomize && BUGGIFY ) MAX_COMMIT_UPDATES = 1;
// Master Server
// masterCommitter() in the master server will allow lower priority tasks (e.g. DataDistibution)
@ -485,6 +499,7 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) {
init( STATUS_MIN_TIME_BETWEEN_REQUESTS, 0.0 );
init( MAX_STATUS_REQUESTS_PER_SECOND, 256.0 );
init( CONFIGURATION_ROWS_TO_FETCH, 20000 );
init( DISABLE_DUPLICATE_LOG_WARNING, false );
// IPager
init( PAGER_RESERVED_PAGES, 1 );

View File

@ -106,6 +106,24 @@ public:
double INFLIGHT_PENALTY_ONE_LEFT;
int MERGE_ONTO_NEW_TEAM; // Merges will request new servers. 0 for off, 1 for \xff only, 2 for all shards.
// Higher priorities are executed first
// Priority/100 is the "priority group"/"superpriority". Priority inversion
// is possible within but not between priority groups; fewer priority groups
// mean better worst case time bounds
// Maximum allowable priority is 999.
int PRIORITY_RECOVER_MOVE;
int PRIORITY_REBALANCE_UNDERUTILIZED_TEAM;
int PRIORITY_REBALANCE_OVERUTILIZED_TEAM;
int PRIORITY_TEAM_HEALTHY;
int PRIORITY_TEAM_CONTAINS_UNDESIRED_SERVER;
int PRIORITY_TEAM_REDUNDANT;
int PRIORITY_MERGE_SHARD;
int PRIORITY_TEAM_UNHEALTHY;
int PRIORITY_TEAM_2_LEFT;
int PRIORITY_TEAM_1_LEFT;
int PRIORITY_TEAM_0_LEFT;
int PRIORITY_SPLIT_SHARD;
// Data distribution
double RETRY_RELOCATESHARD_DELAY;
double DATA_DISTRIBUTION_FAILURE_REACTION_TIME;
@ -244,6 +262,7 @@ public:
double ENFORCED_MIN_RECOVERY_DURATION;
double REQUIRED_MIN_RECOVERY_DURATION;
bool ALWAYS_CAUSAL_READ_RISKY;
int MAX_COMMIT_UPDATES;
// Master Server
double COMMIT_SLEEP_TIME;
@ -423,6 +442,7 @@ public:
double STATUS_MIN_TIME_BETWEEN_REQUESTS;
double MAX_STATUS_REQUESTS_PER_SECOND;
int CONFIGURATION_ROWS_TO_FETCH;
bool DISABLE_DUPLICATE_LOG_WARNING;
// IPager
int PAGER_RESERVED_PAGES;

View File

@ -236,6 +236,7 @@ struct ProxyCommitData {
Optional<LatencyBandConfig> latencyBandConfig;
double lastStartCommit;
double lastCommitLatency;
int updateCommitRequests = 0;
NotifiedDouble lastCommitTime;
//The tag related to a storage server rarely change, so we keep a vector of tags for each key range to be slightly more CPU efficient.
@ -810,27 +811,32 @@ ACTOR Future<Void> commitBatch(
// Serialize and backup the mutations as a single mutation
if ((self->vecBackupKeys.size() > 1) && logRangeMutations.size()) {
Key val;
MutationRef backupMutation;
uint32_t* partBuffer = NULL;
state std::map<Key, MutationListRef>::iterator logRangeMutation = logRangeMutations.begin();
// Serialize the log range mutations within the map
for (auto& logRangeMutation : logRangeMutations)
for (; logRangeMutation != logRangeMutations.end(); ++logRangeMutation)
{
if(yieldBytes > SERVER_KNOBS->DESIRED_TOTAL_BYTES) {
yieldBytes = 0;
wait(yield());
}
yieldBytes += logRangeMutation->second.expectedSize();
BinaryWriter wr(Unversioned());
// Serialize the log destination
wr.serializeBytes( logRangeMutation.first );
wr.serializeBytes( logRangeMutation->first );
// Write the log keys and version information
wr << (uint8_t)hashlittle(&v, sizeof(v), 0);
wr << bigEndian64(commitVersion);
MutationRef backupMutation;
backupMutation.type = MutationRef::SetValue;
partBuffer = NULL;
uint32_t* partBuffer = NULL;
val = BinaryWriter::toValue(logRangeMutation.second, IncludeVersion());
Key val = BinaryWriter::toValue(logRangeMutation->second, IncludeVersion());
for (int part = 0; part * CLIENT_KNOBS->MUTATION_BLOCK_SIZE < val.size(); part++) {
@ -852,7 +858,7 @@ ACTOR Future<Void> commitBatch(
// Define the mutation type and and location
backupMutation.param1 = wr.toValue();
ASSERT( backupMutation.param1.startsWith(logRangeMutation.first) ); // We are writing into the configured destination
ASSERT( backupMutation.param1.startsWith(logRangeMutation->first) ); // We are writing into the configured destination
auto& tags = self->tagsForKey(backupMutation.param1);
toCommit.addTags(tags);
@ -1040,7 +1046,9 @@ ACTOR Future<Void> commitBatch(
ACTOR Future<Void> updateLastCommit(ProxyCommitData* self, Optional<UID> debugID = Optional<UID>()) {
state double confirmStart = now();
self->lastStartCommit = confirmStart;
self->updateCommitRequests++;
wait(self->logSystem->confirmEpochLive(debugID));
self->updateCommitRequests--;
self->lastCommitLatency = now()-confirmStart;
self->lastCommitTime = std::max(self->lastCommitTime.get(), confirmStart);
return Void();
@ -1448,7 +1456,12 @@ ACTOR Future<Void> lastCommitUpdater(ProxyCommitData* self, PromiseStream<Future
if(elapsed < interval) {
wait( delay(interval + 0.0001 - elapsed) );
} else {
addActor.send(updateLastCommit(self));
if(self->updateCommitRequests < SERVER_KNOBS->MAX_COMMIT_UPDATES) {
addActor.send(updateLastCommit(self));
} else {
TraceEvent(g_network->isSimulated() ? SevInfo : SevWarnAlways, "TooManyLastCommitUpdates").suppressFor(1.0);
self->lastStartCommit = now();
}
}
}
}

View File

@ -697,7 +697,7 @@ ACTOR Future<Void> configurationMonitor(Reference<AsyncVar<ServerDBInfo>> dbInfo
conf->fromKeyValues( (VectorRef<KeyValueRef>) results );
state Future<Void> watchFuture = tr.watch(moveKeysLockOwnerKey);
state Future<Void> watchFuture = tr.watch(moveKeysLockOwnerKey) || tr.watch(excludedServersVersionKey);
wait( tr.commit() );
wait( watchFuture );
break;

View File

@ -945,11 +945,8 @@ void SimulationConfig::generateNormalConfig(int minimumReplication, int minimumR
}
}
if (deterministicRandom()->random01() < 0.25) {
int logs = deterministicRandom()->randomInt(1,7);
primaryObj["satellite_logs"] = logs;
remoteObj["satellite_logs"] = logs;
}
if (deterministicRandom()->random01() < 0.25) primaryObj["satellite_logs"] = deterministicRandom()->randomInt(1,7);
if (deterministicRandom()->random01() < 0.25) remoteObj["satellite_logs"] = deterministicRandom()->randomInt(1,7);
//We cannot run with a remote DC when MAX_READ_TRANSACTION_LIFE_VERSIONS is too small, because the log routers will not be able to keep up.
if (minimumRegions <= 1 && (deterministicRandom()->random01() < 0.25 || SERVER_KNOBS->MAX_READ_TRANSACTION_LIFE_VERSIONS < SERVER_KNOBS->VERSIONS_PER_SECOND)) {
@ -998,12 +995,14 @@ void SimulationConfig::generateNormalConfig(int minimumReplication, int minimumR
primarySatelliteObj["id"] = useNormalDCsAsSatellites ? "1" : "2";
primarySatelliteObj["priority"] = 1;
primarySatelliteObj["satellite"] = 1;
if (deterministicRandom()->random01() < 0.25) primarySatelliteObj["satellite_logs"] = deterministicRandom()->randomInt(1,7);
primaryDcArr.push_back(primarySatelliteObj);
StatusObject remoteSatelliteObj;
remoteSatelliteObj["id"] = useNormalDCsAsSatellites ? "0" : "3";
remoteSatelliteObj["priority"] = 1;
remoteSatelliteObj["satellite"] = 1;
if (deterministicRandom()->random01() < 0.25) remoteSatelliteObj["satellite_logs"] = deterministicRandom()->randomInt(1,7);
remoteDcArr.push_back(remoteSatelliteObj);
if (datacenters > 4) {
@ -1011,12 +1010,14 @@ void SimulationConfig::generateNormalConfig(int minimumReplication, int minimumR
primarySatelliteObjB["id"] = useNormalDCsAsSatellites ? "2" : "4";
primarySatelliteObjB["priority"] = 1;
primarySatelliteObjB["satellite"] = 1;
if (deterministicRandom()->random01() < 0.25) primarySatelliteObjB["satellite_logs"] = deterministicRandom()->randomInt(1,7);
primaryDcArr.push_back(primarySatelliteObjB);
StatusObject remoteSatelliteObjB;
remoteSatelliteObjB["id"] = useNormalDCsAsSatellites ? "2" : "5";
remoteSatelliteObjB["priority"] = 1;
remoteSatelliteObjB["satellite"] = 1;
if (deterministicRandom()->random01() < 0.25) remoteSatelliteObjB["satellite_logs"] = deterministicRandom()->randomInt(1,7);
remoteDcArr.push_back(remoteSatelliteObjB);
}
if (useNormalDCsAsSatellites) {

View File

@ -1122,33 +1122,102 @@ ACTOR static Future<JsonBuilderObject> latencyProbeFetcher(Database cx, JsonBuil
return statusObj;
}
ACTOR static Future<Void> consistencyCheckStatusFetcher(Database cx, JsonBuilderArray *messages, std::set<std::string> *incomplete_reasons, bool isAvailable) {
if(isAvailable) {
try {
state Transaction tr(cx);
loop {
try {
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
Optional<Value> ccSuspendVal = wait(timeoutError(BUGGIFY ? Never() : tr.get(fdbShouldConsistencyCheckBeSuspended), 5.0));
bool ccSuspend = ccSuspendVal.present() ? BinaryReader::fromStringRef<bool>(ccSuspendVal.get(), Unversioned()) : false;
if(ccSuspend) {
messages->push_back(JsonString::makeMessage("consistencycheck_disabled", "Consistency checker is disabled."));
}
break;
} catch(Error &e) {
if(e.code() == error_code_timed_out) {
messages->push_back(JsonString::makeMessage("consistencycheck_suspendkey_fetch_timeout",
format("Timed out trying to fetch `%s` from the database.", printable(fdbShouldConsistencyCheckBeSuspended).c_str()).c_str()));
break;
}
wait(tr.onError(e));
ACTOR static Future<Void> consistencyCheckStatusFetcher(Database cx, JsonBuilderArray *messages, std::set<std::string> *incomplete_reasons) {
try {
state Transaction tr(cx);
loop {
try {
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
Optional<Value> ccSuspendVal = wait(timeoutError(BUGGIFY ? Never() : tr.get(fdbShouldConsistencyCheckBeSuspended), 5.0));
bool ccSuspend = ccSuspendVal.present() ? BinaryReader::fromStringRef<bool>(ccSuspendVal.get(), Unversioned()) : false;
if(ccSuspend) {
messages->push_back(JsonString::makeMessage("consistencycheck_disabled", "Consistency checker is disabled."));
}
break;
} catch(Error &e) {
if(e.code() == error_code_timed_out) {
messages->push_back(JsonString::makeMessage("consistencycheck_suspendkey_fetch_timeout",
format("Timed out trying to fetch `%s` from the database.", printable(fdbShouldConsistencyCheckBeSuspended).c_str()).c_str()));
break;
}
wait(tr.onError(e));
}
} catch(Error &e) {
incomplete_reasons->insert(format("Unable to retrieve consistency check settings (%s).", e.what()));
}
} catch(Error &e) {
incomplete_reasons->insert(format("Unable to retrieve consistency check settings (%s).", e.what()));
}
return Void();
}
struct LogRangeAndUID {
KeyRange range;
UID destID;
LogRangeAndUID(KeyRange const& range, UID const& destID) : range(range), destID(destID) {}
bool operator < (LogRangeAndUID const& r) const {
if(range.begin != r.range.begin) return range.begin < r.range.begin;
if(range.end != r.range.end) return range.end < r.range.end;
return destID < r.destID;
}
};
ACTOR static Future<Void> logRangeWarningFetcher(Database cx, JsonBuilderArray *messages, std::set<std::string> *incomplete_reasons) {
try {
state Transaction tr(cx);
state Future<Void> timeoutFuture = timeoutError(Future<Void>(Never()), 5.0);
loop {
try {
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
state Future<Standalone<RangeResultRef>> existingDestUidValues = tr.getRange(KeyRangeRef(destUidLookupPrefix, strinc(destUidLookupPrefix)), CLIENT_KNOBS->TOO_MANY);
state Future<Standalone<RangeResultRef>> existingLogRanges = tr.getRange(logRangesRange, CLIENT_KNOBS->TOO_MANY);
wait( (success(existingDestUidValues) && success(existingLogRanges)) || timeoutFuture );
std::set<LogRangeAndUID> loggingRanges;
for(auto& it : existingLogRanges.get()) {
Key logDestination;
UID logUid;
KeyRef logRangeBegin = logRangesDecodeKey(it.key, &logUid);
Key logRangeEnd = logRangesDecodeValue(it.value, &logDestination);
loggingRanges.insert(LogRangeAndUID(KeyRangeRef(logRangeBegin, logRangeEnd), logUid));
}
std::set<std::pair<Key,Key>> existingRanges;
for(auto& it : existingDestUidValues.get()) {
KeyRange range = BinaryReader::fromStringRef<KeyRange>(it.key.removePrefix(destUidLookupPrefix), IncludeVersion());
UID logUid = BinaryReader::fromStringRef<UID>(it.value, Unversioned());
if(loggingRanges.count(LogRangeAndUID(range, logUid))) {
std::pair<Key,Key> rangePair = std::make_pair(range.begin,range.end);
if(existingRanges.count(rangePair)) {
messages->push_back(JsonString::makeMessage("duplicate_mutation_streams", format("Backup and DR are not sharing the same stream of mutations for `%s` - `%s`", printable(range.begin).c_str(), printable(range.end).c_str()).c_str()));
break;
}
existingRanges.insert(rangePair);
} else {
//This cleanup is done during status, because it should only be required once after upgrading to 6.2.7 or later.
//There is no other good location to detect that the metadata is mismatched.
TraceEvent(SevWarnAlways, "CleaningDestUidLookup").detail("K", it.key.printable()).detail("V", it.value.printable());
tr.clear(it.key);
}
}
wait(tr.commit() || timeoutFuture);
break;
} catch(Error &e) {
if(e.code() == error_code_timed_out) {
messages->push_back(JsonString::makeMessage("duplicate_mutation_fetch_timeout",
format("Timed out trying to fetch `%s` from the database.", printable(destUidLookupPrefix).c_str()).c_str()));
break;
}
wait(tr.onError(e));
}
}
} catch(Error &e) {
incomplete_reasons->insert(format("Unable to retrieve log ranges (%s).", e.what()));
}
return Void();
}
@ -1274,7 +1343,7 @@ static JsonBuilderObject configurationFetcher(Optional<DatabaseConfiguration> co
return statusObj;
}
ACTOR static Future<JsonBuilderObject> dataStatusFetcher(WorkerDetails ddWorker, int *minReplicasRemaining) {
ACTOR static Future<JsonBuilderObject> dataStatusFetcher(WorkerDetails ddWorker, DatabaseConfiguration configuration, int *minReplicasRemaining) {
state JsonBuilderObject statusObjData;
try {
@ -1328,6 +1397,7 @@ ACTOR static Future<JsonBuilderObject> dataStatusFetcher(WorkerDetails ddWorker,
if (dataStats.size())
{
statusObjData.setKeyRawNumber("total_kv_size_bytes",dataStats.getValue("TotalSizeBytes"));
statusObjData.setKeyRawNumber("system_kv_size_bytes",dataStats.getValue("SystemSizeBytes"));
statusObjData.setKeyRawNumber("partitions_count",dataStats.getValue("Shards"));
}
@ -1338,13 +1408,14 @@ ACTOR static Future<JsonBuilderObject> dataStatusFetcher(WorkerDetails ddWorker,
continue;
}
int replicas = configuration.storageTeamSize;
bool primary = inFlight.getInt("Primary");
int highestPriority = inFlight.getInt("HighestPriority");
if (movingHighestPriority < PRIORITY_TEAM_REDUNDANT) {
if (movingHighestPriority < SERVER_KNOBS->PRIORITY_TEAM_REDUNDANT) {
highestPriority = movingHighestPriority;
} else if (partitionsInFlight > 0) {
highestPriority = std::max<int>(highestPriority, PRIORITY_MERGE_SHARD);
highestPriority = std::max<int>(highestPriority, SERVER_KNOBS->PRIORITY_MERGE_SHARD);
}
JsonBuilderObject team_tracker;
@ -1353,53 +1424,47 @@ ACTOR static Future<JsonBuilderObject> dataStatusFetcher(WorkerDetails ddWorker,
team_tracker.setKeyRawNumber("unhealthy_servers",inFlight.getValue("UnhealthyServers"));
JsonBuilderObject stateSectionObj;
if (highestPriority >= PRIORITY_TEAM_0_LEFT) {
if (highestPriority >= SERVER_KNOBS->PRIORITY_TEAM_0_LEFT) {
stateSectionObj["healthy"] = false;
stateSectionObj["name"] = "missing_data";
stateSectionObj["description"] = "No replicas remain of some data";
stateSectionObj["min_replicas_remaining"] = 0;
if(primary) {
*minReplicasRemaining = 0;
}
replicas = 0;
}
else if (highestPriority >= PRIORITY_TEAM_1_LEFT) {
else if (highestPriority >= SERVER_KNOBS->PRIORITY_TEAM_1_LEFT) {
stateSectionObj["healthy"] = false;
stateSectionObj["name"] = "healing";
stateSectionObj["description"] = "Only one replica remains of some data";
stateSectionObj["min_replicas_remaining"] = 1;
if(primary) {
*minReplicasRemaining = 1;
}
replicas = 1;
}
else if (highestPriority >= PRIORITY_TEAM_2_LEFT) {
else if (highestPriority >= SERVER_KNOBS->PRIORITY_TEAM_2_LEFT) {
stateSectionObj["healthy"] = false;
stateSectionObj["name"] = "healing";
stateSectionObj["description"] = "Only two replicas remain of some data";
stateSectionObj["min_replicas_remaining"] = 2;
if(primary) {
*minReplicasRemaining = 2;
}
replicas = 2;
}
else if (highestPriority >= PRIORITY_TEAM_UNHEALTHY) {
else if (highestPriority >= SERVER_KNOBS->PRIORITY_TEAM_UNHEALTHY) {
stateSectionObj["healthy"] = false;
stateSectionObj["name"] = "healing";
stateSectionObj["description"] = "Restoring replication factor";
} else if (highestPriority >= PRIORITY_MERGE_SHARD) {
} else if (highestPriority >= SERVER_KNOBS->PRIORITY_MERGE_SHARD) {
stateSectionObj["healthy"] = true;
stateSectionObj["name"] = "healthy_repartitioning";
stateSectionObj["description"] = "Repartitioning.";
} else if (highestPriority >= PRIORITY_TEAM_REDUNDANT) {
} else if (highestPriority >= SERVER_KNOBS->PRIORITY_TEAM_REDUNDANT) {
stateSectionObj["healthy"] = true;
stateSectionObj["name"] = "optimizing_team_collections";
stateSectionObj["description"] = "Optimizing team collections";
} else if (highestPriority >= PRIORITY_TEAM_CONTAINS_UNDESIRED_SERVER) {
} else if (highestPriority >= SERVER_KNOBS->PRIORITY_TEAM_CONTAINS_UNDESIRED_SERVER) {
stateSectionObj["healthy"] = true;
stateSectionObj["name"] = "healthy_removing_server";
stateSectionObj["description"] = "Removing storage server";
} else if (highestPriority == PRIORITY_TEAM_HEALTHY) {
} else if (highestPriority == SERVER_KNOBS->PRIORITY_TEAM_HEALTHY) {
stateSectionObj["healthy"] = true;
stateSectionObj["name"] = "healthy";
} else if (highestPriority >= PRIORITY_REBALANCE_SHARD) {
} else if (highestPriority >= SERVER_KNOBS->PRIORITY_RECOVER_MOVE) {
stateSectionObj["healthy"] = true;
stateSectionObj["name"] = "healthy_rebalancing";
stateSectionObj["description"] = "Rebalancing";
@ -1415,6 +1480,13 @@ ACTOR static Future<JsonBuilderObject> dataStatusFetcher(WorkerDetails ddWorker,
statusObjData["state"] = stateSectionObj;
}
}
if(primary) {
*minReplicasRemaining = std::max(*minReplicasRemaining, 0) + replicas;
}
else if(replicas > 0) {
*minReplicasRemaining = std::max(*minReplicasRemaining, 0) + 1;
}
}
statusObjData["team_trackers"] = teamTrackers;
}
@ -2219,7 +2291,13 @@ ACTOR Future<StatusReply> clusterGetStatus(
statusObj["latency_probe"] = latencyProbeResults;
}
wait(consistencyCheckStatusFetcher(cx, &messages, &status_incomplete_reasons, isAvailable));
state std::vector<Future<Void>> warningFutures;
if(isAvailable) {
warningFutures.push_back( consistencyCheckStatusFetcher(cx, &messages, &status_incomplete_reasons) );
if(!SERVER_KNOBS->DISABLE_DUPLICATE_LOG_WARNING) {
warningFutures.push_back( logRangeWarningFetcher(cx, &messages, &status_incomplete_reasons) );
}
}
// Start getting storage servers now (using system priority) concurrently. Using sys priority because having storage servers
// in status output is important to give context to error messages in status that reference a storage server role ID.
@ -2234,7 +2312,7 @@ ACTOR Future<StatusReply> clusterGetStatus(
state int minReplicasRemaining = -1;
std::vector<Future<JsonBuilderObject>> futures2;
futures2.push_back(dataStatusFetcher(ddWorker, &minReplicasRemaining));
futures2.push_back(dataStatusFetcher(ddWorker, configuration.get(), &minReplicasRemaining));
futures2.push_back(workloadStatusFetcher(db, workers, mWorker, rkWorker, &qos, &data_overlay, &status_incomplete_reasons, storageServerFuture));
futures2.push_back(layerStatusFetcher(cx, &messages, &status_incomplete_reasons));
futures2.push_back(lockedStatusFetcher(db, &messages, &status_incomplete_reasons));
@ -2313,6 +2391,7 @@ ACTOR Future<StatusReply> clusterGetStatus(
else {
messages.push_back(JsonBuilder::makeMessage("proxies_error", "Timed out trying to retrieve proxies."));
}
wait( waitForAll(warningFutures) );
}
else {
// Set layers status to { _valid: false, error: "configurationMissing"}

View File

@ -1213,7 +1213,7 @@ ACTOR Future<Void> configurationMonitor( Reference<MasterData> self ) {
self->registrationTrigger.trigger();
}
state Future<Void> watchFuture = tr.watch(moveKeysLockOwnerKey);
state Future<Void> watchFuture = tr.watch(moveKeysLockOwnerKey) || tr.watch(excludedServersVersionKey);
wait(tr.commit());
wait(watchFuture);
break;

View File

@ -106,6 +106,7 @@ struct AddingShard : NonCopyable {
Version transferredVersion;
enum Phase { WaitPrevious, Fetching, Waiting };
Phase phase;
AddingShard( StorageServer* server, KeyRangeRef const& keys );
@ -1948,8 +1949,9 @@ void splitMutation(StorageServer* data, KeyRangeMap<T>& map, MutationRef const&
ACTOR Future<Void> logFetchKeysWarning(AddingShard* shard) {
state double startTime = now();
loop {
wait(delay(600));
TraceEvent(SevWarnAlways, "FetchKeysTooLong").detail("Duration", now() - startTime).detail("Phase", shard->phase).detail("Begin", shard->keys.begin.printable()).detail("End", shard->keys.end.printable());
state double waitSeconds = BUGGIFY ? 5.0 : 600.0;
wait(delay(waitSeconds));
TraceEvent(waitSeconds > 300.0 ? SevWarnAlways : SevInfo, "FetchKeysTooLong").detail("Duration", now() - startTime).detail("Phase", shard->phase).detail("Begin", shard->keys.begin.printable()).detail("End", shard->keys.end.printable());
}
}
@ -2068,6 +2070,7 @@ ACTOR Future<Void> fetchKeys( StorageServer *data, AddingShard* shard ) {
shard->server->addShard( ShardInfo::addingSplitLeft( KeyRangeRef(keys.begin, nfk), shard ) );
shard->server->addShard( ShardInfo::newAdding( data, KeyRangeRef(nfk, keys.end) ) );
shard = data->shards.rangeContaining( keys.begin ).value()->adding;
warningLogger = logFetchKeysWarning(shard);
AddingShard* otherShard = data->shards.rangeContaining( nfk ).value()->adding;
keys = shard->keys;

View File

@ -75,12 +75,14 @@ std::string generateRegions() {
primarySatelliteObj["id"] = "2";
primarySatelliteObj["priority"] = 1;
primarySatelliteObj["satellite"] = 1;
if (deterministicRandom()->random01() < 0.25) primarySatelliteObj["satellite_logs"] = deterministicRandom()->randomInt(1,7);
primaryDcArr.push_back(primarySatelliteObj);
StatusObject remoteSatelliteObj;
remoteSatelliteObj["id"] = "3";
remoteSatelliteObj["priority"] = 1;
remoteSatelliteObj["satellite"] = 1;
if (deterministicRandom()->random01() < 0.25) remoteSatelliteObj["satellite_logs"] = deterministicRandom()->randomInt(1,7);
remoteDcArr.push_back(remoteSatelliteObj);
if(g_simulator.physicalDatacenters > 5 && deterministicRandom()->random01() < 0.5) {
@ -88,12 +90,14 @@ std::string generateRegions() {
primarySatelliteObjB["id"] = "4";
primarySatelliteObjB["priority"] = 1;
primarySatelliteObjB["satellite"] = 1;
if (deterministicRandom()->random01() < 0.25) primarySatelliteObjB["satellite_logs"] = deterministicRandom()->randomInt(1,7);
primaryDcArr.push_back(primarySatelliteObjB);
StatusObject remoteSatelliteObjB;
remoteSatelliteObjB["id"] = "5";
remoteSatelliteObjB["priority"] = 1;
remoteSatelliteObjB["satellite"] = 1;
if (deterministicRandom()->random01() < 0.25) remoteSatelliteObjB["satellite_logs"] = deterministicRandom()->randomInt(1,7);
remoteDcArr.push_back(remoteSatelliteObjB);
int satellite_replication_type = deterministicRandom()->randomInt(0,3);
@ -146,11 +150,8 @@ std::string generateRegions() {
}
}
if (deterministicRandom()->random01() < 0.25) {
int logs = deterministicRandom()->randomInt(1,7);
primaryObj["satellite_logs"] = logs;
remoteObj["satellite_logs"] = logs;
}
if (deterministicRandom()->random01() < 0.25) primaryObj["satellite_logs"] = deterministicRandom()->randomInt(1,7);
if (deterministicRandom()->random01() < 0.25) remoteObj["satellite_logs"] = deterministicRandom()->randomInt(1,7);
int remote_replication_type = deterministicRandom()->randomInt(0, 4);
switch (remote_replication_type) {

View File

@ -44,7 +44,7 @@ struct DDMetricsWorkload : TestWorkload {
TraceEventFields md = wait( timeoutError(masterWorker.eventLogRequest.getReply(
EventLogRequest( LiteralStringRef( "MovingData" ) ) ), 1.0 ) );
int relocations;
sscanf(md.getValue("HighPriorityRelocations").c_str(), "%d", &relocations);
sscanf(md.getValue("UnhealthyRelocations").c_str(), "%d", &relocations);
return relocations;
}

View File

@ -67,7 +67,7 @@ FlowKnobs::FlowKnobs(bool randomize, bool isSimulated) {
init( MAX_RECONNECTION_TIME, 0.5 );
init( RECONNECTION_TIME_GROWTH_RATE, 1.2 );
init( RECONNECTION_RESET_TIME, 5.0 );
init( CONNECTION_ACCEPT_DELAY, 0.01 );
init( CONNECTION_ACCEPT_DELAY, 0.5 );
init( USE_OBJECT_SERIALIZER, 1 );
init( TOO_MANY_CONNECTIONS_CLOSED_RESET_DELAY, 5.0 );
init( TOO_MANY_CONNECTIONS_CLOSED_TIMEOUT, 20.0 );
@ -212,6 +212,7 @@ bool Knobs::setKnob( std::string const& knob, std::string const& value ) {
}
*bool_knobs[knob] = v;
}
return true;
}
if (int64_knobs.count(knob) || int_knobs.count(knob)) {
int64_t v;

View File

@ -524,6 +524,7 @@ inline static void aligned_free(void* ptr) { free(ptr); }
inline static void* aligned_alloc(size_t alignment, size_t size) { return memalign(alignment, size); }
#endif
#elif defined(__APPLE__)
#if !defined(HAS_ALIGNED_ALLOC)
#include <cstdlib>
inline static void* aligned_alloc(size_t alignment, size_t size) {
// Linux's aligned_alloc() requires alignment to be a power of 2. While posix_memalign()
@ -540,6 +541,7 @@ inline static void* aligned_alloc(size_t alignment, size_t size) {
posix_memalign(&ptr, alignment, size);
return ptr;
}
#endif
inline static void aligned_free(void* ptr) { free(ptr); }
#endif

View File

@ -79,6 +79,7 @@ public: // introduced features
PROTOCOL_VERSION_FEATURE(0x0FDB00A400040000LL, OpenDatabase);
PROTOCOL_VERSION_FEATURE(0x0FDB00A446020000LL, Locality);
PROTOCOL_VERSION_FEATURE(0x0FDB00A460010000LL, MultiGenerationTLog);
PROTOCOL_VERSION_FEATURE(0x0FDB00A460010000LL, SharedMutations);
PROTOCOL_VERSION_FEATURE(0x0FDB00A551000000LL, MultiVersionClient);
PROTOCOL_VERSION_FEATURE(0x0FDB00A560010000LL, TagLocality);
PROTOCOL_VERSION_FEATURE(0x0FDB00B060000000LL, Fearless);

View File

@ -32,7 +32,7 @@
<Wix xmlns='http://schemas.microsoft.com/wix/2006/wi'>
<Product Name='$(var.Title)'
Id='{6BAF0715-4FDE-4F0D-8CA7-E4CAD53519B8}'
Id='{B69CF2EA-9CDC-4373-83E6-3615F9AE393B}'
UpgradeCode='{A95EA002-686E-4164-8356-C715B7F8B1C8}'
Version='$(var.Version)'
Manufacturer='$(var.Manufacturer)'

View File

@ -1,7 +1,7 @@
<?xml version="1.0"?>
<Project xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<PropertyGroup>
<Version>6.2.5</Version>
<Version>6.2.7</Version>
<PackageName>6.2</PackageName>
</PropertyGroup>
</Project>