From d076999f9b8387f2b8d2f9322c861499e3418871 Mon Sep 17 00:00:00 2001 From: Alex Miller Date: Tue, 13 Nov 2018 15:53:15 -0800 Subject: [PATCH 01/30] Fix the fix of the build of the bindings target. The bindings build was broken becaue fdb_c_performance_test failed to link with errors of the form: /tmp/ccym9LhK.o: In function `clearAll': /opt/foundationdb/bindings/c/test/performance_test.c:130: undefined reference to `fdb_transaction_clear_range' /tmp/ccym9LhK.o: In function `insertRange': /opt/foundationdb/bindings/c/test/performance_test.c:139: undefined reference to `fdb_transaction_set' /tmp/ccym9LhK.o: In function `singleKey': /opt/foundationdb/bindings/c/test/performance_test.c:540: undefined reference to `fdb_transaction_set' ... PR #901's e8f20e4 fixed this by adding `-shared` to the invocation line, and thus doing a dynamic linking of libfdb_c. As dynamically linking produced a working executable, this suggests that the correct set of libraries are being linked, as the symbols can be located eventually, just not in the right order, as the linker will proactively not include unnecessary object files from static libraries. And unfortunately, our performance test framework likely expects to be able to copy the binary, and not have to worry about associated dynamically linked libraries, so a statically linked binary is preferred. The underlying cause of this link error is that the static library preceeded the source code in the command line: /usr/bin/gcc -Werror -Wno-error=format -fPIC -DNO_INTELLISENSE -fvisibility=hidden -DNDEBUG=1 -Wreturn-type -fno-omit-frame-pointer -O2 -g -Llib -lfdb_c -lpthread -Ibindings/c -o bin/fdb_c_performance_test bindings/c/test/performance_test.c This comes from the line in the Makefile: @$(CC) $(CFLAGS) $(fdb_c_tests_LIBS) $(fdb_c_tests_HEADERS) -o $@ bindings/c/test/performance_test.c As we pass `-lfdb_c` before `performance_test.c`, when the linker is considering libfdb_c.a, it sees that no symbols from any of the object files are currently needed, and thus doesn't include them. When we proceprocess performance_test.c, we suddenly need these symbols, but it's too late, as the linker only processes files left-to-right. Thus, we can resolve this problem by passing -lfdb_c after performance_test.c Also of note is that we only seem to have this problem because the link line was crafted by hand instead of using link-wrapper.sh, which already does the right thing. --- bindings/c/local.mk | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bindings/c/local.mk b/bindings/c/local.mk index 28418d408b..36e295ce0e 100644 --- a/bindings/c/local.mk +++ b/bindings/c/local.mk @@ -24,7 +24,7 @@ fdb_c_CFLAGS := $(fdbclient_CFLAGS) fdb_c_LDFLAGS := $(fdbrpc_LDFLAGS) fdb_c_LIBS := lib/libfdbclient.a lib/libfdbrpc.a lib/libflow.a $(FDB_TLS_LIB) fdb_c_STATIC_LIBS := $(TLS_LIBS) -fdb_c_tests_LIBS := -shared -Llib -lfdb_c +fdb_c_tests_LIBS := -Llib -lfdb_c fdb_c_tests_HEADERS := -Ibindings/c CLEAN_TARGETS += fdb_c_tests_clean @@ -84,11 +84,11 @@ bindings/c/foundationdb/fdb_c_options.g.h: bin/vexillographer.exe fdbclient/vexi bin/fdb_c_performance_test: bindings/c/test/performance_test.c bindings/c/test/test.h fdb_c @echo "Compiling fdb_c_performance_test" - @$(CC) $(CFLAGS) $(fdb_c_tests_LIBS) $(fdb_c_tests_HEADERS) -o $@ bindings/c/test/performance_test.c + @$(CC) $(CFLAGS) $(fdb_c_tests_HEADERS) -o $@ bindings/c/test/performance_test.c $(fdb_c_tests_LIBS) bin/fdb_c_ryw_benchmark: bindings/c/test/ryw_benchmark.c bindings/c/test/test.h fdb_c @echo "Compiling fdb_c_ryw_benchmark" - @$(CC) $(CFLAGS) $(fdb_c_tests_LIBS) $(fdb_c_tests_HEADERS) -o $@ bindings/c/test/ryw_benchmark.c + @$(CC) $(CFLAGS) $(fdb_c_tests_HEADERS) -o $@ bindings/c/test/ryw_benchmark.c $(fdb_c_tests_LIBS) packages/fdb-c-tests-$(VERSION)-$(PLATFORM).tar.gz: bin/fdb_c_performance_test bin/fdb_c_ryw_benchmark @echo "Packaging $@" From ca6a6d49064ec711d20a557fda5a2795dab921ec Mon Sep 17 00:00:00 2001 From: Stephen Atherton Date: Tue, 13 Nov 2018 16:18:43 -0800 Subject: [PATCH 02/30] Added BlobStoreEndpoint::bucketExists() to check for existence of a bucket, and createBucket() now uses this to avoid trying to create a bucket that already exists. --- fdbclient/BlobStore.actor.cpp | 23 ++++++++++++++++++++--- fdbclient/BlobStore.h | 3 +++ 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/fdbclient/BlobStore.actor.cpp b/fdbclient/BlobStore.actor.cpp index 21681cf117..911947634a 100644 --- a/fdbclient/BlobStore.actor.cpp +++ b/fdbclient/BlobStore.actor.cpp @@ -225,6 +225,20 @@ std::string BlobStoreEndpoint::getResourceURL(std::string resource) { return r; } +ACTOR Future bucketExists_impl(Reference b, std::string bucket) { + Void _ = wait(b->requestRateRead->getAllowance(1)); + + std::string resource = std::string("/") + bucket; + HTTP::Headers headers; + + Reference r = wait(b->doRequest("HEAD", resource, headers, NULL, 0, {200, 404})); + return r->code == 200; +} + +Future BlobStoreEndpoint::bucketExists(std::string const &bucket) { + return bucketExists_impl(Reference::addRef(this), bucket); +} + ACTOR Future objectExists_impl(Reference b, std::string bucket, std::string object) { Void _ = wait(b->requestRateRead->getAllowance(1)); @@ -310,9 +324,12 @@ Future BlobStoreEndpoint::deleteRecursively(std::string const &bucket, std ACTOR Future createBucket_impl(Reference b, std::string bucket) { Void _ = wait(b->requestRateWrite->getAllowance(1)); - std::string resource = std::string("/") + bucket; - HTTP::Headers headers; - Reference r = wait(b->doRequest("PUT", resource, headers, NULL, 0, {200, 409})); + bool exists = wait(b->bucketExists(bucket)); + if(!exists) { + std::string resource = std::string("/") + bucket; + HTTP::Headers headers; + Reference r = wait(b->doRequest("PUT", resource, headers, NULL, 0, {200, 409})); + } return Void(); } diff --git a/fdbclient/BlobStore.h b/fdbclient/BlobStore.h index 34cb5cd394..a2d3e4d33c 100644 --- a/fdbclient/BlobStore.h +++ b/fdbclient/BlobStore.h @@ -197,6 +197,9 @@ public: // Get a list of the files in a bucket, see listBucketStream for more argument detail. Future listBucket(std::string const &bucket, Optional prefix = {}, Optional delimiter = {}, int maxDepth = 0, std::function recurseFilter = nullptr); + // Check if a bucket exists + Future bucketExists(std::string const &bucket); + // Check if an object exists in a bucket Future objectExists(std::string const &bucket, std::string const &object); From 37112299709082a6d5fe8637ffe3dec24da3d2c2 Mon Sep 17 00:00:00 2001 From: Stephen Atherton Date: Tue, 13 Nov 2018 17:20:00 -0800 Subject: [PATCH 03/30] Added release note for blobstore client compatibility fix. --- documentation/sphinx/source/release-notes.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/documentation/sphinx/source/release-notes.rst b/documentation/sphinx/source/release-notes.rst index e1e2281bbb..4c1a72da2f 100644 --- a/documentation/sphinx/source/release-notes.rst +++ b/documentation/sphinx/source/release-notes.rst @@ -2,7 +2,7 @@ Release Notes ############# -6.0.15 +6.0.16 ====== Features @@ -68,6 +68,7 @@ Fixes * HTTP client used by backup-to-blobstore now correctly treats response header field names as case insensitive. [6.0.15] `(PR #904) `_ * Blobstore REST client was not following the S3 API in several ways (bucket name, date, and response formats). [6.0.15] `(PR #914) `_ * Data distribution could queue shard movements for restoring replication at a low priority. [6.0.15] `(PR #907) `_ +* Blobstore REST client will no longer attempt to create a bucket that already exists. [6.0.16] `(PR #923) `_ Fixes only impacting 6.0.0+ --------------------------- From f7a400375b20a5731a6c077b178f4e2de8dacd03 Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Wed, 14 Nov 2018 23:15:49 -0800 Subject: [PATCH 04/30] added documentation for configuring regions --- .../sphinx/source/command-line-interface.rst | 7 + documentation/sphinx/source/configuration.rst | 230 ++++++++++++++++++ 2 files changed, 237 insertions(+) diff --git a/documentation/sphinx/source/command-line-interface.rst b/documentation/sphinx/source/command-line-interface.rst index 605efe0c40..8dbcc7c124 100644 --- a/documentation/sphinx/source/command-line-interface.rst +++ b/documentation/sphinx/source/command-line-interface.rst @@ -101,6 +101,13 @@ Set the process using ``configure [proxies|resolvers|logs]=``, where ```` For recommendations on appropriate values for process types in large clusters, see :ref:`guidelines-process-class-config`. +fileconfigure +------------- + +The ``fileconfigure`` command is alternative to the ``configure`` command which changes the configuration of the database based on a json document. The command loads a JSON document from the provided file, and change the database configuration to match the contents of the JSON document. The format should be the same as the value of the ``configuration`` entry in status JSON without ``excluded_servers`` or ``coordinators_count``. Its syntax is ``fileconfigure [new] ``. + +"The ``new`` option, if present, initializes a new database with the given configuration rather than changing the configuration of an existing one. + coordinators ------------ diff --git a/documentation/sphinx/source/configuration.rst b/documentation/sphinx/source/configuration.rst index 33e7a10830..d97f91aea6 100644 --- a/documentation/sphinx/source/configuration.rst +++ b/documentation/sphinx/source/configuration.rst @@ -516,6 +516,236 @@ When creating a partition for use with FoundationDB using the standard Linux fdi For an SSD with a single partition, the partition should typically begin at sector 2048 (512 byte sectors yields 1024 KiB alignment). +.. _configuration-configuring-regions: + +Configuring regions +=================== + +Regions configuration enables automatic failover between two datacenters, without adding a WAN latency for commits, while still maintaining all the consistency properties FoundationDB provides. + +This is made possible by combining two features. The first is the ability to support asynchronous replication between two regions. Because we are not waiting for the commits to become durable in the remote region before reporting a commit as successful, it means the remote region will slightly lag behind the primary. + +This is similiar to ``fdbdr``, except that the asynchronous replication is done within a single cluster instead of between different FoundationDB clusters. + +The second feature is the ability to add a synchronous replica of the mutation log in a different data center. Because this datacenter is only holding a transient copy of the mutations being committed to the database, only a few FoundationDB processes are required to fulfill this role. + +The benefit of holding the mutation log external to the primary data center is that if the primary data center fails we will still have access to the most recent commits. This allows the remote replica to catch up to the primary. Once the remote replica has applied all the mutations it can start accepting new commits without suffering any data loss. + +An example configuration would be four total data centers, two on the east coast, two on the west coast, with a preference for fast write latencies from the west coast. One datacenter on each coast would be sized to store a full copy of the data. The second datacenter in each coast would only have a few FoundationDB processes. + +When everything is healthy, writes need to be made durable in both west coast data centers before a commit can succeed. If the data centers are close to each other, this can add as little as 2ms to commit latencies. Reads can be served from either region, and clients can get data from whichever region is closer. Getting a read version from the each coast region will still require communicating with a west coast datacenter. Clients can cache read versions if they can tolerate reading stale data to avoid waiting on read versions. + +If either west coast data center fails, the last few mutations will be propogated from the remaining west coast datacenter to the east coast. At this point we will start accepting commits on the east coast. Once the west coast comes back online, the system will automatically start copying all the data that was committed to the east coast back to the west coast replica. Once the west coast has caught up, the system will automatically switch back to accepting writes from the west coast again. + +In the event that the west coast has failed for long enough that we no longer have enough disk space to continue storing the mutation log, we can drop the replica completely. This decision is not automatic, and requires a manual change to the configuration. The database will then act as a single data center database until the west coast comes back online. Because we have dropped to west coast replica completely, to bring the west coast back online we will have to copy all the data between the regions. + +Region failover generally only requires a few seconds to complete. + +Specifying data centers +----------------------- + +To use region configurations all processes in the cluster need to specify what data center they are in. This can be done on the command line with either ``--locality_dcid`` or ``--datacenter_id``. This data center identifer is case sensitive. + +Clients should also specify their data center with the database option ``datacenter_id``. If a client does not specify their datacenter, they will use latency estimates to balance traffic between the two regions. This will result in about 5% of requests being served by the remote regions, so you will see large tail latencies on reads. + +Changing the region configuration +--------------------------------- + +To change the region configure, use the ``fileconfigure`` command ``fdbcli``. For example:: + + user@host$ fdbcli + Using cluster file `/etc/foundationdb/fdb.cluster'. + + The database is available. + + Welcome to the fdbcli. For help, type `help'. + fdb> fileconfigure regions.json + Configuration changed. + + +Regions are configured in FoundationDB as a json document. For example:: + + "regions":[{ + "datacenters":[{ + "id":"WC1", + "priority":1, + "satellite":1 + }], + "satellite_redundancy_mode":"one_satellite_double", + "satellite_logs":2 + }] + +The ``regions`` object in the json document should be a array. Each element of the array describes the configuration of an individual region. + +Each region is described using a object that contains an array of ``datacenters``. Each region may also optionally provide a ``satellite_redundancy_mode`` and ``satellite_logs``. + +Each datacenter is described with an object that contains the ``id`` and ``priority`` of that data center. Data centers which hold a full replica of the data are referred to as primary data centers. Data centers that only store transaction logs are referred to as satellite data centers. To specify a data center is a satellite, it needs to include ``"satellite" : 1``. The priorities of satellite data centers are only compared to other satellites data centers in the same region. The priorities of primary data centers are only compared to other primary data centers. + +.. warning:: In release 6.0, FoundationDB supports at most two regions. + +Each region can only have one primary data center. A negative priority for a data center denotes that the system should not recover the transaction subsystem in that data center. The region with the transaction subsystem is referred to as the active region. + +One primary data center must have a priority >= 0. The cluster will make the region with the highest priority the active region. If two data centers have equal priority the cluster will make one of them the active region arbirarily. + +The ``satellite_redundancy_mode`` is configured per region, and specifies how many copies of each mutation should be replicated to the satellite data centers. + +``one_satellite_single`` mode + + Keep a single copy of the mutation log in the satellite data center with the highest priority. If the highest priority satellite is unavailable it will put the transaction log in the satellite data center with the next highest priority. + +``one_satellite_double`` mode + + Keep a two copies of the mutation log in the satellite data center with the highest priority. + +``one_satellite_triple`` mode + + Keep a three copies of the mutation log in the satellite data center with the highest priority. + +``two_satellite_safe`` mode + + Keep two copies of the mutation log in each of the two satellite data centers with the highest priorities, for a total of four copies of each mutation. This mode will protect against the simultanous loss of both the primary and one of the satellite data centers. If only one satellite is available, it will fall back to only storing two copies of the mutation log in the remaining data center. + +``two_satellite_fast`` mode + + Keep two copies of the mutation log in each of the two satellite data centers with the highest priorities, for a total of four copies of each mutation. The proxies will only wait for one of the two satellite data centers to make the mutations durable before considering a commit successful. This will reduce tail latencies caused by network issues between data centers. If only one satellite is available, it will fall back to only storing two copies of the mutation log in the remaining data center. + +.. warning:: In release 6.0 this is implemented by waiting for all but 2 of the transaction logs. This means if you configure more than 4 satellite logs, it will still need to wait for replies from both data centers. + +The number of ``satellite_logs`` is also configured per region. It represents the desired number of transaction logs that should be recruited in the satellite data centers. The satellite transaction logs do slightly less work than the primary data center transaction logs. So while you should keep the ratio of logs to replicas roughly equal in the primary data center and the satellites, you may be able to balance performance with slightly less satellite transaction logs. + +Asymmetric configurations +------------------------- + +The fact that satellite policies are configured per region allows for asymmetric configurations. For example, you can have a three datacenter setup where you have two data centers on the west coast (WC1, WC2) and one data center on the east coast (EC1). We set the west coast region as our preferred active region by setting the priority of its primary data center higher than the east coast data center. The west coast region should have a satellite policy configured, so that when it is active we are making mutations durable in both west coast data centers. In the rare event that one of our west coast data center have failed, we will fail over to the east coast data center. Because this region does not a satellite data center, the mutations will only be made durable in one data center while the transaction subsytem is located here. However this is justifiable because the region will only be active if we have already lost a data center. + +This is the region configuration that implements the example:: + + "regions":[{ + "datacenters":[{ + "id":"WC1", + "priority":1, + },{ + "id":"WC2", + "priority":0, + "satellite":1 + }], + "satellite_redundancy_mode":"one_satellite_double" + },{ + "datacenters":[{ + "id":"EC1", + "priority":0, + }] + }] + +Changing the usable_regions configuration +----------------------------------------- + +The ``usable_regions`` configuration option determines the number of regions which have a replica of the database. + +.. warning:: In release 6.0 we only support values of 1 or 2, to match the maximum number of regions that can be defined in the ``regions`` json object. + +Increasing the ``usable_regions`` will start copying data from the active region to the remote region. Reducing the ``usable_regions`` will immediately drop the replicas store in the remote region. During these changes, only one primary data center can have priority >= 0. This enforces exactly which region will lose its replica. + +Changing the log routers configuration +-------------------------------------- + +FoundationDB is architected to copy every mutation between regions exactly once. This copying is done by a new role called the log router. When a mutation is committed, it will be randomly assigned one log router which will be responsible for copying it across the WAN. + +This log router will pull the mutation from exactly one of the transaction logs. This means a single socket will be involved in copying mutations across the WAN per log router. Because of this, if the latency between regions is large the bandwidth-delay product means that the number of log routers could limit the throughput at which mutations can be copied across the WAN. This can be mitigated by either configuring more log routers, or increasing the TCP window scale option. + +To keep the work evenly distributed on the transaction logs, the number of log routers should be a multiple of the number of transaction logs. + +The ``log_routers`` configuration option determines the number of log routers recruited in the remote region. + +Migrating a database to use a region configuration +-------------------------------------------------- + +To configure an existing database to use a region configuration do the following steps:: + + 1. Ensure all processes have their dcid locality set on the command line. All processes should exist in the same data center. If you are converting from a ``three_datacenter`` configuration, you will first need to configure down to using a single data center by changing the replication mode. Then exclude the machines in all data centers but the one that will become the initial active region. + + 2. Configure the region configuration. The data center with all the existing processes should have a non-negative priority. The region which will eventually store the remote replica should be added with a negative priority. + + 3. Add processes to the cluster in the remote region. These processes will not take data yet, but need to be added to the cluster. If they are added before the region configuration is set they will be assigned data like any other FoundationDB process, which will lead to high latencies. + + 4. Configure usable_regions=2. This will cause the cluster to start copying data between the regions. + + 5. Watch status and wait until data movement is complete. This will mean signal that the remote data center has a full replica of all of the data in the database. + + 6. Change the region configuration to have a non-negative priority for the primary data centers in both regions. This will enable automatic failover between regions. + +Handling data center failures +----------------------------- + +When a primary data center fails, the cluster will go into a degraded state. It will recover to the other region and continue accepting commits, however the mutations bound for the other side will build up on the transaction logs. Eventually, the disks on the transaction logs will fill up, so the database cannot be left in this condition indefinately. + +.. warning:: While a data center has failed, the maximum write throughput of the cluster will be roughly 1/3 of normal performance. This is because the transaction logs need to store all of the mutations being committed so that once the other data center comes back online it can replay history to catch back up. + +To drop the dead data center do the follow steps:: + + 1. Configure the region configuration so that the dead data center has a negative priority. + + 2. Configure usable_regions=1. + +If you are running in a configuration without a satellite data center, or you have lost all machines in a region simultanously. The ``force_recovery_with_data_loss`` command from ``fdbcli`` allows you to force a recovery to the other region which will discard the portion of the mutation log which did not make it across the WAN. Once the database has recovered, immediately follow the previous steps to drop the dead region the normal way. + +.. warning:: In 6.0 the ``force_recovery_with_data_loss`` command from ``fdbcli`` can cause data inconsistencies if it is used when processes from both non-satellite data centers are still in the cluster. In general this command has not be tested to same degree as the rest of the codebase, and should only be used in extreme emergencies. + +Region change safety +-------------------- + +The steps described above for both adding and removing replicas are enforced by ``fdbcli``. The follow are the specific conditions checked by ``fdbcli``:: + + * You cannot change the ``regions`` configuration while also changing ``usable_regions``. + + * You can only change ``usable_regions`` when exactly one region has priority >= 0. + + * When ``usable_regions`` > 1, all regions with priority >= 0 must have a full replica of the data. + + * All storage servers must be in one of the regions specified by the region configuration. + +Monitoring +---------- + +It is important to ensure the remote replica does not fall too far behind the active relica. To failover between regions all of the mutations need to be flushed from the active replica to the remote replica. If the remote replica is too far behind, this can take a very long time. The version difference between the data centers is available in status json as ``datacenter_version_difference``. This number should be less than 5 million. A large data center version difference could indicate that you need more log routers. It could also be caused by network issues between the regions. If the difference gets too large the remote replica should be dropped, similar to a data center outage that goes on too long. + +Because of asymmetric write latencies in the two regions, it important to route client traffic to the currently active region. The current active region is written in the system keyspace as the key ``\xff/primaryDatacenter``. Clients can read and watch this key after setting the ``read_system_keys`` transaction option. + +Choosing coordinators +--------------------- + +Choosing coordinators for a multi-region configuration provides its own set of challenges. A majority of coordinators need to be alive for the cluster to be available. There are two common coordinators setups that allow a cluster to survive the simultaeous loss of a data center and one additional machine. + +The first is five coordinators in five different data centers. The second is nine total coordinators spread across three data centers. There is some additional benefit to spreading the cooridators across regions rather than data centers. This is because if an entire region fails, it is still possible to recover to the other region if you are willing to accept a small amount of data loss. However, if you have lost a majority of coordinators this becomes much more difficult. + +Additionally, if a data center fails and then the second data center in the region fails 30 seconds later, we can generally survive this scenario. We can survive because the second data center only needs to be alive long enough to copy the tail of the mutation log across the WAN. However if your coordinators are in this second data center, you will still experience an outage. + +These considerations mean that best practice is to put three coordinators in the main data centers of each region, and then put three additional coordinators in a third region. + +Comparison to other multiple data center configurations +------------------------------------------------------- + +Region configuration provides very similiar functionality to ``fdbdr``. + +If you are not using satellite data centers, the main benefit of a region configuration compared to ``fdbdr`` is that each data center is able to restore replication even after losing all copies of a key range. If we simultanously lose two storage servers in a double replicated cluster, with ``fdbdr`` we would be forced to fail over to the remote region. With region configuration the cluster will automatically copy the missing key range from the remote replica back to the primary data center. + +The main disadvantage of using a region configuration is that the total number of processes we can support in a single region is around half when compared against ``fdbdr``. This is because we have processes for both regions in the same cluster, and some singleton components like our failure monitor will have to do twice as much work. In ``fdbdr`` we have two separate cluster for each region, so the total number of processes can scale to about twice as large as using a region configuration. + +Region configuration is better in almost all ways than the ``three_datacenter`` replication mode. Region configuration gives you the same ability to survive the loss of one data center, however we only need to store two full replicas of the database instead of three. Region configuration is almost much more efficient with how it sends mutations across the WAN. The only reason to use ``three_datacenter`` replication is if you need low latency reads from all three locations. + +Known limitations +----------------- + +The 6.0 release still has a number of rough edges related to region configuration. This is a collection of all the issues that have been pointed out in the sections above. These issues should be significantly improved in future releases of FoundationDB:: + + * FoundationDB supports replicating data to at most two regions. + + * ``two_satellite_fast`` does not hide latency properly when configured with more than 4 satellite transaction logs. + + * While a data center has failed, the maximum write throughput of the cluster will be roughly 1/3 of normal performance. + + * ``force_recovery_with_data_loss``` can cause data inconsistencies if it is used when processes from both non-satellite data centers are still in the cluster. + .. _guidelines-process-class-config: Guidelines for setting process class From b8a3f0e623f07b1242a02f3a7b37d2b229a55246 Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Wed, 14 Nov 2018 23:22:21 -0800 Subject: [PATCH 05/30] changed formatting --- documentation/sphinx/source/configuration.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/documentation/sphinx/source/configuration.rst b/documentation/sphinx/source/configuration.rst index d97f91aea6..021c233f3f 100644 --- a/documentation/sphinx/source/configuration.rst +++ b/documentation/sphinx/source/configuration.rst @@ -660,7 +660,7 @@ The ``log_routers`` configuration option determines the number of log routers re Migrating a database to use a region configuration -------------------------------------------------- -To configure an existing database to use a region configuration do the following steps:: +To configure an existing database to use a region configuration do the following steps: 1. Ensure all processes have their dcid locality set on the command line. All processes should exist in the same data center. If you are converting from a ``three_datacenter`` configuration, you will first need to configure down to using a single data center by changing the replication mode. Then exclude the machines in all data centers but the one that will become the initial active region. @@ -681,7 +681,7 @@ When a primary data center fails, the cluster will go into a degraded state. It .. warning:: While a data center has failed, the maximum write throughput of the cluster will be roughly 1/3 of normal performance. This is because the transaction logs need to store all of the mutations being committed so that once the other data center comes back online it can replay history to catch back up. -To drop the dead data center do the follow steps:: +To drop the dead data center do the follow steps: 1. Configure the region configuration so that the dead data center has a negative priority. @@ -694,7 +694,7 @@ If you are running in a configuration without a satellite data center, or you ha Region change safety -------------------- -The steps described above for both adding and removing replicas are enforced by ``fdbcli``. The follow are the specific conditions checked by ``fdbcli``:: +The steps described above for both adding and removing replicas are enforced by ``fdbcli``. The follow are the specific conditions checked by ``fdbcli``: * You cannot change the ``regions`` configuration while also changing ``usable_regions``. From 6433388c0d45dfe6b60a7d38cb2ecb4711a3e2bf Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Wed, 14 Nov 2018 23:24:27 -0800 Subject: [PATCH 06/30] one more formatting change --- documentation/sphinx/source/configuration.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/documentation/sphinx/source/configuration.rst b/documentation/sphinx/source/configuration.rst index 021c233f3f..e128c75cff 100644 --- a/documentation/sphinx/source/configuration.rst +++ b/documentation/sphinx/source/configuration.rst @@ -736,7 +736,7 @@ Region configuration is better in almost all ways than the ``three_datacenter`` Known limitations ----------------- -The 6.0 release still has a number of rough edges related to region configuration. This is a collection of all the issues that have been pointed out in the sections above. These issues should be significantly improved in future releases of FoundationDB:: +The 6.0 release still has a number of rough edges related to region configuration. This is a collection of all the issues that have been pointed out in the sections above. These issues should be significantly improved in future releases of FoundationDB: * FoundationDB supports replicating data to at most two regions. From ea300a150a7e001a754e1434447646aea8852aed Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Wed, 14 Nov 2018 23:31:58 -0800 Subject: [PATCH 07/30] fixed spelling mistakes --- documentation/sphinx/source/configuration.rst | 30 +++++++++---------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/documentation/sphinx/source/configuration.rst b/documentation/sphinx/source/configuration.rst index e128c75cff..da229fffdc 100644 --- a/documentation/sphinx/source/configuration.rst +++ b/documentation/sphinx/source/configuration.rst @@ -525,7 +525,7 @@ Regions configuration enables automatic failover between two datacenters, withou This is made possible by combining two features. The first is the ability to support asynchronous replication between two regions. Because we are not waiting for the commits to become durable in the remote region before reporting a commit as successful, it means the remote region will slightly lag behind the primary. -This is similiar to ``fdbdr``, except that the asynchronous replication is done within a single cluster instead of between different FoundationDB clusters. +This is similar to ``fdbdr``, except that the asynchronous replication is done within a single cluster instead of between different FoundationDB clusters. The second feature is the ability to add a synchronous replica of the mutation log in a different data center. Because this datacenter is only holding a transient copy of the mutations being committed to the database, only a few FoundationDB processes are required to fulfill this role. @@ -535,7 +535,7 @@ An example configuration would be four total data centers, two on the east coast When everything is healthy, writes need to be made durable in both west coast data centers before a commit can succeed. If the data centers are close to each other, this can add as little as 2ms to commit latencies. Reads can be served from either region, and clients can get data from whichever region is closer. Getting a read version from the each coast region will still require communicating with a west coast datacenter. Clients can cache read versions if they can tolerate reading stale data to avoid waiting on read versions. -If either west coast data center fails, the last few mutations will be propogated from the remaining west coast datacenter to the east coast. At this point we will start accepting commits on the east coast. Once the west coast comes back online, the system will automatically start copying all the data that was committed to the east coast back to the west coast replica. Once the west coast has caught up, the system will automatically switch back to accepting writes from the west coast again. +If either west coast data center fails, the last few mutations will be propagated from the remaining west coast datacenter to the east coast. At this point we will start accepting commits on the east coast. Once the west coast comes back online, the system will automatically start copying all the data that was committed to the east coast back to the west coast replica. Once the west coast has caught up, the system will automatically switch back to accepting writes from the west coast again. In the event that the west coast has failed for long enough that we no longer have enough disk space to continue storing the mutation log, we can drop the replica completely. This decision is not automatic, and requires a manual change to the configuration. The database will then act as a single data center database until the west coast comes back online. Because we have dropped to west coast replica completely, to bring the west coast back online we will have to copy all the data between the regions. @@ -544,7 +544,7 @@ Region failover generally only requires a few seconds to complete. Specifying data centers ----------------------- -To use region configurations all processes in the cluster need to specify what data center they are in. This can be done on the command line with either ``--locality_dcid`` or ``--datacenter_id``. This data center identifer is case sensitive. +To use region configurations all processes in the cluster need to specify what data center they are in. This can be done on the command line with either ``--locality_dcid`` or ``--datacenter_id``. This data center identifier is case sensitive. Clients should also specify their data center with the database option ``datacenter_id``. If a client does not specify their datacenter, they will use latency estimates to balance traffic between the two regions. This will result in about 5% of requests being served by the remote regions, so you will see large tail latencies on reads. @@ -585,7 +585,7 @@ Each datacenter is described with an object that contains the ``id`` and ``prior Each region can only have one primary data center. A negative priority for a data center denotes that the system should not recover the transaction subsystem in that data center. The region with the transaction subsystem is referred to as the active region. -One primary data center must have a priority >= 0. The cluster will make the region with the highest priority the active region. If two data centers have equal priority the cluster will make one of them the active region arbirarily. +One primary data center must have a priority >= 0. The cluster will make the region with the highest priority the active region. If two data centers have equal priority the cluster will make one of them the active region arbitrarily. The ``satellite_redundancy_mode`` is configured per region, and specifies how many copies of each mutation should be replicated to the satellite data centers. @@ -603,7 +603,7 @@ The ``satellite_redundancy_mode`` is configured per region, and specifies how ma ``two_satellite_safe`` mode - Keep two copies of the mutation log in each of the two satellite data centers with the highest priorities, for a total of four copies of each mutation. This mode will protect against the simultanous loss of both the primary and one of the satellite data centers. If only one satellite is available, it will fall back to only storing two copies of the mutation log in the remaining data center. + Keep two copies of the mutation log in each of the two satellite data centers with the highest priorities, for a total of four copies of each mutation. This mode will protect against the simultaneous loss of both the primary and one of the satellite data centers. If only one satellite is available, it will fall back to only storing two copies of the mutation log in the remaining data center. ``two_satellite_fast`` mode @@ -616,7 +616,7 @@ The number of ``satellite_logs`` is also configured per region. It represents th Asymmetric configurations ------------------------- -The fact that satellite policies are configured per region allows for asymmetric configurations. For example, you can have a three datacenter setup where you have two data centers on the west coast (WC1, WC2) and one data center on the east coast (EC1). We set the west coast region as our preferred active region by setting the priority of its primary data center higher than the east coast data center. The west coast region should have a satellite policy configured, so that when it is active we are making mutations durable in both west coast data centers. In the rare event that one of our west coast data center have failed, we will fail over to the east coast data center. Because this region does not a satellite data center, the mutations will only be made durable in one data center while the transaction subsytem is located here. However this is justifiable because the region will only be active if we have already lost a data center. +The fact that satellite policies are configured per region allows for asymmetric configurations. For example, you can have a three datacenter setup where you have two data centers on the west coast (WC1, WC2) and one data center on the east coast (EC1). We set the west coast region as our preferred active region by setting the priority of its primary data center higher than the east coast data center. The west coast region should have a satellite policy configured, so that when it is active we are making mutations durable in both west coast data centers. In the rare event that one of our west coast data center have failed, we will fail over to the east coast data center. Because this region does not a satellite data center, the mutations will only be made durable in one data center while the transaction subsystem is located here. However this is justifiable because the region will only be active if we have already lost a data center. This is the region configuration that implements the example:: @@ -677,7 +677,7 @@ To configure an existing database to use a region configuration do the following Handling data center failures ----------------------------- -When a primary data center fails, the cluster will go into a degraded state. It will recover to the other region and continue accepting commits, however the mutations bound for the other side will build up on the transaction logs. Eventually, the disks on the transaction logs will fill up, so the database cannot be left in this condition indefinately. +When a primary data center fails, the cluster will go into a degraded state. It will recover to the other region and continue accepting commits, however the mutations bound for the other side will build up on the transaction logs. Eventually, the disks on the transaction logs will fill up, so the database cannot be left in this condition indefinitely. .. warning:: While a data center has failed, the maximum write throughput of the cluster will be roughly 1/3 of normal performance. This is because the transaction logs need to store all of the mutations being committed so that once the other data center comes back online it can replay history to catch back up. @@ -687,7 +687,7 @@ To drop the dead data center do the follow steps: 2. Configure usable_regions=1. -If you are running in a configuration without a satellite data center, or you have lost all machines in a region simultanously. The ``force_recovery_with_data_loss`` command from ``fdbcli`` allows you to force a recovery to the other region which will discard the portion of the mutation log which did not make it across the WAN. Once the database has recovered, immediately follow the previous steps to drop the dead region the normal way. +If you are running in a configuration without a satellite data center, or you have lost all machines in a region simultaneously. The ``force_recovery_with_data_loss`` command from ``fdbcli`` allows you to force a recovery to the other region which will discard the portion of the mutation log which did not make it across the WAN. Once the database has recovered, immediately follow the previous steps to drop the dead region the normal way. .. warning:: In 6.0 the ``force_recovery_with_data_loss`` command from ``fdbcli`` can cause data inconsistencies if it is used when processes from both non-satellite data centers are still in the cluster. In general this command has not be tested to same degree as the rest of the codebase, and should only be used in extreme emergencies. @@ -707,16 +707,16 @@ The steps described above for both adding and removing replicas are enforced by Monitoring ---------- -It is important to ensure the remote replica does not fall too far behind the active relica. To failover between regions all of the mutations need to be flushed from the active replica to the remote replica. If the remote replica is too far behind, this can take a very long time. The version difference between the data centers is available in status json as ``datacenter_version_difference``. This number should be less than 5 million. A large data center version difference could indicate that you need more log routers. It could also be caused by network issues between the regions. If the difference gets too large the remote replica should be dropped, similar to a data center outage that goes on too long. +It is important to ensure the remote replica does not fall too far behind the active replica. To failover between regions all of the mutations need to be flushed from the active replica to the remote replica. If the remote replica is too far behind, this can take a very long time. The version difference between the data centers is available in status json as ``datacenter_version_difference``. This number should be less than 5 million. A large data center version difference could indicate that you need more log routers. It could also be caused by network issues between the regions. If the difference gets too large the remote replica should be dropped, similar to a data center outage that goes on too long. -Because of asymmetric write latencies in the two regions, it important to route client traffic to the currently active region. The current active region is written in the system keyspace as the key ``\xff/primaryDatacenter``. Clients can read and watch this key after setting the ``read_system_keys`` transaction option. +Because of asymmetric write latencies in the two regions, it important to route client traffic to the currently active region. The current active region is written in the system key space as the key ``\xff/primaryDatacenter``. Clients can read and watch this key after setting the ``read_system_keys`` transaction option. Choosing coordinators --------------------- -Choosing coordinators for a multi-region configuration provides its own set of challenges. A majority of coordinators need to be alive for the cluster to be available. There are two common coordinators setups that allow a cluster to survive the simultaeous loss of a data center and one additional machine. +Choosing coordinators for a multi-region configuration provides its own set of challenges. A majority of coordinators need to be alive for the cluster to be available. There are two common coordinators setups that allow a cluster to survive the simultaneous loss of a data center and one additional machine. -The first is five coordinators in five different data centers. The second is nine total coordinators spread across three data centers. There is some additional benefit to spreading the cooridators across regions rather than data centers. This is because if an entire region fails, it is still possible to recover to the other region if you are willing to accept a small amount of data loss. However, if you have lost a majority of coordinators this becomes much more difficult. +The first is five coordinators in five different data centers. The second is nine total coordinators spread across three data centers. There is some additional benefit to spreading the coordinators across regions rather than data centers. This is because if an entire region fails, it is still possible to recover to the other region if you are willing to accept a small amount of data loss. However, if you have lost a majority of coordinators this becomes much more difficult. Additionally, if a data center fails and then the second data center in the region fails 30 seconds later, we can generally survive this scenario. We can survive because the second data center only needs to be alive long enough to copy the tail of the mutation log across the WAN. However if your coordinators are in this second data center, you will still experience an outage. @@ -725,9 +725,9 @@ These considerations mean that best practice is to put three coordinators in the Comparison to other multiple data center configurations ------------------------------------------------------- -Region configuration provides very similiar functionality to ``fdbdr``. +Region configuration provides very similar functionality to ``fdbdr``. -If you are not using satellite data centers, the main benefit of a region configuration compared to ``fdbdr`` is that each data center is able to restore replication even after losing all copies of a key range. If we simultanously lose two storage servers in a double replicated cluster, with ``fdbdr`` we would be forced to fail over to the remote region. With region configuration the cluster will automatically copy the missing key range from the remote replica back to the primary data center. +If you are not using satellite data centers, the main benefit of a region configuration compared to ``fdbdr`` is that each data center is able to restore replication even after losing all copies of a key range. If we simultaneously lose two storage servers in a double replicated cluster, with ``fdbdr`` we would be forced to fail over to the remote region. With region configuration the cluster will automatically copy the missing key range from the remote replica back to the primary data center. The main disadvantage of using a region configuration is that the total number of processes we can support in a single region is around half when compared against ``fdbdr``. This is because we have processes for both regions in the same cluster, and some singleton components like our failure monitor will have to do twice as much work. In ``fdbdr`` we have two separate cluster for each region, so the total number of processes can scale to about twice as large as using a region configuration. @@ -744,7 +744,7 @@ The 6.0 release still has a number of rough edges related to region configuratio * While a data center has failed, the maximum write throughput of the cluster will be roughly 1/3 of normal performance. - * ``force_recovery_with_data_loss``` can cause data inconsistencies if it is used when processes from both non-satellite data centers are still in the cluster. + * ``force_recovery_with_data_loss`` can cause data inconsistencies if it is used when processes from both non-satellite data centers are still in the cluster. .. _guidelines-process-class-config: From 3bb17e69fd423cbabf3b58d78e5749fced9ed0a6 Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Wed, 14 Nov 2018 23:34:20 -0800 Subject: [PATCH 08/30] added whitespace --- documentation/sphinx/source/command-line-interface.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/documentation/sphinx/source/command-line-interface.rst b/documentation/sphinx/source/command-line-interface.rst index 8dbcc7c124..df365ea7da 100644 --- a/documentation/sphinx/source/command-line-interface.rst +++ b/documentation/sphinx/source/command-line-interface.rst @@ -104,7 +104,9 @@ For recommendations on appropriate values for process types in large clusters, s fileconfigure ------------- -The ``fileconfigure`` command is alternative to the ``configure`` command which changes the configuration of the database based on a json document. The command loads a JSON document from the provided file, and change the database configuration to match the contents of the JSON document. The format should be the same as the value of the ``configuration`` entry in status JSON without ``excluded_servers`` or ``coordinators_count``. Its syntax is ``fileconfigure [new] ``. +The ``fileconfigure`` command is alternative to the ``configure`` command which changes the configuration of the database based on a json document. The command loads a JSON document from the provided file, and change the database configuration to match the contents of the JSON document. + +The format should be the same as the value of the ``configuration`` entry in status JSON without ``excluded_servers`` or ``coordinators_count``. Its syntax is ``fileconfigure [new] ``. "The ``new`` option, if present, initializes a new database with the given configuration rather than changing the configuration of an existing one. From 1f2223fcf58fe5a6d9a8c610802d680f84239db3 Mon Sep 17 00:00:00 2001 From: Stephen Atherton Date: Thu, 15 Nov 2018 02:15:25 -0800 Subject: [PATCH 09/30] Bug fix in backup expiration. After the range file scan, it was being asserted that the range files found have a version < expiration version but this isn't guaranteed because the expiration version is likely shifted backward a bit after the file scan based on the log files found. --- fdbclient/BackupContainer.actor.cpp | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/fdbclient/BackupContainer.actor.cpp b/fdbclient/BackupContainer.actor.cpp index 79c378feb8..3220e6bbc0 100644 --- a/fdbclient/BackupContainer.actor.cpp +++ b/fdbclient/BackupContainer.actor.cpp @@ -587,6 +587,13 @@ public: scanBegin = expiredEnd.get(); } + TraceEvent("BackupContainerFileSystem") + .detail("ExpireEndVersion", expireEndVersion) + .detail("ScanBeginVersion", scanBegin) + .detail("CachedLogBegin", logBegin.orDefault(-1)) + .detail("CachedLogEnd", logEnd.orDefault(-1)) + .detail("CachedExpiredEnd", expiredEnd.orDefault(-1)); + // Get log files that contain any data at or before expireEndVersion state std::vector logs = wait(bc->listLogFiles(scanBegin, expireEndVersion - 1)); // Get range files up to and including expireEndVersion @@ -622,8 +629,12 @@ public: // Move filenames out of vector then destroy it to save memory for(auto const &f : ranges) { - ASSERT(f.version < expireEndVersion); - toDelete.push_back(std::move(f.fileName)); + // The file version must be checked here again because it is likely that expireEndVersion is in the middle of a log file, in which case + // after the log and range file listings are done (using the original expireEndVersion) the expireEndVersion will be moved back slightly + // to the begin version of the last log file found (which is also the first log to not be deleted) + if(f.version < expireEndVersion) { + toDelete.push_back(std::move(f.fileName)); + } } ranges.clear(); From 86f5139640d7d2276219429a6f8d221152ae6967 Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Thu, 15 Nov 2018 09:42:15 -0800 Subject: [PATCH 10/30] added additional information to region documentation --- documentation/sphinx/source/configuration.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/documentation/sphinx/source/configuration.rst b/documentation/sphinx/source/configuration.rst index da229fffdc..e46d33823c 100644 --- a/documentation/sphinx/source/configuration.rst +++ b/documentation/sphinx/source/configuration.rst @@ -372,6 +372,8 @@ In addition to the more commonly used modes listed above, this version of Founda FoundationDB attempts to replicate data across two datacenters and will stay up with only two available. Data is triple replicated. For maximum availability, you should use five coordination servers: two in two of the datacenters and one in the third datacenter. +.. warning:: ``three_datacenter`` mode is not compatible with region configuration. + Changing redundancy mode ------------------------ @@ -613,6 +615,8 @@ The ``satellite_redundancy_mode`` is configured per region, and specifies how ma The number of ``satellite_logs`` is also configured per region. It represents the desired number of transaction logs that should be recruited in the satellite data centers. The satellite transaction logs do slightly less work than the primary data center transaction logs. So while you should keep the ratio of logs to replicas roughly equal in the primary data center and the satellites, you may be able to balance performance with slightly less satellite transaction logs. +The number of replicas in each region is controlled by redundancy level. For example ``double`` mode will put 2 replicas in each region, for a total of 4 replicas. + Asymmetric configurations ------------------------- From c1306f94fd7395808ae6ac80915d2d7a226bd1f1 Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Thu, 15 Nov 2018 13:14:25 -0800 Subject: [PATCH 11/30] =?UTF-8?q?replaced=20usage=20of=20=E2=80=9Cdata=20c?= =?UTF-8?q?enter=E2=80=9D=20with=20=E2=80=9Cdatacenter=E2=80=9D,=20and=20c?= =?UTF-8?q?larified=20the=20definition=20of=20a=20datacenter?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- documentation/sphinx/source/configuration.rst | 96 ++++++++++--------- 1 file changed, 49 insertions(+), 47 deletions(-) diff --git a/documentation/sphinx/source/configuration.rst b/documentation/sphinx/source/configuration.rst index e46d33823c..74c7fb8a19 100644 --- a/documentation/sphinx/source/configuration.rst +++ b/documentation/sphinx/source/configuration.rst @@ -260,7 +260,7 @@ Contains default parameters for all fdbserver processes on this machine. These s * ``storage_memory``: Maximum memory used for data storage. This parameter is used *only* with memory storage engine, not the ssd storage engine. The default value is 1GiB. When specified without a unit, MB is assumed. Clusters will be restricted to using this amount of memory per process for purposes of data storage. Memory overhead associated with storing the data is counted against this total. If you increase the ``storage_memory``, you should also increase the ``memory`` parameter by the same amount. * ``locality_machineid``: Machine identifier key. All processes on a machine should share a unique id. By default, processes on a machine determine a unique id to share. This does not generally need to be set. * ``locality_zoneid``: Zone identifier key. Processes that share a zone id are considered non-unique for the purposes of data replication. If unset, defaults to machine id. -* ``locality_dcid``: Data center identifier key. All processes physically located in a data center should share the id. No default value. If you are depending on data center based replication this must be set on all processes. +* ``locality_dcid``: Datacenter identifier key. All processes physically located in a datacenter should share the id. No default value. If you are depending on datacenter based replication this must be set on all processes. * ``locality_data_hall``: Data hall identifier key. All processes physically located in a data hall should share the id. No default value. If you are depending on data hall based replication this must be set on all processes. * ``io_trust_seconds``: Time in seconds that a read or write operation is allowed to take before timing out with an error. If an operation times out, all future operations on that file will fail with an error as well. Only has an effect when using AsyncFileKAIO in Linux. If unset, defaults to 0 which means timeout is disabled. @@ -529,26 +529,28 @@ This is made possible by combining two features. The first is the ability to sup This is similar to ``fdbdr``, except that the asynchronous replication is done within a single cluster instead of between different FoundationDB clusters. -The second feature is the ability to add a synchronous replica of the mutation log in a different data center. Because this datacenter is only holding a transient copy of the mutations being committed to the database, only a few FoundationDB processes are required to fulfill this role. +The second feature is the ability to add a synchronous replica of the mutation log in a different datacenter. Because this datacenter is only holding a transient copy of the mutations being committed to the database, only a few FoundationDB processes are required to fulfill this role. -The benefit of holding the mutation log external to the primary data center is that if the primary data center fails we will still have access to the most recent commits. This allows the remote replica to catch up to the primary. Once the remote replica has applied all the mutations it can start accepting new commits without suffering any data loss. +The benefit of holding the mutation log external to the primary datacenter is that if the primary datacenter fails we will still have access to the most recent commits. This allows the remote replica to catch up to the primary. Once the remote replica has applied all the mutations it can start accepting new commits without suffering any data loss. -An example configuration would be four total data centers, two on the east coast, two on the west coast, with a preference for fast write latencies from the west coast. One datacenter on each coast would be sized to store a full copy of the data. The second datacenter in each coast would only have a few FoundationDB processes. +An example configuration would be four total datacenters, two on the east coast, two on the west coast, with a preference for fast write latencies from the west coast. One datacenter on each coast would be sized to store a full copy of the data. The second datacenter in each coast would only have a few FoundationDB processes. -When everything is healthy, writes need to be made durable in both west coast data centers before a commit can succeed. If the data centers are close to each other, this can add as little as 2ms to commit latencies. Reads can be served from either region, and clients can get data from whichever region is closer. Getting a read version from the each coast region will still require communicating with a west coast datacenter. Clients can cache read versions if they can tolerate reading stale data to avoid waiting on read versions. +When everything is healthy, writes need to be made durable in both west coast datacenters before a commit can succeed. If the datacenters are close to each other, this can add as little as 2ms to commit latencies. Reads can be served from either region, and clients can get data from whichever region is closer. Getting a read version from the each coast region will still require communicating with a west coast datacenter. Clients can cache read versions if they can tolerate reading stale data to avoid waiting on read versions. -If either west coast data center fails, the last few mutations will be propagated from the remaining west coast datacenter to the east coast. At this point we will start accepting commits on the east coast. Once the west coast comes back online, the system will automatically start copying all the data that was committed to the east coast back to the west coast replica. Once the west coast has caught up, the system will automatically switch back to accepting writes from the west coast again. +If either west coast datacenter fails, the last few mutations will be propagated from the remaining west coast datacenter to the east coast. At this point we will start accepting commits on the east coast. Once the west coast comes back online, the system will automatically start copying all the data that was committed to the east coast back to the west coast replica. Once the west coast has caught up, the system will automatically switch back to accepting writes from the west coast again. -In the event that the west coast has failed for long enough that we no longer have enough disk space to continue storing the mutation log, we can drop the replica completely. This decision is not automatic, and requires a manual change to the configuration. The database will then act as a single data center database until the west coast comes back online. Because we have dropped to west coast replica completely, to bring the west coast back online we will have to copy all the data between the regions. +In the event that the west coast has failed for long enough that we no longer have enough disk space to continue storing the mutation log, we can drop the replica completely. This decision is not automatic, and requires a manual change to the configuration. The database will then act as a single datacenter database until the west coast comes back online. Because we have dropped to west coast replica completely, to bring the west coast back online we will have to copy all the data between the regions. Region failover generally only requires a few seconds to complete. -Specifying data centers ------------------------ +Specifying datacenters +---------------------- -To use region configurations all processes in the cluster need to specify what data center they are in. This can be done on the command line with either ``--locality_dcid`` or ``--datacenter_id``. This data center identifier is case sensitive. +We use the term ``datacenter`` to denote unique locations that are failure independent from one another. Cloud providers generally expose this property of failure independence with Availability Zones. -Clients should also specify their data center with the database option ``datacenter_id``. If a client does not specify their datacenter, they will use latency estimates to balance traffic between the two regions. This will result in about 5% of requests being served by the remote regions, so you will see large tail latencies on reads. +To use region configurations all processes in the cluster need to specify what datacenter they are in. This can be done on the command line with either ``--locality_dcid`` or ``--datacenter_id``. This datacenter identifier is case sensitive. + +Clients should also specify their datacenter with the database option ``datacenter_id``. If a client does not specify their datacenter, they will use latency estimates to balance traffic between the two regions. This will result in about 5% of requests being served by the remote regions, so you will see large tail latencies on reads. Changing the region configuration --------------------------------- @@ -581,46 +583,46 @@ The ``regions`` object in the json document should be a array. Each element of t Each region is described using a object that contains an array of ``datacenters``. Each region may also optionally provide a ``satellite_redundancy_mode`` and ``satellite_logs``. -Each datacenter is described with an object that contains the ``id`` and ``priority`` of that data center. Data centers which hold a full replica of the data are referred to as primary data centers. Data centers that only store transaction logs are referred to as satellite data centers. To specify a data center is a satellite, it needs to include ``"satellite" : 1``. The priorities of satellite data centers are only compared to other satellites data centers in the same region. The priorities of primary data centers are only compared to other primary data centers. +Each datacenter is described with an object that contains the ``id`` and ``priority`` of that datacenter. Datacenters which hold a full replica of the data are referred to as primary datacenters. Datacenters that only store transaction logs are referred to as satellite datacenters. To specify a datacenter is a satellite, it needs to include ``"satellite" : 1``. The priorities of satellite datacenters are only compared to other satellites datacenters in the same region. The priorities of primary datacenters are only compared to other primary datacenters. .. warning:: In release 6.0, FoundationDB supports at most two regions. -Each region can only have one primary data center. A negative priority for a data center denotes that the system should not recover the transaction subsystem in that data center. The region with the transaction subsystem is referred to as the active region. +Each region can only have one primary datacenter. A negative priority for a datacenter denotes that the system should not recover the transaction subsystem in that datacenter. The region with the transaction subsystem is referred to as the active region. -One primary data center must have a priority >= 0. The cluster will make the region with the highest priority the active region. If two data centers have equal priority the cluster will make one of them the active region arbitrarily. +One primary datacenter must have a priority >= 0. The cluster will make the region with the highest priority the active region. If two datacenters have equal priority the cluster will make one of them the active region arbitrarily. -The ``satellite_redundancy_mode`` is configured per region, and specifies how many copies of each mutation should be replicated to the satellite data centers. +The ``satellite_redundancy_mode`` is configured per region, and specifies how many copies of each mutation should be replicated to the satellite datacenters. ``one_satellite_single`` mode - Keep a single copy of the mutation log in the satellite data center with the highest priority. If the highest priority satellite is unavailable it will put the transaction log in the satellite data center with the next highest priority. + Keep a single copy of the mutation log in the satellite datacenter with the highest priority. If the highest priority satellite is unavailable it will put the transaction log in the satellite datacenter with the next highest priority. ``one_satellite_double`` mode - Keep a two copies of the mutation log in the satellite data center with the highest priority. + Keep a two copies of the mutation log in the satellite datacenter with the highest priority. ``one_satellite_triple`` mode - Keep a three copies of the mutation log in the satellite data center with the highest priority. + Keep a three copies of the mutation log in the satellite datacenter with the highest priority. ``two_satellite_safe`` mode - Keep two copies of the mutation log in each of the two satellite data centers with the highest priorities, for a total of four copies of each mutation. This mode will protect against the simultaneous loss of both the primary and one of the satellite data centers. If only one satellite is available, it will fall back to only storing two copies of the mutation log in the remaining data center. + Keep two copies of the mutation log in each of the two satellite datacenters with the highest priorities, for a total of four copies of each mutation. This mode will protect against the simultaneous loss of both the primary and one of the satellite datacenters. If only one satellite is available, it will fall back to only storing two copies of the mutation log in the remaining datacenter. ``two_satellite_fast`` mode - Keep two copies of the mutation log in each of the two satellite data centers with the highest priorities, for a total of four copies of each mutation. The proxies will only wait for one of the two satellite data centers to make the mutations durable before considering a commit successful. This will reduce tail latencies caused by network issues between data centers. If only one satellite is available, it will fall back to only storing two copies of the mutation log in the remaining data center. + Keep two copies of the mutation log in each of the two satellite datacenters with the highest priorities, for a total of four copies of each mutation. The proxies will only wait for one of the two satellite datacenters to make the mutations durable before considering a commit successful. This will reduce tail latencies caused by network issues between datacenters. If only one satellite is available, it will fall back to only storing two copies of the mutation log in the remaining datacenter. -.. warning:: In release 6.0 this is implemented by waiting for all but 2 of the transaction logs. This means if you configure more than 4 satellite logs, it will still need to wait for replies from both data centers. +.. warning:: In release 6.0 this is implemented by waiting for all but 2 of the transaction logs. This means if you configure more than 4 satellite logs, it will still need to wait for replies from both datacenters. -The number of ``satellite_logs`` is also configured per region. It represents the desired number of transaction logs that should be recruited in the satellite data centers. The satellite transaction logs do slightly less work than the primary data center transaction logs. So while you should keep the ratio of logs to replicas roughly equal in the primary data center and the satellites, you may be able to balance performance with slightly less satellite transaction logs. +The number of ``satellite_logs`` is also configured per region. It represents the desired number of transaction logs that should be recruited in the satellite datacenters. The satellite transaction logs do slightly less work than the primary datacenter transaction logs. So while you should keep the ratio of logs to replicas roughly equal in the primary datacenter and the satellites, you may be able to balance performance with slightly less satellite transaction logs. The number of replicas in each region is controlled by redundancy level. For example ``double`` mode will put 2 replicas in each region, for a total of 4 replicas. Asymmetric configurations ------------------------- -The fact that satellite policies are configured per region allows for asymmetric configurations. For example, you can have a three datacenter setup where you have two data centers on the west coast (WC1, WC2) and one data center on the east coast (EC1). We set the west coast region as our preferred active region by setting the priority of its primary data center higher than the east coast data center. The west coast region should have a satellite policy configured, so that when it is active we are making mutations durable in both west coast data centers. In the rare event that one of our west coast data center have failed, we will fail over to the east coast data center. Because this region does not a satellite data center, the mutations will only be made durable in one data center while the transaction subsystem is located here. However this is justifiable because the region will only be active if we have already lost a data center. +The fact that satellite policies are configured per region allows for asymmetric configurations. For example, you can have a three datacenter setup where you have two datacenters on the west coast (WC1, WC2) and one datacenter on the east coast (EC1). We set the west coast region as our preferred active region by setting the priority of its primary datacenter higher than the east coast datacenter. The west coast region should have a satellite policy configured, so that when it is active we are making mutations durable in both west coast datacenters. In the rare event that one of our west coast datacenter have failed, we will fail over to the east coast datacenter. Because this region does not a satellite datacenter, the mutations will only be made durable in one datacenter while the transaction subsystem is located here. However this is justifiable because the region will only be active if we have already lost a datacenter. This is the region configuration that implements the example:: @@ -648,7 +650,7 @@ The ``usable_regions`` configuration option determines the number of regions whi .. warning:: In release 6.0 we only support values of 1 or 2, to match the maximum number of regions that can be defined in the ``regions`` json object. -Increasing the ``usable_regions`` will start copying data from the active region to the remote region. Reducing the ``usable_regions`` will immediately drop the replicas store in the remote region. During these changes, only one primary data center can have priority >= 0. This enforces exactly which region will lose its replica. +Increasing the ``usable_regions`` will start copying data from the active region to the remote region. Reducing the ``usable_regions`` will immediately drop the replicas store in the remote region. During these changes, only one primary datacenter can have priority >= 0. This enforces exactly which region will lose its replica. Changing the log routers configuration -------------------------------------- @@ -666,34 +668,34 @@ Migrating a database to use a region configuration To configure an existing database to use a region configuration do the following steps: - 1. Ensure all processes have their dcid locality set on the command line. All processes should exist in the same data center. If you are converting from a ``three_datacenter`` configuration, you will first need to configure down to using a single data center by changing the replication mode. Then exclude the machines in all data centers but the one that will become the initial active region. + 1. Ensure all processes have their dcid locality set on the command line. All processes should exist in the same datacenter. If you are converting from a ``three_datacenter`` configuration, you will first need to configure down to using a single datacenter by changing the replication mode. Then exclude the machines in all datacenters but the one that will become the initial active region. - 2. Configure the region configuration. The data center with all the existing processes should have a non-negative priority. The region which will eventually store the remote replica should be added with a negative priority. + 2. Configure the region configuration. The datacenter with all the existing processes should have a non-negative priority. The region which will eventually store the remote replica should be added with a negative priority. 3. Add processes to the cluster in the remote region. These processes will not take data yet, but need to be added to the cluster. If they are added before the region configuration is set they will be assigned data like any other FoundationDB process, which will lead to high latencies. 4. Configure usable_regions=2. This will cause the cluster to start copying data between the regions. - 5. Watch status and wait until data movement is complete. This will mean signal that the remote data center has a full replica of all of the data in the database. + 5. Watch status and wait until data movement is complete. This will mean signal that the remote datacenter has a full replica of all of the data in the database. - 6. Change the region configuration to have a non-negative priority for the primary data centers in both regions. This will enable automatic failover between regions. + 6. Change the region configuration to have a non-negative priority for the primary datacenters in both regions. This will enable automatic failover between regions. -Handling data center failures ------------------------------ +Handling datacenter failures +---------------------------- -When a primary data center fails, the cluster will go into a degraded state. It will recover to the other region and continue accepting commits, however the mutations bound for the other side will build up on the transaction logs. Eventually, the disks on the transaction logs will fill up, so the database cannot be left in this condition indefinitely. +When a primary datacenter fails, the cluster will go into a degraded state. It will recover to the other region and continue accepting commits, however the mutations bound for the other side will build up on the transaction logs. Eventually, the disks on the transaction logs will fill up, so the database cannot be left in this condition indefinitely. -.. warning:: While a data center has failed, the maximum write throughput of the cluster will be roughly 1/3 of normal performance. This is because the transaction logs need to store all of the mutations being committed so that once the other data center comes back online it can replay history to catch back up. +.. warning:: While a datacenter has failed, the maximum write throughput of the cluster will be roughly 1/3 of normal performance. This is because the transaction logs need to store all of the mutations being committed so that once the other datacenter comes back online it can replay history to catch back up. -To drop the dead data center do the follow steps: +To drop the dead datacenter do the follow steps: - 1. Configure the region configuration so that the dead data center has a negative priority. + 1. Configure the region configuration so that the dead datacenter has a negative priority. 2. Configure usable_regions=1. -If you are running in a configuration without a satellite data center, or you have lost all machines in a region simultaneously. The ``force_recovery_with_data_loss`` command from ``fdbcli`` allows you to force a recovery to the other region which will discard the portion of the mutation log which did not make it across the WAN. Once the database has recovered, immediately follow the previous steps to drop the dead region the normal way. +If you are running in a configuration without a satellite datacenter, or you have lost all machines in a region simultaneously. The ``force_recovery_with_data_loss`` command from ``fdbcli`` allows you to force a recovery to the other region which will discard the portion of the mutation log which did not make it across the WAN. Once the database has recovered, immediately follow the previous steps to drop the dead region the normal way. -.. warning:: In 6.0 the ``force_recovery_with_data_loss`` command from ``fdbcli`` can cause data inconsistencies if it is used when processes from both non-satellite data centers are still in the cluster. In general this command has not be tested to same degree as the rest of the codebase, and should only be used in extreme emergencies. +.. warning:: In 6.0 the ``force_recovery_with_data_loss`` command from ``fdbcli`` can cause data inconsistencies if it is used when processes from both non-satellite datacenters are still in the cluster. In general this command has not be tested to same degree as the rest of the codebase, and should only be used in extreme emergencies. Region change safety -------------------- @@ -711,31 +713,31 @@ The steps described above for both adding and removing replicas are enforced by Monitoring ---------- -It is important to ensure the remote replica does not fall too far behind the active replica. To failover between regions all of the mutations need to be flushed from the active replica to the remote replica. If the remote replica is too far behind, this can take a very long time. The version difference between the data centers is available in status json as ``datacenter_version_difference``. This number should be less than 5 million. A large data center version difference could indicate that you need more log routers. It could also be caused by network issues between the regions. If the difference gets too large the remote replica should be dropped, similar to a data center outage that goes on too long. +It is important to ensure the remote replica does not fall too far behind the active replica. To failover between regions all of the mutations need to be flushed from the active replica to the remote replica. If the remote replica is too far behind, this can take a very long time. The version difference between the datacenters is available in status json as ``datacenter_version_difference``. This number should be less than 5 million. A large datacenter version difference could indicate that you need more log routers. It could also be caused by network issues between the regions. If the difference gets too large the remote replica should be dropped, similar to a datacenter outage that goes on too long. Because of asymmetric write latencies in the two regions, it important to route client traffic to the currently active region. The current active region is written in the system key space as the key ``\xff/primaryDatacenter``. Clients can read and watch this key after setting the ``read_system_keys`` transaction option. Choosing coordinators --------------------- -Choosing coordinators for a multi-region configuration provides its own set of challenges. A majority of coordinators need to be alive for the cluster to be available. There are two common coordinators setups that allow a cluster to survive the simultaneous loss of a data center and one additional machine. +Choosing coordinators for a multi-region configuration provides its own set of challenges. A majority of coordinators need to be alive for the cluster to be available. There are two common coordinators setups that allow a cluster to survive the simultaneous loss of a datacenter and one additional machine. -The first is five coordinators in five different data centers. The second is nine total coordinators spread across three data centers. There is some additional benefit to spreading the coordinators across regions rather than data centers. This is because if an entire region fails, it is still possible to recover to the other region if you are willing to accept a small amount of data loss. However, if you have lost a majority of coordinators this becomes much more difficult. +The first is five coordinators in five different datacenters. The second is nine total coordinators spread across three datacenters. There is some additional benefit to spreading the coordinators across regions rather than datacenters. This is because if an entire region fails, it is still possible to recover to the other region if you are willing to accept a small amount of data loss. However, if you have lost a majority of coordinators this becomes much more difficult. -Additionally, if a data center fails and then the second data center in the region fails 30 seconds later, we can generally survive this scenario. We can survive because the second data center only needs to be alive long enough to copy the tail of the mutation log across the WAN. However if your coordinators are in this second data center, you will still experience an outage. +Additionally, if a datacenter fails and then the second datacenter in the region fails 30 seconds later, we can generally survive this scenario. We can survive because the second datacenter only needs to be alive long enough to copy the tail of the mutation log across the WAN. However if your coordinators are in this second datacenter, you will still experience an outage. -These considerations mean that best practice is to put three coordinators in the main data centers of each region, and then put three additional coordinators in a third region. +These considerations mean that best practice is to put three coordinators in the main datacenters of each region, and then put three additional coordinators in a third region. -Comparison to other multiple data center configurations -------------------------------------------------------- +Comparison to other multiple datacenter configurations +------------------------------------------------------ Region configuration provides very similar functionality to ``fdbdr``. -If you are not using satellite data centers, the main benefit of a region configuration compared to ``fdbdr`` is that each data center is able to restore replication even after losing all copies of a key range. If we simultaneously lose two storage servers in a double replicated cluster, with ``fdbdr`` we would be forced to fail over to the remote region. With region configuration the cluster will automatically copy the missing key range from the remote replica back to the primary data center. +If you are not using satellite datacenters, the main benefit of a region configuration compared to ``fdbdr`` is that each datacenter is able to restore replication even after losing all copies of a key range. If we simultaneously lose two storage servers in a double replicated cluster, with ``fdbdr`` we would be forced to fail over to the remote region. With region configuration the cluster will automatically copy the missing key range from the remote replica back to the primary datacenter. The main disadvantage of using a region configuration is that the total number of processes we can support in a single region is around half when compared against ``fdbdr``. This is because we have processes for both regions in the same cluster, and some singleton components like our failure monitor will have to do twice as much work. In ``fdbdr`` we have two separate cluster for each region, so the total number of processes can scale to about twice as large as using a region configuration. -Region configuration is better in almost all ways than the ``three_datacenter`` replication mode. Region configuration gives you the same ability to survive the loss of one data center, however we only need to store two full replicas of the database instead of three. Region configuration is almost much more efficient with how it sends mutations across the WAN. The only reason to use ``three_datacenter`` replication is if you need low latency reads from all three locations. +Region configuration is better in almost all ways than the ``three_datacenter`` replication mode. Region configuration gives you the same ability to survive the loss of one datacenter, however we only need to store two full replicas of the database instead of three. Region configuration is almost much more efficient with how it sends mutations across the WAN. The only reason to use ``three_datacenter`` replication is if you need low latency reads from all three locations. Known limitations ----------------- @@ -746,9 +748,9 @@ The 6.0 release still has a number of rough edges related to region configuratio * ``two_satellite_fast`` does not hide latency properly when configured with more than 4 satellite transaction logs. - * While a data center has failed, the maximum write throughput of the cluster will be roughly 1/3 of normal performance. + * While a datacenter has failed, the maximum write throughput of the cluster will be roughly 1/3 of normal performance. - * ``force_recovery_with_data_loss`` can cause data inconsistencies if it is used when processes from both non-satellite data centers are still in the cluster. + * ``force_recovery_with_data_loss`` can cause data inconsistencies if it is used when processes from both non-satellite datacenters are still in the cluster. .. _guidelines-process-class-config: From a681740f12297b6ad106e01647ab88dbbeb9f7b7 Mon Sep 17 00:00:00 2001 From: Alex Miller Date: Thu, 15 Nov 2018 17:09:06 -0800 Subject: [PATCH 12/30] Suggested changes to region configuration documentation. --- documentation/sphinx/source/configuration.rst | 84 +++++++++---------- 1 file changed, 40 insertions(+), 44 deletions(-) diff --git a/documentation/sphinx/source/configuration.rst b/documentation/sphinx/source/configuration.rst index 74c7fb8a19..124aa6984b 100644 --- a/documentation/sphinx/source/configuration.rst +++ b/documentation/sphinx/source/configuration.rst @@ -523,34 +523,30 @@ For an SSD with a single partition, the partition should typically begin at sect Configuring regions =================== +.. note:: In the following text, the term ``datacenter`` is used to denote unique locations that are failure independent from one another. Cloud providers generally expose this property of failure independence with Availability Zones. + Regions configuration enables automatic failover between two datacenters, without adding a WAN latency for commits, while still maintaining all the consistency properties FoundationDB provides. -This is made possible by combining two features. The first is the ability to support asynchronous replication between two regions. Because we are not waiting for the commits to become durable in the remote region before reporting a commit as successful, it means the remote region will slightly lag behind the primary. +This is made possible by combining two features. The first is asynchronous replication between two regions. By not waiting for the commits to become durable in the remote region before reporting a commit as successful, it means the remote region will slightly lag behind the primary. This is similar to ``fdbdr``, except that the asynchronous replication is done within a single cluster instead of between different FoundationDB clusters. -This is similar to ``fdbdr``, except that the asynchronous replication is done within a single cluster instead of between different FoundationDB clusters. +The second feature is the ability to add one or more synchronous replicas of the mutation log in a different datacenter. Because this datacenter is only holding a transient copy of the mutations being committed to the database, only a few FoundationDB processes are required to fulfill this role. If the primary datacenter fails, the external mutation log replicas will still allow access to the most recent commits. This allows the lagging remote replica to catch up to the primary. Once the remote replica has applied all the mutations, it can start accepting new commits without suffering any data loss. -The second feature is the ability to add a synchronous replica of the mutation log in a different datacenter. Because this datacenter is only holding a transient copy of the mutations being committed to the database, only a few FoundationDB processes are required to fulfill this role. +An example configuration would be four total datacenters, two on the east coast, two on the west coast, with a preference for fast write latencies from the west coast. One datacenter on each coast would be sized to store a full copy of the data. The second datacenter on each coast would only have a few FoundationDB processes. -The benefit of holding the mutation log external to the primary datacenter is that if the primary datacenter fails we will still have access to the most recent commits. This allows the remote replica to catch up to the primary. Once the remote replica has applied all the mutations it can start accepting new commits without suffering any data loss. +While everything is healthy, writes need to be made durable in both west coast datacenters before a commit can succeed. The geographic proximity of the two datacenters minimizes the additional commit latency. Reads can be served from either region, and clients can get data from whichever region is closer. Getting a read version from the each coast region will still require communicating with a west coast datacenter. Clients can cache read versions if they can tolerate reading stale data to avoid waiting on read versions. -An example configuration would be four total datacenters, two on the east coast, two on the west coast, with a preference for fast write latencies from the west coast. One datacenter on each coast would be sized to store a full copy of the data. The second datacenter in each coast would only have a few FoundationDB processes. +If either west coast datacenter fails, the last few mutations will be propagated from the remaining west coast datacenter to the east coast. At this point, FoundationDB will start accepting commits on the east coast. Once the west coast comes back online, the system will automatically start copying all the data that was committed to the east coast back to the west coast replica. Once the west coast has caught up, the system will automatically switch back to accepting writes from the west coast again. -When everything is healthy, writes need to be made durable in both west coast datacenters before a commit can succeed. If the datacenters are close to each other, this can add as little as 2ms to commit latencies. Reads can be served from either region, and clients can get data from whichever region is closer. Getting a read version from the each coast region will still require communicating with a west coast datacenter. Clients can cache read versions if they can tolerate reading stale data to avoid waiting on read versions. +The west coast mutation logs will maintain their copies of all committed mutations until they have been applied by the east coast datacenter. In the event that the east coast has failed for long enough that the west coast mutation logs no longer have enough disk space to continue storing the mutations, FoundationDB can be requested to drop the east coast replica completely. This decision is not automatic, and requires a manual change to the configuration. The west coast database will then act as a single datacenter database until the east coast comes back online. Because the east coast datacenter was completely dropped from the configuration, to bring the west coast back online FoundationDB will have to copy all the data between the regions. -If either west coast datacenter fails, the last few mutations will be propagated from the remaining west coast datacenter to the east coast. At this point we will start accepting commits on the east coast. Once the west coast comes back online, the system will automatically start copying all the data that was committed to the east coast back to the west coast replica. Once the west coast has caught up, the system will automatically switch back to accepting writes from the west coast again. - -In the event that the west coast has failed for long enough that we no longer have enough disk space to continue storing the mutation log, we can drop the replica completely. This decision is not automatic, and requires a manual change to the configuration. The database will then act as a single datacenter database until the west coast comes back online. Because we have dropped to west coast replica completely, to bring the west coast back online we will have to copy all the data between the regions. - -Region failover generally only requires a few seconds to complete. +If a region failover occurs, clients will generally only see a latency spike of a few seconds. Specifying datacenters ---------------------- -We use the term ``datacenter`` to denote unique locations that are failure independent from one another. Cloud providers generally expose this property of failure independence with Availability Zones. +To use region configurations all processes in the cluster need to specify in which datacenter they are located. This can be done on the command line with either ``--locality_dcid`` or ``--datacenter_id``. This datacenter identifier is case sensitive. -To use region configurations all processes in the cluster need to specify what datacenter they are in. This can be done on the command line with either ``--locality_dcid`` or ``--datacenter_id``. This datacenter identifier is case sensitive. - -Clients should also specify their datacenter with the database option ``datacenter_id``. If a client does not specify their datacenter, they will use latency estimates to balance traffic between the two regions. This will result in about 5% of requests being served by the remote regions, so you will see large tail latencies on reads. +Clients should also specify their datacenter with the database option ``datacenter_id``. If a client does not specify their datacenter, they will use latency estimates to balance traffic between the two regions. This will result in about 5% of requests being served by the remote regions, so reads will suffer from high tail latencies. Changing the region configuration --------------------------------- @@ -579,11 +575,11 @@ Regions are configured in FoundationDB as a json document. For example:: "satellite_logs":2 }] -The ``regions`` object in the json document should be a array. Each element of the array describes the configuration of an individual region. +The ``regions`` object in the json document should be an array. Each element of the array describes the configuration of an individual region. -Each region is described using a object that contains an array of ``datacenters``. Each region may also optionally provide a ``satellite_redundancy_mode`` and ``satellite_logs``. +Each region is described using an object that contains an array of ``datacenters``. Each region may also optionally provide a ``satellite_redundancy_mode`` and ``satellite_logs``. -Each datacenter is described with an object that contains the ``id`` and ``priority`` of that datacenter. Datacenters which hold a full replica of the data are referred to as primary datacenters. Datacenters that only store transaction logs are referred to as satellite datacenters. To specify a datacenter is a satellite, it needs to include ``"satellite" : 1``. The priorities of satellite datacenters are only compared to other satellites datacenters in the same region. The priorities of primary datacenters are only compared to other primary datacenters. +Each datacenter is described with an object that contains the ``id`` and ``priority`` of that datacenter. An ``id`` may be any unique alphanumeric string. Datacenters which hold a full replica of the data are referred to as primary datacenters. Datacenters that only store transaction logs are referred to as satellite datacenters. To specify a datacenter is a satellite, it needs to include ``"satellite" : 1``. The priorities of satellite datacenters are only compared to other satellites datacenters in the same region. The priorities of primary datacenters are only compared to other primary datacenters. .. warning:: In release 6.0, FoundationDB supports at most two regions. @@ -595,15 +591,15 @@ The ``satellite_redundancy_mode`` is configured per region, and specifies how ma ``one_satellite_single`` mode - Keep a single copy of the mutation log in the satellite datacenter with the highest priority. If the highest priority satellite is unavailable it will put the transaction log in the satellite datacenter with the next highest priority. + Keep one copy of the mutation log in the satellite datacenter with the highest priority. If the highest priority satellite is unavailable it will put the transaction log in the satellite datacenter with the next highest priority. ``one_satellite_double`` mode - Keep a two copies of the mutation log in the satellite datacenter with the highest priority. + Keep two copies of the mutation log in the satellite datacenter with the highest priority. ``one_satellite_triple`` mode - Keep a three copies of the mutation log in the satellite datacenter with the highest priority. + Keep three copies of the mutation log in the satellite datacenter with the highest priority. ``two_satellite_safe`` mode @@ -611,18 +607,18 @@ The ``satellite_redundancy_mode`` is configured per region, and specifies how ma ``two_satellite_fast`` mode - Keep two copies of the mutation log in each of the two satellite datacenters with the highest priorities, for a total of four copies of each mutation. The proxies will only wait for one of the two satellite datacenters to make the mutations durable before considering a commit successful. This will reduce tail latencies caused by network issues between datacenters. If only one satellite is available, it will fall back to only storing two copies of the mutation log in the remaining datacenter. + Keep two copies of the mutation log in each of the two satellite datacenters with the highest priorities, for a total of four copies of each mutation. FoundationDB will only synchronously wait for one of the two satellite datacenters to make the mutations durable before considering a commit successful. This will reduce tail latencies caused by network issues between datacenters. If only one satellite is available, it will fall back to only storing two copies of the mutation log in the remaining datacenter. -.. warning:: In release 6.0 this is implemented by waiting for all but 2 of the transaction logs. This means if you configure more than 4 satellite logs, it will still need to wait for replies from both datacenters. +.. warning:: In release 6.0 this is implemented by waiting for all but 2 of the transaction logs. If ``satellite_logs`` is set to more than 4, FoundationDB will still need to wait for replies from both datacenters. -The number of ``satellite_logs`` is also configured per region. It represents the desired number of transaction logs that should be recruited in the satellite datacenters. The satellite transaction logs do slightly less work than the primary datacenter transaction logs. So while you should keep the ratio of logs to replicas roughly equal in the primary datacenter and the satellites, you may be able to balance performance with slightly less satellite transaction logs. +The number of ``satellite_logs`` is also configured per region. It represents the desired number of transaction logs that should be recruited in the satellite datacenters. The satellite transaction logs do slightly less work than the primary datacenter transaction logs. So while the ratio of logs to replicas should be kept roughly equal in the primary datacenter and the satellites, a slightly fewer number of satellite transaction logs may be the optimal balance for performance. The number of replicas in each region is controlled by redundancy level. For example ``double`` mode will put 2 replicas in each region, for a total of 4 replicas. Asymmetric configurations ------------------------- -The fact that satellite policies are configured per region allows for asymmetric configurations. For example, you can have a three datacenter setup where you have two datacenters on the west coast (WC1, WC2) and one datacenter on the east coast (EC1). We set the west coast region as our preferred active region by setting the priority of its primary datacenter higher than the east coast datacenter. The west coast region should have a satellite policy configured, so that when it is active we are making mutations durable in both west coast datacenters. In the rare event that one of our west coast datacenter have failed, we will fail over to the east coast datacenter. Because this region does not a satellite datacenter, the mutations will only be made durable in one datacenter while the transaction subsystem is located here. However this is justifiable because the region will only be active if we have already lost a datacenter. +The fact that satellite policies are configured per region allows for asymmetric configurations. For example, FoudnationDB can have a three datacenter setup where there are two datacenters on the west coast (WC1, WC2) and one datacenter on the east coast (EC1). The west coast region can be set as the preferred active region by setting the priority of its primary datacenter higher than the east coast datacenter. The west coast region should have a satellite policy configured, so that when it is active, FoundationDB is making mutations durable in both west coast datacenters. In the rare event that one of the west coast datacenter have failed, FoundationDB will fail over to the east coast datacenter. Because this region does not a satellite datacenter, the mutations will only be made durable in one datacenter while the transaction subsystem is located here. However this is justifiable because the region will only be active if a datacenter has already been lost. This is the region configuration that implements the example:: @@ -648,16 +644,16 @@ Changing the usable_regions configuration The ``usable_regions`` configuration option determines the number of regions which have a replica of the database. -.. warning:: In release 6.0 we only support values of 1 or 2, to match the maximum number of regions that can be defined in the ``regions`` json object. +.. warning:: In release 6.0, ``usable_regions`` can only be configured to the values of ``1`` or ``2``, and a maximum of 2 regions can be defined in the ``regions`` json object. -Increasing the ``usable_regions`` will start copying data from the active region to the remote region. Reducing the ``usable_regions`` will immediately drop the replicas store in the remote region. During these changes, only one primary datacenter can have priority >= 0. This enforces exactly which region will lose its replica. +Increasing the ``usable_regions`` will start copying data from the active region to the remote region. Reducing the ``usable_regions`` will immediately drop the replicas in the remote region. During these changes, only one primary datacenter can have priority >= 0. This enforces exactly which region will lose its replica. Changing the log routers configuration -------------------------------------- -FoundationDB is architected to copy every mutation between regions exactly once. This copying is done by a new role called the log router. When a mutation is committed, it will be randomly assigned one log router which will be responsible for copying it across the WAN. +FoundationDB is architected to copy every mutation between regions exactly once. This copying is done by a new role called the log router. When a mutation is committed, it will be randomly assigned to one log router, which will be responsible for copying it across the WAN. -This log router will pull the mutation from exactly one of the transaction logs. This means a single socket will be involved in copying mutations across the WAN per log router. Because of this, if the latency between regions is large the bandwidth-delay product means that the number of log routers could limit the throughput at which mutations can be copied across the WAN. This can be mitigated by either configuring more log routers, or increasing the TCP window scale option. +This log router will pull the mutation from exactly one of the transaction logs. This means a single socket will be used to copy mutations across the WAN per log router. Because of this, if the latency between regions is large the bandwidth-delay product means that the number of log routers could limit the throughput at which mutations can be copied across the WAN. This can be mitigated by either configuring more log routers, or increasing the TCP window scale option. To keep the work evenly distributed on the transaction logs, the number of log routers should be a multiple of the number of transaction logs. @@ -666,41 +662,41 @@ The ``log_routers`` configuration option determines the number of log routers re Migrating a database to use a region configuration -------------------------------------------------- -To configure an existing database to use a region configuration do the following steps: +To configure an existing database to regions, do the following steps: - 1. Ensure all processes have their dcid locality set on the command line. All processes should exist in the same datacenter. If you are converting from a ``three_datacenter`` configuration, you will first need to configure down to using a single datacenter by changing the replication mode. Then exclude the machines in all datacenters but the one that will become the initial active region. + 1. Ensure all processes have their dcid locality set on the command line. All processes should exist in the same datacenter. If converting from a ``three_datacenter`` configuration, first configure down to using a single datacenter by changing the replication mode. Then exclude the machines in all datacenters but the one that will become the initial active region. 2. Configure the region configuration. The datacenter with all the existing processes should have a non-negative priority. The region which will eventually store the remote replica should be added with a negative priority. 3. Add processes to the cluster in the remote region. These processes will not take data yet, but need to be added to the cluster. If they are added before the region configuration is set they will be assigned data like any other FoundationDB process, which will lead to high latencies. - 4. Configure usable_regions=2. This will cause the cluster to start copying data between the regions. + 4. Configure ``usable_regions=2``. This will cause the cluster to start copying data between the regions. - 5. Watch status and wait until data movement is complete. This will mean signal that the remote datacenter has a full replica of all of the data in the database. + 5. Watch ``status`` and wait until data movement is complete. This will mean signal that the remote datacenter has a full replica of all of the data in the database. 6. Change the region configuration to have a non-negative priority for the primary datacenters in both regions. This will enable automatic failover between regions. Handling datacenter failures ---------------------------- -When a primary datacenter fails, the cluster will go into a degraded state. It will recover to the other region and continue accepting commits, however the mutations bound for the other side will build up on the transaction logs. Eventually, the disks on the transaction logs will fill up, so the database cannot be left in this condition indefinitely. +When a primary datacenter fails, the cluster will go into a degraded state. It will recover to the other region and continue accepting commits, however the mutations bound for the other side will build up on the transaction logs. Eventually, the disks on the primary's transaction logs will fill up, so the database cannot be left in this condition indefinitely. -.. warning:: While a datacenter has failed, the maximum write throughput of the cluster will be roughly 1/3 of normal performance. This is because the transaction logs need to store all of the mutations being committed so that once the other datacenter comes back online it can replay history to catch back up. +.. warning:: While a datacenter has failed, the maximum write throughput of the cluster will be roughly 1/3 of normal performance. This is because the transaction logs need to store all of the mutations being committed, so that once the other datacenter comes back online, it can replay history to catch back up. To drop the dead datacenter do the follow steps: 1. Configure the region configuration so that the dead datacenter has a negative priority. - 2. Configure usable_regions=1. + 2. Configure ``usable_regions=1``. -If you are running in a configuration without a satellite datacenter, or you have lost all machines in a region simultaneously. The ``force_recovery_with_data_loss`` command from ``fdbcli`` allows you to force a recovery to the other region which will discard the portion of the mutation log which did not make it across the WAN. Once the database has recovered, immediately follow the previous steps to drop the dead region the normal way. +If you are running in a configuration without a satellite datacenter, or you have lost all machines in a region simultaneously, the ``force_recovery_with_data_loss`` command from ``fdbcli`` allows you to force a recovery to the other region. This will discard the portion of the mutation log which did not make it across the WAN. Once the database has recovered, immediately follow the previous steps to drop the dead region the normal way. .. warning:: In 6.0 the ``force_recovery_with_data_loss`` command from ``fdbcli`` can cause data inconsistencies if it is used when processes from both non-satellite datacenters are still in the cluster. In general this command has not be tested to same degree as the rest of the codebase, and should only be used in extreme emergencies. Region change safety -------------------- -The steps described above for both adding and removing replicas are enforced by ``fdbcli``. The follow are the specific conditions checked by ``fdbcli``: +The steps described above for both adding and removing replicas are enforced by ``fdbcli``. The following are the specific conditions checked by ``fdbcli``: * You cannot change the ``regions`` configuration while also changing ``usable_regions``. @@ -713,7 +709,7 @@ The steps described above for both adding and removing replicas are enforced by Monitoring ---------- -It is important to ensure the remote replica does not fall too far behind the active replica. To failover between regions all of the mutations need to be flushed from the active replica to the remote replica. If the remote replica is too far behind, this can take a very long time. The version difference between the datacenters is available in status json as ``datacenter_version_difference``. This number should be less than 5 million. A large datacenter version difference could indicate that you need more log routers. It could also be caused by network issues between the regions. If the difference gets too large the remote replica should be dropped, similar to a datacenter outage that goes on too long. +It is important to ensure the remote replica does not fall too far behind the active replica. To failover between regions, all of the mutations need to be flushed from the active replica to the remote replica. If the remote replica is too far behind, this can take a very long time. The version difference between the datacenters is available in ``status json`` as ``datacenter_version_difference``. This number should be less than 5 million. A large datacenter version difference could indicate that more log routers are needed. It could also be caused by network issues between the regions. If the difference becomes too large the remote replica should be dropped, similar to a datacenter outage that goes on too long. Because of asymmetric write latencies in the two regions, it important to route client traffic to the currently active region. The current active region is written in the system key space as the key ``\xff/primaryDatacenter``. Clients can read and watch this key after setting the ``read_system_keys`` transaction option. @@ -722,11 +718,11 @@ Choosing coordinators Choosing coordinators for a multi-region configuration provides its own set of challenges. A majority of coordinators need to be alive for the cluster to be available. There are two common coordinators setups that allow a cluster to survive the simultaneous loss of a datacenter and one additional machine. -The first is five coordinators in five different datacenters. The second is nine total coordinators spread across three datacenters. There is some additional benefit to spreading the coordinators across regions rather than datacenters. This is because if an entire region fails, it is still possible to recover to the other region if you are willing to accept a small amount of data loss. However, if you have lost a majority of coordinators this becomes much more difficult. +The first is five coordinators in five different datacenters. The second is nine total coordinators spread across three datacenters. There is some additional benefit to spreading the coordinators across regions rather than datacenters. This is because if an entire region fails, it is still possible to recover to the other region if you are willing to accept a small amount of data loss. However, if you have lost a majority of coordinators, this becomes much more difficult. -Additionally, if a datacenter fails and then the second datacenter in the region fails 30 seconds later, we can generally survive this scenario. We can survive because the second datacenter only needs to be alive long enough to copy the tail of the mutation log across the WAN. However if your coordinators are in this second datacenter, you will still experience an outage. +Additionally, if a datacenter fails and then the second datacenter in the region fails 30 seconds later, we can generally survive this scenario. The second datacenter only needs to be alive long enough to copy the tail of the mutation log across the WAN. However if your coordinators are in this second datacenter, you will still experience an outage. -These considerations mean that best practice is to put three coordinators in the main datacenters of each region, and then put three additional coordinators in a third region. +These considerations mean that best practice is to put three coordinators in the main datacenters of each of the two regions, and then put three additional coordinators in a third region. Comparison to other multiple datacenter configurations ------------------------------------------------------ @@ -735,9 +731,9 @@ Region configuration provides very similar functionality to ``fdbdr``. If you are not using satellite datacenters, the main benefit of a region configuration compared to ``fdbdr`` is that each datacenter is able to restore replication even after losing all copies of a key range. If we simultaneously lose two storage servers in a double replicated cluster, with ``fdbdr`` we would be forced to fail over to the remote region. With region configuration the cluster will automatically copy the missing key range from the remote replica back to the primary datacenter. -The main disadvantage of using a region configuration is that the total number of processes we can support in a single region is around half when compared against ``fdbdr``. This is because we have processes for both regions in the same cluster, and some singleton components like our failure monitor will have to do twice as much work. In ``fdbdr`` we have two separate cluster for each region, so the total number of processes can scale to about twice as large as using a region configuration. +The main disadvantage of using a region configuration is that the total number of processes we can support in a single region is around half when compared against ``fdbdr``. This is because we have processes for both regions in the same cluster, and some singleton components like the failure monitor will have to do twice as much work. In ``fdbdr``, there are two separate clusters for each region, so the total number of processes can scale to about twice as large as using a region configuration. -Region configuration is better in almost all ways than the ``three_datacenter`` replication mode. Region configuration gives you the same ability to survive the loss of one datacenter, however we only need to store two full replicas of the database instead of three. Region configuration is almost much more efficient with how it sends mutations across the WAN. The only reason to use ``three_datacenter`` replication is if you need low latency reads from all three locations. +Region configuration is better in almost all ways than the ``three_datacenter`` replication mode. Region configuration gives the same ability to survive the loss of one datacenter, however we only need to store two full replicas of the database instead of three. Region configuration is more efficient with how it sends mutations across the WAN. The only reason to use ``three_datacenter`` replication is if low latency reads from all three locations is required. Known limitations ----------------- From beabaf0c3ddf5ba6accd50b754df2bb16aa72065 Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Thu, 15 Nov 2018 17:17:29 -0800 Subject: [PATCH 13/30] reverted release notes temporarily --- documentation/sphinx/source/release-notes.rst | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/documentation/sphinx/source/release-notes.rst b/documentation/sphinx/source/release-notes.rst index 4c1a72da2f..e1e2281bbb 100644 --- a/documentation/sphinx/source/release-notes.rst +++ b/documentation/sphinx/source/release-notes.rst @@ -2,7 +2,7 @@ Release Notes ############# -6.0.16 +6.0.15 ====== Features @@ -68,7 +68,6 @@ Fixes * HTTP client used by backup-to-blobstore now correctly treats response header field names as case insensitive. [6.0.15] `(PR #904) `_ * Blobstore REST client was not following the S3 API in several ways (bucket name, date, and response formats). [6.0.15] `(PR #914) `_ * Data distribution could queue shard movements for restoring replication at a low priority. [6.0.15] `(PR #907) `_ -* Blobstore REST client will no longer attempt to create a bucket that already exists. [6.0.16] `(PR #923) `_ Fixes only impacting 6.0.0+ --------------------------- From ca313dadbfa63ed3f25e1e26517bd8133281f507 Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Thu, 15 Nov 2018 18:23:18 -0800 Subject: [PATCH 14/30] re-added 6.0.16 release note --- documentation/sphinx/source/release-notes.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/documentation/sphinx/source/release-notes.rst b/documentation/sphinx/source/release-notes.rst index e1e2281bbb..4c1a72da2f 100644 --- a/documentation/sphinx/source/release-notes.rst +++ b/documentation/sphinx/source/release-notes.rst @@ -2,7 +2,7 @@ Release Notes ############# -6.0.15 +6.0.16 ====== Features @@ -68,6 +68,7 @@ Fixes * HTTP client used by backup-to-blobstore now correctly treats response header field names as case insensitive. [6.0.15] `(PR #904) `_ * Blobstore REST client was not following the S3 API in several ways (bucket name, date, and response formats). [6.0.15] `(PR #914) `_ * Data distribution could queue shard movements for restoring replication at a low priority. [6.0.15] `(PR #907) `_ +* Blobstore REST client will no longer attempt to create a bucket that already exists. [6.0.16] `(PR #923) `_ Fixes only impacting 6.0.0+ --------------------------- From 7bb1eee21cc71fed1eb954b7d7e068d84d5411e0 Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Fri, 16 Nov 2018 16:26:25 -0800 Subject: [PATCH 15/30] added a few additional links in our documentation --- documentation/sphinx/source/administration.rst | 5 ++++- documentation/sphinx/source/technical-overview.rst | 3 +++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/documentation/sphinx/source/administration.rst b/documentation/sphinx/source/administration.rst index 6762ad6f5a..ee69f9d8a2 100644 --- a/documentation/sphinx/source/administration.rst +++ b/documentation/sphinx/source/administration.rst @@ -9,15 +9,18 @@ Administration :hidden: :titlesonly: + configuration moving-a-cluster tls - + This document covers the administration of an existing FoundationDB cluster. We recommend you read this document before setting up a cluster for performance testing or production use. .. note:: In FoundationDB, a "cluster" refers to one or more FoundationDB processes spread across one or more physical machines that together host a FoundationDB database. To administer an externally accessible cluster, you need to understand basic system tasks. You should begin with how to :ref:`start and stop the database `. Next, you should review management of a cluster, including :ref:`adding ` and :ref:`removing ` machines, and monitoring :ref:`cluster status ` and the basic :ref:`server processes `. You should be familiar with :ref:`managing trace files ` and :ref:`other administrative concerns `. Finally, you should know how to :ref:`uninstall ` or :ref:`upgrade ` the database. +FoundationDB also provides a number of different :doc:`configuration ` options which you should know about when setting up a FoundationDB database. + .. _administration-running-foundationdb: Starting and stopping diff --git a/documentation/sphinx/source/technical-overview.rst b/documentation/sphinx/source/technical-overview.rst index 318afeda94..c5dbf29e37 100644 --- a/documentation/sphinx/source/technical-overview.rst +++ b/documentation/sphinx/source/technical-overview.rst @@ -26,6 +26,8 @@ These documents explain the engineering design of FoundationDB, with detailed in * :doc:`testing`: FoundationDB uses a combined regime of robust simulation, live performance testing, and hardware-based failure testing to meet exacting standards of correctness and performance. +* :doc:`kv-architecture` provides a description of every major role a process in FoundationDB can fulfill. + .. toctree:: :maxdepth: 1 :titlesonly: @@ -42,3 +44,4 @@ These documents explain the engineering design of FoundationDB, with detailed in fault-tolerance flow testing + kv-architecture From ec9410492dd3ccd13ebc94f3d37528ef9ab6b8ed Mon Sep 17 00:00:00 2001 From: Stephen Atherton Date: Fri, 23 Nov 2018 05:23:56 -0800 Subject: [PATCH 16/30] Changed backup folder scheme to use a much smaller number of unique folders for kv range files, which will speed up list and expire operations. The old scheme is still readable but will no longer be written except in simulation in order to test backward compatibility. --- fdbclient/BackupAgent.h | 10 +++ fdbclient/BackupContainer.actor.cpp | 119 +++++++++++++++++++++++----- fdbclient/BackupContainer.h | 2 +- fdbclient/FileBackupAgent.actor.cpp | 25 +++++- flow/genericactors.actor.h | 10 +++ 5 files changed, 142 insertions(+), 24 deletions(-) diff --git a/fdbclient/BackupAgent.h b/fdbclient/BackupAgent.h index 247fc98799..caff559b2e 100644 --- a/fdbclient/BackupAgent.h +++ b/fdbclient/BackupAgent.h @@ -615,6 +615,15 @@ public: return configSpace.pack(LiteralStringRef(__FUNCTION__)); } + // Number of kv range files that were both committed to persistent storage AND inserted into + // the snapshotRangeFileMap. Note that since insertions could replace 1 or more existing + // map entries this is not necessarily the number of entries currently in the map. + // This value exists to help with sizing of kv range folders for BackupContainers that + // require it. + KeyBackedBinaryValue snapshotRangeFileCount() { + return configSpace.pack(LiteralStringRef(__FUNCTION__)); + } + // Coalesced set of ranges already dispatched for writing. typedef KeyBackedMap RangeDispatchMapT; RangeDispatchMapT snapshotRangeDispatchMap() { @@ -671,6 +680,7 @@ public: copy.snapshotBeginVersion().set(tr, beginVersion.get()); copy.snapshotTargetEndVersion().set(tr, endVersion); + copy.snapshotRangeFileCount().set(tr, 0); return Void(); }); diff --git a/fdbclient/BackupContainer.actor.cpp b/fdbclient/BackupContainer.actor.cpp index 3220e6bbc0..76add48e1e 100644 --- a/fdbclient/BackupContainer.actor.cpp +++ b/fdbclient/BackupContainer.actor.cpp @@ -143,16 +143,36 @@ std::string BackupDescription::toString() const { /* BackupContainerFileSystem implements a backup container which stores files in a nested folder structure. * Inheritors must only defined methods for writing, reading, deleting, sizing, and listing files. * - * BackupInfo is stored as a JSON document at - * /info - * Snapshots are stored as JSON at file paths like - * /snapshots/snapshot,startVersion,endVersion,totalBytes - * Log and Range data files at file paths like - * /logs/.../log,startVersion,endVersion,blockSize - * /ranges/.../range,version,uid,blockSize + * Snapshot manifests (a complete set of files constituting a database snapshot for the backup's target ranges) + * are stored as JSON files at paths like + * /snapshots/snapshot,minVersion,maxVersion,totalBytes + * + * Key range files for snapshots are stored at paths like + * /kvranges/snapshot,startVersion/N/range,version,uid,blockSize + * where startVersion is the version at which the backup snapshot execution began and N is a number + * that is increased as key range files are generated over time (at varying rates) such that there + * are around 5,000 key range files in each folder. * - * Where ... is a multi level path which sorts lexically into version order and targets 10,000 or less - * entries in each folder (though a full speed snapshot could exceed this count at the innermost folder level) + * Note that startVersion will NOT correspond to the minVersion of a snapshot manifest because + * snapshot manifest min/max versions are based on the actual contained data and the first data + * file written will be after the start version of the snapshot's execution. + * + * Log files are at file paths like + * /logs/.../log,startVersion,endVersion,blockSize + * where ... is a multi level path which sorts lexically into version order and results in approximately 1 + * unique folder per day containing about 5,000 files. + * + * BACKWARD COMPATIBILITY + * + * Prior to FDB version 6.0.16, key range files were stored using a different folder scheme. Newer versions + * still support this scheme for all restore and backup management operations but key range files generated + * by backup using version 6.0.16 or later use the scheme describe above. + * + * The old format stored key range files at paths like + * /ranges/.../range,version,uid,blockSize + * where ... is a multi level path with sorts lexically into version order and results in up to approximately + * 900 unique folders per day. The number of files per folder depends on the configured snapshot rate and + * database size and will vary from 1 to around 5,000. */ class BackupContainerFileSystem : public IBackupContainer { public: @@ -166,8 +186,8 @@ public: virtual Future create() = 0; // Get a list of fileNames and their sizes in the container under the given path - // The implementation can (but does not have to) use the folder path filter to avoid traversing - // specific subpaths. + // Although not required, an implementation can avoid traversing unwanted subfolders + // by calling folderPathFilter(absoluteFolderPath) and checking for a false return value. typedef std::vector> FilesAndSizesT; virtual Future listFiles(std::string path = "", std::function folderPathFilter = nullptr) = 0; @@ -207,10 +227,24 @@ public: } // The innermost folder covers 100 seconds (1e8 versions) During a full speed backup it is possible though very unlikely write about 10,000 snapshot range files during that time. - static std::string rangeVersionFolderString(Version v) { + static std::string old_rangeVersionFolderString(Version v) { return format("ranges/%s/", versionFolderString(v, 8).c_str()); } + // Get the root folder for a snapshot's data based on its begin version + static std::string snapshotFolderString(Version snapshotBeginVersion) { + return format("kvranges/snapshot.%018lld", snapshotBeginVersion); + } + + // Extract the snapshot begin version from a path + static Version extractSnapshotBeginVersion(std::string path) { + Version snapshotBeginVersion; + if(sscanf(path.c_str(), "kvranges/snapshot.%018lld", &snapshotBeginVersion) == 1) { + return snapshotBeginVersion; + } + return invalidVersion; + } + // The innermost folder covers 100,000 seconds (1e11 versions) which is 5,000 mutation log files at current settings. static std::string logVersionFolderString(Version v) { return format("logs/%s/", versionFolderString(v, 11).c_str()); @@ -220,8 +254,15 @@ public: return writeFile(logVersionFolderString(beginVersion) + format("log,%lld,%lld,%s,%d", beginVersion, endVersion, g_random->randomUniqueID().toString().c_str(), blockSize)); } - Future> writeRangeFile(Version version, int blockSize) { - return writeFile(rangeVersionFolderString(version) + format("range,%lld,%s,%d", version, g_random->randomUniqueID().toString().c_str(), blockSize)); + Future> writeRangeFile(Version snapshotBeginVersion, int snapshotFileCount, Version fileVersion, int blockSize) { + std::string fileName = format("range,%lld,%s,%d", fileVersion, g_random->randomUniqueID().toString().c_str(), blockSize); + + // In order to test backward compatibility in simulation, sometimes write to the old path format + if(BUGGIFY) { + return writeFile(old_rangeVersionFolderString(fileVersion) + fileName); + } + + return writeFile(snapshotFolderString(snapshotBeginVersion) + format("/%d/", snapshotFileCount / (BUGGIFY ? 1 : 5000)) + fileName); } static bool pathToRangeFile(RangeFile &out, std::string path, int64_t size) { @@ -265,6 +306,7 @@ public: // TODO: Do this more efficiently, as the range file list for a snapshot could potentially be hundreds of megabytes. ACTOR static Future> readKeyspaceSnapshot_impl(Reference bc, KeyspaceSnapshotFile snapshot) { // Read the range file list for the specified version range, and then index them by fileName. + // This is so we can verify that each of the files listed in the manifest file are also in the container at this time. std::vector files = wait(bc->listRangeFiles(snapshot.beginVersion, snapshot.endVersion)); state std::map rangeIndex; for(auto &f : files) @@ -386,11 +428,12 @@ public: }); } - // List range files, in sorted version order, which contain data at or between beginVersion and endVersion - Future> listRangeFiles(Version beginVersion = 0, Version endVersion = std::numeric_limits::max()) { + // List range files which contain data at or between beginVersion and endVersion + // NOTE: This reads the range file folder schema from FDB 6.0.15 and earlier and is provided for backward compatibility + Future> old_listRangeFiles(Version beginVersion, Version endVersion) { // Get the cleaned (without slashes) first and last folders that could contain relevant results. - std::string firstPath = cleanFolderString(rangeVersionFolderString(beginVersion)); - std::string lastPath = cleanFolderString(rangeVersionFolderString(endVersion)); + std::string firstPath = cleanFolderString(old_rangeVersionFolderString(beginVersion)); + std::string lastPath = cleanFolderString(old_rangeVersionFolderString(endVersion)); std::function pathFilter = [=](const std::string &folderPath) { // Remove slashes in the given folder path so that the '/' positions in the version folder string do not matter @@ -407,6 +450,38 @@ public: if(pathToRangeFile(rf, f.first, f.second) && rf.version >= beginVersion && rf.version <= endVersion) results.push_back(rf); } + return results; + }); + } + + // List range files which contain data at or between beginVersion and endVersion + // Note: The contents of each top level snapshot.N folder do not necessarily constitute a valid snapshot + // and therefore listing files is not how RestoreSets are obtained. + // Note: Snapshots partially written using FDB versions prior to 6.0.16 will have some range files stored + // using the old folder scheme read by old_listRangeFiles + Future> listRangeFiles(Version beginVersion, Version endVersion) { + // Until the old folder scheme is no longer supported, read files stored using old folder scheme + Future> oldFiles = old_listRangeFiles(beginVersion, endVersion); + + // Define filter function (for listFiles() implementations that use it) to reject any folder + // starting after endVersion + std::function pathFilter = [=](std::string const &path) { + return extractSnapshotBeginVersion(path) > endVersion; + }; + + Future> newFiles = map(listFiles("kvranges/", pathFilter), [=](const FilesAndSizesT &files) { + std::vector results; + RangeFile rf; + for(auto &f : files) { + if(pathToRangeFile(rf, f.first, f.second) && rf.version >= beginVersion && rf.version <= endVersion) + results.push_back(rf); + } + return results; + }); + + return map(success(oldFiles) && success(newFiles), [=](Void _) { + std::vector results = std::move(newFiles.get()); + results.insert(results.end(), std::make_move_iterator(oldFiles.get().begin()), std::make_move_iterator(oldFiles.get().end())); std::sort(results.begin(), results.end()); return results; }); @@ -1403,9 +1478,11 @@ ACTOR Future testBackupContainer(std::string url) { state Reference log1 = wait(c->writeLogFile(100 + versionShift, 150 + versionShift, 10)); state Reference log2 = wait(c->writeLogFile(150 + versionShift, 300 + versionShift, 10)); - state Reference range1 = wait(c->writeRangeFile(160 + versionShift, 10)); - state Reference range2 = wait(c->writeRangeFile(300 + versionShift, 10)); - state Reference range3 = wait(c->writeRangeFile(310 + versionShift, 10)); + state Version snapshotBeginVersion1 = 160 + versionShift; + state Version snapshotBeginVersion2 = 310 + versionShift; + state Reference range1 = wait(c->writeRangeFile(snapshotBeginVersion1, 0, snapshotBeginVersion1, 10)); + state Reference range2 = wait(c->writeRangeFile(snapshotBeginVersion1, 1, 300 + versionShift, 10)); + state Reference range3 = wait(c->writeRangeFile(snapshotBeginVersion2, 0, snapshotBeginVersion2, 10)); Void _ = wait( writeAndVerifyFile(c, log1, 0) diff --git a/fdbclient/BackupContainer.h b/fdbclient/BackupContainer.h index 4a0d659528..58bc536d09 100644 --- a/fdbclient/BackupContainer.h +++ b/fdbclient/BackupContainer.h @@ -156,7 +156,7 @@ public: // Open a log file or range file for writing virtual Future> writeLogFile(Version beginVersion, Version endVersion, int blockSize) = 0; - virtual Future> writeRangeFile(Version version, int blockSize) = 0; + virtual Future> writeRangeFile(Version snapshotBeginVersion, int snapshotFileCount, Version fileVersion, int blockSize) = 0; // Write a KeyspaceSnapshotFile of range file names representing a full non overlapping // snapshot of the key ranges this backup is targeting. diff --git a/fdbclient/FileBackupAgent.actor.cpp b/fdbclient/FileBackupAgent.actor.cpp index abcf1c1703..cfff72a833 100644 --- a/fdbclient/FileBackupAgent.actor.cpp +++ b/fdbclient/FileBackupAgent.actor.cpp @@ -1002,6 +1002,7 @@ namespace fileBackup { // Update the range bytes written in the backup config backup.rangeBytesWritten().atomicOp(tr, file->size(), MutationRef::AddValue); + backup.snapshotRangeFileCount().atomicOp(tr, 1, MutationRef::AddValue); // See if there is already a file for this key which has an earlier begin, update the map if not. Optional s = wait(backup.snapshotRangeFileMap().get(tr, range.end)); @@ -1127,11 +1128,31 @@ namespace fileBackup { if(done) return Void(); - // Start writing a new file + // Start writing a new file after verifying this task should keep running as of a new read version (which must be >= outVersion) outVersion = values.second; // block size must be at least large enough for 3 max size keys and 2 max size values + overhead so 250k conservatively. state int blockSize = BUGGIFY ? g_random->randomInt(250e3, 4e6) : CLIENT_KNOBS->BACKUP_RANGEFILE_BLOCK_SIZE; - Reference f = wait(bc->writeRangeFile(outVersion, blockSize)); + state Version snapshotBeginVersion; + state int64_t snapshotRangeFileCount; + + state Reference tr(new ReadYourWritesTransaction(cx)); + loop { + try { + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + tr->setOption(FDBTransactionOptions::LOCK_AWARE); + + Void _ = wait(taskBucket->keepRunning(tr, task) + && storeOrThrow(backup.snapshotBeginVersion().get(tr), snapshotBeginVersion) + && storeOrThrow(backup.snapshotRangeFileCount().get(tr), snapshotRangeFileCount) + ); + + break; + } catch(Error &e) { + Void _ = wait(tr->onError(e)); + } + } + + Reference f = wait(bc->writeRangeFile(snapshotBeginVersion, snapshotRangeFileCount, outVersion, blockSize)); outFile = f; // Initialize range file writer and write begin key diff --git a/flow/genericactors.actor.h b/flow/genericactors.actor.h index 5c83edbdae..88f57411d6 100644 --- a/flow/genericactors.actor.h +++ b/flow/genericactors.actor.h @@ -296,6 +296,16 @@ Future store(Future what, T &out) { return map(what, [&out](T const &v) { out = v; return Void(); }); } +template +Future storeOrThrow(Future> what, T &out, Error e = key_not_found()) { + return map(what, [&out,e](Optional const &o) { + if(!o.present()) + throw e; + out = o.get(); + return Void(); + }); +} + //Waits for a future to be ready, and then applies an asynchronous function to it. ACTOR template()(fake()).getValue() )> Future mapAsync(Future what, F actorFunc) From aa648daabf1422a8831a4ccbb5b7952707603d95 Mon Sep 17 00:00:00 2001 From: Stephen Atherton Date: Fri, 23 Nov 2018 12:49:10 -0800 Subject: [PATCH 17/30] Compile fix for linux, can't make a move iterator from a const container reference. --- fdbclient/BackupContainer.actor.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fdbclient/BackupContainer.actor.cpp b/fdbclient/BackupContainer.actor.cpp index 76add48e1e..a9fbb4bda6 100644 --- a/fdbclient/BackupContainer.actor.cpp +++ b/fdbclient/BackupContainer.actor.cpp @@ -481,7 +481,8 @@ public: return map(success(oldFiles) && success(newFiles), [=](Void _) { std::vector results = std::move(newFiles.get()); - results.insert(results.end(), std::make_move_iterator(oldFiles.get().begin()), std::make_move_iterator(oldFiles.get().end())); + std::vector oldResults = std::move(oldFiles.get()); + results.insert(results.end(), std::make_move_iterator(oldResults.begin()), std::make_move_iterator(oldResults.end())); std::sort(results.begin(), results.end()); return results; }); From 0610a19e4d6e879b22c12f86137544886437e7bc Mon Sep 17 00:00:00 2001 From: Stephen Atherton Date: Sat, 24 Nov 2018 17:24:54 -0800 Subject: [PATCH 18/30] Rewrote backup container unit test to use randomness to cover a wider variety of data patterns and edge cases. --- fdbclient/BackupContainer.actor.cpp | 170 +++++++++++++++++----------- 1 file changed, 106 insertions(+), 64 deletions(-) diff --git a/fdbclient/BackupContainer.actor.cpp b/fdbclient/BackupContainer.actor.cpp index a9fbb4bda6..e42bd94c04 100644 --- a/fdbclient/BackupContainer.actor.cpp +++ b/fdbclient/BackupContainer.actor.cpp @@ -258,7 +258,7 @@ public: std::string fileName = format("range,%lld,%s,%d", fileVersion, g_random->randomUniqueID().toString().c_str(), blockSize); // In order to test backward compatibility in simulation, sometimes write to the old path format - if(BUGGIFY) { + if(g_network->isSimulated() && g_random->coinflip()) { return writeFile(old_rangeVersionFolderString(fileVersion) + fileName); } @@ -1438,6 +1438,15 @@ ACTOR Future> timeKeeperEpochsFromVersion(Version v, Reference return found.first + (v - found.second) / CLIENT_KNOBS->CORE_VERSIONSPERSECOND; } +int chooseFileSize(std::vector &sizes) { + int size = 1000; + if(!sizes.empty()) { + size = sizes.back(); + sizes.pop_back(); + } + return size; +} + ACTOR Future writeAndVerifyFile(Reference c, Reference f, int size) { state Standalone content; if(size > 0) { @@ -1460,6 +1469,12 @@ ACTOR Future writeAndVerifyFile(Reference c, ReferencerandomInt64(1, CLIENT_KNOBS->CORE_VERSIONSPERSECOND); + return v + increment; +} + ACTOR Future testBackupContainer(std::string url) { printf("BackupContainerTest URL %s\n", url.c_str()); @@ -1475,88 +1490,115 @@ ACTOR Future testBackupContainer(std::string url) { Void _ = wait(c->create()); - state int64_t versionShift = g_random->randomInt64(0, std::numeric_limits::max() - 500); + state std::vector> writes; + state std::map> snapshots; + state std::map snapshotSizes; + state int nRangeFiles = 0; + state std::map logs; + state Version v = g_random->randomInt64(0, std::numeric_limits::max() / 2); - state Reference log1 = wait(c->writeLogFile(100 + versionShift, 150 + versionShift, 10)); - state Reference log2 = wait(c->writeLogFile(150 + versionShift, 300 + versionShift, 10)); - state Version snapshotBeginVersion1 = 160 + versionShift; - state Version snapshotBeginVersion2 = 310 + versionShift; - state Reference range1 = wait(c->writeRangeFile(snapshotBeginVersion1, 0, snapshotBeginVersion1, 10)); - state Reference range2 = wait(c->writeRangeFile(snapshotBeginVersion1, 1, 300 + versionShift, 10)); - state Reference range3 = wait(c->writeRangeFile(snapshotBeginVersion2, 0, snapshotBeginVersion2, 10)); + // List of sizes to use to test edge cases on underlying file implementations + state std::vector fileSizes = {0, 10000000, 5000005}; - Void _ = wait( - writeAndVerifyFile(c, log1, 0) - && writeAndVerifyFile(c, log2, g_random->randomInt(0, 10000000)) - && writeAndVerifyFile(c, range1, g_random->randomInt(0, 1000)) - && writeAndVerifyFile(c, range2, g_random->randomInt(0, 100000)) - && writeAndVerifyFile(c, range3, g_random->randomInt(0, 3000000)) - ); + loop { + state Version logStart = v; + state int kvfiles = g_random->randomInt(0, 3); - Void _ = wait( - c->writeKeyspaceSnapshotFile({range1->getFileName(), range2->getFileName()}, range1->size() + range2->size()) - && c->writeKeyspaceSnapshotFile({range3->getFileName()}, range3->size()) - ); + while(kvfiles > 0) { + if(snapshots.empty()) { + snapshots[v] = {}; + snapshotSizes[v] = 0; + if(g_random->coinflip()) { + v = nextVersion(v); + } + } + Reference range = wait(c->writeRangeFile(snapshots.rbegin()->first, 0, v, 10)); + ++nRangeFiles; + v = nextVersion(v); + snapshots.rbegin()->second.push_back(range->getFileName()); - printf("Checking file list dump\n"); - FullBackupListing listing = wait(c->dumpFileList()); - ASSERT(listing.logs.size() == 2); - ASSERT(listing.ranges.size() == 3); - ASSERT(listing.snapshots.size() == 2); + int size = chooseFileSize(fileSizes); + snapshotSizes.rbegin()->second += size; + writes.push_back(writeAndVerifyFile(c, range, size)); + + if(g_random->random01() < .2) { + writes.push_back(c->writeKeyspaceSnapshotFile(snapshots.rbegin()->second, snapshotSizes.rbegin()->second)); + snapshots[v] = {}; + snapshotSizes[v] = 0; + break; + } + + --kvfiles; + } + + if(logStart == v || g_random->coinflip()) { + v = nextVersion(v); + } + state Reference log = wait(c->writeLogFile(logStart, v, 10)); + logs[logStart] = log->getFileName(); + int size = chooseFileSize(fileSizes); + writes.push_back(writeAndVerifyFile(c, log, size)); + + // Randomly stop after a snapshot has finished and all manually seeded file sizes have been used. + if(fileSizes.empty() && !snapshots.empty() && snapshots.rbegin()->second.empty() && g_random->random01() < .2) { + snapshots.erase(snapshots.rbegin()->first); + break; + } + } + + Void _ = wait(waitForAll(writes)); + + state FullBackupListing listing = wait(c->dumpFileList()); + ASSERT(listing.ranges.size() == nRangeFiles); + ASSERT(listing.logs.size() == logs.size()); + ASSERT(listing.snapshots.size() == snapshots.size()); state BackupDescription desc = wait(c->describeBackup()); - printf("Backup Description 1\n%s", desc.toString().c_str()); + printf("\n%s\n", desc.toString().c_str()); - ASSERT(desc.maxRestorableVersion.present()); - Optional rest = wait(c->getRestoreSet(desc.maxRestorableVersion.get())); - ASSERT(rest.present()); - ASSERT(rest.get().logs.size() == 0); - ASSERT(rest.get().ranges.size() == 1); + // Do a series of expirations and verify resulting state + state int i = 0; + for(; i < listing.snapshots.size(); ++i) { + // Ensure we can still restore to the latest version + Optional rest = wait(c->getRestoreSet(desc.maxRestorableVersion.get())); + ASSERT(rest.present()); - Optional rest = wait(c->getRestoreSet(150 + versionShift)); - ASSERT(!rest.present()); + // Ensure we can restore to the end version of snapshot i + Optional rest = wait(c->getRestoreSet(listing.snapshots[i].endVersion)); + ASSERT(rest.present()); - Optional rest = wait(c->getRestoreSet(300 + versionShift)); - ASSERT(rest.present()); - ASSERT(rest.get().logs.size() == 1); - ASSERT(rest.get().ranges.size() == 2); + // Test expiring to the end of this snapshot + state Version expireVersion = listing.snapshots[i].endVersion; - printf("Expire 1\n"); - Void _ = wait(c->expireData(100 + versionShift)); - BackupDescription d = wait(c->describeBackup()); - printf("Backup Description 2\n%s", d.toString().c_str()); - ASSERT(d.minLogBegin == 100 + versionShift); - ASSERT(d.maxRestorableVersion == desc.maxRestorableVersion); + // Expire everything up to but not including the snapshot end version + printf("EXPIRE TO %lld\n", expireVersion); + state Future f = c->expireData(expireVersion); + Void _ = wait(ready(f)); - printf("Expire 2\n"); - Void _ = wait(c->expireData(101 + versionShift)); - BackupDescription d = wait(c->describeBackup()); - printf("Backup Description 3\n%s", d.toString().c_str()); - ASSERT(d.minLogBegin == 100 + versionShift); - ASSERT(d.maxRestorableVersion == desc.maxRestorableVersion); + // If there is an error, it must be backup_cannot_expire and we have to be on the last snapshot + if(f.isError()) { + ASSERT(f.getError().code() == error_code_backup_cannot_expire); + ASSERT(i == listing.snapshots.size() - 1); + Void _ = wait(c->expireData(expireVersion, true)); + } - printf("Expire 3\n"); - Void _ = wait(c->expireData(300 + versionShift)); - BackupDescription d = wait(c->describeBackup()); - printf("Backup Description 4\n%s", d.toString().c_str()); - ASSERT(d.minLogBegin.present()); - ASSERT(d.snapshots.size() == desc.snapshots.size()); - ASSERT(d.maxRestorableVersion == desc.maxRestorableVersion); - - printf("Expire 4\n"); - Void _ = wait(c->expireData(301 + versionShift, true)); - BackupDescription d = wait(c->describeBackup()); - printf("Backup Description 4\n%s", d.toString().c_str()); - ASSERT(d.snapshots.size() == 1); - ASSERT(!d.minLogBegin.present()); + BackupDescription d = wait(c->describeBackup()); + printf("\n%s\n", d.toString().c_str()); + } + printf("DELETING\n"); Void _ = wait(c->deleteContainer()); BackupDescription d = wait(c->describeBackup()); - printf("Backup Description 5\n%s", d.toString().c_str()); + printf("\n%s\n", d.toString().c_str()); ASSERT(d.snapshots.size() == 0); ASSERT(!d.minLogBegin.present()); + FullBackupListing empty = wait(c->dumpFileList()); + ASSERT(empty.ranges.size() == 0); + ASSERT(empty.logs.size() == 0); + ASSERT(empty.snapshots.size() == 0); + printf("BackupContainerTest URL=%s PASSED.\n", url.c_str()); return Void(); From 3d68d6b9949fbef410cd06050000a2e6340e3a70 Mon Sep 17 00:00:00 2001 From: Stephen Atherton Date: Sat, 24 Nov 2018 18:41:39 -0800 Subject: [PATCH 19/30] Bug fix, clarified a comment. --- fdbclient/BackupContainer.actor.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fdbclient/BackupContainer.actor.cpp b/fdbclient/BackupContainer.actor.cpp index e42bd94c04..ab046308aa 100644 --- a/fdbclient/BackupContainer.actor.cpp +++ b/fdbclient/BackupContainer.actor.cpp @@ -454,7 +454,7 @@ public: }); } - // List range files which contain data at or between beginVersion and endVersion + // List range files, sorted in version order, which contain data at or between beginVersion and endVersion // Note: The contents of each top level snapshot.N folder do not necessarily constitute a valid snapshot // and therefore listing files is not how RestoreSets are obtained. // Note: Snapshots partially written using FDB versions prior to 6.0.16 will have some range files stored @@ -466,7 +466,7 @@ public: // Define filter function (for listFiles() implementations that use it) to reject any folder // starting after endVersion std::function pathFilter = [=](std::string const &path) { - return extractSnapshotBeginVersion(path) > endVersion; + return extractSnapshotBeginVersion(path) <= endVersion; }; Future> newFiles = map(listFiles("kvranges/", pathFilter), [=](const FilesAndSizesT &files) { From 8697194d8ef616130cb6fbb320d8c971ac004026 Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Mon, 26 Nov 2018 10:49:15 -0800 Subject: [PATCH 20/30] Fix error name for code 2021 (no_commit_version) in documentation --- documentation/sphinx/source/api-error-codes.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/documentation/sphinx/source/api-error-codes.rst b/documentation/sphinx/source/api-error-codes.rst index f0d2554f02..4e0cad202b 100644 --- a/documentation/sphinx/source/api-error-codes.rst +++ b/documentation/sphinx/source/api-error-codes.rst @@ -100,7 +100,7 @@ FoundationDB may return the following error codes from API functions. If you nee +-----------------------------------------------+-----+--------------------------------------------------------------------------------+ | transaction_invalid_version | 2020| Transaction does not have a valid commit version | +-----------------------------------------------+-----+--------------------------------------------------------------------------------+ -| transaction_read_only | 2021| Transaction is read-only and therefore does not have a commit version | +| no_commit_version | 2021| Transaction is read-only and therefore does not have a commit version | +-----------------------------------------------+-----+--------------------------------------------------------------------------------+ | environment_variable_network_option_failed | 2022| Environment variable network option could not be set | +-----------------------------------------------+-----+--------------------------------------------------------------------------------+ From 512c00d304999d06f81c4fc70e3a05340fb3fd8c Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Mon, 26 Nov 2018 11:01:10 -0800 Subject: [PATCH 21/30] added dump token trace events for storage server interfaces after rollbacks --- fdbserver/worker.actor.cpp | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp index b016804810..15a1cb699a 100644 --- a/fdbserver/worker.actor.cpp +++ b/fdbserver/worker.actor.cpp @@ -356,12 +356,25 @@ ACTOR Future storageServerRollbackRebooter( Future prevStorageServer TraceEvent("StorageServerRequestedReboot", id); - StorageServerInterface ssi; - ssi.uniqueID = id; - ssi.locality = locality; - ssi.initEndpoints(); + StorageServerInterface recruited; + recruited.uniqueID = id; + recruited.locality = locality; + recruited.initEndpoints(); - prevStorageServer = storageServer( store, ssi, db, folder, Promise() ); + DUMPTOKEN(recruited.getVersion); + DUMPTOKEN(recruited.getValue); + DUMPTOKEN(recruited.getKey); + DUMPTOKEN(recruited.getKeyValues); + DUMPTOKEN(recruited.getShardState); + DUMPTOKEN(recruited.waitMetrics); + DUMPTOKEN(recruited.splitMetrics); + DUMPTOKEN(recruited.getPhysicalMetrics); + DUMPTOKEN(recruited.waitFailure); + DUMPTOKEN(recruited.getQueuingMetrics); + DUMPTOKEN(recruited.getKeyValueStoreType); + DUMPTOKEN(recruited.watchValue); + + prevStorageServer = storageServer( store, recruited, db, folder, Promise() ); prevStorageServer = handleIOErrors(prevStorageServer, store, id, store->onClosed()); } } From 530b5e3763e9ec676c26009077c735fe262df391 Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Mon, 26 Nov 2018 15:17:17 -0800 Subject: [PATCH 22/30] fix: do not track txsPopVersions unless there are remote logs to pop from --- fdbserver/MasterProxyServer.actor.cpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/fdbserver/MasterProxyServer.actor.cpp b/fdbserver/MasterProxyServer.actor.cpp index 1f1117a04a..f31764f964 100644 --- a/fdbserver/MasterProxyServer.actor.cpp +++ b/fdbserver/MasterProxyServer.actor.cpp @@ -206,6 +206,7 @@ struct ProxyCommitData { std::map tag_popped; Deque> txsPopVersions; Version lastTxsPop; + bool popRemoteTxs; //The tag related to a storage server rarely change, so we keep a vector of tags for each key range to be slightly more CPU efficient. //When a tag related to a storage server does change, we empty out all of these vectors to signify they must be repopulated. @@ -228,12 +229,12 @@ struct ProxyCommitData { ProxyCommitData(UID dbgid, MasterInterface master, RequestStream getConsistentReadVersion, Version recoveryTransactionVersion, RequestStream commit, Reference> db, bool firstProxy) : dbgid(dbgid), stats(dbgid, &version, &committedVersion, &commitBatchesMemBytesCount), master(master), - logAdapter(NULL), txnStateStore(NULL), + logAdapter(NULL), txnStateStore(NULL), popRemoteTxs(false), committedVersion(recoveryTransactionVersion), version(0), minKnownCommittedVersion(0), lastVersionTime(0), commitVersionRequestNumber(1), mostRecentProcessedRequestNumber(0), getConsistentReadVersion(getConsistentReadVersion), commit(commit), lastCoalesceTime(0), - localCommitBatchesStarted(0), locked(false), commitBatchInterval(SERVER_KNOBS->COMMIT_TRANSACTION_BATCH_INTERVAL_MIN), - firstProxy(firstProxy), cx(openDBOnServer(db, TaskDefaultEndpoint, true, true)), + localCommitBatchesStarted(0), locked(false), commitBatchInterval(SERVER_KNOBS->COMMIT_TRANSACTION_BATCH_INTERVAL_MIN), + firstProxy(firstProxy), cx(openDBOnServer(db, TaskDefaultEndpoint, true, true)), singleKeyMutationEvent(LiteralStringRef("SingleKeyMutation")), commitBatchesMemBytesCount(0), lastTxsPop(0) {} }; @@ -915,7 +916,7 @@ ACTOR Future commitBatch( } Void _ = wait(yield()); - if(!self->txsPopVersions.size() || msg.popTo > self->txsPopVersions.back().second) { + if( self->popRemoteTxs && msg.popTo > ( self->txsPopVersions.size() ? self->txsPopVersions.back().second : self->lastTxsPop ) ) { if(self->txsPopVersions.size() >= SERVER_KNOBS->MAX_TXS_POP_VERSION_HISTORY) { TraceEvent(SevWarnAlways, "DiscardingTxsPopHistory").suppressFor(1.0); self->txsPopVersions.pop_front(); @@ -1297,6 +1298,7 @@ ACTOR Future monitorRemoteCommitted(ProxyCommitData* self, ReferenceonChange()); continue; } + self->popRemoteTxs = true; state Future onChange = db->onChange(); loop { From d1e6c81d99b9c893b18d3303d841552827a2eefb Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Mon, 26 Nov 2018 15:31:40 -0800 Subject: [PATCH 23/30] The binding tester wouldn't align keys that had nan in them. Also, the ruby tester generated a few keys with byte strings instead of unicode. --- bindings/bindingtester/__init__.py | 23 +++++++++++++++++++++- bindings/bindingtester/bindingtester.py | 2 +- bindings/ruby/tests/directory_extension.rb | 8 ++++---- 3 files changed, 27 insertions(+), 6 deletions(-) diff --git a/bindings/bindingtester/__init__.py b/bindings/bindingtester/__init__.py index 753f220272..c11e936975 100644 --- a/bindings/bindingtester/__init__.py +++ b/bindings/bindingtester/__init__.py @@ -18,6 +18,7 @@ # limitations under the License. # +import math import sys import os @@ -61,11 +62,31 @@ class Result: def key(self, specification): return self.key_tuple[specification.key_start_index:] + @staticmethod + def elements_equal(el1, el2): + if type(el1) != type(el2): + return False + + if isinstance(el1, tuple): + return Result.tuples_match(el1, el2) + + if isinstance(el1, float) and math.isnan(el1): + return math.isnan(el2) + + return el1 == el2 + + @staticmethod + def tuples_match(t1, t2): + if len(t1) != len(t2): + return False + + return all([Result.elements_equal(x,y) for x,y in zip(t1, t2)]) + def matches_key(self, rhs, specification): if not isinstance(rhs, Result): return False - return self.key(specification) == rhs.key(specification) + return Result.tuples_match(self.key(specification), rhs.key(specification)) def matches(self, rhs, specification): if not self.matches_key(rhs, specification): diff --git a/bindings/bindingtester/bindingtester.py b/bindings/bindingtester/bindingtester.py index 0b563ed0da..0f4eb45e69 100755 --- a/bindings/bindingtester/bindingtester.py +++ b/bindings/bindingtester/bindingtester.py @@ -98,7 +98,7 @@ class ResultSet(object): # If these results aren't using sequence numbers, then we match two results based on whether they share the same key else: min_key = min([r.key(self.specification) for r in results.values()]) - results = {i: r for i, r in results.items() if r.key(self.specification) == min_key} + results = {i: r for i, r in results.items() if Result.tuples_match(r.key(self.specification), min_key)} # Increment the indices for those testers which produced a result in this iteration for i in results.keys(): diff --git a/bindings/ruby/tests/directory_extension.rb b/bindings/ruby/tests/directory_extension.rb index e985ab7e89..1f6c3e7bbc 100644 --- a/bindings/ruby/tests/directory_extension.rb +++ b/bindings/ruby/tests/directory_extension.rb @@ -157,10 +157,10 @@ module DirectoryExtension exists = directory.exists?(inst.tr) children = exists ? directory.list(inst.tr) : [] log_subspace = FDB::Subspace.new([@dir_index], inst.wait_and_pop) - inst.tr[log_subspace['path']] = FDB::Tuple.pack(directory.path) - inst.tr[log_subspace['layer']] = FDB::Tuple.pack([directory.layer]) - inst.tr[log_subspace['exists']] = FDB::Tuple.pack([exists ? 1 : 0]) - inst.tr[log_subspace['children']] = FDB::Tuple.pack(children) + inst.tr[log_subspace['path'.encode('utf-8')]] = FDB::Tuple.pack(directory.path) + inst.tr[log_subspace['layer'.encode('utf-8')]] = FDB::Tuple.pack([directory.layer]) + inst.tr[log_subspace['exists'.encode('utf-8')]] = FDB::Tuple.pack([exists ? 1 : 0]) + inst.tr[log_subspace['children'.encode('utf-8')]] = FDB::Tuple.pack(children) elsif inst.op == 'DIRECTORY_STRIP_PREFIX' str = inst.wait_and_pop throw "String #{str} does not start with raw prefix #{directory.key}" if !str.start_with?(directory.key) From 32f434b2eeee3ffb1280bc740625756f2dfad099 Mon Sep 17 00:00:00 2001 From: Stephen Atherton Date: Mon, 26 Nov 2018 19:53:46 -0800 Subject: [PATCH 24/30] Bug fix, dns resolution would throw an error if any of the results were IPv6 addresses, which could happen depending on the host networking configuration. --- flow/Net2.actor.cpp | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/flow/Net2.actor.cpp b/flow/Net2.actor.cpp index 1b8da44908..2bcaf35625 100644 --- a/flow/Net2.actor.cpp +++ b/flow/Net2.actor.cpp @@ -835,11 +835,12 @@ Future< Reference > Net2::connect( NetworkAddress toAddr, std::stri } ACTOR static Future> resolveTCPEndpoint_impl( Net2 *self, std::string host, std::string service) { - Promise> result; + Promise> promise; + state Future> result = promise.getFuture(); self->tcpResolver.async_resolve(tcp::resolver::query(host, service), [=](const boost::system::error_code &ec, tcp::resolver::iterator iter) { if(ec) { - result.sendError(lookup_failed()); + promise.sendError(lookup_failed()); return; } @@ -847,18 +848,26 @@ ACTOR static Future> resolveTCPEndpoint_impl( Net2 * tcp::resolver::iterator end; while(iter != end) { - // The easiest way to get an ip:port formatted endpoint with this interface is with a string stream because - // endpoint::to_string doesn't exist but operator<< does. - std::stringstream s; - s << iter->endpoint(); - addrs.push_back(NetworkAddress::parse(s.str())); + auto endpoint = iter->endpoint(); + // Currently only ipv4 is supported by NetworkAddress + auto addr = endpoint.address(); + if(addr.is_v4()) { + addrs.push_back(NetworkAddress(addr.to_v4().to_ulong(), endpoint.port())); + } ++iter; } - result.send(addrs); + + if(addrs.empty()) { + promise.sendError(lookup_failed()); + } + else { + promise.send(addrs); + } }); - std::vector addresses = wait(result.getFuture()); - return addresses; + Void _ = wait(ready(result)); + + return result.get(); } Future> Net2::resolveTCPEndpoint( std::string host, std::string service) { From b91b26ef75e4c48c00fea3354fd7e2276e0ac10f Mon Sep 17 00:00:00 2001 From: Stephen Atherton Date: Mon, 26 Nov 2018 20:02:03 -0800 Subject: [PATCH 25/30] Attempt at workaround of a rare issue where long running backup processes reach a state where DNS resolution requests always time out but other processes on the same host can still resolve successfully. In case this was somehow caused by a bad boost tcp_resolver state, each request now uses a unique tcp_resolver instance. --- flow/Net2.actor.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/flow/Net2.actor.cpp b/flow/Net2.actor.cpp index 2bcaf35625..0b108ccf38 100644 --- a/flow/Net2.actor.cpp +++ b/flow/Net2.actor.cpp @@ -158,7 +158,6 @@ public: ASIOReactor reactor; INetworkConnections *network; // initially this, but can be changed - tcp::resolver tcpResolver; int64_t tsc_begin, tsc_end; double taskBegin; @@ -478,7 +477,6 @@ Net2::Net2(NetworkAddress localAddress, bool useThreadPool, bool useMetrics) : useThreadPool(useThreadPool), network(this), reactor(this), - tcpResolver(reactor.ios), stopped(false), tasksIssued(0), // Until run() is called, yield() will always yield @@ -835,10 +833,11 @@ Future< Reference > Net2::connect( NetworkAddress toAddr, std::stri } ACTOR static Future> resolveTCPEndpoint_impl( Net2 *self, std::string host, std::string service) { + state tcp::resolver tcpResolver(self->reactor.ios); Promise> promise; state Future> result = promise.getFuture(); - self->tcpResolver.async_resolve(tcp::resolver::query(host, service), [=](const boost::system::error_code &ec, tcp::resolver::iterator iter) { + tcpResolver.async_resolve(tcp::resolver::query(host, service), [=](const boost::system::error_code &ec, tcp::resolver::iterator iter) { if(ec) { promise.sendError(lookup_failed()); return; @@ -866,6 +865,7 @@ ACTOR static Future> resolveTCPEndpoint_impl( Net2 * }); Void _ = wait(ready(result)); + tcpResolver.cancel(); return result.get(); } From b6d9763469d53e7489860c733a897f5cfbc05700 Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Tue, 27 Nov 2018 13:10:14 -0800 Subject: [PATCH 26/30] updated release notes for 6.0.16 --- documentation/sphinx/source/release-notes.rst | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/documentation/sphinx/source/release-notes.rst b/documentation/sphinx/source/release-notes.rst index 4c1a72da2f..38a9ccd91d 100644 --- a/documentation/sphinx/source/release-notes.rst +++ b/documentation/sphinx/source/release-notes.rst @@ -5,6 +5,21 @@ Release Notes 6.0.16 ====== +Performance +----------- + +* Added a new backup folder scheme which results in far fewer kv range folders. `(PR #939) `_ + +Fixes +----- + +* Blobstore REST client attempted to create buckets that already existed. `(PR #923) `_ +* DNS would fail if IPv6 responses were received. `(PR #945) `_ +* Backup expiration would occasionally fail due to an incorrect assert. `(PR #926) `_ + +6.0.15 +====== + Features -------- @@ -68,7 +83,6 @@ Fixes * HTTP client used by backup-to-blobstore now correctly treats response header field names as case insensitive. [6.0.15] `(PR #904) `_ * Blobstore REST client was not following the S3 API in several ways (bucket name, date, and response formats). [6.0.15] `(PR #914) `_ * Data distribution could queue shard movements for restoring replication at a low priority. [6.0.15] `(PR #907) `_ -* Blobstore REST client will no longer attempt to create a bucket that already exists. [6.0.16] `(PR #923) `_ Fixes only impacting 6.0.0+ --------------------------- From e948aadba3001829446835686bbd8ec8fa835d27 Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Tue, 27 Nov 2018 13:21:37 -0800 Subject: [PATCH 27/30] update installer WIX GUID following release --- packaging/msi/FDBInstaller.wxs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packaging/msi/FDBInstaller.wxs b/packaging/msi/FDBInstaller.wxs index 71db7fcb11..b95c57f8bf 100644 --- a/packaging/msi/FDBInstaller.wxs +++ b/packaging/msi/FDBInstaller.wxs @@ -32,7 +32,7 @@ Date: Tue, 27 Nov 2018 13:23:32 -0800 Subject: [PATCH 28/30] updated downloads for 6.0.16 --- documentation/sphinx/source/downloads.rst | 24 +++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/documentation/sphinx/source/downloads.rst b/documentation/sphinx/source/downloads.rst index 988b78e7b5..48dba89938 100644 --- a/documentation/sphinx/source/downloads.rst +++ b/documentation/sphinx/source/downloads.rst @@ -10,38 +10,38 @@ macOS The macOS installation package is supported on macOS 10.7+. It includes the client and (optionally) the server. -* `FoundationDB-6.0.15.pkg `_ +* `FoundationDB-6.0.16.pkg `_ Ubuntu ------ The Ubuntu packages are supported on 64-bit Ubuntu 12.04+, but beware of the Linux kernel bug in Ubuntu 12.x. -* `foundationdb-clients-6.0.15-1_amd64.deb `_ -* `foundationdb-server-6.0.15-1_amd64.deb `_ (depends on the clients package) +* `foundationdb-clients-6.0.16-1_amd64.deb `_ +* `foundationdb-server-6.0.16-1_amd64.deb `_ (depends on the clients package) RHEL/CentOS EL6 --------------- The RHEL/CentOS EL6 packages are supported on 64-bit RHEL/CentOS 6.x. -* `foundationdb-clients-6.0.15-1.el6.x86_64.rpm `_ -* `foundationdb-server-6.0.15-1.el6.x86_64.rpm `_ (depends on the clients package) +* `foundationdb-clients-6.0.16-1.el6.x86_64.rpm `_ +* `foundationdb-server-6.0.16-1.el6.x86_64.rpm `_ (depends on the clients package) RHEL/CentOS EL7 --------------- The RHEL/CentOS EL7 packages are supported on 64-bit RHEL/CentOS 7.x. -* `foundationdb-clients-6.0.15-1.el7.x86_64.rpm `_ -* `foundationdb-server-6.0.15-1.el7.x86_64.rpm `_ (depends on the clients package) +* `foundationdb-clients-6.0.16-1.el7.x86_64.rpm `_ +* `foundationdb-server-6.0.16-1.el7.x86_64.rpm `_ (depends on the clients package) Windows ------- The Windows installer is supported on 64-bit Windows XP and later. It includes the client and (optionally) the server. -* `foundationdb-6.0.15-x64.msi `_ +* `foundationdb-6.0.16-x64.msi `_ API Language Bindings ===================== @@ -58,18 +58,18 @@ On macOS and Windows, the FoundationDB Python API bindings are installed as part If you need to use the FoundationDB Python API from other Python installations or paths, download the Python package: -* `foundationdb-6.0.15.tar.gz `_ +* `foundationdb-6.0.16.tar.gz `_ Ruby 1.9.3/2.0.0+ ----------------- -* `fdb-6.0.15.gem `_ +* `fdb-6.0.16.gem `_ Java 8+ ------- -* `fdb-java-6.0.15.jar `_ -* `fdb-java-6.0.15-javadoc.jar `_ +* `fdb-java-6.0.16.jar `_ +* `fdb-java-6.0.16-javadoc.jar `_ Go 1.1+ ------- From 4a1c1ad860624bf774f25d121c6eb06b38c7fd74 Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Tue, 27 Nov 2018 14:29:52 -0800 Subject: [PATCH 29/30] update versions target to 6.0.17 --- versions.target | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/versions.target b/versions.target index 2ac042b444..66ff809c50 100644 --- a/versions.target +++ b/versions.target @@ -1,7 +1,7 @@ - 6.0.16 + 6.0.17 6.0 From 84e07de16175db83b268bd92793db03555df089a Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Tue, 27 Nov 2018 14:29:52 -0800 Subject: [PATCH 30/30] update installer WIX GUID following release --- packaging/msi/FDBInstaller.wxs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packaging/msi/FDBInstaller.wxs b/packaging/msi/FDBInstaller.wxs index b95c57f8bf..7f3f5aa7bb 100644 --- a/packaging/msi/FDBInstaller.wxs +++ b/packaging/msi/FDBInstaller.wxs @@ -32,7 +32,7 @@