Merge branch 'master' into mengxu/fr-code-improvement-PR

This commit is contained in:
Meng Xu 2020-04-29 21:07:33 -07:00
commit a0d67cac16
110 changed files with 5302 additions and 4155 deletions

View File

@ -206,37 +206,3 @@ will automatically find it and build with TLS support.
If you installed WIX before running `cmake` you should find the
`FDBInstaller.msi` in your build directory under `packaging/msi`.
## Makefile (Deprecated - all users should transition to using cmake)
#### MacOS
1. Check out this repo on your Mac.
1. Install the Xcode command-line tools.
1. Download version 1.67.0 of [Boost](https://sourceforge.net/projects/boost/files/boost/1.67.0/).
1. Set the `BOOSTDIR` environment variable to the location containing this boost installation.
1. Install [Mono](http://www.mono-project.com/download/stable/).
1. Install a [JDK](http://www.oracle.com/technetwork/java/javase/downloads/index.html). FoundationDB currently builds with Java 8.
1. Navigate to the directory where you checked out the foundationdb repo.
1. Run `make`.
#### Linux
1. Install [Docker](https://www.docker.com/).
1. Check out the foundationdb repo.
1. Run the docker image interactively with [Docker Run](https://docs.docker.com/engine/reference/run/#general-form), and with the directory containing the foundationdb repo mounted via [Docker Mounts](https://docs.docker.com/storage/volumes/).
```shell
docker run -it -v '/local/dir/path/foundationdb:/docker/dir/path/foundationdb' foundationdb/foundationdb-build:latest
```
1. Run `$ scl enable devtoolset-8 python27 rh-python36 rh-ruby24 -- bash` within the running container. This enables a more modern compiler, which is required to build FoundationDB.
1. Navigate to the container's mounted directory which contains the foundationdb repo.
```shell
cd /docker/dir/path/foundationdb
```
1. Run `make`.
This will build the fdbserver binary and the python bindings. If you want to build our other bindings, you will need to install a runtime for the language whose binding you want to build. Each binding has an `.mk` file which provides specific targets for that binding.

View File

@ -99,6 +99,8 @@ function(build_go_package)
endif()
add_custom_command(OUTPUT ${outfile}
COMMAND ${CMAKE_COMMAND} -E env ${go_env}
${GO_EXECUTABLE} get -d ${GO_IMPORT_PATH}/${BGP_PATH} &&
${CMAKE_COMMAND} -E env ${go_env}
${GO_EXECUTABLE} install ${GO_IMPORT_PATH}/${BGP_PATH}
DEPENDS ${fdb_options_file}
COMMENT "Building ${BGP_NAME}")

View File

@ -27,7 +27,7 @@ package fdb
import "C"
import (
"sync"
"runtime"
)
// Database is a handle to a FoundationDB database. Database is a lightweight
@ -74,14 +74,13 @@ func (d Database) CreateTransaction() (Transaction, error) {
return Transaction{}, Error{int(err)}
}
t := &transaction{outt, d, sync.Once{}}
t := &transaction{outt, d}
runtime.SetFinalizer(t, (*transaction).destroy)
return Transaction{t}, nil
}
func retryable(t Transaction, wrapped func() (interface{}, error), onError func(Error) FutureNil) (ret interface{}, e error) {
defer t.Close()
func retryable(wrapped func() (interface{}, error), onError func(Error) FutureNil) (ret interface{}, e error) {
for {
ret, e = wrapped()
@ -141,7 +140,7 @@ func (d Database) Transact(f func(Transaction) (interface{}, error)) (interface{
return
}
return retryable(tr, wrapped, tr.OnError)
return retryable(wrapped, tr.OnError)
}
// ReadTransact runs a caller-provided function inside a retry loop, providing
@ -181,7 +180,7 @@ func (d Database) ReadTransact(f func(ReadTransaction) (interface{}, error)) (in
return
}
return retryable(tr, wrapped, tr.OnError)
return retryable(wrapped, tr.OnError)
}
// Options returns a DatabaseOptions instance suitable for setting options

View File

@ -417,7 +417,6 @@ func (dl directoryLayer) subdirNames(rtr fdb.ReadTransaction, node subspace.Subs
rr := rtr.GetRange(sd, fdb.RangeOptions{})
ri := rr.Iterator()
defer ri.Close()
var ret []string
@ -443,7 +442,6 @@ func (dl directoryLayer) subdirNodes(tr fdb.Transaction, node subspace.Subspace)
rr := tr.GetRange(sd, fdb.RangeOptions{})
ri := rr.Iterator()
defer ri.Close()
var ret []subspace.Subspace

View File

@ -246,7 +246,6 @@ func ExampleRangeIterator() {
rr := tr.GetRange(fdb.KeyRange{fdb.Key(""), fdb.Key{0xFF}}, fdb.RangeOptions{})
ri := rr.Iterator()
defer ri.Close()
// Advance will return true until the iterator is exhausted
for ri.Advance() {

View File

@ -39,6 +39,7 @@ package fdb
import "C"
import (
"runtime"
"sync"
"unsafe"
)
@ -74,7 +75,9 @@ type future struct {
}
func newFuture(ptr *C.FDBFuture) *future {
return &future{ptr}
f := &future{ptr}
runtime.SetFinalizer(f, func(f *future) { C.fdb_future_destroy(f.ptr) })
return f
}
// Note: This function guarantees the callback will be executed **at most once**.
@ -97,14 +100,17 @@ func fdb_future_block_until_ready(f *C.FDBFuture) {
}
func (f *future) BlockUntilReady() {
defer runtime.KeepAlive(f)
fdb_future_block_until_ready(f.ptr)
}
func (f *future) IsReady() bool {
defer runtime.KeepAlive(f)
return C.fdb_future_is_ready(f.ptr) != 0
}
func (f *future) Cancel() {
defer runtime.KeepAlive(f)
C.fdb_future_cancel(f.ptr)
}
@ -136,7 +142,7 @@ type futureByteSlice struct {
func (f *futureByteSlice) Get() ([]byte, error) {
f.o.Do(func() {
defer C.fdb_future_destroy(f.ptr)
defer runtime.KeepAlive(f.future)
var present C.fdb_bool_t
var value *C.uint8_t
@ -150,14 +156,10 @@ func (f *futureByteSlice) Get() ([]byte, error) {
}
if present != 0 {
// Copy the native `value` into a Go byte slice so the underlying
// native Future can be freed. This avoids the need for finalizers.
valueDestination := make([]byte, length)
valueSource := C.GoBytes(unsafe.Pointer(value), length)
copy(valueDestination, valueSource)
f.v = valueDestination
f.v = C.GoBytes(unsafe.Pointer(value), length)
}
C.fdb_future_release_memory(f.ptr)
})
return f.v, f.e
@ -197,7 +199,7 @@ type futureKey struct {
func (f *futureKey) Get() (Key, error) {
f.o.Do(func() {
defer C.fdb_future_destroy(f.ptr)
defer runtime.KeepAlive(f.future)
var value *C.uint8_t
var length C.int
@ -209,11 +211,8 @@ func (f *futureKey) Get() (Key, error) {
return
}
keySource := C.GoBytes(unsafe.Pointer(value), length)
keyDestination := make([]byte, length)
copy(keyDestination, keySource)
f.k = keyDestination
f.k = C.GoBytes(unsafe.Pointer(value), length)
C.fdb_future_release_memory(f.ptr)
})
return f.k, f.e
@ -246,21 +245,17 @@ type FutureNil interface {
type futureNil struct {
*future
o sync.Once
e error
}
func (f *futureNil) Get() error {
f.o.Do(func() {
defer C.fdb_future_destroy(f.ptr)
defer runtime.KeepAlive(f.future)
f.BlockUntilReady()
if err := C.fdb_future_get_error(f.ptr); err != 0 {
f.e = Error{int(err)}
return Error{int(err)}
}
})
return f.e
return nil
}
func (f *futureNil) MustGet() {
@ -273,6 +268,7 @@ type futureKeyValueArray struct {
*future
}
//go:nocheckptr
func stringRefToSlice(ptr unsafe.Pointer) []byte {
size := *((*C.int)(unsafe.Pointer(uintptr(ptr) + 8)))
@ -286,6 +282,8 @@ func stringRefToSlice(ptr unsafe.Pointer) []byte {
}
func (f *futureKeyValueArray) Get() ([]KeyValue, bool, error) {
defer runtime.KeepAlive(f.future)
f.BlockUntilReady()
var kvs *C.FDBKeyValue
@ -296,42 +294,13 @@ func (f *futureKeyValueArray) Get() ([]KeyValue, bool, error) {
return nil, false, Error{int(err)}
}
// To minimize the number of individual allocations, we first calculate the
// final size used by all keys and values returned from this iteration,
// then perform one larger allocation and slice within it.
poolSize := 0
for i := 0; i < int(count); i++ {
kvptr := unsafe.Pointer(uintptr(unsafe.Pointer(kvs)) + uintptr(i*24))
poolSize += len(stringRefToSlice(kvptr))
poolSize += len(stringRefToSlice(unsafe.Pointer(uintptr(kvptr) + 12)))
}
poolOffset := 0
pool := make([]byte, poolSize)
ret := make([]KeyValue, int(count))
for i := 0; i < int(count); i++ {
kvptr := unsafe.Pointer(uintptr(unsafe.Pointer(kvs)) + uintptr(i*24))
keySource := stringRefToSlice(kvptr)
valueSource := stringRefToSlice(unsafe.Pointer(uintptr(kvptr) + 12))
keyDestination := pool[poolOffset : poolOffset+len(keySource)]
poolOffset += len(keySource)
valueDestination := pool[poolOffset : poolOffset+len(valueSource)]
poolOffset += len(valueSource)
copy(keyDestination, keySource)
copy(valueDestination, valueSource)
ret[i] = KeyValue{
Key: keyDestination,
Value: valueDestination,
}
ret[i].Key = stringRefToSlice(kvptr)
ret[i].Value = stringRefToSlice(unsafe.Pointer(uintptr(kvptr) + 12))
}
return ret, (more != 0), nil
@ -356,28 +325,19 @@ type FutureInt64 interface {
type futureInt64 struct {
*future
o sync.Once
e error
v int64
}
func (f *futureInt64) Get() (int64, error) {
f.o.Do(func() {
defer C.fdb_future_destroy(f.ptr)
defer runtime.KeepAlive(f.future)
f.BlockUntilReady()
var ver C.int64_t
if err := C.fdb_future_get_int64(f.ptr, &ver); err != 0 {
f.v = 0
f.e = Error{int(err)}
return
return 0, Error{int(err)}
}
f.v = int64(ver)
})
return f.v, f.e
return int64(ver), nil
}
func (f *futureInt64) MustGet() int64 {
@ -408,14 +368,10 @@ type FutureStringSlice interface {
type futureStringSlice struct {
*future
o sync.Once
e error
v []string
}
func (f *futureStringSlice) Get() ([]string, error) {
f.o.Do(func() {
defer C.fdb_future_destroy(f.ptr)
defer runtime.KeepAlive(f.future)
f.BlockUntilReady()
@ -423,25 +379,16 @@ func (f *futureStringSlice) Get() ([]string, error) {
var count C.int
if err := C.fdb_future_get_string_array(f.ptr, (***C.char)(unsafe.Pointer(&strings)), &count); err != 0 {
f.e = Error{int(err)}
return
return nil, Error{int(err)}
}
ret := make([]string, int(count))
for i := 0; i < int(count); i++ {
source := C.GoString((*C.char)(*(**C.char)(unsafe.Pointer(uintptr(unsafe.Pointer(strings)) + uintptr(i*8)))))
destination := make([]byte, len(source))
copy(destination, source)
ret[i] = string(destination)
ret[i] = C.GoString((*C.char)(*(**C.char)(unsafe.Pointer(uintptr(unsafe.Pointer(strings)) + uintptr(i*8)))))
}
f.v = ret
})
return f.v, f.e
return ret, nil
}
func (f *futureStringSlice) MustGet() []string {

View File

@ -304,7 +304,7 @@ func (o DatabaseOptions) SetTransactionTimeout(param int64) error {
return o.setOpt(500, int64ToBytes(param))
}
// Set a timeout in milliseconds which, when elapsed, will cause a transaction automatically to be cancelled. This sets the ``retry_limit`` option of each transaction created by this database. See the transaction option description for more information.
// Set a maximum number of retries after which additional calls to ``onError`` will throw the most recently seen error code. This sets the ``retry_limit`` option of each transaction created by this database. See the transaction option description for more information.
//
// Parameter: number of times to retry
func (o DatabaseOptions) SetTransactionRetryLimit(param int64) error {
@ -330,7 +330,7 @@ func (o DatabaseOptions) SetTransactionCausalReadRisky() error {
return o.setOpt(504, nil)
}
// Addresses returned by get_addresses_for_key include the port when enabled. As of api version 700, this option is enabled by default and setting this has no effect.
// Addresses returned by get_addresses_for_key include the port when enabled. As of api version 630, this option is enabled by default and setting this has no effect.
func (o DatabaseOptions) SetTransactionIncludePortInAddress() error {
return o.setOpt(505, nil)
}
@ -350,7 +350,7 @@ func (o TransactionOptions) SetCausalReadDisable() error {
return o.setOpt(21, nil)
}
// Addresses returned by get_addresses_for_key include the port when enabled. As of api version 700, this option is enabled by default and setting this has no effect.
// Addresses returned by get_addresses_for_key include the port when enabled. As of api version 630, this option is enabled by default and setting this has no effect.
func (o TransactionOptions) SetIncludePortInAddress() error {
return o.setOpt(23, nil)
}
@ -429,7 +429,7 @@ func (o TransactionOptions) SetDebugTransactionIdentifier(param string) error {
return o.setOpt(403, []byte(param))
}
// Enables tracing for this transaction and logs results to the client trace logs. The DEBUG_TRANSACTION_IDENTIFIER option must be set before using this option, and client trace logging must be enabled and to get log output.
// Enables tracing for this transaction and logs results to the client trace logs. The DEBUG_TRANSACTION_IDENTIFIER option must be set before using this option, and client trace logging must be enabled to get log output.
func (o TransactionOptions) SetLogTransaction() error {
return o.setOpt(404, nil)
}
@ -479,7 +479,7 @@ func (o TransactionOptions) SetSnapshotRywDisable() error {
return o.setOpt(601, nil)
}
// The transaction can read and write to locked databases, and is resposible for checking that it took the lock.
// The transaction can read and write to locked databases, and is responsible for checking that it took the lock.
func (o TransactionOptions) SetLockAware() error {
return o.setOpt(700, nil)
}

View File

@ -28,7 +28,6 @@ import "C"
import (
"fmt"
"sync"
)
// KeyValue represents a single key-value pair in the database.
@ -141,7 +140,6 @@ func (rr RangeResult) GetSliceWithError() ([]KeyValue, error) {
var ret []KeyValue
ri := rr.Iterator()
defer ri.Close()
if rr.options.Limit != 0 {
ri.options.Mode = StreamingModeExact
@ -209,18 +207,6 @@ type RangeIterator struct {
index int
err error
snapshot bool
o sync.Once
}
// Close releases the underlying native resources for all the `KeyValue`s
// ever returned by this iterator. The `KeyValue`s themselves are copied
// before they're returned, so they are still safe to use after calling
// this function. This is instended to be called with `defer` inside
// your transaction function.
func (ri *RangeIterator) Close() {
ri.o.Do(func() {
C.fdb_future_destroy(ri.f.ptr)
})
}
// Advance attempts to advance the iterator to the next key-value pair. Advance

View File

@ -25,7 +25,6 @@ package fdb
// #define FDB_API_VERSION 630
// #include <foundationdb/fdb_c.h>
import "C"
import "sync"
// A ReadTransaction can asynchronously read from a FoundationDB
// database. Transaction and Snapshot both satisfy the ReadTransaction
@ -71,7 +70,6 @@ type Transaction struct {
type transaction struct {
ptr *C.FDBTransaction
db Database
o sync.Once
}
// TransactionOptions is a handle with which to set options that affect a
@ -87,18 +85,16 @@ func (opt TransactionOptions) setOpt(code int, param []byte) error {
}, param)
}
func (t *transaction) destroy() {
C.fdb_transaction_destroy(t.ptr)
}
// GetDatabase returns a handle to the database with which this transaction is
// interacting.
func (t Transaction) GetDatabase() Database {
return t.transaction.db
}
func (t Transaction) Close() {
t.o.Do(func() {
C.fdb_transaction_destroy(t.ptr)
})
}
// Transact executes the caller-provided function, passing it the Transaction
// receiver object.
//
@ -410,6 +406,9 @@ func (t *transaction) getApproximateSize() FutureInt64 {
}
}
// Returns a future that is the approximate transaction size so far in this
// transaction, which is the summation of the estimated size of mutations,
// read conflict ranges, and write conflict ranges.
func (t Transaction) GetApproximateSize() FutureInt64 {
return t.getApproximateSize()
}

207
bindings/python/LICENSE Normal file
View File

@ -0,0 +1,207 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-------------------------------------------------------------------------------
SOFTWARE DISTRIBUTED WITH FOUNDATIONDB:
The FoundationDB software includes a number of subcomponents with separate
copyright notices and license terms - please see the file ACKNOWLEDGEMENTS.
-------------------------------------------------------------------------------

View File

@ -20,7 +20,7 @@ cd ${tmpdir}
echo
cat <<EOF >> Dockerfile
FROM foundationdb/foundationdb-build:latest
FROM foundationdb/foundationdb-dev:0.11.1
RUN yum install -y sudo
RUN echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers
RUN groupadd -g 1100 sudo
@ -64,13 +64,19 @@ then
ccache_args=\$args
fi
if [ -t 1 ] ; then
TERMINAL_ARGS=-it `# Run in interactive mode and simulate a TTY`
else
TERMINAL_ARGS=-i `# Run in interactive mode`
fi
sudo docker run --rm `# delete (temporary) image after return` \\
-it `# Run in interactive mode and simulate a TTY` \\
\${TERMINAL_ARGS} \\
--privileged=true `# Run in privileged mode ` \\
--cap-add=SYS_PTRACE \\
--security-opt seccomp=unconfined \\
-v "${HOME}:${HOME}" `# Mount home directory` \\
-w="\$(pwd)" \\
\${ccache_args} \\
${image} "\$@"
EOF
@ -87,6 +93,7 @@ then
echo -e "\tThis can cause problems with some scripts (like fdb-clangd)"
fi
chmod +x $HOME/bin/fdb-dev
chmod +x $HOME/bin/clangd
echo "To start the dev docker image run $HOME/bin/fdb-dev"
echo "$HOME/bin/clangd can be used for IDE integration"
echo "You can edit these files but be aware that this script will overwrite your changes if you rerun it"

View File

@ -87,6 +87,9 @@ function(add_fdb_test)
if (NOT "${ADD_FDB_TEST_TEST_NAME}" STREQUAL "")
set(test_name ${ADD_FDB_TEST_TEST_NAME})
endif()
if((NOT test_name MATCHES "${TEST_INCLUDE}") OR (test_name MATCHES "${TEST_EXCLUDE}"))
return()
endif()
math(EXPR test_idx "${CURRENT_TEST_INDEX} + ${NUM_TEST_FILES}")
set(CURRENT_TEST_INDEX "${test_idx}" PARENT_SCOPE)
# set(<var> <value> PARENT_SCOPE) doesn't set the
@ -160,8 +163,6 @@ function(create_test_package)
string(SUBSTRING ${file} ${base_length} -1 rel_out_file)
set(out_file ${CMAKE_BINARY_DIR}/packages/tests/${rel_out_file})
list(APPEND out_files ${out_file})
get_filename_component(test_dir ${out_file} DIRECTORY)
file(MAKE_DIRECTORY packages/tests/${test_dir})
add_custom_command(
OUTPUT ${out_file}
DEPENDS ${file}

View File

@ -0,0 +1,53 @@
include(CheckCXXCompilerFlag)
function(env_set var_name default_value type docstring)
set(val ${default_value})
if(DEFINED ENV{${var_name}})
set(val $ENV{${var_name}})
endif()
set(${var_name} ${val} CACHE ${type} "${docstring}")
endfunction()
function(default_linker var_name)
if(APPLE)
set("${var_name}" "DEFAULT" PARENT_SCOPE)
elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
find_program(lld_path ld.lld "Path to LLD - is only used to determine default linker")
if(lld_path)
set("${var_name}" "LLD" PARENT_SCOPE)
else()
set("${var_name}" "DEFAULT" PARENT_SCOPE)
endif()
else()
set("${var_name}" "DEFAULT" PARENT_SCOPE)
endif()
endfunction()
function(use_libcxx out)
if(APPLE OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
set("${out}" ON PARENT_SCOPE)
else()
set("${out}" OFF PARENT_SCOPE)
endif()
endfunction()
function(static_link_libcxx out)
if(APPLE)
set("${out}" OFF PARENT_SCOPE)
elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
default_linker(linker)
if(NOT linker STREQUAL "LLD")
set("${out}" OFF PARENT_SCOPE)
return()
endif()
find_library(libcxx_a libc++.a)
find_library(libcxx_abi libc++abi.a)
if(libcxx_a AND libcxx_abi)
set("${out}" ON PARENT_SCOPE)
else()
set("${out}" OFF PARENT_SCOPE)
endif()
else()
set("${out}" ON PARENT_SCOPE)
endif()
endfunction()

View File

@ -1,25 +1,23 @@
function(env_set var_name default_value type docstring)
set(val ${default_value})
if(DEFINED ENV{${var_name}})
set(val $ENV{${var_name}})
endif()
set(${var_name} ${val} CACHE ${type} "${docstring}")
endfunction()
include(CompilerChecks)
set(USE_GPERFTOOLS OFF CACHE BOOL "Use gperfools for profiling")
env_set(USE_GPERFTOOLS OFF BOOL "Use gperfools for profiling")
env_set(USE_VALGRIND OFF BOOL "Compile for valgrind usage")
set(USE_VALGRIND_FOR_CTEST ${USE_VALGRIND} CACHE BOOL "Use valgrind for ctest")
set(ALLOC_INSTRUMENTATION OFF CACHE BOOL "Instrument alloc")
set(WITH_UNDODB OFF CACHE BOOL "Use rr or undodb")
set(USE_ASAN OFF CACHE BOOL "Compile with address sanitizer")
set(USE_UBSAN OFF CACHE BOOL "Compile with undefined behavior sanitizer")
set(FDB_RELEASE OFF CACHE BOOL "This is a building of a final release")
env_set(USE_LD "DEFAULT" STRING "The linker to use for building: can be LD (system default, default choice), BFD, GOLD, or LLD")
env_set(USE_LIBCXX OFF BOOL "Use libc++")
env_set(USE_VALGRIND_FOR_CTEST ${USE_VALGRIND} BOOL "Use valgrind for ctest")
env_set(ALLOC_INSTRUMENTATION OFF BOOL "Instrument alloc")
env_set(WITH_UNDODB OFF BOOL "Use rr or undodb")
env_set(USE_ASAN OFF BOOL "Compile with address sanitizer")
env_set(USE_UBSAN OFF BOOL "Compile with undefined behavior sanitizer")
env_set(FDB_RELEASE OFF BOOL "This is a building of a final release")
env_set(USE_CCACHE OFF BOOL "Use ccache for compilation if available")
set(RELATIVE_DEBUG_PATHS OFF CACHE BOOL "Use relative file paths in debug info")
set(STATIC_LINK_LIBCXX ON CACHE BOOL "Statically link libstdcpp/libc++")
set(USE_WERROR OFF CACHE BOOL "Compile with -Werror. Recommended for local development and CI.")
env_set(RELATIVE_DEBUG_PATHS OFF BOOL "Use relative file paths in debug info")
env_set(USE_WERROR OFF BOOL "Compile with -Werror. Recommended for local development and CI.")
default_linker(_use_ld)
env_set(USE_LD "${_use_ld}" STRING
"The linker to use for building: can be LD (system default and same as DEFAULT), BFD, GOLD, or LLD - will be LLD for Clang if available, DEFAULT otherwise")
use_libcxx(_use_libcxx)
env_set(USE_LIBCXX "${_use_libcxx}" BOOL "Use libc++")
static_link_libcxx(_static_link_libcxx)
env_set(STATIC_LINK_LIBCXX "${_static_link_libcxx}" BOOL "Statically link libstdcpp/libc++")
if(USE_LIBCXX AND STATIC_LINK_LIBCXX AND NOT USE_LD STREQUAL "LLD")
message(FATAL_ERROR "Unsupported configuration: STATIC_LINK_LIBCXX with libc+++ only works if USE_LD=LLD")

View File

@ -185,12 +185,12 @@ function(add_flow_target)
if(WIN32)
add_custom_command(OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/${generated}"
COMMAND $<TARGET_FILE:actorcompiler> "${CMAKE_CURRENT_SOURCE_DIR}/${src}" "${CMAKE_CURRENT_BINARY_DIR}/${generated}" ${actor_compiler_flags}
DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/${src}"
DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/${src}" ${actor_exe}
COMMENT "Compile actor: ${src}")
else()
add_custom_command(OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/${generated}"
COMMAND ${MONO_EXECUTABLE} ${actor_exe} "${CMAKE_CURRENT_SOURCE_DIR}/${src}" "${CMAKE_CURRENT_BINARY_DIR}/${generated}" ${actor_compiler_flags} > /dev/null
DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/${src}"
DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/${src}" ${actor_exe}
COMMENT "Compile actor: ${src}")
endif()
else()

View File

@ -131,9 +131,9 @@ set(install_destination_for_log_el6 "var/log/foundationdb")
set(install_destination_for_log_el7 "var/log/foundationdb")
set(install_destination_for_log_pm "")
set(install_destination_for_data_tgz "lib/foundationdb")
set(install_destination_for_data_deb "var/lib/foundationdb")
set(install_destination_for_data_el6 "var/lib/foundationdb")
set(install_destination_for_data_el7 "var/lib/foundationdb")
set(install_destination_for_data_deb "var/lib/foundationdb/data")
set(install_destination_for_data_el6 "var/lib/foundationdb/data")
set(install_destination_for_data_el7 "var/lib/foundationdb/data")
set(install_destination_for_data_pm "")
set(generated_dir "${CMAKE_CURRENT_BINARY_DIR}/generated")

View File

@ -0,0 +1,336 @@
# The New FDB Backup System: Requirements & Design
Github tracking issue: https://github.com/apple/foundationdb/issues/1003
## Purpose and Audience
The purpose of this document is to capture functional requirements as well as propose a high level design for implementation of the new backup system in FoundationDB. The intended audience for this document includes:
* **FDB users** - Users can understand what are the changes in the new backup system, especially how to start a backup using the new backup system. The restore for new backup is handled by the [Performant Restore System](https://github.com/apple/foundationdb/issues/1049).
* **SRE's and Support** - can understand the high level architecture and know the requirements, including the metrics, tooling, and documentation to ensure that the new FDB backup can be supported.
* **Developers** - can know why this feature is needed, what it does, and how it is to be implemented. The hope is that this document becomes the starting point for any developer wishing to understand or be involved in the related aspects of FDB.
## Functional Requirements
As an essential component of a database system, backup and restore is commonly used technique for disaster recovery, reliability, audit and compliance purposes. The current FDB backup system consumes about half of the clusters write bandwidth, causes write skew among storage servers, increases storage space usage, and results in data balancing. The new backup system aims to double clusters write bandwidth for *HA clusters* (old DR clusters still need old style backup system).
## Background
FDB backup system continuously scan the databases key-value space, save key-value pairs and mutations at versions into range files and log files in blob storage. Specifically, mutation logs are generated at Proxy, and are written to transaction logs along with regular mutations. In production clusters like CK clusters, backup system is always on, which means each mutation is written twice to transaction logs, consuming about half of write bandwidth and about 40% of Proxy CPU time.
The design of old backup system is [here](https://github.com/apple/foundationdb/blob/master/design/backup.md), and the data format of range files and mutations files is [here](https://github.com/apple/foundationdb/blob/master/design/backup-dataFormat.md). The technical overview of FDB is [here](https://github.com/apple/foundationdb/wiki/Technical-Overview-of-the-Database). The FDB recovery is described in this [doc](https://github.com/apple/foundationdb/blob/master/design/recovery-internals.md).
## Terminology
* **Blob storage**: blob storage is an object storage for unstructed data. Backup files are encoded in binary format and saved in blob storage, e.g., Amazon S3.
* **Version**: FDB continuously generate increasing number as version and use version to decide mutation ordering. Version number typically advance one million per second. To restore a FDB cluster to a specified date and time, the restore system first convert the date and time to the corresponding version number and restore the cluster to the version number.
* **Epoch**: A generation of FDBs transaction system. After a component of the transaction system failed, FDB automatically initiates a recovery and restores the system in a new healthy generation, which is called an epoch.
* **Backup worker**: is a new role added to the FDB cluster that is responsible for pulling mutations from transaction logs and saving them to blob storage.
* **Tag**: A tag is a short address for a mutations destination, which includes a locality (`int8_t`, representing the data center ID and a negative number denotes special system locality) and an ID (`int16_t`). The idea is that the tag is a small data structure that consumes less bytes than using IP addresses or storage servers UIDs (16 bytes each), since tags are associated with each mutation and are stored both in memory and on disk.
* **Tag partitioned log system**: FDBs write-ahead log is a tag partitioned log system, where each mutation is assigned a number of tags.
* **Log router tag**: is a special system tag, e.g., `-2:0` where locality `-2` means log router tag and `0` means ID. If attached to a mutation, originally this tag means the mutation should be sent to a remote log router. In the new backup system, we reuse this tag for backup workers to receive all mutations in a number of partitioned streams.
* **Restorable version:** The version that a backup can be restored to. A version `v` is a restorable version if the entire key-space and mutations in version `[v1, v)` are recorded in backup files.
* **Node**: A node is a machine or a process in a cluster.
## Detailed Feature Requirements
Feature priorities: Feature 1, 2, 3, 4, 5 are must-have; Feature 6 is better to have.
1. **Write bandwidth reduction by half**: removes the requirement to generate backup mutations at the Proxy, thus reduce TLog write bandwidth usage by half and significantly improve Proxy CPU usage;
2. **Correctness**: The restored database must be consistent: each *restored* state (i.e., key-value pair) at a version `v` must match the original state at version `v`.
3. **Performance**: The backup system should be performant, mostly measured as a small CPU overhead on transaction logs and backup workers. The version lag on backup workers is an indicator of performance.
4. **Fault-tolerant**: The backup system should be fault-tolerant to node failures in the FDB cluster.
5. **Restore ready**: The new backup system should be restored by the Performant Restore System. As a fallback for new performant restore system, we can convert new backup logs into the format of old backup logs, thus enabling restore of the new backup with existing old restore system.
6. **Backward compatibility**: The new backup system should allow both old style backup and DR (FDB 6.2 and below) to be performed, as well as support new backup in FDB 6.3 and above.
## Security and Privacy Requirements
**Security**: The backup systems components are assumed to be trusted components, because they are running on the nodes in a FDB cluster. The transmission from cluster to blob store is through SSL connections. Blob credentials are passed in from “fdbserver” command line.
**Privacy**: Backup data are stored in blob store with appropriate access control. Data retention policy can be set with “fdbbackup” tool to delete older backup data.
## Operational and Maintainability Requirements
This section discusses changes that may need to be identified or accounted for on the back-end in order to support the feature from a monitoring or management perspective.
### Tooling / Front-End
Workflow is needed for DBA to start, pause, resume, abort the new type of backups. The difference from the old type of backups should be only a flag change for starting the backup. The FDB cluster then generates backups as specified by the flag.
A command line tool `fdbconvert` has been written to convert new backup logs into the format of old backup logs. Thus, if the new restore system has issues, we can still restore the new backup with existing old restore system.
**Deployment instructions for tooling development**
* A new stateless role “`Backup Worker`” (or “`BW`” for abbreviation) is introduced in a FDB cluster. The number of BW processes is based on the number of log routers (usually they are the same). If there is no log routers, the number of transaction logs is used. Note that occasionally the cluster may recruit more backup workers for version ranges in the old epoch. Since these version ranges are small, the resource requirements for these short-lived backup workers are very small.
* As in the old backup system, backup agents need to be started for saving snapshot files to blob storage. In contrast, backup workers in the new backup system running in the primary DC are responsible for saving mutation logs to blob storage.
* Backup workers memory should be large enough to hold 10s of seconds worth of mutation data from TLogs. The memory requirement can be calculated as: `WriteThroughput * BufferPeriod / partitions + SafetyMargin`, where `WriteThroughput` is the aggregated TLog write bandwidth, `partitions` is the number of log router tags.
* A new process class “backup” is defined for backup workers.
* How to start a new type backup: e.g.,
```
fdbbackup start -C fdb.cluster -p -d blob_url
```
### KPI's and Health
The solution must provide at least the following KPIs:
* How fast (MB/s) does the transaction logs commit writes (already existed);
* How much backup data has been processed;
* An estimation of backup delay;
### Customer Care
The feature does not require any specific customer care awareness or interaction.
### Roll-out
The feature must follow the usual roll-out process. It needs to coexist with the existing backup system and periodically restore clusters to test its correctness. Only after we gain enough confidence will we deprecate the existing backup system.
Note the new backup system is designed for HA clusters. Existing DR clusters still uses the old backup system. Thus, rolling out of the new backup system is only for HA clusters.
### Quota
This feature requires a blob storage for saving all log files. The blob storage must have enough:
* disk capacity for all backup data;
* write bandwidth for uploading backup data;
* file count for backup data: the new backup system stored partitioned mutation logs, thus expecting several time increases of the file count.
## Success Criteria
* Write bandwidth reduction meets the expectation: TLog write bandwidth is reduced by half;
* New backup workflow is available to SREs;
* Continuous backup and restore should be performed to validate the restore.
# Design
**One sentence summary**: the new backup system introduces a new role, backup worker, to pull mutations from transaction logs and save them, thus removing the burden of saving mutation logs into the database.
The old backup system writes the mutation log to the database itself, thus doubling the write bandwidth usage. Backup agents later fetch mutation logs from the database, upload them to blob storage, and then remove the mutation logs from the database.
This project saves the mutation log to blob storage directly from the FDB cluster, which should almost double the database's write bandwidth when backup is enabled. In FDB, every mutation already has exactly one log router tag, so the idea of the new system is to backup data for each log router tag individually (i.e., saving mutation logs into multiple partitioned logs). During restore time, these partitioned mutation logs are combined together to form a continuous mutation log stream.
## Design choices
**Design question 1**: Should backup workers be recruited as part of log system or not?
There are two design alternatives:
1. Backup worker is external to the log system. In other words, backup workers survive master recovery. Thus, backup workers are recruited and monitored by the cluster controller.
1. The advantage is that the failure of backup workers does not cause master recovery.
2. The disadvantage is that backup workers need to monitor master recovery, especially configuration changes. Because the number of log routers can change after a recovery, we might need to recruit more backup workers for an increase and need to pause/shutdown backup workers for a decrease, which complicates the recruitment logic; or we might need to changing the mapping of tags to backup workers, which is also complex. A further complication is that backup workers need to constantly monitor master recovery and be very careful about the version boundary between two consecutive epochs, because the number of tags may change.
2. Backup worker is recruited during master recovery as part of log system. The Master recruits a fixed number of backup workers, i.e., the same number as LogRouters.
1. The advantage is that recruiting and mapping from backup worker to LogRouter tags are simple, i.e., one tag per worker.
2. The disadvantages is that backup workers are tied with master recovery -- a failure of a backup worker results in a master recovery, and a master recovery stops old backup workers and starts new ones.
**Decision**: We choose the second approach for the simplicity of the recruiting process and handling of mapping of LogRouter tags to backup workers.
**Design question 2**: Place of backup workers on the primary or remote Data Center (DC)?
Placing backup workers on the primary side has the advantage of supporting any deployment configurations (single DC, multi DC).
Placing on the remote is desirable to reduce the workload on the primary DCs transaction logs. Since log routers on the remote side is already pulling mutations from primary DC, backup workers can simply pull from these log routers.
**Decision**: We choose to recruit backup workers on the primary DC, because not all clusters are configured with multiple DCs and the backup system needs to support all types of deployment.
## Design Assumptions
The design proposed below is based upon the following assumptions:
* Blob system has enough write bandwidth and storage space for backup workers to save log files.
* FDB cluster has enough stateless processes to run as backup workers and these processes have memory capacity to buffer 10s of seconds of commit data.
## Design Challenges
The requirement of the new backup system raises several design challenges:
1. Correctness of the new backup files. Backup files must be complete and accurate to capture all data, otherwise we end up with corrupted data in the backup. The challenge here is to make sure no mutation is missing, even when the FDB cluster experiences failures and has to perform recovery.
2. Testing of the new backup system. How can we test the new backup system when there is no restore system available? We need to verify backup files are correct without performing a full restore.
## System components
**Backup Worker**: This is a new role introduced in the new backup system. A backup worker is a `fdbserver` process running inside a FDB cluster, responsible for pulling mutations from transaction logs and saving the mutations to blob storage.
**Master**: The master is responsible for coordinating the transition of the FDB transaction sub-system from one generation to the next. In particular, the master recruits backup workers during the recovery.
**Transaction Logs (TLogs)**: The transaction logs make mutations durable to disk for fast commit latencies. The logs receive commits from the proxy in version order, and only respond to the proxy once the data has been written and fsync'ed to an append only mutation log on disk. Storage servers retrieve mutations from TLogs. Once the storage servers have persisted mutations, storage servers then pop the mutations from the TLogs.
**Proxy**: The proxies are responsible for providing read versions, committing transactions, and tracking the storage servers responsible for each range of keys. In the old backup system, Proxies are responsible to group mutations into backup mutations and write them to the database.
## System overview
From an end-to-end perspective, the new backup system works in the following steps:
1. Operators issue a new backup request via `fdbbackup` command line tool;
2. FDB cluster receives the request and registers the request in the database (internal `TaskBucket` and system keys);
3. Backup workers monitor changes to system keys, register the request in its own internal queue, and starts logging mutations for the request key range; at the same time, backup agents (scheduled by `TaskBucket`) starts taking snapshots of key ranges in the database;
4. Periodically, backup workers upload mutations to the requested blob storage, and save the progress into the database;
5. The backup is restorable when backup workers have saved versions that are larger than the complete snapshots end version, and the backup is stopped if a stop on restorable flag is set in the request.
The new backup has four major components: 1) backup workers; 2) recruitment of backup workers; 3) extension of tag partitioned log system to support pseudo tags; 4) integration with existing `TaskBucket` based backup command interface; and 5) integration with the Performant Restore System.
### Backup workers
Backup worker is a new role introduced in the new backup system. A backup worker is responsible for pulling mutations from transaction logs and saving the mutations to blob storage. Internally, a backup worker maintains a message buffer, which keeps mutations pulled from transaction logs, but have not been saved to blob storage yet. Periodically, the backup worker parses mutations in the message buffer, extracts those mutations that are within user specified key ranges, and then uploads mutation data to blob storage. After data is saved, the backup worker removes these messages from its internal buffer and saves its progress in the database, so that after a failure, a new backup worker starts from the previously saved version.
Backup worker has two modes of operation: *no-op* mode, and *working* mode. When there is no active backup in the cluster, backup worker operates in the no-op mode, which simply obtains the recently committed version from Proxies and then pops mutations from transaction logs. After operators submit a new backup request to the cluster, backup workers transition into the working mode that starts pulling mutations from transaction logs and saving the mutation data to blob storage.
In the working mode, the popping of backup workers need to follow a strictly increasing version order. For the same tag, there could be multiple backup workers, each is responsible for a different epoch. These backup workers must coordinating their popping order, otherwise the backup can miss some mutation data. This coordination among backup workers is achieved by deferring popping of a later epoch and only allowing the oldest epoch to pop first. After the oldest epoch has finished, these corresponding backup workers notifies the master, which will then advances the oldest backup epoch so that the next epoch can proceed the popping.
A subtle issue for a displaced backup worker (i.e., being displaced because a new epoch begins), is that the last pop of the backup worker can cause missing version ranges in mutation logs. This is because the transaction for saving the progress may be delayed during recovery. As a result, the master could already recruited a new backup worker for the old epoch starting at the previously saved progress version. Then the saving transaction succeeds, and the worker pops mutations that the new backup worker is supposed to save, resulting in missing data for new backup workers log. The solution to this problem can be: 1) the old backup worker aborts immediately after knowing itself is displaced, thus not trying to save its progress; or 2) the old backup worker skip its last pop, since the next epoch will pop versions larger than its progress. Because the second approach avoids doing duplicated work in the new epoch, we choose to the second approach.
Finally, multiple concurrent backups are supported. Each backup worker keeps track of current backup jobs and saves mutations to corresponding backup containers for the same batch of mutations.
### Recruitment of Backup workers
Backup workers are recruited during master recovery as part of log system. The Master recruits a fixed number of backup workers, one for each log router tag. During the recruiting process, the master sends backup worker initialization request as:
```
struct InitializeBackupRequest {
UID reqId;
LogEpoch epoch; // epoch this worker is recruited
LogEpoch backupEpoch; // epoch that this worker actually works on
Tag routerTag;
Version startVersion;
Optional<Version> endVersion; // Only present for unfinished old epoch
ReplyPromise<struct BackupInterface> reply;
… // additional methods elided
};
```
Note we need two epochs here: one for the recruited epoch and one for backing up epoch. The recruited epoch is the epoch of the log system, which is used by a backup worker to find out if it works for the current epoch. If so, the worker should save its progress and immediately exit. The `backupEpoch` is used for saving progress. The `backupEpoch` is usually the same as the epoch that the worker is recruited. However, it can be some earlier epoch than the recruiting epoch, signifying that the worker is responsible for data in that earlier epoch. In this case, when the worker is done and exits, the master should not flag its departure as a trigger of recovery. This is solved by the following protocol:
1. The backup worker finishes its work, including saving progress to the key value store and uploading to cloud storage, and then sends a `BackupWorkerDoneRequest` to the master;
2. The master receives the request, removes the worker from its log system, and updates the oldest backing up epoch `oldestBackupEpoch`;
3. The master sends backup a reply message to the backup worker and registers the new log system with cluster controller;
4. The backup worker exits after receiving the reply. Other backup workers in the system get the new log system from the cluster controller. If a backup workers `backupEpoch` is equal to `oldestBackupEpoch`, then the worker may start popping from TLogs.
Note `oldestBackupEpoch` is introduced to prevent a backup worker for a newer epoch from popping when there are backup workers for older epochs. Otherwise, these older backup workers may lose data.
### Extension of tag partitioned log system to support pseudo tags
The tag partitioned log system is modeled like a FIFO queue, where Proxies push mutations to the queue and Storage Servers or Log Routers pop mutations from the queue. Specifically, consumers of the tag partitioned log system use two operations, `peek` and `pop`, to read mutations for a given tag and to pop mutations from the queue. Because Proxies assign each mutation a unique log router tag, the backup system reuses this tag to obtain the whole mutation stream. As a result, each log router tag now has two consumers, a log router and a backup worker.
To support multiple consumers of the log router tag, the peek and pop has been extended to support pseudo tags. In other words, each log router tag can be mapped to multiple pseudo tags. Log routers and Backup workers still `peek` mutations with the log router tag, but `pop` with different pseudo tags. Only after both pseudo tags are popped, TLogs can pop the mutations from its internal queue.
Note the introduction of pseudo tags opens the possibility for more usage scenarios. For instance, a change stream can be implemented with a pseudo tag, where the new consumer can look at each mutation and emit mutations on specified key ranges.
### Integration with existing taskbucket based backup command interface
We strive to keep the operational interface the same as the old backup system. That is, the new backup is initiated by the client as before with an additional flag. FDB cluster receives the backup request, sees the flag being set, and uses the new system for generating mutation logs.
By default, backup workers are not enabled in the system. When operators submit a new backup request for the first time, the database performs a configuration change (`backup_worker_enabled:=1`) that enables backup workers.
The operators backup request can indicate if an old backup or a new backup is used. This is a command line option (i.e., `-p` or `--partitioned_log`) in the `fdbbackup` command. A backup request of the new type is started in the following steps:
1. Operators use `fdbbackup` tool to write the backup range to a system key, i.e., `\xff\x02/backupStarted`.
2. All backup workers monitor the key `\xff\x02/backupStarted`, see the change, and start logging mutations.
3. After all backup workers have started, the `fdbbackup` tool initiates the backup of all or specified key ranges by issuing a transaction `Ts`.
Compared to the old backup system, the above step 1 and 2 are new and is only triggered if client requests for a new type of backup. The purpose is to allow backup workers to function as no-op if there are no ongoing backups. However, the backup workers should still continuously pop their corresponding tags, otherwise mutations will be kept in the TLog. In order to know the version to pop, backup workers can obtain the read version from any proxy. Because the read version must be a committed version, so popping to this version is safe.
**Backup Submission Protocol**
Protocol for `submitBackup()` to ensure that all backup workers of the current epoch have started logging mutations:
1. After the `submitBackup()` call, the task bucket (i.e., `StartFullBackupTaskFunc`) starts by creating a `BackupConfig` object in the system key space.
2. Each backup worker monitors the `\xff\x02/backupStarted` key and notices the new backup job. Then the backup worker inserts the new job into its internal queue, and writes to `startedBackupWorkers` key in the `BackupConfig` object if the workers `backupEpoch` is the current epoch. Among these workers, the worker with Log Router Tag `-2:0` monitors the `startedBackupWorkers` key, and sets `allWorkerStarted` key after all workers have updated the `startedBackupWorkers` key.
3. The task bucket watches change to the `startedBackupWorkers` key and declares the job submission successful.
This protocol was implemented after another abandoned protocol: the `startedBackupWorkers` key is set after all backup workers have saved logs with versions larger than the version of `submitBackup()` call. This protocol fails if there is already a backup job and there is a backup worker that doesnt notice the change to the `\xff\x02/backupStarted` key. As a result, the worker is saving versions larger than the new jobs start version, but in the old backup container. Thus the new container misses some mutations.
**Protocol for Determining A Backup is Restorable**
1. Each backup worker independently logs mutations to a backup container and updates its progress in the system key space.
2. The worker with Log Router Tag `-2:0` of current epoch monitors all workers progress. If the oldest backup epoch is the current epoch (i.e, there are no backup workers for any old epochs, thus no version ranges missing before this epoch), this worker updates `latestBackupWorkerSavedVersion` key in the `BackupConfig` object with the minimum saved version among workers.
3. The client calls `describeBackup()`, which eventually calls `getLatestRestorableVersion` to read the value from the `latestBackupWorkerSavedVersion` key. If this version is larger than the first snapshots end version, then the backup is restorable.
**Pause and Resume Backups**
The command line for pause or resume backups remains the same, but the implementation for the new backup system is different from the old one. This is because in the old backup system, both mutation logs and range logs are handled by `TaskBucket`, an asynchronous task scheduling framework that stores states in the FDB database. Thus, the old backup system simply pauses or resumes the `TaskBucket`. In the new backup system, mutation logs are generated by backup workers, thus the pause or resume command needs to tell all backup workers to pause or resume pulling mutations from TLogs. Specifically,
1. The operator issues a pause or resume request that upates both the `TaskBucket` and `\xff\x02/backupPaused` key.
2. Each backup worker monitors the `\xff\x02/backupPaused` key and notices the change. Then the backup worker pauses or resumes pulling from TLogs.
**Backup Container Changes**
* Partitioned mutation logs are stored in `plogs/XXXX/XXXX` directory and their names are in the format of `log,[startVersion],[endVersion],[UID],[N-of-M],[blockSize]`, where `M` is total partition number, `N` can be any number from `0` to `M - 1`. In contrast, old mutation logs are stored in `logs/XXXX/XXXX` directory and are named differently.
* To restore a version range, all partitioned logs for the range needs to be available. The restore process should read all partitioned logs, and combine mutations from different logs into one mutation stream, ordered by `(commit_version, subsequence)` pair. It is guaranteed that all mutations form a total order. Note in the old backup files, there is no subsequence number, as each versions mutations are serialized in order in one file.
### Integration with the [Performant Restore System](https://github.com/apple/foundationdb/issues/1049)
As discussed above, the new backup system split mutation logs into multiple partitions. Thus, the restore process must verify the backup files are continuous for all partitions with the restores version range. This is possible because each log file name has the information about its partition number and the total number of partitions.
Once the restore system verifies the version range is continuous, the restore system needs to filter out duplicated version range among different log files (both log continuity analysis and dedup logic are implemented in `BackupContainer` abstraction). A given version range may be stored in **multiple** mutation log files. This can happen because a recruited backup worker can upload mutation files successfully, but doesnt save the progress before another recovery happens. As a result, the new epoch tries to backup this version range again, producing the same version ranges (though the file names are different).
Finally, the restore system loads the same versions mutations from all partitions, and then merges these mutations in the order of their subsequence number before they are applied on the restore cluster. Note the mutations in the old backup system lack subsequence numbers. As a result, restoring old backups needs to assign subsequence number to mutations.
## Ordered and Complete Guarantee of Mutation Logs
The backup system must generate log files that the restore system can apply all the mutations on the backup cluster in the same order exactly once.
**Ordering guarantee**. To maintain the ordering of mutations, each mutation is stored with its commit version and a subsequence number, both are assigned by Proxies during commit. The restore system can load all mutations and derive a total order among all the mutations.
**Completeness guarantee**. All mutations should be saved in log files. We cannot allow any mutations missing from the backup. This is guaranteed by the fault tolerance discussed below. Essentially all backup workers checkpoint their progress in the database. After the recovery, the new master reads previous checkpoints and recruit new backup workers for any missing version ranges.
## Backup File Format
The old backup file format is documented [here](https://github.com/apple/foundationdb/blob/release-6.2/design/backup-dataFormat.md). We cant use this file format, because our backup files are created for log router tags. When there are more than one log routers (almost always the case), the mutations in one transaction can be given different log router tags. As a result, for the same version, mutations are distributed in many files. Another subtle issue is that, there can be two mutations, (e.g., `a = 1` and `a = 2` in a transaction), which are given two different tags. We have to preserve the order of these two mutations in the restore process. Even though the order is saved in the sub-sequence number of a version, we still need to merge mutations from multiple files and apply them in the correct order.
In the new backup system, mutation log file is named as `log,[startVersion],[endVersion],[UID],[N-of-M],[blockSize]`, where `startVersion` is inclusive and `endVersion` is *not* inclusive, e.g., `log,332850851,332938927,7be23c0a3e80df8ab1530fa76fa66980,1-of-4,1048576`. With the information from all file names, the restore process can find all files for a version range, i.e., versions intersect with the range and all log router tags. “`M`” is the total number of tags, and “`N`” is from `0` to `m - 1`.Note `tagId` is not required in the old backup filename, since all mutations for a version are included in one file.
Each file content is a list of fixed size blocks. Each block contains a sequence of mutations, where each mutation consists of a serialized `Version`, `int32_t`, `int32_t`, (all these three numbers are in big endian) and `Mutation`, where `Mutation` is of format `type|kLen|vLen|Key|Value`, where `type` is the mutation type (e.g., `Set` or `Clear`), `kLen` and `vLen` respectively are the lengths of the key and value in the mutation. `Key` and `Value` are the serialized value of the Key and Value in the mutation. The paddings at the end of the block are bytes of `0xFF`.
```
`<BlockHeader>`
`<Version_1><Subseq_1><Mutation1_len><Mutation1>`
`<Version_2><Subseq_2><Mutation2_len><Mutation2>`
`…`
`<Padding>
`
```
Note the big Endianness for version is required, as `0xFF` is used as the padding to indicate block end. A little endian number can easily be mistaken as the end. In contrast, big endian for version almost guarantee the first byte is not `0xFF` (should always be `0x00`).
## Performance optimization
### Future Optimizations
Add a metadata file describe the backup file:
* The number of mutations;
* The number of atomic operations;
* key range and version range of mutations in each backup file;
The information can be used to optimize the restore process. For instance, the number of mutations can be used to make better load balancing decisions; if there is no atomic operations, the restore can apply mutation in a backward fashion -- skipping mutations with earlier versions.
## Fault Tolerance
Failures of a backup worker will trigger a master recovery. After the recovery, the new master recruits a new set of backup workers. Among them, a new backup worker shall continue the work of the failed backup worker from the previous epoch.
The interesting part is the handling of old epochs, since the backup workers for the old epoch are in the “displaced” state and should exit. So the basic idea is that we need a set of backup workers for the data left in the old epochs. To figure out the set of data not backed up yet, the master first loads saved backup progress data `<Worker_UID, LogEpoch, SavedVersion, Tag, TotalTags> `from the database, and then computes for each epoch, what version ranges have not been backed up. For each of the version range and tag, master recruit a worker to resume the backup for that version range and tag. Note that this worker has a different worker UID from the worker in the original epoch. As a result, for a given epoch and a tag, there might be multiple progress status, as these workers are recruited at different epochs.
## KPI's and Metrics
The backup system emits the following metrics:
* How much backup data has been processed: the backup command line tool `fdbbackup` can show the status of backup, including the size of mutation logs (`LogBytes written`) and snapshots (`RangeBytes written`). By taking two consecutive backup status, the backup speed can be estimated as (`2nd_LogBytes - 1st_LogBytes) / interval`.
* An estimation of backup delay: Each backup worker emits `BackupWorkerMetrics` trace events every 5 seconds, which includes `SavedVersion`, `MinKnownCommittedVersion`, and `MsgQ`. The backup delay can be estimated as (`MinKnownCommittedVersion - SavedVersion) / 1,000,000` seconds, which is the difference between a workers saved version and current committed version, divided by 1M version per second. `MsgQ` is the queue size of memory buffer of the backup worker.
## Controlling Properties
System operator can control the following backup properties:
* **Backup key ranges**: The non-overlapped key ranges that will be backed up to the blob storage.
* **Blob url**: The root path in blob that host all backup files.
* **Performance knobs**: The knobs that control the performance
* The backup interval (knob `BACKUP_UPLOAD_DELAY`) for saving mutation logs to blob storage;
## Testing
The feature will be tested both in simulation and in real clusters:
* New test cases are added into the test folder in FDB. The nightly correctness (i.e., simulation) tests will test the correctness of both backup and restore.
* Tests will be added to constantly backup a cluster with the new backup system and restore the backup to ensure the restore works on real clusters. During the time period of active backup, the cluster should have better write performance than using old backup system.
* Tests should also be conducted with production data. This ensures backup data is restorable and catches potential bugs in backup and restore. This test is preferably conducted regularly, e.g., weekly per cluster.
Before the restore system is available, the testing strategy for backup files is to keep old backup system running. Thus, both new backup files and old backup files are generated. Then both types of log files are decoded and compared against. The new backup file is considered correct if its content matches the content of old log files.

View File

@ -176,6 +176,9 @@
.. |transaction-get-committed-version-blurb| replace::
Gets the version number at which a successful commit modified the database. This must be called only after the successful (non-error) completion of a call to |commit-func| on this Transaction, or the behavior is undefined. Read-only transactions do not modify the database when committed and will have a committed version of -1. Keep in mind that a transaction which reads keys and then sets them to their current values may be optimized to a read-only transaction.
.. |transaction-get-approximate-size-blurb| replace::
Gets the the approximate transaction size so far, which is the summation of the estimated size of mutations, read conflict ranges, and write conflict ranges.
.. |transaction-get-versionstamp-blurb| replace::
Returns a future which will contain the versionstamp which was used by any versionstamp operations in this transaction. This function must be called before a call to |commit-func| on this Transaction. The future will be ready only after the successful completion of a call to |commit-func| on this Transaction. Read-only transactions do not modify the database when committed and will result in the future completing with an error. Keep in mind that a transaction which reads keys and then sets them to their current values may be optimized to a read-only transaction.

View File

@ -805,6 +805,13 @@ Transaction misc functions
.. _api-python-transaction-options:
Transaction misc functions
--------------------------
.. method:: Transaction.get_approximate_size()
|transaction-get-approximate-size-blurb|. Returns a :class:`FutureInt64`.
Transaction options
-------------------

View File

@ -736,7 +736,7 @@ Most applications should use the read version that FoundationDB determines autom
|infrequent| |transaction-get-committed-version-blurb|
.. method:: Transaction.get_verionstamp() -> String
.. method:: Transaction.get_versionstamp() -> String
|infrequent| |transaction-get-versionstamp-blurb|
@ -747,6 +747,10 @@ Transaction misc functions
Get the estimated byte size of the given key range. Returns a :class:`Int64Future`.
.. method:: Transaction.get_approximate_size() -> Int64Future
|transaction-get-approximate-size-blurb|. Returns a :class:`Int64Future`.
Transaction options
-------------------

View File

@ -167,6 +167,11 @@ getversion
The ``getversion`` command fetches the current read version of the cluster or currently running transaction.
advanceversion
--------------
Forces the cluster to recover at the specified version. If the specified version is larger than the current version of the cluster, the cluster version is advanced to the specified version via a forced recovery.
help
----

View File

@ -10,38 +10,38 @@ macOS
The macOS installation package is supported on macOS 10.7+. It includes the client and (optionally) the server.
* `FoundationDB-6.2.19.pkg <https://www.foundationdb.org/downloads/6.2.19/macOS/installers/FoundationDB-6.2.19.pkg>`_
* `FoundationDB-6.2.20.pkg <https://www.foundationdb.org/downloads/6.2.20/macOS/installers/FoundationDB-6.2.20.pkg>`_
Ubuntu
------
The Ubuntu packages are supported on 64-bit Ubuntu 12.04+, but beware of the Linux kernel bug in Ubuntu 12.x.
* `foundationdb-clients-6.2.19-1_amd64.deb <https://www.foundationdb.org/downloads/6.2.19/ubuntu/installers/foundationdb-clients_6.2.19-1_amd64.deb>`_
* `foundationdb-server-6.2.19-1_amd64.deb <https://www.foundationdb.org/downloads/6.2.19/ubuntu/installers/foundationdb-server_6.2.19-1_amd64.deb>`_ (depends on the clients package)
* `foundationdb-clients-6.2.20-1_amd64.deb <https://www.foundationdb.org/downloads/6.2.20/ubuntu/installers/foundationdb-clients_6.2.20-1_amd64.deb>`_
* `foundationdb-server-6.2.20-1_amd64.deb <https://www.foundationdb.org/downloads/6.2.20/ubuntu/installers/foundationdb-server_6.2.20-1_amd64.deb>`_ (depends on the clients package)
RHEL/CentOS EL6
---------------
The RHEL/CentOS EL6 packages are supported on 64-bit RHEL/CentOS 6.x.
* `foundationdb-clients-6.2.19-1.el6.x86_64.rpm <https://www.foundationdb.org/downloads/6.2.19/rhel6/installers/foundationdb-clients-6.2.19-1.el6.x86_64.rpm>`_
* `foundationdb-server-6.2.19-1.el6.x86_64.rpm <https://www.foundationdb.org/downloads/6.2.19/rhel6/installers/foundationdb-server-6.2.19-1.el6.x86_64.rpm>`_ (depends on the clients package)
* `foundationdb-clients-6.2.20-1.el6.x86_64.rpm <https://www.foundationdb.org/downloads/6.2.20/rhel6/installers/foundationdb-clients-6.2.20-1.el6.x86_64.rpm>`_
* `foundationdb-server-6.2.20-1.el6.x86_64.rpm <https://www.foundationdb.org/downloads/6.2.20/rhel6/installers/foundationdb-server-6.2.20-1.el6.x86_64.rpm>`_ (depends on the clients package)
RHEL/CentOS EL7
---------------
The RHEL/CentOS EL7 packages are supported on 64-bit RHEL/CentOS 7.x.
* `foundationdb-clients-6.2.19-1.el7.x86_64.rpm <https://www.foundationdb.org/downloads/6.2.19/rhel7/installers/foundationdb-clients-6.2.19-1.el7.x86_64.rpm>`_
* `foundationdb-server-6.2.19-1.el7.x86_64.rpm <https://www.foundationdb.org/downloads/6.2.19/rhel7/installers/foundationdb-server-6.2.19-1.el7.x86_64.rpm>`_ (depends on the clients package)
* `foundationdb-clients-6.2.20-1.el7.x86_64.rpm <https://www.foundationdb.org/downloads/6.2.20/rhel7/installers/foundationdb-clients-6.2.20-1.el7.x86_64.rpm>`_
* `foundationdb-server-6.2.20-1.el7.x86_64.rpm <https://www.foundationdb.org/downloads/6.2.20/rhel7/installers/foundationdb-server-6.2.20-1.el7.x86_64.rpm>`_ (depends on the clients package)
Windows
-------
The Windows installer is supported on 64-bit Windows XP and later. It includes the client and (optionally) the server.
* `foundationdb-6.2.19-x64.msi <https://www.foundationdb.org/downloads/6.2.19/windows/installers/foundationdb-6.2.19-x64.msi>`_
* `foundationdb-6.2.20-x64.msi <https://www.foundationdb.org/downloads/6.2.20/windows/installers/foundationdb-6.2.20-x64.msi>`_
API Language Bindings
=====================
@ -58,18 +58,18 @@ On macOS and Windows, the FoundationDB Python API bindings are installed as part
If you need to use the FoundationDB Python API from other Python installations or paths, use the Python package manager ``pip`` (``pip install foundationdb``) or download the Python package:
* `foundationdb-6.2.19.tar.gz <https://www.foundationdb.org/downloads/6.2.19/bindings/python/foundationdb-6.2.19.tar.gz>`_
* `foundationdb-6.2.20.tar.gz <https://www.foundationdb.org/downloads/6.2.20/bindings/python/foundationdb-6.2.20.tar.gz>`_
Ruby 1.9.3/2.0.0+
-----------------
* `fdb-6.2.19.gem <https://www.foundationdb.org/downloads/6.2.19/bindings/ruby/fdb-6.2.19.gem>`_
* `fdb-6.2.20.gem <https://www.foundationdb.org/downloads/6.2.20/bindings/ruby/fdb-6.2.20.gem>`_
Java 8+
-------
* `fdb-java-6.2.19.jar <https://www.foundationdb.org/downloads/6.2.19/bindings/java/fdb-java-6.2.19.jar>`_
* `fdb-java-6.2.19-javadoc.jar <https://www.foundationdb.org/downloads/6.2.19/bindings/java/fdb-java-6.2.19-javadoc.jar>`_
* `fdb-java-6.2.20.jar <https://www.foundationdb.org/downloads/6.2.20/bindings/java/fdb-java-6.2.20.jar>`_
* `fdb-java-6.2.20-javadoc.jar <https://www.foundationdb.org/downloads/6.2.20/bindings/java/fdb-java-6.2.20-javadoc.jar>`_
Go 1.11+
--------

View File

@ -2,7 +2,7 @@
Release Notes
#############
7.0.0
6.3.0
=====
Features
@ -28,9 +28,6 @@ Bindings
* Java: Introduced ``keyAfter`` utility function that can be used to create the immediate next key for a given byte array. `(PR #2458) <https://github.com/apple/foundationdb/pull/2458>`_
* C: The ``FDBKeyValue`` struct's ``key`` and ``value`` members have changed type from ``void*`` to ``uint8_t*``. `(PR #2622) <https://github.com/apple/foundationdb/pull/2622>`_
* Deprecated ``enable_slow_task_profiling`` transaction option and replaced it with ``enable_run_loop_profiling``. `(PR #2608) <https://github.com/apple/foundationdb/pull/2608>`_
* Go: Added a ``Close`` function to ``RangeIterator`` which **must** be called to free resources returned from ``Transaction.GetRange``. `(PR #1910) <https://github.com/apple/foundationdb/pull/1910>`_.
* Go: Finalizers are no longer used to clean up native resources. ``Future`` results are now copied from the native heap to the Go heap, and native resources are freed immediately. `(PR #1910) <https://github.com/apple/foundationdb/pull/1910>`_.
Other Changes
-------------

View File

@ -45,11 +45,11 @@ if(NOT OPEN_FOR_IDE)
symlink_files(
LOCATION packages/bin
SOURCE fdbbackup
TARGETS fdbdr dr_agent backup_agent fdbrestore)
TARGETS fdbdr dr_agent backup_agent fdbrestore fastrestore_agent)
symlink_files(
LOCATION bin
SOURCE fdbbackup
TARGETS fdbdr dr_agent backup_agent fdbrestore)
TARGETS fdbdr dr_agent backup_agent fdbrestore fastrestore_agent)
endif()
if (GPERFTOOLS_FOUND)

View File

@ -373,17 +373,6 @@ struct LogFileWriter {
return wr.toValue();
}
// Return a block of contiguous padding bytes, growing if needed.
static Value makePadding(int size) {
static Value pad;
if (pad.size() < size) {
pad = makeString(size);
memset(mutateString(pad), '\xff', pad.size());
}
return pad.substr(0, size);
}
// Start a new block if needed, then write the key and value
ACTOR static Future<Void> writeKV_impl(LogFileWriter* self, Key k, Value v) {
// If key and value do not fit in this block, end it and start a new one
@ -392,7 +381,7 @@ struct LogFileWriter {
// Write padding if needed
int bytesLeft = self->blockEnd - self->file->size();
if (bytesLeft > 0) {
state Value paddingFFs = makePadding(bytesLeft);
state Value paddingFFs = fileBackup::makePadding(bytesLeft);
wait(self->file->append(paddingFFs.begin(), bytesLeft));
}

View File

@ -2192,8 +2192,7 @@ ACTOR Future<Void> runRestore(Database db, std::string originalClusterFile, std:
// Fast restore agent that kicks off the restore: send restore requests to restore workers.
ACTOR Future<Void> runFastRestoreAgent(Database db, std::string tagName, std::string container,
Standalone<VectorRef<KeyRangeRef>> ranges, Version dbVersion,
bool performRestore, bool verbose, bool waitForDone, std::string addPrefix,
std::string removePrefix) {
bool performRestore, bool verbose, bool waitForDone) {
try {
state FileBackupAgent backupAgent;
state Version restoreVersion = invalidVersion;
@ -2219,9 +2218,26 @@ ACTOR Future<Void> runFastRestoreAgent(Database db, std::string tagName, std::st
dbVersion = desc.maxRestorableVersion.get();
TraceEvent("FastRestoreAgent").detail("TargetRestoreVersion", dbVersion);
}
Version _restoreVersion = wait(fastRestore(db, KeyRef(tagName), KeyRef(container), waitForDone, dbVersion,
verbose, range, KeyRef(addPrefix), KeyRef(removePrefix)));
restoreVersion = _restoreVersion;
state UID randomUID = deterministicRandom()->randomUniqueID();
TraceEvent("FastRestoreAgent")
.detail("SubmitRestoreRequests", ranges.size())
.detail("RestoreUID", randomUID);
wait(backupAgent.submitParallelRestore(db, KeyRef(tagName), ranges, KeyRef(container), dbVersion, true,
randomUID));
if (waitForDone) {
// Wait for parallel restore to finish and unlock DB after that
TraceEvent("FastRestoreAgent").detail("BackupAndParallelRestore", "WaitForRestoreToFinish");
wait(backupAgent.parallelRestoreFinish(db, randomUID));
TraceEvent("FastRestoreAgent").detail("BackupAndParallelRestore", "RestoreFinished");
} else {
TraceEvent("FastRestoreAgent")
.detail("RestoreUID", randomUID)
.detail("OperationGuide", "Manually unlock DB when restore finishes");
printf("WARNING: DB will be in locked state after restore. Need UID:%s to unlock DB\n",
randomUID.toString().c_str());
}
restoreVersion = dbVersion;
} else {
state Reference<IBackupContainer> bc = IBackupContainer::openContainer(container);
state BackupDescription description = wait(bc->describeBackup());
@ -3740,7 +3756,7 @@ int main(int argc, char* argv[]) {
switch (restoreType) {
case RESTORE_START:
f = stopAfter(runFastRestoreAgent(db, tagName, restoreContainer, backupKeys, restoreVersion, !dryRun,
!quietDisplay, waitForDone, addPrefix, removePrefix));
!quietDisplay, waitForDone));
break;
case RESTORE_WAIT:
printf("[TODO][ERROR] FastRestore does not support RESTORE_WAIT yet!\n");
@ -3887,102 +3903,3 @@ int main(int argc, char* argv[]) {
flushAndExit(status);
}
//------Restore Agent: Kick off the restore by sending the restore requests
ACTOR static Future<FileBackupAgent::ERestoreState> waitFastRestore(Database cx, Key tagName, bool verbose) {
// We should wait on all restore to finish before proceeds
TraceEvent("FastRestore").detail("Progress", "WaitForRestoreToFinish");
state ReadYourWritesTransaction tr(cx);
state Future<Void> fRestoreRequestDone;
state bool restoreRequestDone = false;
loop {
try {
tr.reset();
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
// In case restoreRequestDoneKey is already set before we set watch on it
Optional<Value> restoreRequestDoneKeyValue = wait(tr.get(restoreRequestDoneKey));
if (restoreRequestDoneKeyValue.present()) {
restoreRequestDone = true;
tr.clear(restoreRequestDoneKey);
wait(tr.commit());
break;
} else if (!restoreRequestDone) {
fRestoreRequestDone = tr.watch(restoreRequestDoneKey);
wait(tr.commit());
wait(fRestoreRequestDone);
} else {
break;
}
} catch (Error& e) {
wait(tr.onError(e));
}
}
TraceEvent("FastRestore").detail("Progress", "RestoreFinished");
return FileBackupAgent::ERestoreState::COMPLETED;
}
ACTOR static Future<Version> _fastRestore(Database cx, Key tagName, Key url, bool waitForComplete,
Version targetVersion, bool verbose, KeyRange range, Key addPrefix,
Key removePrefix) {
state Reference<IBackupContainer> bc = IBackupContainer::openContainer(url.toString());
state BackupDescription desc = wait(bc->describeBackup());
wait(desc.resolveVersionTimes(cx));
if (targetVersion == invalidVersion && desc.maxRestorableVersion.present())
targetVersion = desc.maxRestorableVersion.get();
Optional<RestorableFileSet> restoreSet = wait(bc->getRestoreSet(targetVersion));
TraceEvent("FastRestore").detail("BackupDesc", desc.toString()).detail("TargetVersion", targetVersion);
if (!restoreSet.present()) {
TraceEvent(SevWarn, "FileBackupAgentRestoreNotPossible")
.detail("BackupContainer", bc->getURL())
.detail("TargetVersion", targetVersion);
throw restore_invalid_version();
}
// NOTE: The restore agent makes sure we only support 1 restore range for each restore request for now!
// The simulation test did test restoring multiple restore ranges in one restore request though.
state Reference<ReadYourWritesTransaction> tr(new ReadYourWritesTransaction(cx));
state int restoreIndex = 0;
loop {
try {
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr->setOption(FDBTransactionOptions::LOCK_AWARE);
Standalone<StringRef> restoreTag(tagName.toString() + "_" + std::to_string(restoreIndex));
bool locked = true;
struct RestoreRequest restoreRequest(restoreIndex, restoreTag, KeyRef(bc->getURL()), true, targetVersion,
true, range, Key(), Key(), locked,
deterministicRandom()->randomUniqueID());
tr->set(restoreRequestKeyFor(restoreRequest.index), restoreRequestValue(restoreRequest));
// backupRanges.size = 1 because we only support restoring 1 range in real mode for now
tr->set(restoreRequestTriggerKey, restoreRequestTriggerValue(deterministicRandom()->randomUniqueID(),1));
wait(tr->commit()); // Trigger fast restore
break;
} catch (Error& e) {
if (e.code() != error_code_restore_duplicate_tag) {
wait(tr->onError(e));
}
}
}
if (waitForComplete) {
FileBackupAgent::ERestoreState finalState = wait(waitFastRestore(cx, tagName, verbose));
if (finalState != FileBackupAgent::ERestoreState::COMPLETED) throw restore_error();
}
return targetVersion;
}
ACTOR Future<Version> fastRestore(Database cx, Standalone<StringRef> tagName, Standalone<StringRef> url,
bool waitForComplete, long targetVersion, bool verbose, Standalone<KeyRangeRef> range,
Standalone<StringRef> addPrefix, Standalone<StringRef> removePrefix) {
Version result =
wait(_fastRestore(cx, tagName, url, waitForComplete, targetVersion, verbose, range, addPrefix, removePrefix));
return result;
}

View File

@ -525,6 +525,11 @@ void initHelp() {
helpMap["getversion"] =
CommandHelp("getversion", "Fetch the current read version",
"Displays the current read version of the database or currently running transaction.");
helpMap["advanceversion"] = CommandHelp(
"advanceversion <VERSION>", "Force the cluster to recover at the specified version",
"Forces the cluster to recover at the specified version. If the specified version is larger than the current "
"version of the cluster, the cluster version is advanced "
"to the specified version via a forced recovery.");
helpMap["reset"] = CommandHelp(
"reset",
"reset the current transaction",
@ -3217,6 +3222,23 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise) {
continue;
}
if (tokencmp(tokens[0], "advanceversion")) {
if (tokens.size() != 2) {
printUsage(tokens[0]);
is_error = true;
} else {
Version v;
int n = 0;
if (sscanf(tokens[1].toString().c_str(), "%ld%n", &v, &n) != 1 || n != tokens[1].size()) {
printUsage(tokens[0]);
is_error = true;
} else {
wait(makeInterruptable(advanceVersion(db, v)));
}
}
continue;
}
if (tokencmp(tokens[0], "kill")) {
getTransaction(db, tr, options, intrans);
if (tokens.size() == 1) {

View File

@ -278,7 +278,7 @@ public:
// parallel restore
Future<Void> parallelRestoreFinish(Database cx, UID randomUID);
Future<Void> submitParallelRestore(Database cx, Key backupTag, Standalone<VectorRef<KeyRangeRef>> backupRanges,
KeyRef bcUrl, Version targetVersion, bool lockDB, UID randomUID);
Key bcUrl, Version targetVersion, bool lockDB, UID randomUID);
Future<Void> atomicParallelRestore(Database cx, Key tagName, Standalone<VectorRef<KeyRangeRef>> ranges,
Key addPrefix, Key removePrefix);
@ -893,10 +893,6 @@ public:
}
};
ACTOR Future<Version> fastRestore(Database cx, Standalone<StringRef> tagName, Standalone<StringRef> url,
bool waitForComplete, long targetVersion, bool verbose, Standalone<KeyRangeRef> range,
Standalone<StringRef> addPrefix, Standalone<StringRef> removePrefix);
// Helper class for reading restore data from a buffer and throwing the right errors.
struct StringRefReader {
StringRefReader(StringRef s = StringRef(), Error e = Error()) : rptr(s.begin()), end(s.end()), failure_error(e) {}
@ -937,6 +933,9 @@ struct StringRefReader {
namespace fileBackup {
ACTOR Future<Standalone<VectorRef<KeyValueRef>>> decodeRangeFileBlock(Reference<IAsyncFile> file, int64_t offset,
int len);
// Return a block of contiguous padding bytes "\0xff" for backup files, growing if needed.
Value makePadding(int size);
}
#include "flow/unactorcompiler.h"

View File

@ -1343,6 +1343,7 @@ public:
Standalone<VectorRef<KeyValueRef>> blockData = wait(fileBackup::decodeRangeFileBlock(inFile, j, len));
if (!beginKeySet) {
beginKey = blockData.front().key;
beginKeySet = true;
}
endKey = blockData.back().key;
}
@ -2096,6 +2097,8 @@ ACTOR Future<Optional<int64_t>> timeKeeperEpochsFromVersion(Version v, Reference
return found.first + (v - found.second) / CLIENT_KNOBS->CORE_VERSIONSPERSECOND;
}
namespace backup_test {
int chooseFileSize(std::vector<int> &sizes) {
int size = 1000;
if(!sizes.empty()) {
@ -2133,7 +2136,30 @@ Version nextVersion(Version v) {
return v + increment;
}
ACTOR Future<Void> testBackupContainer(std::string url) {
// Write a snapshot file with only begin & end key
ACTOR static Future<Void> testWriteSnapshotFile(Reference<IBackupFile> file, Key begin, Key end, uint32_t blockSize) {
ASSERT(blockSize > 3 * sizeof(uint32_t) + begin.size() + end.size());
uint32_t fileVersion = BACKUP_AGENT_SNAPSHOT_FILE_VERSION;
// write Header
wait(file->append((uint8_t*)&fileVersion, sizeof(fileVersion)));
// write begin key length and key
wait(file->appendStringRefWithLen(begin));
// write end key length and key
wait(file->appendStringRefWithLen(end));
int bytesLeft = blockSize - file->size();
if (bytesLeft > 0) {
Value paddings = fileBackup::makePadding(bytesLeft);
wait(file->append(paddings.begin(), bytesLeft));
}
wait(file->finish());
return Void();
}
ACTOR static Future<Void> testBackupContainer(std::string url) {
printf("BackupContainerTest URL %s\n", url.c_str());
state Reference<IBackupContainer> c = IBackupContainer::openContainer(url);
@ -2162,6 +2188,9 @@ ACTOR Future<Void> testBackupContainer(std::string url) {
loop {
state Version logStart = v;
state int kvfiles = deterministicRandom()->randomInt(0, 3);
state Key begin = LiteralStringRef("");
state Key end = LiteralStringRef("");
state int blockSize = 3 * sizeof(uint32_t) + begin.size() + end.size() + 8;
while(kvfiles > 0) {
if(snapshots.empty()) {
@ -2172,15 +2201,17 @@ ACTOR Future<Void> testBackupContainer(std::string url) {
v = nextVersion(v);
}
}
Reference<IBackupFile> range = wait(c->writeRangeFile(snapshots.rbegin()->first, 0, v, 10));
Reference<IBackupFile> range = wait(c->writeRangeFile(snapshots.rbegin()->first, 0, v, blockSize));
++nRangeFiles;
v = nextVersion(v);
snapshots.rbegin()->second.push_back(range->getFileName());
snapshotBeginEndKeys.rbegin()->second.emplace_back(LiteralStringRef(""), LiteralStringRef(""));
snapshotBeginEndKeys.rbegin()->second.emplace_back(begin, end);
int size = chooseFileSize(fileSizes);
snapshotSizes.rbegin()->second += size;
writes.push_back(writeAndVerifyFile(c, range, size));
// Write in actual range file format, instead of random data.
// writes.push_back(writeAndVerifyFile(c, range, size));
wait(testWriteSnapshotFile(range, begin, end, blockSize));
if(deterministicRandom()->random01() < .2) {
writes.push_back(c->writeKeyspaceSnapshotFile(
@ -2377,3 +2408,5 @@ TEST_CASE("/backup/continuous") {
return Void();
}
} // namespace backup_test

View File

@ -68,6 +68,9 @@ static const uint32_t BACKUP_AGENT_MLOG_VERSION = 2001;
// Mutation log version written by BackupWorker
static const uint32_t PARTITIONED_MLOG_VERSION = 4110;
// Snapshot file version written by FileBackupAgent
static const uint32_t BACKUP_AGENT_SNAPSHOT_FILE_VERSION = 1001;
struct LogFile {
Version beginVersion;
Version endVersion;
@ -108,12 +111,6 @@ struct RangeFile {
std::string fileName;
int64_t fileSize;
RangeFile() {}
RangeFile(Version v, uint32_t bSize, std::string name, int64_t size)
: version(v), blockSize(bSize), fileName(name), fileSize(size) {}
RangeFile(const RangeFile& f)
: version(f.version), blockSize(f.blockSize), fileName(f.fileName), fileSize(f.fileSize) {}
// Order by version, break ties with name
bool operator< (const RangeFile &rhs) const {
return version == rhs.version ? fileName < rhs.fileName : version < rhs.version;

View File

@ -494,11 +494,16 @@ Optional<ValueRef> DatabaseConfiguration::get( KeyRef key ) const {
}
}
bool DatabaseConfiguration::isExcludedServer( NetworkAddress a ) const {
return get( encodeExcludedServersKey( AddressExclusion(a.ip, a.port) ) ).present() ||
get( encodeExcludedServersKey( AddressExclusion(a.ip) ) ).present() ||
get( encodeFailedServersKey( AddressExclusion(a.ip, a.port) ) ).present() ||
get( encodeFailedServersKey( AddressExclusion(a.ip) ) ).present();
bool DatabaseConfiguration::isExcludedServer( NetworkAddressList a ) const {
return get( encodeExcludedServersKey( AddressExclusion(a.address.ip, a.address.port) ) ).present() ||
get( encodeExcludedServersKey( AddressExclusion(a.address.ip) ) ).present() ||
get( encodeFailedServersKey( AddressExclusion(a.address.ip, a.address.port) ) ).present() ||
get( encodeFailedServersKey( AddressExclusion(a.address.ip) ) ).present() ||
( a.secondaryAddress.present() && (
get( encodeExcludedServersKey( AddressExclusion(a.secondaryAddress.get().ip, a.secondaryAddress.get().port) ) ).present() ||
get( encodeExcludedServersKey( AddressExclusion(a.secondaryAddress.get().ip) ) ).present() ||
get( encodeFailedServersKey( AddressExclusion(a.secondaryAddress.get().ip, a.secondaryAddress.get().port) ) ).present() ||
get( encodeFailedServersKey( AddressExclusion(a.secondaryAddress.get().ip) ) ).present() ) );
}
std::set<AddressExclusion> DatabaseConfiguration::getExcludedServers() const {
const_cast<DatabaseConfiguration*>(this)->makeConfigurationImmutable();

View File

@ -187,7 +187,7 @@ struct DatabaseConfiguration {
std::vector<RegionInfo> regions;
// Excluded servers (no state should be here)
bool isExcludedServer( NetworkAddress ) const;
bool isExcludedServer( NetworkAddressList ) const;
std::set<AddressExclusion> getExcludedServers() const;
int32_t getDesiredProxies() const { if(masterProxyCount == -1) return autoMasterProxyCount; return masterProxyCount; }

View File

@ -284,6 +284,7 @@ struct KeyRangeRef {
force_inline void serialize(Ar& ar) {
serializer(ar, const_cast<KeyRef&>(begin), const_cast<KeyRef&>(end));
if( begin > end ) {
TraceEvent("InvertedRange").detail("Begin", begin).detail("End", end);
throw inverted_range();
};
}

View File

@ -461,7 +461,8 @@ namespace fileBackup {
// then the space after the final key to the next 1MB boundary would
// just be padding anyway.
struct RangeFileWriter {
RangeFileWriter(Reference<IBackupFile> file = Reference<IBackupFile>(), int blockSize = 0) : file(file), blockSize(blockSize), blockEnd(0), fileVersion(1001) {}
RangeFileWriter(Reference<IBackupFile> file = Reference<IBackupFile>(), int blockSize = 0)
: file(file), blockSize(blockSize), blockEnd(0), fileVersion(BACKUP_AGENT_SNAPSHOT_FILE_VERSION) {}
// Handles the first block and internal blocks. Ends current block if needed.
// The final flag is used in simulation to pad the file's final block to a whole block size
@ -557,8 +558,8 @@ namespace fileBackup {
state StringRefReader reader(buf, restore_corrupted_data());
try {
// Read header, currently only decoding version 1001
if(reader.consume<int32_t>() != 1001)
// Read header, currently only decoding BACKUP_AGENT_SNAPSHOT_FILE_VERSION
if(reader.consume<int32_t>() != BACKUP_AGENT_SNAPSHOT_FILE_VERSION)
throw restore_unsupported_file_version();
// Read begin key, if this fails then block was invalid.
@ -2406,6 +2407,7 @@ namespace fileBackup {
state bool backupWorkerEnabled = dbConfig.backupWorkerEnabled;
if (!backupWorkerEnabled) {
wait(success(changeConfig(cx, "backup_worker_enabled:=1", true)));
backupWorkerEnabled = true;
}
// Set the "backupStartedKey" and wait for all backup worker started
@ -3626,8 +3628,32 @@ public:
}
ACTOR static Future<Void> submitParallelRestore(Database cx, Key backupTag,
Standalone<VectorRef<KeyRangeRef>> backupRanges, KeyRef bcUrl,
Standalone<VectorRef<KeyRangeRef>> backupRanges, Key bcUrl,
Version targetVersion, bool lockDB, UID randomUID) {
// Sanity check backup is valid
state Reference<IBackupContainer> bc = IBackupContainer::openContainer(bcUrl.toString());
state BackupDescription desc = wait(bc->describeBackup());
wait(desc.resolveVersionTimes(cx));
if (targetVersion == invalidVersion && desc.maxRestorableVersion.present()) {
targetVersion = desc.maxRestorableVersion.get();
TraceEvent(SevWarn, "FastRestoreSubmitRestoreRequestWithInvalidTargetVersion")
.detail("OverrideTargetVersion", targetVersion);
}
Optional<RestorableFileSet> restoreSet = wait(bc->getRestoreSet(targetVersion));
if (!restoreSet.present()) {
TraceEvent(SevWarn, "FileBackupAgentRestoreNotPossible")
.detail("BackupContainer", bc->getURL())
.detail("TargetVersion", targetVersion);
throw restore_invalid_version();
}
TraceEvent("FastRestoreSubmitRestoreRequest")
.detail("BackupDesc", desc.toString())
.detail("TargetVersion", targetVersion);
state Reference<ReadYourWritesTransaction> tr(new ReadYourWritesTransaction(cx));
state int restoreIndex = 0;
state int numTries = 0;
@ -4606,7 +4632,7 @@ Future<Void> FileBackupAgent::parallelRestoreFinish(Database cx, UID randomUID)
}
Future<Void> FileBackupAgent::submitParallelRestore(Database cx, Key backupTag,
Standalone<VectorRef<KeyRangeRef>> backupRanges, KeyRef bcUrl,
Standalone<VectorRef<KeyRangeRef>> backupRanges, Key bcUrl,
Version targetVersion, bool lockDB, UID randomUID) {
return FileBackupAgentImpl::submitParallelRestore(cx, backupTag, backupRanges, bcUrl, targetVersion, lockDB,
randomUID);

View File

@ -1803,6 +1803,26 @@ ACTOR Future<Void> checkDatabaseLock( Reference<ReadYourWritesTransaction> tr, U
return Void();
}
ACTOR Future<Void> advanceVersion(Database cx, Version v) {
state Transaction tr(cx);
loop {
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
try {
Version rv = wait(tr.getReadVersion());
if (rv <= v) {
tr.set(minRequiredCommitVersionKey, BinaryWriter::toValue(v + 1, Unversioned()));
wait(tr.commit());
} else {
printf("Current read version is %ld\n", rv);
return Void();
}
} catch (Error& e) {
wait(tr.onError(e));
}
}
}
ACTOR Future<Void> forceRecovery( Reference<ClusterConnectionFile> clusterFile, Key dcId ) {
state Reference<AsyncVar<Optional<ClusterInterface>>> clusterInterface(new AsyncVar<Optional<ClusterInterface>>);
state Future<Void> leaderMon = monitorLeader<ClusterInterface>(clusterFile, clusterInterface);

View File

@ -178,6 +178,8 @@ ACTOR Future<Void> unlockDatabase( Database cx, UID id );
ACTOR Future<Void> checkDatabaseLock( Transaction* tr, UID id );
ACTOR Future<Void> checkDatabaseLock( Reference<ReadYourWritesTransaction> tr, UID id );
ACTOR Future<Void> advanceVersion(Database cx, Version v);
ACTOR Future<int> setDDMode( Database cx, int mode );
ACTOR Future<Void> forceRecovery( Reference<ClusterConnectionFile> clusterFile, Standalone<StringRef> dcId );

View File

@ -278,11 +278,12 @@ struct TxnStateRequest {
VectorRef<KeyValueRef> data;
Sequence sequence;
bool last;
std::vector<Endpoint> broadcastInfo;
ReplyPromise<Void> reply;
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, data, sequence, last, reply, arena);
serializer(ar, data, sequence, last, broadcastInfo, reply, arena);
}
};

View File

@ -76,16 +76,7 @@ TLSConfig tlsConfig(TLSEndpointType::CLIENT);
NetworkOptions::NetworkOptions()
: localAddress(""), clusterFile(""), traceDirectory(Optional<std::string>()),
traceRollSize(TRACE_DEFAULT_ROLL_SIZE), traceMaxLogsSize(TRACE_DEFAULT_MAX_LOGS_SIZE), traceLogGroup("default"),
traceFormat("xml"), traceClockSource("now"), runLoopProfilingEnabled(false) {
Standalone<VectorRef<ClientVersionRef>> defaultSupportedVersions;
StringRef sourceVersion = StringRef((const uint8_t*)getSourceVersion(), strlen(getSourceVersion()));
std::string protocolVersionString = format("%llx", currentProtocolVersion.version());
defaultSupportedVersions.push_back_deep(defaultSupportedVersions.arena(), ClientVersionRef(LiteralStringRef(FDB_VT_VERSION), sourceVersion, protocolVersionString));
supportedVersions = ReferencedObject<Standalone<VectorRef<ClientVersionRef>>>::from(defaultSupportedVersions);
}
traceFormat("xml"), traceClockSource("now"), runLoopProfilingEnabled(false), supportedVersions(new ReferencedObject<Standalone<VectorRef<ClientVersionRef>>>()) {}
static const Key CLIENT_LATENCY_INFO_PREFIX = LiteralStringRef("client_latency/");
static const Key CLIENT_LATENCY_INFO_CTR_PREFIX = LiteralStringRef("client_latency_counter/");
@ -1045,7 +1036,10 @@ void setupNetwork(uint64_t transportId, bool useMetrics) {
if (!networkOptions.logClientInfo.present())
networkOptions.logClientInfo = true;
TLS::DisableOpenSSLAtExitHandler();
g_network = newNet2(tlsConfig, false, useMetrics || networkOptions.traceDirectory.present());
g_network->addStopCallback( Net2FileSystem::stop );
g_network->addStopCallback( TLS::DestroyOpenSSLGlobalState );
FlowTransport::createInstance(true, transportId);
Net2FileSystem::newFileSystem();
}

View File

@ -1229,8 +1229,8 @@ Future< Optional<Value> > ReadYourWritesTransaction::get( const Key& key, bool s
return Optional<Value>();
}
// special key space are only allowed to query if both begin and end start with \xff\xff
if (key.startsWith(specialKeys.begin))
// special key space are only allowed to query if both begin and end are in \xff\xff, \xff\xff\xff
if (specialKeys.contains(key))
return getDatabase()->specialKeySpace->get(Reference<ReadYourWritesTransaction>::addRef(this), key);
if(checkUsedDuringCommit()) {
@ -1284,8 +1284,8 @@ Future< Standalone<RangeResultRef> > ReadYourWritesTransaction::getRange(
}
}
// special key space are only allowed to query if both begin and end start with \xff\xff
if (begin.getKey().startsWith(specialKeys.begin) && end.getKey().startsWith(specialKeys.begin))
// special key space are only allowed to query if both begin and end are in \xff\xff, \xff\xff\xff
if (specialKeys.contains(begin.getKey()) && specialKeys.contains(end.getKey()))
return getDatabase()->specialKeySpace->getRange(Reference<ReadYourWritesTransaction>::addRef(this), begin, end,
limits, reverse);

View File

@ -74,6 +74,7 @@ struct StorageServerInterface {
explicit StorageServerInterface(UID uid) : uniqueID( uid ) {}
StorageServerInterface() : uniqueID( deterministicRandom()->randomUniqueID() ) {}
NetworkAddress address() const { return getValue.getEndpoint().getPrimaryAddress(); }
NetworkAddress stableAddress() const { return getValue.getEndpoint().getStableAddress(); }
Optional<NetworkAddress> secondaryAddress() const { return getValue.getEndpoint().addresses.secondaryAddress; }
UID id() const { return uniqueID; }
std::string toString() const { return id().shortString(); }
@ -394,12 +395,14 @@ struct GetStorageMetricsReply {
StorageMetrics available;
StorageMetrics capacity;
double bytesInputRate;
int64_t versionLag;
double lastUpdate;
GetStorageMetricsReply() : bytesInputRate(0) {}
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, load, available, capacity, bytesInputRate);
serializer(ar, load, available, capacity, bytesInputRate, versionLag, lastUpdate);
}
};

View File

@ -52,6 +52,10 @@ public:
}
}
static void stop() {
eio_set_max_parallel(0);
}
static bool should_poll() { return want_poll; }
static bool lock_fd( int fd ) {

View File

@ -39,6 +39,8 @@ class AsyncFileWinASIO : public IAsyncFile, public ReferenceCounted<AsyncFileWin
public:
static void init() {}
static void stop() {}
static bool should_poll() { return false; }
// FIXME: This implementation isn't actually asynchronous - it just does operations synchronously!

View File

@ -13,6 +13,7 @@ set(FDBRPC_SRCS
FlowTransport.actor.cpp
genericactors.actor.h
genericactors.actor.cpp
HealthMonitor.actor.cpp
IAsyncFile.actor.cpp
LoadBalance.actor.h
Locality.cpp

View File

@ -21,53 +21,55 @@
#include "fdbrpc/FailureMonitor.h"
#include "flow/actorcompiler.h" // This must be the last #include.
ACTOR Future<Void> waitForStateEqual( IFailureMonitor* monitor, Endpoint endpoint, FailureStatus status ) {
ACTOR Future<Void> waitForStateEqual(IFailureMonitor* monitor, Endpoint endpoint, FailureStatus status) {
loop {
Future<Void> change = monitor->onStateChanged(endpoint);
if (monitor->getState(endpoint) == status)
return Void();
wait( change );
if (monitor->getState(endpoint) == status) return Void();
wait(change);
}
}
ACTOR Future<Void> waitForContinuousFailure( IFailureMonitor* monitor, Endpoint endpoint, double sustainedFailureDuration, double slope ) {
ACTOR Future<Void> waitForContinuousFailure(IFailureMonitor* monitor, Endpoint endpoint,
double sustainedFailureDuration, double slope) {
state double startT = now();
loop {
wait( monitor->onFailed( endpoint ) );
if(monitor->permanentlyFailed(endpoint))
return Void();
wait(monitor->onFailed(endpoint));
if (monitor->permanentlyFailed(endpoint)) return Void();
// X == sustainedFailureDuration + slope * (now()-startT+X)
double waitDelay = (sustainedFailureDuration + slope * (now()-startT)) / (1-slope);
double waitDelay = (sustainedFailureDuration + slope * (now() - startT)) / (1 - slope);
//SOMEDAY: if we know that this process is a server or client we can tune this optimization better
if(waitDelay < std::min(FLOW_KNOBS->CLIENT_REQUEST_INTERVAL, FLOW_KNOBS->SERVER_REQUEST_INTERVAL)) //We will not get a failure monitoring update in this amount of time, so there is no point in waiting for changes
// SOMEDAY: if we know that this process is a server or client we can tune this optimization better
if (waitDelay <
std::min(FLOW_KNOBS->CLIENT_REQUEST_INTERVAL,
FLOW_KNOBS->SERVER_REQUEST_INTERVAL)) // We will not get a failure monitoring update in this amount
// of time, so there is no point in waiting for changes
waitDelay = 0;
choose {
when (wait( monitor->onStateEqual( endpoint, FailureStatus(false) ) )) {} // SOMEDAY: Use onStateChanged() for efficiency
when (wait( delay(waitDelay) )) {
return Void();
}
when(wait(monitor->onStateEqual(endpoint, FailureStatus(false)))) {
} // SOMEDAY: Use onStateChanged() for efficiency
when(wait(delay(waitDelay))) { return Void(); }
}
}
}
Future<Void> IFailureMonitor::onStateEqual( Endpoint const& endpoint, FailureStatus status ) {
if ( status == getState(endpoint) ) return Void();
Future<Void> IFailureMonitor::onStateEqual(Endpoint const& endpoint, FailureStatus status) {
if (status == getState(endpoint)) return Void();
return waitForStateEqual(this, endpoint, status);
}
Future<Void> IFailureMonitor::onFailedFor( Endpoint const& endpoint, double sustainedFailureDuration, double slope ) {
ASSERT( slope < 1.0 );
return waitForContinuousFailure( this, endpoint, sustainedFailureDuration, slope );
Future<Void> IFailureMonitor::onFailedFor(Endpoint const& endpoint, double sustainedFailureDuration, double slope) {
ASSERT(slope < 1.0);
return waitForContinuousFailure(this, endpoint, sustainedFailureDuration, slope);
}
void SimpleFailureMonitor::setStatus( NetworkAddress const& address, FailureStatus const& status ) {
void SimpleFailureMonitor::setStatus(NetworkAddress const& address, FailureStatus const& status) {
//if (status.failed)
// printf("On machine '%s': Machine '%s' is failed\n", g_network->getLocalAddress().toString().c_str(), address.toString().c_str());
//printf("%s.setState(%s, %s) %p\n", g_network->getLocalAddress().toString(), address.toString(), status.failed ? "FAILED" : "OK", this);
//addressStatus.set( address, status );
// if (status.failed)
// printf("On machine '%s': Machine '%s' is failed\n", g_network->getLocalAddress().toString().c_str(),
// address.toString().c_str()); printf("%s.setState(%s, %s) %p\n", g_network->getLocalAddress().toString(),
// address.toString(), status.failed ? "FAILED" : "OK", this); addressStatus.set( address, status );
// onStateChanged() will be waiting on endpointKnownFailed only where it is false, so if the address status
// for an endpoint that is waited on changes, the waiter sees its failure status change
@ -96,22 +98,29 @@ void SimpleFailureMonitor::setStatus( NetworkAddress const& address, FailureStat
}
}
void SimpleFailureMonitor::endpointNotFound( Endpoint const& endpoint ) {
void SimpleFailureMonitor::endpointNotFound(Endpoint const& endpoint) {
// SOMEDAY: Expiration (this "leaks" memory)
if(endpoint.token.first() == -1) {
TraceEvent("WellKnownEndpointNotFound").suppressFor(1.0).detail("Address", endpoint.getPrimaryAddress()).detail("TokenFirst", endpoint.token.first()).detail("TokenSecond", endpoint.token.second());
if (endpoint.token.first() == -1) {
TraceEvent("WellKnownEndpointNotFound")
.suppressFor(1.0)
.detail("Address", endpoint.getPrimaryAddress())
.detail("TokenFirst", endpoint.token.first())
.detail("TokenSecond", endpoint.token.second());
return;
}
TraceEvent("EndpointNotFound").suppressFor(1.0).detail("Address", endpoint.getPrimaryAddress()).detail("Token", endpoint.token);
endpointKnownFailed.set( endpoint, true );
TraceEvent("EndpointNotFound")
.suppressFor(1.0)
.detail("Address", endpoint.getPrimaryAddress())
.detail("Token", endpoint.token);
endpointKnownFailed.set(endpoint, true);
}
void SimpleFailureMonitor::notifyDisconnect( NetworkAddress const& address ) {
void SimpleFailureMonitor::notifyDisconnect(NetworkAddress const& address) {
//TraceEvent("NotifyDisconnect").detail("Address", address);
endpointKnownFailed.triggerRange( Endpoint({address}, UID()), Endpoint({address}, UID(-1,-1)) );
endpointKnownFailed.triggerRange(Endpoint({ address }, UID()), Endpoint({ address }, UID(-1, -1)));
}
Future<Void> SimpleFailureMonitor::onDisconnectOrFailure( Endpoint const& endpoint ) {
Future<Void> SimpleFailureMonitor::onDisconnectOrFailure(Endpoint const& endpoint) {
// If the endpoint or address is already failed, return right away
auto i = addressStatus.find(endpoint.getPrimaryAddress());
if (i == addressStatus.end() || i->second.isFailed() || endpointKnownFailed.get(endpoint)) {
@ -120,12 +129,12 @@ Future<Void> SimpleFailureMonitor::onDisconnectOrFailure( Endpoint const& endpoi
}
// Return when the endpoint is triggered, which means that either the endpoint has become known failed, or the
// address has changed state (and since it was previously not failed, it must now be failed), or notifyDisconnect()
// has been called.
// address has changed state (and since it was previously not failed, it must now be failed), or
// notifyDisconnect() has been called.
return endpointKnownFailed.onChange(endpoint);
}
Future<Void> SimpleFailureMonitor::onStateChanged( Endpoint const& endpoint ) {
Future<Void> SimpleFailureMonitor::onStateChanged(Endpoint const& endpoint) {
// Wait on endpointKnownFailed if it is false, to pick up both endpointNotFound errors (which set it to true)
// and changes to addressStatus (which trigger a range). Don't wait on endpointKnownFailed if it is true, because
// failure status for that endpoint can never change (and we could be spuriously triggered by setStatus)
@ -137,36 +146,42 @@ Future<Void> SimpleFailureMonitor::onStateChanged( Endpoint const& endpoint ) {
return endpointKnownFailed.onChange(endpoint);
}
FailureStatus SimpleFailureMonitor::getState( Endpoint const& endpoint ) {
FailureStatus SimpleFailureMonitor::getState(Endpoint const& endpoint) {
if (endpointKnownFailed.get(endpoint))
return FailureStatus(true);
else {
auto a = addressStatus.find(endpoint.getPrimaryAddress());
if (a == addressStatus.end()) return FailureStatus();
else return a->second;
//printf("%s.getState(%s) = %s %p\n", g_network->getLocalAddress().toString(), endpoint.address.toString(), a.failed ? "FAILED" : "OK", this);
if (a == addressStatus.end())
return FailureStatus();
else
return a->second;
// printf("%s.getState(%s) = %s %p\n", g_network->getLocalAddress().toString(), endpoint.address.toString(),
// a.failed ? "FAILED" : "OK", this);
}
}
FailureStatus SimpleFailureMonitor::getState( NetworkAddress const& address ) {
FailureStatus SimpleFailureMonitor::getState(NetworkAddress const& address) {
auto a = addressStatus.find(address);
if (a == addressStatus.end()) return FailureStatus();
else return a->second;
if (a == addressStatus.end())
return FailureStatus();
else
return a->second;
}
bool SimpleFailureMonitor::onlyEndpointFailed( Endpoint const& endpoint ) {
if(!endpointKnownFailed.get(endpoint))
return false;
bool SimpleFailureMonitor::onlyEndpointFailed(Endpoint const& endpoint) {
if (!endpointKnownFailed.get(endpoint)) return false;
auto a = addressStatus.find(endpoint.getPrimaryAddress());
if (a == addressStatus.end()) return true;
else return !a->second.failed;
if (a == addressStatus.end())
return true;
else
return !a->second.failed;
}
bool SimpleFailureMonitor::permanentlyFailed( Endpoint const& endpoint ) {
bool SimpleFailureMonitor::permanentlyFailed(Endpoint const& endpoint) {
return endpointKnownFailed.get(endpoint);
}
void SimpleFailureMonitor::reset() {
addressStatus = std::unordered_map< NetworkAddress, FailureStatus >();
addressStatus = std::unordered_map<NetworkAddress, FailureStatus>();
endpointKnownFailed.resetNoWaiting();
}

View File

@ -76,8 +76,8 @@ struct FailureStatus {
bool isFailed() const { return failed; }
bool isAvailable() const { return !failed; }
bool operator == (FailureStatus const& r) const { return failed == r.failed; }
bool operator != (FailureStatus const& r) const { return failed != r.failed; }
bool operator==(FailureStatus const& r) const { return failed == r.failed; }
bool operator!=(FailureStatus const& r) const { return failed != r.failed; }
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, failed);
@ -87,43 +87,43 @@ struct FailureStatus {
class IFailureMonitor {
public:
// Returns the currently known status for the endpoint
virtual FailureStatus getState( Endpoint const& endpoint ) = 0;
virtual FailureStatus getState(Endpoint const& endpoint) = 0;
// Returns the currently known status for the address
virtual FailureStatus getState( NetworkAddress const& address ) = 0;
virtual FailureStatus getState(NetworkAddress const& address) = 0;
// Only use this function when the endpoint is known to be failed
virtual void endpointNotFound( Endpoint const& ) = 0;
virtual void endpointNotFound(Endpoint const&) = 0;
// The next time the known status for the endpoint changes, returns the new status.
virtual Future<Void> onStateChanged( Endpoint const& endpoint ) = 0;
virtual Future<Void> onStateChanged(Endpoint const& endpoint) = 0;
// Returns when onFailed(endpoint) || transport().onDisconnect( endpoint.getPrimaryAddress() ), but more efficiently
virtual Future<Void> onDisconnectOrFailure( Endpoint const& endpoint ) = 0;
virtual Future<Void> onDisconnectOrFailure(Endpoint const& endpoint) = 0;
// Returns true if the endpoint is failed but the address of the endpoint is not failed.
virtual bool onlyEndpointFailed( Endpoint const& endpoint ) = 0;
virtual bool onlyEndpointFailed(Endpoint const& endpoint) = 0;
// Returns true if the endpoint will never become available.
virtual bool permanentlyFailed( Endpoint const& endpoint ) = 0;
virtual bool permanentlyFailed(Endpoint const& endpoint) = 0;
// Called by FlowTransport when a connection closes and a prior request or reply might be lost
virtual void notifyDisconnect( NetworkAddress const& ) = 0;
virtual void notifyDisconnect(NetworkAddress const&) = 0;
// Called to update the failure status of network address directly when running client.
virtual void setStatus(NetworkAddress const& address, FailureStatus const& status) = 0;
// Returns when the known status of endpoint is next equal to status. Returns immediately
// if appropriate.
Future<Void> onStateEqual( Endpoint const& endpoint, FailureStatus status );
Future<Void> onStateEqual(Endpoint const& endpoint, FailureStatus status);
// Returns when the status of the given endpoint is next considered "failed"
Future<Void> onFailed( Endpoint const& endpoint ) {
return onStateEqual( endpoint, FailureStatus() );
}
Future<Void> onFailed(Endpoint const& endpoint) { return onStateEqual(endpoint, FailureStatus()); }
// Returns when the status of the given endpoint has continuously been "failed" for sustainedFailureDuration + (elapsedTime*sustainedFailureSlope)
Future<Void> onFailedFor( Endpoint const& endpoint, double sustainedFailureDuration, double sustainedFailureSlope = 0.0 );
// Returns when the status of the given endpoint has continuously been "failed" for sustainedFailureDuration +
// (elapsedTime*sustainedFailureSlope)
Future<Void> onFailedFor(Endpoint const& endpoint, double sustainedFailureDuration,
double sustainedFailureSlope = 0.0);
// Returns the failure monitor that the calling machine should use
static IFailureMonitor& failureMonitor() {
@ -137,22 +137,23 @@ public:
class SimpleFailureMonitor : public IFailureMonitor {
public:
SimpleFailureMonitor() : endpointKnownFailed() { }
void setStatus( NetworkAddress const& address, FailureStatus const& status );
void endpointNotFound( Endpoint const& );
virtual void notifyDisconnect( NetworkAddress const& );
SimpleFailureMonitor() : endpointKnownFailed() {}
void setStatus(NetworkAddress const& address, FailureStatus const& status);
void endpointNotFound(Endpoint const&);
virtual void notifyDisconnect(NetworkAddress const&);
virtual Future<Void> onStateChanged( Endpoint const& endpoint );
virtual FailureStatus getState( Endpoint const& endpoint );
virtual FailureStatus getState( NetworkAddress const& address );
virtual Future<Void> onDisconnectOrFailure( Endpoint const& endpoint );
virtual bool onlyEndpointFailed( Endpoint const& endpoint );
virtual bool permanentlyFailed( Endpoint const& endpoint );
virtual Future<Void> onStateChanged(Endpoint const& endpoint);
virtual FailureStatus getState(Endpoint const& endpoint);
virtual FailureStatus getState(NetworkAddress const& address);
virtual Future<Void> onDisconnectOrFailure(Endpoint const& endpoint);
virtual bool onlyEndpointFailed(Endpoint const& endpoint);
virtual bool permanentlyFailed(Endpoint const& endpoint);
void reset();
private:
std::unordered_map< NetworkAddress, FailureStatus > addressStatus;
YieldedAsyncMap< Endpoint, bool > endpointKnownFailed;
std::unordered_map<NetworkAddress, FailureStatus> addressStatus;
YieldedAsyncMap<Endpoint, bool> endpointKnownFailed;
friend class OnStateChangedActorActor;
};

View File

@ -236,6 +236,7 @@ struct YieldMockNetwork : INetwork, ReferenceCounted<YieldMockNetwork> {
virtual double now() { return baseNetwork->now(); }
virtual double timer() { return baseNetwork->timer(); }
virtual void stop() { return baseNetwork->stop(); }
virtual void addStopCallback( std::function<void()> fn ) { ASSERT(false); return; }
virtual bool isSimulated() const { return baseNetwork->isSimulated(); }
virtual void onMainThread(Promise<Void>&& signal, TaskPriority taskID) { return baseNetwork->onMainThread(std::move(signal), taskID); }
bool isOnMainThread() const override { return baseNetwork->isOnMainThread(); }

View File

@ -28,6 +28,7 @@
#include "flow/crc32c.h"
#include "fdbrpc/fdbrpc.h"
#include "fdbrpc/FailureMonitor.h"
#include "fdbrpc/HealthMonitor.h"
#include "fdbrpc/genericactors.actor.h"
#include "fdbrpc/simulator.h"
#include "flow/ActorCollection.h"
@ -189,6 +190,7 @@ public:
std::vector<Future<Void>> listeners;
std::unordered_map<NetworkAddress, Reference<struct Peer>> peers;
std::unordered_map<NetworkAddress, std::pair<double, double>> closedPeers;
HealthMonitor healthMonitor;
Reference<AsyncVar<bool>> degraded;
bool warnAlwaysForLargePacket;
@ -206,6 +208,7 @@ public:
Int64MetricHandle countConnClosedWithoutError;
std::map<NetworkAddress, std::pair<uint64_t, double>> incompatiblePeers;
AsyncTrigger incompatiblePeersChanged;
uint32_t numIncompatibleConnections;
std::map<uint64_t, double> multiVersionConnections;
double lastIncompatibleMessage;
@ -295,7 +298,7 @@ static ReliablePacket* sendPacket( TransportData* self, Reference<Peer> peer, IS
ACTOR Future<Void> connectionMonitor( Reference<Peer> peer ) {
state Endpoint remotePingEndpoint({ peer->destination }, WLTOKEN_PING_PACKET);
loop {
if (!FlowTransport::transport().isClient() && !peer->destination.isPublic() && peer->compatible) {
if (!FlowTransport::isClient() && !peer->destination.isPublic() && peer->compatible) {
// Don't send ping messages to clients unless necessary. Instead monitor incoming client pings.
// We ignore this block for incompatible clients because pings from server would trigger the
// peer->resetPing and prevent 'connection_failed' due to ping timeout.
@ -324,7 +327,7 @@ ACTOR Future<Void> connectionMonitor( Reference<Peer> peer ) {
(peer->lastDataPacketSentTime < now() - FLOW_KNOBS->CONNECTION_MONITOR_UNREFERENCED_CLOSE_DELAY)) {
// TODO: What about when peerReference == -1?
throw connection_unreferenced();
} else if (FlowTransport::transport().isClient() && peer->compatible && peer->destination.isPublic() &&
} else if (FlowTransport::isClient() && peer->compatible && peer->destination.isPublic() &&
(peer->lastConnectTime < now() - FLOW_KNOBS->CONNECTION_MONITOR_IDLE_TIMEOUT) &&
(peer->lastDataPacketSentTime < now() - FLOW_KNOBS->CONNECTION_MONITOR_IDLE_TIMEOUT)) {
// First condition is necessary because we may get here if we are server.
@ -396,80 +399,133 @@ ACTOR Future<Void> connectionWriter( Reference<Peer> self, Reference<IConnection
}
}
ACTOR Future<Void> delayedHealthUpdate(NetworkAddress address) {
state double start = now();
state bool delayed = false;
loop {
if (FLOW_KNOBS->HEALTH_MONITOR_MARK_FAILED_UNSTABLE_CONNECTIONS &&
FlowTransport::transport().healthMonitor()->tooManyConnectionsClosed(address) && address.isPublic()) {
if (!delayed) {
TraceEvent("TooManyConnectionsClosedMarkFailed")
.detail("Dest", address)
.detail("StartTime", start)
.detail("ClosedCount", FlowTransport::transport().healthMonitor()->closedConnectionsCount(address));
IFailureMonitor::failureMonitor().setStatus(address, FailureStatus(true));
}
delayed = true;
wait(delayJittered(FLOW_KNOBS->MAX_RECONNECTION_TIME * 2.0));
} else {
if (delayed) {
TraceEvent("TooManyConnectionsClosedMarkAvailable")
.detail("Dest", address)
.detail("StartTime", start)
.detail("TimeElapsed", now() - start)
.detail("ClosedCount", FlowTransport::transport().healthMonitor()->closedConnectionsCount(address));
}
IFailureMonitor::failureMonitor().setStatus(address, FailureStatus(false));
break;
}
}
return Void();
}
ACTOR Future<Void> connectionKeeper( Reference<Peer> self,
Reference<IConnection> conn = Reference<IConnection>(),
Future<Void> reader = Void()) {
TraceEvent(SevDebug, "ConnectionKeeper", conn ? conn->getDebugID() : UID())
.detail("PeerAddr", self->destination)
.detail("ConnSet", (bool)conn);
ASSERT_WE_THINK(FlowTransport::transport().getLocalAddress() != self->destination);
state Optional<double> firstConnFailedTime = Optional<double>();
state int retryConnect = false;
loop {
try {
state Future<Void> delayedHealthUpdateF = Future<Void>();
if (!conn) { // Always, except for the first loop with an incoming connection
self->outgoingConnectionIdle = true;
// Wait until there is something to send.
while (self->unsent.empty()) {
if (self->destination.isPublic() &&
IFailureMonitor::failureMonitor().getState(self->destination).isFailed()) {
break;
// Override waiting, if we are in failed state to update failure monitoring status.
Future<Void> retryConnectF = Never();
if (retryConnect) {
retryConnectF = IFailureMonitor::failureMonitor().getState(self->destination).isAvailable()
? delay(FLOW_KNOBS->FAILURE_DETECTION_DELAY)
: delay(FLOW_KNOBS->SERVER_REQUEST_INTERVAL);
}
wait (self->dataToSend.onTrigger());
choose {
when(wait(self->dataToSend.onTrigger())) {}
when(wait(retryConnectF)) { break; }
}
}
ASSERT( self->destination.isPublic() );
ASSERT(self->destination.isPublic());
self->outgoingConnectionIdle = false;
wait(delayJittered(
std::max(0.0, self->lastConnectTime + self->reconnectionDelay -
now()))); // Don't connect() to the same peer more than once per 2 sec
self->lastConnectTime = now();
TraceEvent("ConnectingTo", conn ? conn->getDebugID() : UID()).suppressFor(1.0).detail("PeerAddr", self->destination);
TraceEvent("ConnectingTo", conn ? conn->getDebugID() : UID())
.suppressFor(1.0)
.detail("PeerAddr", self->destination)
.detail("PeerReferences", self->peerReferences)
.detail("FailureStatus", IFailureMonitor::failureMonitor().getState(self->destination).isAvailable()
? "OK"
: "FAILED");
try {
choose {
when( Reference<IConnection> _conn = wait( INetworkConnections::net()->connect(self->destination) ) ) {
when(Reference<IConnection> _conn =
wait(INetworkConnections::net()->connect(self->destination))) {
conn = _conn;
wait(conn->connectHandshake());
IFailureMonitor::failureMonitor().setStatus(self->destination, FailureStatus(false));
if (self->unsent.empty()) {
delayedHealthUpdateF = delayedHealthUpdate(self->destination);
choose {
when(wait(delayedHealthUpdateF)) {
conn->close();
conn = Reference<IConnection>();
retryConnect = false;
continue;
} else {
}
when(wait(self->dataToSend.onTrigger())) {}
}
}
TraceEvent("ConnectionExchangingConnectPacket", conn->getDebugID())
.suppressFor(1.0)
.detail("PeerAddr", self->destination);
self->prependConnectPacket();
}
reader = connectionReader( self->transport, conn, self, Promise<Reference<Peer>>());
reader = connectionReader(self->transport, conn, self, Promise<Reference<Peer>>());
}
when( wait( delay( FLOW_KNOBS->CONNECTION_MONITOR_TIMEOUT ) ) ) {
throw connection_failed();
}
}
} catch( Error &e ) {
if(e.code() != error_code_connection_failed) {
} catch (Error& e) {
if (e.code() != error_code_connection_failed) {
throw;
}
TraceEvent("ConnectionTimedOut", conn ? conn->getDebugID() : UID())
.suppressFor(1.0)
.detail("PeerAddr", self->destination);
IFailureMonitor::failureMonitor().setStatus(
self->destination, FailureStatus(e.code() == error_code_connection_failed));
throw;
}
} else {
IFailureMonitor::failureMonitor().setStatus(self->destination, FailureStatus(false));
self->outgoingConnectionIdle = false;
}
firstConnFailedTime.reset();
try {
self->transport->countConnEstablished++;
wait( connectionWriter( self, conn ) || reader || connectionMonitor(self) );
if (!delayedHealthUpdateF.isValid())
delayedHealthUpdateF = delayedHealthUpdate(self->destination);
wait(connectionWriter(self, conn) || reader || connectionMonitor(self));
} catch (Error& e) {
if (e.code() == error_code_connection_failed || e.code() == error_code_actor_cancelled ||
e.code() == error_code_connection_unreferenced ||
@ -483,6 +539,7 @@ ACTOR Future<Void> connectionKeeper( Reference<Peer> self,
ASSERT( false );
} catch (Error& e) {
delayedHealthUpdateF.cancel();
if(now() - self->lastConnectTime > FLOW_KNOBS->RECONNECTION_RESET_TIME) {
self->reconnectionDelay = FLOW_KNOBS->INITIAL_RECONNECTION_TIME;
} else {
@ -499,6 +556,18 @@ ACTOR Future<Void> connectionKeeper( Reference<Peer> self,
firstConnFailedTime = now();
}
// Don't immediately mark connection as failed. To stay closed to earlier behaviour of centralized
// failure monitoring, wait until connection stays failed for FLOW_KNOBS->FAILURE_DETECTION_DELAY timeout.
retryConnect = self->destination.isPublic() && e.code() == error_code_connection_failed;
if (e.code() == error_code_connection_failed) {
if (!self->destination.isPublic()) {
// Can't connect back to non-public addresses.
IFailureMonitor::failureMonitor().setStatus(self->destination, FailureStatus(true));
} else if (now() - firstConnFailedTime.get() > FLOW_KNOBS->FAILURE_DETECTION_DELAY) {
IFailureMonitor::failureMonitor().setStatus(self->destination, FailureStatus(true));
}
}
self->discardUnreliablePackets();
reader = Future<Void>();
bool ok = e.code() == error_code_connection_failed || e.code() == error_code_actor_cancelled ||
@ -521,7 +590,7 @@ ACTOR Future<Void> connectionKeeper( Reference<Peer> self,
if(self->destination.isPublic()
&& IFailureMonitor::failureMonitor().getState(self->destination).isAvailable()
&& !FlowTransport::transport().isClient())
&& !FlowTransport::isClient())
{
auto& it = self->transport->closedPeers[self->destination];
if(now() - it.second > FLOW_KNOBS->TOO_MANY_CONNECTIONS_CLOSED_RESET_DELAY) {
@ -536,6 +605,10 @@ ACTOR Future<Void> connectionKeeper( Reference<Peer> self,
}
if (conn) {
if (self->destination.isPublic() && e.code() == error_code_connection_failed) {
FlowTransport::transport().healthMonitor()->reportPeerClosed(self->destination);
}
conn->close();
conn = Reference<IConnection>();
}
@ -556,6 +629,14 @@ ACTOR Future<Void> connectionKeeper( Reference<Peer> self,
}
}
Peer::Peer(TransportData* transport, NetworkAddress const& destination)
: transport(transport), destination(destination), outgoingConnectionIdle(true), lastConnectTime(0.0),
reconnectionDelay(FLOW_KNOBS->INITIAL_RECONNECTION_TIME), compatible(true), outstandingReplies(0),
incompatibleProtocolVersionNewer(false), peerReferences(-1), bytesReceived(0), lastDataPacketSentTime(now()) {
IFailureMonitor::failureMonitor().setStatus(destination, FailureStatus(false));
}
void Peer::send(PacketBuffer* pb, ReliablePacket* rp, bool firstUnsent) {
unsent.setWriteBuffer(pb);
if (rp) reliable.insert(rp);
@ -662,6 +743,9 @@ ACTOR static void deliver(TransportData* self, Endpoint destination, ArenaReader
} catch (Error& e) {
g_currentDeliveryPeerAddress = {NetworkAddress()};
TraceEvent(SevError, "ReceiverError").error(e).detail("Token", destination.token.toString()).detail("Peer", destination.getPrimaryAddress());
if(!FlowTransport::isClient()) {
flushAndExit(FDB_EXIT_ERROR);
}
throw;
}
} else if (destination.token.first() & TOKEN_STREAM_FLAG) {
@ -1023,7 +1107,7 @@ Reference<Peer> TransportData::getOrOpenPeer( NetworkAddress const& address, boo
auto peer = getPeer(address);
if(!peer) {
peer = Reference<Peer>( new Peer(this, address) );
if(startConnectionKeeper) {
if(startConnectionKeeper && !isLocalAddress(address)) {
peer->connect = connectionKeeper(peer);
}
peers[address] = peer;
@ -1039,10 +1123,14 @@ bool TransportData::isLocalAddress(const NetworkAddress& address) const {
ACTOR static Future<Void> multiVersionCleanupWorker( TransportData* self ) {
loop {
wait(delay(FLOW_KNOBS->CONNECTION_CLEANUP_DELAY));
bool foundIncompatible = false;
for(auto it = self->incompatiblePeers.begin(); it != self->incompatiblePeers.end();) {
if( self->multiVersionConnections.count(it->second.first) ) {
it = self->incompatiblePeers.erase(it);
} else {
if( now() - it->second.second > FLOW_KNOBS->INCOMPATIBLE_PEER_DELAY_BEFORE_LOGGING ) {
foundIncompatible = true;
}
it++;
}
}
@ -1054,6 +1142,10 @@ ACTOR static Future<Void> multiVersionCleanupWorker( TransportData* self ) {
it++;
}
}
if(foundIncompatible) {
self->incompatiblePeersChanged.trigger();
}
}
}
@ -1084,6 +1176,10 @@ std::map<NetworkAddress, std::pair<uint64_t, double>>* FlowTransport::getIncompa
return &self->incompatiblePeers;
}
Future<Void> FlowTransport::onIncompatibleChanged() {
return self->incompatiblePeersChanged.onTrigger();
}
Future<Void> FlowTransport::bind( NetworkAddress publicAddress, NetworkAddress listenAddress ) {
ASSERT( publicAddress.isPublic() );
if(self->localAddresses.address == NetworkAddress()) {
@ -1107,9 +1203,7 @@ void FlowTransport::addPeerReference(const Endpoint& endpoint, bool isStream) {
return;
Reference<Peer> peer = self->getOrOpenPeer(endpoint.getPrimaryAddress());
if(peer->peerReferences == -1) {
IFailureMonitor::failureMonitor().setStatus(endpoint.getPrimaryAddress(), FailureStatus(false));
if (peer->peerReferences == -1) {
peer->peerReferences = 1;
} else {
peer->peerReferences++;
@ -1173,7 +1267,8 @@ static void sendLocal( TransportData* self, ISerializeSource const& what, const
deliver(self, destination, ArenaReader(copy.arena(), copy, AssumeVersion(currentProtocolVersion)), false);
}
static ReliablePacket* sendPacket( TransportData* self, Reference<Peer> peer, ISerializeSource const& what, const Endpoint& destination, bool reliable ) {
static ReliablePacket* sendPacket(TransportData* self, Reference<Peer> peer, ISerializeSource const& what,
const Endpoint& destination, bool reliable) {
const bool checksumEnabled = !destination.getPrimaryAddress().isTLS();
++self->countPacketsGenerated;
@ -1315,4 +1410,15 @@ void FlowTransport::createInstance(bool isClient, uint64_t transportId) {
g_network->setGlobal(INetwork::enFlowTransport, (flowGlobalType) new FlowTransport(transportId));
g_network->setGlobal(INetwork::enNetworkAddressFunc, (flowGlobalType) &FlowTransport::getGlobalLocalAddress);
g_network->setGlobal(INetwork::enNetworkAddressesFunc, (flowGlobalType) &FlowTransport::getGlobalLocalAddresses);
// Mark ourselves as avaiable in FailureMonitor
const auto& localAddresses = FlowTransport::transport().getLocalAddresses();
IFailureMonitor::failureMonitor().setStatus(localAddresses.address, FailureStatus(false));
if (localAddresses.secondaryAddress.present()) {
IFailureMonitor::failureMonitor().setStatus(localAddresses.secondaryAddress.get(), FailureStatus(false));
}
}
HealthMonitor* FlowTransport::healthMonitor() {
return &self->healthMonitor;
}

View File

@ -23,6 +23,7 @@
#pragma once
#include <algorithm>
#include "fdbrpc/HealthMonitor.h"
#include "flow/genericactors.actor.h"
#include "flow/network.h"
#include "flow/FileIdentifier.h"
@ -44,7 +45,9 @@ public:
}
void choosePrimaryAddress() {
if(addresses.secondaryAddress.present() && !g_network->getLocalAddresses().secondaryAddress.present() && (addresses.address.isTLS() != g_network->getLocalAddresses().address.isTLS())) {
if(addresses.secondaryAddress.present() &&
((!g_network->getLocalAddresses().secondaryAddress.present() && (addresses.address.isTLS() != g_network->getLocalAddresses().address.isTLS())) ||
(g_network->getLocalAddresses().secondaryAddress.present() && !addresses.address.isTLS()))) {
std::swap(addresses.address, addresses.secondaryAddress.get());
}
}
@ -58,6 +61,10 @@ public:
return addresses.address;
}
NetworkAddress getStableAddress() const {
return addresses.getTLSAddress();
}
bool operator == (Endpoint const& r) const {
return getPrimaryAddress() == r.getPrimaryAddress() && token == r.token;
}
@ -123,10 +130,7 @@ struct Peer : public ReferenceCounted<Peer> {
double lastDataPacketSentTime;
int outstandingReplies;
explicit Peer(TransportData* transport, NetworkAddress const& destination)
: transport(transport), destination(destination), outgoingConnectionIdle(true), lastConnectTime(0.0),
reconnectionDelay(FLOW_KNOBS->INITIAL_RECONNECTION_TIME), compatible(true), outstandingReplies(0),
incompatibleProtocolVersionNewer(false), peerReferences(-1), bytesReceived(0), lastDataPacketSentTime(now()) {}
explicit Peer(TransportData* transport, NetworkAddress const& destination);
void send(PacketBuffer* pb, ReliablePacket* rp, bool firstUnsent);
@ -164,6 +168,9 @@ public:
std::map<NetworkAddress, std::pair<uint64_t, double>>* getIncompatiblePeers();
// Returns the same of all peers that have attempted to connect, but have incompatible protocol versions
Future<Void> onIncompatibleChanged();
// Returns when getIncompatiblePeers has at least one peer which is incompatible.
void addPeerReference(const Endpoint&, bool isStream);
// Signal that a peer connection is being used, even if no messages are currently being sent to the peer
@ -205,6 +212,8 @@ public:
Endpoint loadedEndpoint(const UID& token);
HealthMonitor* healthMonitor();
private:
class TransportData* self;
};

View File

@ -0,0 +1,51 @@
/*
* HealthMonitor.actor.cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2020 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "fdbrpc/FailureMonitor.h"
#include "fdbrpc/FlowTransport.h"
#include "fdbrpc/HealthMonitor.h"
void HealthMonitor::reportPeerClosed(const NetworkAddress& peerAddress) {
purgeOutdatedHistory();
peerClosedHistory.push_back(std::make_pair(now(), peerAddress));
peerClosedNum[peerAddress] += 1;
}
void HealthMonitor::purgeOutdatedHistory() {
for (auto it : peerClosedHistory) {
if (it.first < now() - FLOW_KNOBS->HEALTH_MONITOR_CLIENT_REQUEST_INTERVAL_SECS) {
peerClosedNum[it.second] -= 1;
ASSERT(peerClosedNum[it.second] >= 0);
peerClosedHistory.pop_front();
} else {
break;
}
}
}
bool HealthMonitor::tooManyConnectionsClosed(const NetworkAddress& peerAddress) {
purgeOutdatedHistory();
return peerClosedNum[peerAddress] > FLOW_KNOBS->HEALTH_MONITOR_CONNECTION_MAX_CLOSED;
}
int HealthMonitor::closedConnectionsCount(const NetworkAddress& peerAddress) {
purgeOutdatedHistory();
return peerClosedNum[peerAddress];
}

41
fdbrpc/HealthMonitor.h Normal file
View File

@ -0,0 +1,41 @@
/*
* HealthMonitor.h
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2020 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef FDBRPC_HEALTH_MONITOR_H
#define FDBRPC_HEALTH_MONITOR_H
#include <deque>
#include <unordered_map>
#include <flow/flow.h>
class HealthMonitor {
public:
void reportPeerClosed(const NetworkAddress& peerAddress);
bool tooManyConnectionsClosed(const NetworkAddress& peerAddress);
int closedConnectionsCount(const NetworkAddress& peerAddress);
private:
void purgeOutdatedHistory();
std::deque<std::pair<double, NetworkAddress>> peerClosedHistory;
std::unordered_map<NetworkAddress, int> peerClosedNum;
};
#endif // FDBRPC_HEALTH_MONITOR_H

View File

@ -115,3 +115,7 @@ Net2FileSystem::Net2FileSystem(double ioTimeout, std::string fileSystemPath)
}
#endif
}
void Net2FileSystem::stop() {
Net2AsyncFile::stop();
}

View File

@ -36,6 +36,7 @@ public:
virtual Future< std::time_t > lastWriteTime( std::string filename );
//void init();
static void stop();
Net2FileSystem(double ioTimeout=0.0, std::string fileSystemPath = "");

View File

@ -871,7 +871,12 @@ public:
return emptyConfig;
}
virtual void stop() { isStopped = true; }
virtual void stop() {
isStopped = true;
}
virtual void addStopCallback( std::function<void()> fn ) {
stopCallbacks.emplace_back(std::move(fn));
}
virtual bool isSimulated() const { return true; }
struct SimThreadArgs {
@ -995,6 +1000,9 @@ public:
}
self->currentProcess = callingMachine;
self->net2->stop();
for ( auto& fn : self->stopCallbacks ) {
fn();
}
return Void();
}
@ -1615,6 +1623,7 @@ public:
// Not letting currentProcess be NULL eliminates some annoying special cases
currentProcess = new ProcessInfo("NoMachine", LocalityData(Optional<Standalone<StringRef>>(), StringRef(), StringRef(), StringRef()), ProcessClass(), {NetworkAddress()}, this, "", "");
g_network = net2 = newNet2(TLSConfig(), false, true);
g_network->addStopCallback( Net2FileSystem::stop );
Net2FileSystem::newFileSystem();
check_yield(TaskPriority::Zero);
}
@ -1713,6 +1722,8 @@ public:
//tasks is guarded by ISimulator::mutex
std::priority_queue<Task, std::vector<Task>> tasks;
std::vector<std::function<void()>> stopCallbacks;
//Sim2Net network;
INetwork *net2;

View File

@ -68,7 +68,6 @@ struct BackupData {
const UID myId;
const Tag tag; // LogRouter tag for this worker, i.e., (-2, i)
const int totalTags; // Total log router tags
// Backup request's commit version. Mutations are logged at some version after this.
const Version startVersion; // This worker's start version
const Optional<Version> endVersion; // old epoch's end version (inclusive), or empty for current epoch
const LogEpoch recruitedEpoch; // current epoch whose tLogs are receiving mutations
@ -209,8 +208,12 @@ struct BackupData {
}
BackupData* self = nullptr;
// Backup request's commit version. Mutations are logged at some version after this.
Version startVersion = invalidVersion;
// The last mutation log's saved version (not inclusive), i.e., next log's begin version.
Version lastSavedVersion = invalidVersion;
Future<Optional<Reference<IBackupContainer>>> container;
Future<Optional<std::vector<KeyRange>>> ranges; // Key ranges of this backup
Future<Void> updateWorker;
@ -568,17 +571,6 @@ ACTOR Future<Void> saveProgress(BackupData* self, Version backupVersion) {
}
}
// Return a block of contiguous padding bytes, growing if needed.
static Value makePadding(int size) {
static Value pad;
if (pad.size() < size) {
pad = makeString(size);
memset(mutateString(pad), '\xff', pad.size());
}
return pad.substr(0, size);
}
// Write a mutation to a log file. Note the mutation can be different from
// message.message for clear mutations.
ACTOR Future<Void> addMutation(Reference<IBackupFile> logFile, VersionedMessage message, StringRef mutation,
@ -599,7 +591,7 @@ ACTOR Future<Void> addMutation(Reference<IBackupFile> logFile, VersionedMessage
// Write padding if needed
const int bytesLeft = *blockEnd - logFile->size();
if (bytesLeft > 0) {
state Value paddingFFs = makePadding(bytesLeft);
state Value paddingFFs = fileBackup::makePadding(bytesLeft);
wait(logFile->append(paddingFFs.begin(), bytesLeft));
}
@ -762,6 +754,10 @@ ACTOR Future<Void> uploadData(BackupData* self) {
state int numMsg = 0;
Version lastPopVersion = popVersion;
// index of last version's end position in self->messages
int lastVersionIndex = 0;
Version lastVersion = invalidVersion;
if (self->messages.empty()) {
// Even though messages is empty, we still want to advance popVersion.
if (!self->endVersion.present()) {
@ -770,18 +766,30 @@ ACTOR Future<Void> uploadData(BackupData* self) {
} else {
for (const auto& message : self->messages) {
// message may be prefetched in peek; uncommitted message should not be uploaded.
if (message.getVersion() > self->maxPopVersion()) break;
popVersion = std::max(popVersion, message.getVersion());
const Version version = message.getVersion();
if (version > self->maxPopVersion()) break;
if (version > popVersion) {
lastVersionIndex = numMsg;
lastVersion = popVersion;
popVersion = version;
}
numMsg++;
}
}
if (self->pullFinished()) {
popVersion = self->endVersion.get();
} else {
// make sure file is saved on version boundary
popVersion = lastVersion;
numMsg = lastVersionIndex;
}
if (((numMsg > 0 || popVersion > lastPopVersion) && self->pulling) || self->pullFinished()) {
TraceEvent("BackupWorkerSave", self->myId)
.detail("Version", popVersion)
.detail("LastPopVersion", lastPopVersion)
.detail("Pulling", self->pulling)
.detail("SavedVersion", self->savedVersion)
.detail("NumMsg", numMsg)
.detail("MsgQ", self->messages.size());
// save an empty file for old epochs so that log file versions are continuous
wait(saveMutationsToFile(self, popVersion, numMsg));

View File

@ -6,7 +6,6 @@ set(FDBSERVER_SRCS
BackupProgress.actor.h
BackupWorker.actor.cpp
ClusterController.actor.cpp
ClusterRecruitmentInterface.h
ConflictSet.h
CoordinatedState.actor.cpp
CoordinatedState.h

View File

@ -36,7 +36,6 @@
#include "fdbserver/LeaderElection.h"
#include "fdbserver/LogSystemConfig.h"
#include "fdbserver/WaitFailure.h"
#include "fdbserver/ClusterRecruitmentInterface.h"
#include "fdbserver/RatekeeperInterface.h"
#include "fdbserver/ServerDBInfo.h"
#include "fdbserver/Status.h"
@ -63,14 +62,15 @@ struct WorkerInfo : NonCopyable {
Future<Void> haltRatekeeper;
Future<Void> haltDistributor;
Optional<uint16_t> storageCacheInfo;
Standalone<VectorRef<StringRef>> issues;
WorkerInfo() : gen(-1), reboots(0), priorityInfo(ProcessClass::UnsetFit, false, ClusterControllerPriorityInfo::FitnessUnknown) {}
WorkerInfo( Future<Void> watcher, ReplyPromise<RegisterWorkerReply> reply, Generation gen, WorkerInterface interf, ProcessClass initialClass, ProcessClass processClass, ClusterControllerPriorityInfo priorityInfo, bool degraded ) :
watcher(watcher), reply(reply), gen(gen), reboots(0), initialClass(initialClass), priorityInfo(priorityInfo), details(interf, processClass, degraded) {}
WorkerInfo( Future<Void> watcher, ReplyPromise<RegisterWorkerReply> reply, Generation gen, WorkerInterface interf, ProcessClass initialClass, ProcessClass processClass, ClusterControllerPriorityInfo priorityInfo, bool degraded, Standalone<VectorRef<StringRef>> issues ) :
watcher(watcher), reply(reply), gen(gen), reboots(0), initialClass(initialClass), priorityInfo(priorityInfo), details(interf, processClass, degraded), issues(issues) {}
WorkerInfo( WorkerInfo&& r ) BOOST_NOEXCEPT : watcher(std::move(r.watcher)), reply(std::move(r.reply)), gen(r.gen),
reboots(r.reboots), initialClass(r.initialClass), priorityInfo(r.priorityInfo), details(std::move(r.details)),
haltRatekeeper(r.haltRatekeeper), haltDistributor(r.haltDistributor), storageCacheInfo(r.storageCacheInfo) {}
haltRatekeeper(r.haltRatekeeper), haltDistributor(r.haltDistributor), storageCacheInfo(r.storageCacheInfo), issues(r.issues) {}
void operator=( WorkerInfo&& r ) BOOST_NOEXCEPT {
watcher = std::move(r.watcher);
reply = std::move(r.reply);
@ -82,6 +82,7 @@ struct WorkerInfo : NonCopyable {
haltRatekeeper = r.haltRatekeeper;
haltDistributor = r.haltDistributor;
storageCacheInfo = r.storageCacheInfo;
issues = r.issues;
}
};
@ -98,13 +99,11 @@ class ClusterControllerData {
public:
struct DBInfo {
Reference<AsyncVar<ClientDBInfo>> clientInfo;
Reference<AsyncVar<CachedSerialization<ServerDBInfo>>> serverInfo;
CachedSerialization<ServerDBInfo> serverInfoMasterOnly;
std::set<NetworkAddress> requiredAddresses;
ProcessIssuesMap workersWithIssues;
Reference<AsyncVar<ServerDBInfo>> serverInfo;
std::map<NetworkAddress, double> incompatibleConnections;
AsyncTrigger forceMasterFailure;
int64_t masterRegistrationCount;
int64_t dbInfoCount;
bool recoveryStalled;
bool forceRecovery;
DatabaseConfiguration config; // Asynchronously updated via master registration
@ -117,42 +116,36 @@ public:
std::map<NetworkAddress, std::pair<double, OpenDatabaseRequest>> clientStatus;
DBInfo() : masterRegistrationCount(0), recoveryStalled(false), forceRecovery(false), unfinishedRecoveries(0), logGenerations(0), cachePopulated(false),
clientInfo( new AsyncVar<ClientDBInfo>( ClientDBInfo() ) ),
serverInfo( new AsyncVar<CachedSerialization<ServerDBInfo>>( CachedSerialization<ServerDBInfo>() ) ),
clientInfo( new AsyncVar<ClientDBInfo>( ClientDBInfo() ) ), dbInfoCount(0),
serverInfo( new AsyncVar<ServerDBInfo>( ServerDBInfo() ) ),
db( DatabaseContext::create( clientInfo, Future<Void>(), LocalityData(), true, TaskPriority::DefaultEndpoint, true ) ) // SOMEDAY: Locality!
{
}
void addRequiredAddresses(const std::vector<WorkerInterface>& interfaces) {
for(auto& it : interfaces) {
requiredAddresses.insert(it.address());
}
}
void setDistributor(const DataDistributorInterface& interf) {
CachedSerialization<ServerDBInfo> newInfoCache = serverInfo->get();
auto& newInfo = newInfoCache.mutate();
auto newInfo = serverInfo->get();
newInfo.id = deterministicRandom()->randomUniqueID();
newInfo.infoGeneration = ++dbInfoCount;
newInfo.distributor = interf;
serverInfo->set( newInfoCache );
serverInfo->set( newInfo );
}
void setRatekeeper(const RatekeeperInterface& interf) {
CachedSerialization<ServerDBInfo> newInfoCache = serverInfo->get();
auto& newInfo = newInfoCache.mutate();
auto newInfo = serverInfo->get();
newInfo.id = deterministicRandom()->randomUniqueID();
newInfo.infoGeneration = ++dbInfoCount;
newInfo.ratekeeper = interf;
serverInfo->set( newInfoCache );
serverInfo->set( newInfo );
}
void setStorageCache(uint16_t id, const StorageServerInterface& interf) {
CachedSerialization<ServerDBInfo> newInfoCache = serverInfo->get();
auto& newInfo = newInfoCache.mutate();
auto newInfo = serverInfo->get();
bool found = false;
for(auto& it : newInfo.storageCaches) {
if(it.first == id) {
if(it.second != interf) {
newInfo.id = deterministicRandom()->randomUniqueID();
newInfo.infoGeneration = ++dbInfoCount;
it.second = interf;
}
found = true;
@ -161,36 +154,36 @@ public:
}
if(!found) {
newInfo.id = deterministicRandom()->randomUniqueID();
newInfo.infoGeneration = ++dbInfoCount;
newInfo.storageCaches.push_back(std::make_pair(id, interf));
}
serverInfo->set( newInfoCache );
serverInfo->set( newInfo );
}
void clearInterf(ProcessClass::ClassType t) {
CachedSerialization<ServerDBInfo> newInfoCache = serverInfo->get();
auto& newInfo = newInfoCache.mutate();
auto newInfo = serverInfo->get();
newInfo.id = deterministicRandom()->randomUniqueID();
newInfo.infoGeneration = ++dbInfoCount;
if (t == ProcessClass::DataDistributorClass) {
newInfo.distributor = Optional<DataDistributorInterface>();
} else if (t == ProcessClass::RatekeeperClass) {
newInfo.ratekeeper = Optional<RatekeeperInterface>();
}
serverInfo->set( newInfoCache );
serverInfo->set( newInfo );
}
void clearStorageCache(uint16_t id) {
CachedSerialization<ServerDBInfo> newInfoCache = serverInfo->get();
auto& newInfo = newInfoCache.mutate();
auto newInfo = serverInfo->get();
for(auto it = newInfo.storageCaches.begin(); it != newInfo.storageCaches.end(); ++it) {
if(it->first == id) {
newInfo.id = deterministicRandom()->randomUniqueID();
newInfo.infoGeneration = ++dbInfoCount;
newInfo.storageCaches.erase(it);
break;
}
}
serverInfo->set( newInfoCache );
serverInfo->set( newInfo );
}
};
struct UpdateWorkerList {
@ -256,8 +249,8 @@ public:
}
bool isLongLivedStateless( Optional<Key> const& processId ) {
return (db.serverInfo->get().read().distributor.present() && db.serverInfo->get().read().distributor.get().locality.processId() == processId) ||
(db.serverInfo->get().read().ratekeeper.present() && db.serverInfo->get().read().ratekeeper.get().locality.processId() == processId);
return (db.serverInfo->get().distributor.present() && db.serverInfo->get().distributor.get().locality.processId() == processId) ||
(db.serverInfo->get().ratekeeper.present() && db.serverInfo->get().ratekeeper.get().locality.processId() == processId);
}
WorkerDetails getStorageWorker( RecruitStorageRequest const& req ) {
@ -270,6 +263,7 @@ public:
!excludedMachines.count(it.second.details.interf.locality.zoneId()) &&
( includeDCs.size() == 0 || includeDCs.count(it.second.details.interf.locality.dcId()) ) &&
!addressExcluded(excludedAddresses, it.second.details.interf.address()) &&
( !it.second.details.interf.secondaryAddress().present() || !addressExcluded(excludedAddresses, it.second.details.interf.secondaryAddress().get()) ) &&
it.second.details.processClass.machineClassFitness( ProcessClass::Storage ) <= ProcessClass::UnsetFit ) {
return it.second.details;
}
@ -306,7 +300,7 @@ public:
for( auto& it : id_worker ) {
auto fitness = it.second.details.processClass.machineClassFitness( ProcessClass::Storage );
if( workerAvailable(it.second, false) && !conf.isExcludedServer(it.second.details.interf.address()) && fitness != ProcessClass::NeverAssign && ( !dcId.present() || it.second.details.interf.locality.dcId()==dcId.get() ) ) {
if( workerAvailable(it.second, false) && !conf.isExcludedServer(it.second.details.interf.addresses()) && fitness != ProcessClass::NeverAssign && ( !dcId.present() || it.second.details.interf.locality.dcId()==dcId.get() ) ) {
fitness_workers[ fitness ].push_back(it.second.details);
}
}
@ -351,7 +345,7 @@ public:
for( auto& it : id_worker ) {
if (std::find(exclusionWorkerIds.begin(), exclusionWorkerIds.end(), it.second.details.interf.id()) == exclusionWorkerIds.end()) {
auto fitness = it.second.details.processClass.machineClassFitness(ProcessClass::TLog);
if (workerAvailable(it.second, checkStable) && !conf.isExcludedServer(it.second.details.interf.address()) && fitness != ProcessClass::NeverAssign && (!dcIds.size() || dcIds.count(it.second.details.interf.locality.dcId()))) {
if (workerAvailable(it.second, checkStable) && !conf.isExcludedServer(it.second.details.interf.addresses()) && fitness != ProcessClass::NeverAssign && (!dcIds.size() || dcIds.count(it.second.details.interf.locality.dcId()))) {
fitness_workers[std::make_pair(fitness, it.second.details.degraded)].push_back(it.second.details);
}
else {
@ -507,7 +501,7 @@ public:
for( auto& it : id_worker ) {
auto fitness = it.second.details.processClass.machineClassFitness( role );
if(conf.isExcludedServer(it.second.details.interf.address())) {
if(conf.isExcludedServer(it.second.details.interf.addresses())) {
fitness = std::max(fitness, ProcessClass::ExcludeFit);
}
if( workerAvailable(it.second, checkStable) && fitness < unacceptableFitness && it.second.details.interf.locality.dcId()==dcId ) {
@ -545,7 +539,7 @@ public:
for( auto& it : id_worker ) {
auto fitness = it.second.details.processClass.machineClassFitness( role );
if( workerAvailable(it.second, checkStable) && !conf.isExcludedServer(it.second.details.interf.address()) && it.second.details.interf.locality.dcId() == dcId &&
if( workerAvailable(it.second, checkStable) && !conf.isExcludedServer(it.second.details.interf.addresses()) && it.second.details.interf.locality.dcId() == dcId &&
( !minWorker.present() || ( it.second.details.interf.id() != minWorker.get().worker.interf.id() && ( fitness < minWorker.get().fitness || (fitness == minWorker.get().fitness && id_used[it.first] <= minWorker.get().used ) ) ) ) ) {
if (isLongLivedStateless(it.first)) {
fitness_workers[ std::make_pair(fitness, id_used[it.first]) ].second.push_back(it.second.details);
@ -664,7 +658,7 @@ public:
std::set<Optional<Standalone<StringRef>>> getDatacenters( DatabaseConfiguration const& conf, bool checkStable = false ) {
std::set<Optional<Standalone<StringRef>>> result;
for( auto& it : id_worker )
if( workerAvailable( it.second, checkStable ) && !conf.isExcludedServer( it.second.details.interf.address() ) )
if( workerAvailable( it.second, checkStable ) && !conf.isExcludedServer( it.second.details.interf.addresses() ) )
result.insert(it.second.details.interf.locality.dcId());
return result;
}
@ -984,7 +978,7 @@ public:
}
void checkRecoveryStalled() {
if( (db.serverInfo->get().read().recoveryState == RecoveryState::RECRUITING || db.serverInfo->get().read().recoveryState == RecoveryState::ACCEPTING_COMMITS || db.serverInfo->get().read().recoveryState == RecoveryState::ALL_LOGS_RECRUITED) && db.recoveryStalled ) {
if( (db.serverInfo->get().recoveryState == RecoveryState::RECRUITING || db.serverInfo->get().recoveryState == RecoveryState::ACCEPTING_COMMITS || db.serverInfo->get().recoveryState == RecoveryState::ALL_LOGS_RECRUITED) && db.recoveryStalled ) {
if (db.config.regions.size() > 1) {
auto regions = db.config.regions;
if(clusterControllerDcId.get() == regions[0].dcId) {
@ -998,7 +992,7 @@ public:
//FIXME: determine when to fail the cluster controller when a primaryDC has not been set
bool betterMasterExists() {
const ServerDBInfo dbi = db.serverInfo->get().read();
const ServerDBInfo dbi = db.serverInfo->get();
if(dbi.recoveryState < RecoveryState::ACCEPTING_COMMITS) {
return false;
@ -1094,7 +1088,7 @@ public:
// Check master fitness. Don't return false if master is excluded in case all the processes are excluded, we still need master for recovery.
ProcessClass::Fitness oldMasterFit = masterWorker->second.details.processClass.machineClassFitness( ProcessClass::Master );
if(db.config.isExcludedServer(dbi.master.address())) {
if(db.config.isExcludedServer(dbi.master.addresses())) {
oldMasterFit = std::max(oldMasterFit, ProcessClass::ExcludeFit);
}
@ -1102,7 +1096,7 @@ public:
id_used[clusterControllerProcessId]++;
WorkerFitnessInfo mworker = getWorkerForRoleInDatacenter(clusterControllerDcId, ProcessClass::Master, ProcessClass::NeverAssign, db.config, id_used, true);
auto newMasterFit = mworker.worker.processClass.machineClassFitness( ProcessClass::Master );
if(db.config.isExcludedServer(mworker.worker.interf.address())) {
if(db.config.isExcludedServer(mworker.worker.interf.addresses())) {
newMasterFit = std::max(newMasterFit, ProcessClass::ExcludeFit);
}
@ -1263,7 +1257,7 @@ public:
ASSERT(masterProcessId.present());
if (processId == masterProcessId) return false;
auto& dbInfo = db.serverInfo->get().read();
auto& dbInfo = db.serverInfo->get();
for (const auto& tlogset : dbInfo.logSystemConfig.tLogs) {
for (const auto& tlog: tlogset.tLogs) {
if (tlog.present() && tlog.interf().locality.processId() == processId) return true;
@ -1293,7 +1287,7 @@ public:
std::map<Optional<Standalone<StringRef>>, int> idUsed;
updateKnownIds(&idUsed);
auto& dbInfo = db.serverInfo->get().read();
auto& dbInfo = db.serverInfo->get();
for (const auto& tlogset : dbInfo.logSystemConfig.tLogs) {
for (const auto& tlog: tlogset.tLogs) {
if (tlog.present()) {
@ -1331,6 +1325,9 @@ public:
UpdateWorkerList updateWorkerList;
Future<Void> outstandingRequestChecker;
Future<Void> outstandingRemoteRequestChecker;
AsyncTrigger updateDBInfo;
std::set<Endpoint> updateDBInfoEndpoints;
std::set<Endpoint> removedDBInfoEndpoints;
DBInfo db;
Database cx;
@ -1351,7 +1348,6 @@ public:
Counter getWorkersRequests;
Counter getClientWorkersRequests;
Counter registerMasterRequests;
Counter getServerDBInfoRequests;
Counter statusRequests;
Counter failureMonitoringRequests;
@ -1370,18 +1366,18 @@ public:
getWorkersRequests("GetWorkersRequests", clusterControllerMetrics),
getClientWorkersRequests("GetClientWorkersRequests", clusterControllerMetrics),
registerMasterRequests("RegisterMasterRequests", clusterControllerMetrics),
getServerDBInfoRequests("GetServerDBInfoRequests", clusterControllerMetrics),
statusRequests("StatusRequests", clusterControllerMetrics),
failureMonitoringRequests("FailureMonitoringRequests", clusterControllerMetrics),
serversFailed("ServersFailed", clusterControllerMetrics),
serversUnfailed("ServersUnfailed", clusterControllerMetrics)
{
auto& serverInfo = db.serverInfoMasterOnly.mutate();
auto serverInfo = ServerDBInfo();
serverInfo.id = deterministicRandom()->randomUniqueID();
serverInfo.infoGeneration = ++db.dbInfoCount;
serverInfo.masterLifetime.ccID = id;
serverInfo.clusterInterface = ccInterface;
serverInfo.myLocality = locality;
db.serverInfo->set( db.serverInfoMasterOnly );
db.serverInfo->set( serverInfo );
cx = openDBOnServer(db.serverInfo, TaskPriority::DefaultEndpoint, true, true);
}
@ -1416,7 +1412,7 @@ ACTOR Future<Void> clusterWatchDatabase( ClusterControllerData* cluster, Cluster
continue;
}
RecruitMasterRequest rmq;
rmq.lifetime = db->serverInfo->get().read().masterLifetime;
rmq.lifetime = db->serverInfo->get().masterLifetime;
rmq.forceRecovery = db->forceRecovery;
cluster->masterProcessId = masterWorker.worker.interf.locality.processId();
@ -1436,22 +1432,20 @@ ACTOR Future<Void> clusterWatchDatabase( ClusterControllerData* cluster, Cluster
db->masterRegistrationCount = 0;
db->recoveryStalled = false;
db->serverInfoMasterOnly = CachedSerialization<ServerDBInfo>();
auto& dbInfo = db->serverInfoMasterOnly.mutate();
auto dbInfo = ServerDBInfo();
dbInfo.master = iMaster;
dbInfo.id = deterministicRandom()->randomUniqueID();
dbInfo.masterLifetime = db->serverInfo->get().read().masterLifetime;
dbInfo.infoGeneration = ++db->dbInfoCount;
dbInfo.masterLifetime = db->serverInfo->get().masterLifetime;
++dbInfo.masterLifetime;
dbInfo.clusterInterface = db->serverInfo->get().read().clusterInterface;
dbInfo.distributor = db->serverInfo->get().read().distributor;
dbInfo.ratekeeper = db->serverInfo->get().read().ratekeeper;
dbInfo.storageCaches = db->serverInfo->get().read().storageCaches;
dbInfo.latencyBandConfig = db->serverInfo->get().read().latencyBandConfig;
dbInfo.clusterInterface = db->serverInfo->get().clusterInterface;
dbInfo.distributor = db->serverInfo->get().distributor;
dbInfo.ratekeeper = db->serverInfo->get().ratekeeper;
dbInfo.storageCaches = db->serverInfo->get().storageCaches;
dbInfo.latencyBandConfig = db->serverInfo->get().latencyBandConfig;
TraceEvent("CCWDB", cluster->id).detail("Lifetime", dbInfo.masterLifetime.toString()).detail("ChangeID", dbInfo.id);
db->requiredAddresses.clear();
db->serverInfo->set( db->serverInfoMasterOnly );
db->serverInfo->set( dbInfo );
state Future<Void> spinDelay = delay(SERVER_KNOBS->MASTER_SPIN_DELAY); // Don't retry master recovery more than once per second, but don't delay the "first" recovery after more than a second of normal operation
@ -1486,30 +1480,14 @@ ACTOR Future<Void> clusterWatchDatabase( ClusterControllerData* cluster, Cluster
}
ACTOR Future<Void> clusterGetServerInfo(ClusterControllerData::DBInfo* db, UID knownServerInfoID,
Standalone<VectorRef<StringRef>> issues,
std::vector<NetworkAddress> incompatiblePeers,
ReplyPromise<CachedSerialization<ServerDBInfo>> reply) {
state Optional<UID> issueID;
state bool useMasterOnly = false;
setIssues(db->workersWithIssues, reply.getEndpoint().getPrimaryAddress(), issues, issueID);
for(auto it : incompatiblePeers) {
db->incompatibleConnections[it] = now() + SERVER_KNOBS->INCOMPATIBLE_PEERS_LOGGING_INTERVAL;
}
loop {
useMasterOnly = db->serverInfo->get().read().recoveryState < RecoveryState::ACCEPTING_COMMITS && !db->requiredAddresses.count(reply.getEndpoint().getPrimaryAddress());
if((useMasterOnly ? db->serverInfoMasterOnly.read().id : db->serverInfo->get().read().id) != knownServerInfoID) {
break;
}
ReplyPromise<ServerDBInfo> reply) {
while(db->serverInfo->get().id == knownServerInfoID) {
choose {
when (wait( yieldedFuture(db->serverInfo->onChange()) )) {}
when (wait( delayJittered( 300 ) )) { break; } // The server might be long gone!
}
}
removeIssues(db->workersWithIssues, reply.getEndpoint().getPrimaryAddress(), issueID);
reply.send( useMasterOnly ? db->serverInfoMasterOnly : db->serverInfo->get() );
reply.send( db->serverInfo->get() );
return Void();
}
@ -1535,12 +1513,6 @@ void checkOutstandingRecruitmentRequests( ClusterControllerData* self ) {
RecruitFromConfigurationRequest& req = self->outstandingRecruitmentRequests[i];
try {
RecruitFromConfigurationReply rep = self->findWorkersForConfiguration( req );
self->db.addRequiredAddresses(rep.oldLogRouters);
self->db.addRequiredAddresses(rep.proxies);
self->db.addRequiredAddresses(rep.resolvers);
self->db.addRequiredAddresses(rep.satelliteTLogs);
self->db.addRequiredAddresses(rep.tLogs);
self->db.serverInfo->trigger();
req.reply.send( rep );
swapAndPop( &self->outstandingRecruitmentRequests, i-- );
} catch (Error& e) {
@ -1559,9 +1531,6 @@ void checkOutstandingRemoteRecruitmentRequests( ClusterControllerData* self ) {
RecruitRemoteFromConfigurationRequest& req = self->outstandingRemoteRecruitmentRequests[i];
try {
RecruitRemoteFromConfigurationReply rep = self->findRemoteWorkersForConfiguration( req );
self->db.addRequiredAddresses(rep.remoteTLogs);
self->db.addRequiredAddresses(rep.logRouters);
self->db.serverInfo->trigger();
req.reply.send( rep );
swapAndPop( &self->outstandingRemoteRecruitmentRequests, i-- );
} catch (Error& e) {
@ -1609,7 +1578,7 @@ void checkOutstandingStorageRequests( ClusterControllerData* self ) {
}
void checkBetterDDOrRK(ClusterControllerData* self) {
if (!self->masterProcessId.present() || self->db.serverInfo->get().read().recoveryState < RecoveryState::ACCEPTING_COMMITS) {
if (!self->masterProcessId.present() || self->db.serverInfo->get().recoveryState < RecoveryState::ACCEPTING_COMMITS) {
return;
}
@ -1628,11 +1597,11 @@ void checkBetterDDOrRK(ClusterControllerData* self) {
newDDWorker = self->id_worker[self->masterProcessId.get()].details;
}
auto bestFitnessForRK = newRKWorker.processClass.machineClassFitness(ProcessClass::Ratekeeper);
if(self->db.config.isExcludedServer(newRKWorker.interf.address())) {
if(self->db.config.isExcludedServer(newRKWorker.interf.addresses())) {
bestFitnessForRK = std::max(bestFitnessForRK, ProcessClass::ExcludeFit);
}
auto bestFitnessForDD = newDDWorker.processClass.machineClassFitness(ProcessClass::DataDistributor);
if(self->db.config.isExcludedServer(newDDWorker.interf.address())) {
if(self->db.config.isExcludedServer(newDDWorker.interf.addresses())) {
bestFitnessForDD = std::max(bestFitnessForDD, ProcessClass::ExcludeFit);
}
//TraceEvent("CheckBetterDDorRKNewRecruits", self->id).detail("MasterProcessId", self->masterProcessId)
@ -1641,7 +1610,7 @@ void checkBetterDDOrRK(ClusterControllerData* self) {
Optional<Standalone<StringRef>> currentRKProcessId;
Optional<Standalone<StringRef>> currentDDProcessId;
auto& db = self->db.serverInfo->get().read();
auto& db = self->db.serverInfo->get();
bool ratekeeperHealthy = false;
if (db.ratekeeper.present() && self->id_worker.count(db.ratekeeper.get().locality.processId()) &&
(!self->recruitingRatekeeperID.present() || (self->recruitingRatekeeperID.get() == db.ratekeeper.get().id()))) {
@ -1700,7 +1669,7 @@ ACTOR Future<Void> doCheckOutstandingRequests( ClusterControllerData* self ) {
self->checkRecoveryStalled();
if (self->betterMasterExists()) {
self->db.forceMasterFailure.trigger();
TraceEvent("MasterRegistrationKill", self->id).detail("MasterId", self->db.serverInfo->get().read().master.id());
TraceEvent("MasterRegistrationKill", self->id).detail("MasterId", self->db.serverInfo->get().master.id());
}
} catch( Error &e ) {
if(e.code() != error_code_no_more_servers) {
@ -1757,12 +1726,14 @@ ACTOR Future<Void> rebootAndCheck( ClusterControllerData* cluster, Optional<Stan
return Void();
}
ACTOR Future<Void> workerAvailabilityWatch( WorkerInterface worker, ProcessClass startingClass, ClusterControllerData* cluster ) {
ACTOR Future<Void> workerAvailabilityWatch( WorkerInterface worker, ProcessClass startingClass, ClusterControllerData* cluster) {
state Future<Void> failed =
(worker.address() == g_network->getLocalAddress() || startingClass.classType() == ProcessClass::TesterClass)
? Never()
: waitFailureClient(worker.waitFailure, SERVER_KNOBS->WORKER_FAILURE_TIME);
cluster->updateWorkerList.set( worker.locality.processId(), ProcessData(worker.locality, startingClass, worker.address()) );
cluster->updateWorkerList.set( worker.locality.processId(), ProcessData(worker.locality, startingClass, worker.stableAddress()) );
cluster->updateDBInfoEndpoints.insert(worker.updateServerDBInfo.getEndpoint());
cluster->updateDBInfo.trigger();
// This switching avoids a race where the worker can be added to id_worker map after the workerAvailabilityWatch fails for the worker.
wait(delay(0));
@ -1801,6 +1772,7 @@ ACTOR Future<Void> workerAvailabilityWatch( WorkerInterface worker, ProcessClass
if (worker.locality.processId() == cluster->masterProcessId) {
cluster->masterProcessId = Optional<Key>();
}
cluster->removedDBInfoEndpoints.insert(worker.updateServerDBInfo.getEndpoint());
cluster->id_worker.erase( worker.locality.processId() );
cluster->updateWorkerList.set( worker.locality.processId(), Optional<ProcessData>() );
return Void();
@ -1996,12 +1968,6 @@ ACTOR Future<Void> clusterRecruitFromConfiguration( ClusterControllerData* self,
loop {
try {
auto rep = self->findWorkersForConfiguration( req );
self->db.addRequiredAddresses(rep.oldLogRouters);
self->db.addRequiredAddresses(rep.proxies);
self->db.addRequiredAddresses(rep.resolvers);
self->db.addRequiredAddresses(rep.satelliteTLogs);
self->db.addRequiredAddresses(rep.tLogs);
self->db.serverInfo->trigger();
req.reply.send( rep );
return Void();
} catch (Error& e) {
@ -2027,9 +1993,6 @@ ACTOR Future<Void> clusterRecruitRemoteFromConfiguration( ClusterControllerData*
loop {
try {
RecruitRemoteFromConfigurationReply rep = self->findRemoteWorkersForConfiguration( req );
self->db.addRequiredAddresses(rep.remoteTLogs);
self->db.addRequiredAddresses(rep.logRouters);
self->db.serverInfo->trigger();
req.reply.send( rep );
return Void();
} catch (Error& e) {
@ -2066,8 +2029,8 @@ void clusterRegisterMaster( ClusterControllerData* self, RegisterMasterRequest c
//make sure the request comes from an active database
auto db = &self->db;
if ( db->serverInfo->get().read().master.id() != req.id || req.registrationCount <= db->masterRegistrationCount ) {
TraceEvent("MasterRegistrationNotFound", self->id).detail("MasterId", req.id).detail("ExistingId", db->serverInfo->get().read().master.id()).detail("RegCount", req.registrationCount).detail("ExistingRegCount", db->masterRegistrationCount);
if ( db->serverInfo->get().master.id() != req.id || req.registrationCount <= db->masterRegistrationCount ) {
TraceEvent("MasterRegistrationNotFound", self->id).detail("MasterId", req.id).detail("ExistingId", db->serverInfo->get().master.id()).detail("RegCount", req.registrationCount).detail("ExistingRegCount", db->masterRegistrationCount);
return;
}
@ -2088,7 +2051,7 @@ void clusterRegisterMaster( ClusterControllerData* self, RegisterMasterRequest c
self->gotFullyRecoveredConfig = true;
db->fullyRecoveredConfig = req.configuration.get();
for ( auto& it : self->id_worker ) {
bool isExcludedFromConfig = db->fullyRecoveredConfig.isExcludedServer(it.second.details.interf.address());
bool isExcludedFromConfig = db->fullyRecoveredConfig.isExcludedServer(it.second.details.interf.addresses());
if ( it.second.priorityInfo.isExcluded != isExcludedFromConfig ) {
it.second.priorityInfo.isExcluded = isExcludedFromConfig;
if( !it.second.reply.isSet() ) {
@ -2100,8 +2063,7 @@ void clusterRegisterMaster( ClusterControllerData* self, RegisterMasterRequest c
}
bool isChanged = false;
auto cachedInfo = self->db.serverInfo->get();
auto& dbInfo = cachedInfo.mutate();
auto dbInfo = self->db.serverInfo->get();
if (dbInfo.recoveryState != req.recoveryState) {
dbInfo.recoveryState = req.recoveryState;
@ -2142,7 +2104,8 @@ void clusterRegisterMaster( ClusterControllerData* self, RegisterMasterRequest c
if( isChanged ) {
dbInfo.id = deterministicRandom()->randomUniqueID();
self->db.serverInfo->set( cachedInfo );
dbInfo.infoGeneration = ++self->db.dbInfoCount;
self->db.serverInfo->set( dbInfo );
}
checkOutstandingRequests(self);
@ -2155,6 +2118,11 @@ void registerWorker( RegisterWorkerRequest req, ClusterControllerData *self ) {
ClusterControllerPriorityInfo newPriorityInfo = req.priorityInfo;
newPriorityInfo.processClassFitness = newProcessClass.machineClassFitness(ProcessClass::ClusterController);
for(auto it : req.incompatiblePeers) {
self->db.incompatibleConnections[it] = now() + SERVER_KNOBS->INCOMPATIBLE_PEERS_LOGGING_INTERVAL;
}
self->removedDBInfoEndpoints.erase(w.updateServerDBInfo.getEndpoint());
if(info == self->id_worker.end()) {
TraceEvent("ClusterControllerActualWorkers", self->id).detail("WorkerId",w.id()).detail("ProcessId", w.locality.processId()).detail("ZoneId", w.locality.zoneId()).detail("DataHall", w.locality.dataHallId()).detail("PClass", req.processClass.toString()).detail("Workers", self->id_worker.size());
self->goodRecruitmentTime = lowPriorityDelay(SERVER_KNOBS->WAIT_FOR_GOOD_RECRUITMENT_DELAY);
@ -2194,13 +2162,13 @@ void registerWorker( RegisterWorkerRequest req, ClusterControllerData *self ) {
}
if ( self->gotFullyRecoveredConfig ) {
newPriorityInfo.isExcluded = self->db.fullyRecoveredConfig.isExcludedServer(w.address());
newPriorityInfo.isExcluded = self->db.fullyRecoveredConfig.isExcludedServer(w.addresses());
}
}
if( info == self->id_worker.end() ) {
self->id_worker[w.locality.processId()] = WorkerInfo( workerAvailabilityWatch( w, newProcessClass, self ), req.reply, req.generation, w, req.initialClass, newProcessClass, newPriorityInfo, req.degraded );
if (!self->masterProcessId.present() && w.locality.processId() == self->db.serverInfo->get().read().master.locality.processId()) {
self->id_worker[w.locality.processId()] = WorkerInfo( workerAvailabilityWatch( w, newProcessClass, self ), req.reply, req.generation, w, req.initialClass, newProcessClass, newPriorityInfo, req.degraded, req.issues );
if (!self->masterProcessId.present() && w.locality.processId() == self->db.serverInfo->get().master.locality.processId()) {
self->masterProcessId = w.locality.processId();
}
checkOutstandingRequests( self );
@ -2214,8 +2182,10 @@ void registerWorker( RegisterWorkerRequest req, ClusterControllerData *self ) {
info->second.initialClass = req.initialClass;
info->second.details.degraded = req.degraded;
info->second.gen = req.generation;
info->second.issues = req.issues;
if(info->second.details.interf.id() != w.id()) {
self->removedDBInfoEndpoints.insert(info->second.details.interf.updateServerDBInfo.getEndpoint());
info->second.details.interf = w;
info->second.watcher = workerAvailabilityWatch( w, newProcessClass, self );
}
@ -2224,7 +2194,7 @@ void registerWorker( RegisterWorkerRequest req, ClusterControllerData *self ) {
TEST(true); // Received an old worker registration request.
}
if (req.distributorInterf.present() && !self->db.serverInfo->get().read().distributor.present() &&
if (req.distributorInterf.present() && !self->db.serverInfo->get().distributor.present() &&
self->clusterControllerDcId == req.distributorInterf.get().locality.dcId() &&
!self->recruitingDistributor) {
const DataDistributorInterface& di = req.distributorInterf.get();
@ -2244,7 +2214,7 @@ void registerWorker( RegisterWorkerRequest req, ClusterControllerData *self ) {
req.ratekeeperInterf.get().haltRatekeeper.getReply(HaltRatekeeperRequest(self->id)));
} else if (!self->recruitingRatekeeperID.present()) {
const RatekeeperInterface& rki = req.ratekeeperInterf.get();
const auto& ratekeeper = self->db.serverInfo->get().read().ratekeeper;
const auto& ratekeeper = self->db.serverInfo->get().ratekeeper;
TraceEvent("CCRegisterRatekeeper", self->id).detail("RKID", rki.id());
if (ratekeeper.present() && ratekeeper.get().id() != rki.id() && self->id_worker.count(ratekeeper.get().locality.processId())) {
TraceEvent("CCHaltPreviousRatekeeper", self->id).detail("RKID", ratekeeper.get().id())
@ -2425,8 +2395,14 @@ ACTOR Future<Void> statusServer(FutureStream< StatusRequest> requests,
// Get status but trap errors to send back to client.
vector<WorkerDetails> workers;
for(auto& it : self->id_worker)
std::vector<ProcessIssues> workerIssues;
for(auto& it : self->id_worker) {
workers.push_back(it.second.details);
if(it.second.issues.size()) {
workerIssues.push_back(ProcessIssues(it.second.details.interf.address(), it.second.issues));
}
}
std::vector<NetworkAddress> incompatibleConnections;
for(auto it = self->db.incompatibleConnections.begin(); it != self->db.incompatibleConnections.end();) {
@ -2438,7 +2414,7 @@ ACTOR Future<Void> statusServer(FutureStream< StatusRequest> requests,
}
}
state ErrorOr<StatusReply> result = wait(errorOr(clusterGetStatus(self->db.serverInfo, self->cx, workers, self->db.workersWithIssues, &self->db.clientStatus, coordinators, incompatibleConnections, self->datacenterVersionDifference)));
state ErrorOr<StatusReply> result = wait(errorOr(clusterGetStatus(self->db.serverInfo, self->cx, workers, workerIssues, &self->db.clientStatus, coordinators, incompatibleConnections, self->datacenterVersionDifference)));
if (result.isError() && result.getError().code() == error_code_actor_cancelled)
throw result.getError();
@ -2565,13 +2541,13 @@ ACTOR Future<Void> monitorServerInfoConfig(ClusterControllerData::DBInfo* db) {
config = LatencyBandConfig::parse(configVal.get());
}
auto cachedInfo = db->serverInfo->get();
auto& serverInfo = cachedInfo.mutate();
auto serverInfo = db->serverInfo->get();
if(config != serverInfo.latencyBandConfig) {
TraceEvent("LatencyBandConfigChanged").detail("Present", config.present());
serverInfo.id = deterministicRandom()->randomUniqueID();
serverInfo.infoGeneration = ++db->dbInfoCount;
serverInfo.latencyBandConfig = config;
db->serverInfo->set(cachedInfo);
db->serverInfo->set(serverInfo);
}
state Future<Void> configChangeFuture = tr.watch(latencyBandConfigKey);
@ -2799,7 +2775,7 @@ ACTOR Future<Void> updateDatacenterVersionDifference( ClusterControllerData *sel
state double lastLogTime = 0;
loop {
self->versionDifferenceUpdated = false;
if(self->db.serverInfo->get().read().recoveryState >= RecoveryState::ACCEPTING_COMMITS && self->db.config.usableRegions == 1) {
if(self->db.serverInfo->get().recoveryState >= RecoveryState::ACCEPTING_COMMITS && self->db.config.usableRegions == 1) {
bool oldDifferenceTooLarge = !self->versionDifferenceUpdated || self->datacenterVersionDifference >= SERVER_KNOBS->MAX_VERSION_DIFFERENCE;
self->versionDifferenceUpdated = true;
self->datacenterVersionDifference = 0;
@ -2814,8 +2790,8 @@ ACTOR Future<Void> updateDatacenterVersionDifference( ClusterControllerData *sel
state Optional<TLogInterface> primaryLog;
state Optional<TLogInterface> remoteLog;
if(self->db.serverInfo->get().read().recoveryState >= RecoveryState::ALL_LOGS_RECRUITED) {
for(auto& logSet : self->db.serverInfo->get().read().logSystemConfig.tLogs) {
if(self->db.serverInfo->get().recoveryState >= RecoveryState::ALL_LOGS_RECRUITED) {
for(auto& logSet : self->db.serverInfo->get().logSystemConfig.tLogs) {
if(logSet.isLocal && logSet.locality != tagLocalitySatellite) {
for(auto& tLog : logSet.tLogs) {
if(tLog.present()) {
@ -2916,12 +2892,12 @@ ACTOR Future<DataDistributorInterface> startDataDistributor( ClusterControllerDa
TraceEvent("CCStartDataDistributor", self->id);
loop {
try {
state bool no_distributor = !self->db.serverInfo->get().read().distributor.present();
while (!self->masterProcessId.present() || self->masterProcessId != self->db.serverInfo->get().read().master.locality.processId() || self->db.serverInfo->get().read().recoveryState < RecoveryState::ACCEPTING_COMMITS) {
state bool no_distributor = !self->db.serverInfo->get().distributor.present();
while (!self->masterProcessId.present() || self->masterProcessId != self->db.serverInfo->get().master.locality.processId() || self->db.serverInfo->get().recoveryState < RecoveryState::ACCEPTING_COMMITS) {
wait(self->db.serverInfo->onChange() || delay(SERVER_KNOBS->WAIT_FOR_GOOD_RECRUITMENT_DELAY));
}
if (no_distributor && self->db.serverInfo->get().read().distributor.present()) {
return self->db.serverInfo->get().read().distributor.get();
if (no_distributor && self->db.serverInfo->get().distributor.present()) {
return self->db.serverInfo->get().distributor.get();
}
std::map<Optional<Standalone<StringRef>>, int> id_used = self->getUsedIds();
@ -2951,15 +2927,15 @@ ACTOR Future<DataDistributorInterface> startDataDistributor( ClusterControllerDa
}
ACTOR Future<Void> monitorDataDistributor(ClusterControllerData *self) {
while(self->db.serverInfo->get().read().recoveryState < RecoveryState::ACCEPTING_COMMITS) {
while(self->db.serverInfo->get().recoveryState < RecoveryState::ACCEPTING_COMMITS) {
wait(self->db.serverInfo->onChange());
}
loop {
if ( self->db.serverInfo->get().read().distributor.present() ) {
wait( waitFailureClient( self->db.serverInfo->get().read().distributor.get().waitFailure, SERVER_KNOBS->DD_FAILURE_TIME ) );
if ( self->db.serverInfo->get().distributor.present() ) {
wait( waitFailureClient( self->db.serverInfo->get().distributor.get().waitFailure, SERVER_KNOBS->DD_FAILURE_TIME ) );
TraceEvent("CCDataDistributorDied", self->id)
.detail("DistributorId", self->db.serverInfo->get().read().distributor.get().id());
.detail("DistributorId", self->db.serverInfo->get().distributor.get().id());
self->db.clearInterf(ProcessClass::DataDistributorClass);
} else {
self->recruitingDistributor = true;
@ -2976,11 +2952,11 @@ ACTOR Future<Void> startRatekeeper(ClusterControllerData *self) {
TraceEvent("CCStartRatekeeper", self->id);
loop {
try {
state bool no_ratekeeper = !self->db.serverInfo->get().read().ratekeeper.present();
while (!self->masterProcessId.present() || self->masterProcessId != self->db.serverInfo->get().read().master.locality.processId() || self->db.serverInfo->get().read().recoveryState < RecoveryState::ACCEPTING_COMMITS) {
state bool no_ratekeeper = !self->db.serverInfo->get().ratekeeper.present();
while (!self->masterProcessId.present() || self->masterProcessId != self->db.serverInfo->get().master.locality.processId() || self->db.serverInfo->get().recoveryState < RecoveryState::ACCEPTING_COMMITS) {
wait(self->db.serverInfo->onChange() || delay(SERVER_KNOBS->WAIT_FOR_GOOD_RECRUITMENT_DELAY));
}
if (no_ratekeeper && self->db.serverInfo->get().read().ratekeeper.present()) {
if (no_ratekeeper && self->db.serverInfo->get().ratekeeper.present()) {
// Existing ratekeeper registers while waiting, so skip.
return Void();
}
@ -3000,7 +2976,7 @@ ACTOR Future<Void> startRatekeeper(ClusterControllerData *self) {
if (interf.present()) {
self->recruitRatekeeper.set(false);
self->recruitingRatekeeperID = interf.get().id();
const auto& ratekeeper = self->db.serverInfo->get().read().ratekeeper;
const auto& ratekeeper = self->db.serverInfo->get().ratekeeper;
TraceEvent("CCRatekeeperRecruited", self->id).detail("Addr", worker.interf.address()).detail("RKID", interf.get().id());
if (ratekeeper.present() && ratekeeper.get().id() != interf.get().id() && self->id_worker.count(ratekeeper.get().locality.processId())) {
TraceEvent("CCHaltRatekeeperAfterRecruit", self->id).detail("RKID", ratekeeper.get().id())
@ -3025,16 +3001,16 @@ ACTOR Future<Void> startRatekeeper(ClusterControllerData *self) {
}
ACTOR Future<Void> monitorRatekeeper(ClusterControllerData *self) {
while(self->db.serverInfo->get().read().recoveryState < RecoveryState::ACCEPTING_COMMITS) {
while(self->db.serverInfo->get().recoveryState < RecoveryState::ACCEPTING_COMMITS) {
wait(self->db.serverInfo->onChange());
}
loop {
if ( self->db.serverInfo->get().read().ratekeeper.present() && !self->recruitRatekeeper.get() ) {
if ( self->db.serverInfo->get().ratekeeper.present() && !self->recruitRatekeeper.get() ) {
choose {
when(wait(waitFailureClient( self->db.serverInfo->get().read().ratekeeper.get().waitFailure, SERVER_KNOBS->RATEKEEPER_FAILURE_TIME ))) {
when(wait(waitFailureClient( self->db.serverInfo->get().ratekeeper.get().waitFailure, SERVER_KNOBS->RATEKEEPER_FAILURE_TIME ))) {
TraceEvent("CCRatekeeperDied", self->id)
.detail("RKID", self->db.serverInfo->get().read().ratekeeper.get().id());
.detail("RKID", self->db.serverInfo->get().ratekeeper.get().id());
self->db.clearInterf(ProcessClass::RatekeeperClass);
}
when(wait(self->recruitRatekeeper.onChange())) {}
@ -3045,6 +3021,54 @@ ACTOR Future<Void> monitorRatekeeper(ClusterControllerData *self) {
}
}
ACTOR Future<Void> dbInfoUpdater( ClusterControllerData* self ) {
state Future<Void> dbInfoChange = self->db.serverInfo->onChange();
state Future<Void> updateDBInfo = self->updateDBInfo.onTrigger();
loop {
choose {
when(wait(updateDBInfo)) {
wait(delay(SERVER_KNOBS->DBINFO_BATCH_DELAY) || dbInfoChange);
}
when(wait(dbInfoChange)) {}
}
UpdateServerDBInfoRequest req;
if(dbInfoChange.isReady()) {
for(auto &it : self->id_worker) {
req.broadcastInfo.push_back(it.second.details.interf.updateServerDBInfo.getEndpoint());
}
} else {
for(auto it : self->removedDBInfoEndpoints) {
self->updateDBInfoEndpoints.erase(it);
}
req.broadcastInfo = std::vector<Endpoint>(self->updateDBInfoEndpoints.begin(), self->updateDBInfoEndpoints.end());
}
self->updateDBInfoEndpoints.clear();
self->removedDBInfoEndpoints.clear();
dbInfoChange = self->db.serverInfo->onChange();
updateDBInfo = self->updateDBInfo.onTrigger();
req.serializedDbInfo = BinaryWriter::toValue(self->db.serverInfo->get(), AssumeVersion(currentProtocolVersion));
TraceEvent("DBInfoStartBroadcast", self->id);
choose {
when(std::vector<Endpoint> notUpdated = wait( broadcastDBInfoRequest(req, SERVER_KNOBS->DBINFO_SEND_AMOUNT, Optional<Endpoint>(), false) )) {
TraceEvent("DBInfoFinishBroadcast", self->id);
for(auto &it : notUpdated) {
TraceEvent("DBInfoNotUpdated", self->id).detail("Addr", it.getPrimaryAddress());
}
if(notUpdated.size()) {
self->updateDBInfoEndpoints.insert(notUpdated.begin(), notUpdated.end());
self->updateDBInfo.trigger();
}
}
when(wait(dbInfoChange)) {}
}
}
}
ACTOR Future<Void> clusterControllerCore( ClusterControllerFullInterface interf, Future<Void> leaderFail, ServerCoordinators coordinators, LocalityData locality ) {
state ClusterControllerData self( interf, locality );
state Future<Void> coordinationPingDelay = delay( SERVER_KNOBS->WORKER_COORDINATION_PING_DELAY );
@ -3066,6 +3090,7 @@ ACTOR Future<Void> clusterControllerCore( ClusterControllerFullInterface interf,
self.addActor.send( monitorDataDistributor(&self) );
self.addActor.send( monitorRatekeeper(&self) );
self.addActor.send( monitorStorageCache(&self) );
self.addActor.send( dbInfoUpdater(&self) );
self.addActor.send( traceCounters("ClusterControllerMetrics", self.id, SERVER_KNOBS->STORAGE_LOGGING_DELAY, &self.clusterControllerMetrics, self.id.toString() + "/ClusterControllerMetrics") );
//printf("%s: I am the cluster controller\n", g_network->getLocalAddress().toString().c_str());
@ -3103,7 +3128,7 @@ ACTOR Future<Void> clusterControllerCore( ClusterControllerFullInterface interf,
vector<WorkerDetails> workers;
for(auto& it : self.id_worker) {
if ( (req.flags & GetWorkersRequest::NON_EXCLUDED_PROCESSES_ONLY) && self.db.config.isExcludedServer(it.second.details.interf.address()) ) {
if ( (req.flags & GetWorkersRequest::NON_EXCLUDED_PROCESSES_ONLY) && self.db.config.isExcludedServer(it.second.details.interf.addresses()) ) {
continue;
}
@ -3138,9 +3163,7 @@ ACTOR Future<Void> clusterControllerCore( ClusterControllerFullInterface interf,
clusterRegisterMaster( &self, req );
}
when( GetServerDBInfoRequest req = waitNext( interf.getServerDBInfo.getFuture() ) ) {
++self.getServerDBInfoRequests;
self.addActor.send(
clusterGetServerInfo(&self.db, req.knownServerInfoID, req.issues, req.incompatiblePeers, req.reply));
self.addActor.send( clusterGetServerInfo(&self.db, req.knownServerInfoID, req.reply) );
}
when( wait( leaderFail ) ) {
// We are no longer the leader if this has changed.

View File

@ -1,263 +0,0 @@
/*
* ClusterRecruitmentInterface.h
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2018 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef FDBSERVER_CLUSTERRECRUITMENTINTERFACE_H
#define FDBSERVER_CLUSTERRECRUITMENTINTERFACE_H
#pragma once
#include <vector>
#include "fdbclient/ClusterInterface.h"
#include "fdbclient/StorageServerInterface.h"
#include "fdbclient/MasterProxyInterface.h"
#include "fdbclient/DatabaseConfiguration.h"
#include "fdbserver/BackupInterface.h"
#include "fdbserver/DataDistributorInterface.h"
#include "fdbserver/MasterInterface.h"
#include "fdbserver/RecoveryState.h"
#include "fdbserver/TLogInterface.h"
#include "fdbserver/WorkerInterface.actor.h"
#include "fdbserver/Knobs.h"
// This interface and its serialization depend on slicing, since the client will deserialize only the first part of this structure
struct ClusterControllerFullInterface {
constexpr static FileIdentifier file_identifier =
ClusterControllerClientInterface::file_identifier;
ClusterInterface clientInterface;
RequestStream< struct RecruitFromConfigurationRequest > recruitFromConfiguration;
RequestStream< struct RecruitRemoteFromConfigurationRequest > recruitRemoteFromConfiguration;
RequestStream< struct RecruitStorageRequest > recruitStorage;
RequestStream< struct RegisterWorkerRequest > registerWorker;
RequestStream< struct GetWorkersRequest > getWorkers;
RequestStream< struct RegisterMasterRequest > registerMaster;
RequestStream< struct GetServerDBInfoRequest > getServerDBInfo;
UID id() const { return clientInterface.id(); }
bool operator == (ClusterControllerFullInterface const& r) const { return id() == r.id(); }
bool operator != (ClusterControllerFullInterface const& r) const { return id() != r.id(); }
bool hasMessage() {
return clientInterface.hasMessage() ||
recruitFromConfiguration.getFuture().isReady() ||
recruitRemoteFromConfiguration.getFuture().isReady() ||
recruitStorage.getFuture().isReady() ||
registerWorker.getFuture().isReady() ||
getWorkers.getFuture().isReady() ||
registerMaster.getFuture().isReady() ||
getServerDBInfo.getFuture().isReady();
}
void initEndpoints() {
clientInterface.initEndpoints();
recruitFromConfiguration.getEndpoint( TaskPriority::ClusterControllerRecruit );
recruitRemoteFromConfiguration.getEndpoint( TaskPriority::ClusterControllerRecruit );
recruitStorage.getEndpoint( TaskPriority::ClusterController );
registerWorker.getEndpoint( TaskPriority::ClusterControllerWorker );
getWorkers.getEndpoint( TaskPriority::ClusterController );
registerMaster.getEndpoint( TaskPriority::ClusterControllerRegister );
getServerDBInfo.getEndpoint( TaskPriority::ClusterController );
}
template <class Ar>
void serialize(Ar& ar) {
if constexpr (!is_fb_function<Ar>) {
ASSERT(ar.protocolVersion().isValid());
}
serializer(ar, clientInterface, recruitFromConfiguration, recruitRemoteFromConfiguration, recruitStorage,
registerWorker, getWorkers, registerMaster, getServerDBInfo);
}
};
struct RecruitFromConfigurationReply {
constexpr static FileIdentifier file_identifier = 2224085;
std::vector<WorkerInterface> backupWorkers;
std::vector<WorkerInterface> tLogs;
std::vector<WorkerInterface> satelliteTLogs;
std::vector<WorkerInterface> proxies;
std::vector<WorkerInterface> resolvers;
std::vector<WorkerInterface> storageServers;
std::vector<WorkerInterface> oldLogRouters;
Optional<Key> dcId;
bool satelliteFallback;
RecruitFromConfigurationReply() : satelliteFallback(false) {}
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, tLogs, satelliteTLogs, proxies, resolvers, storageServers, oldLogRouters, dcId,
satelliteFallback, backupWorkers);
}
};
struct RecruitFromConfigurationRequest {
constexpr static FileIdentifier file_identifier = 2023046;
DatabaseConfiguration configuration;
bool recruitSeedServers;
int maxOldLogRouters;
ReplyPromise< struct RecruitFromConfigurationReply > reply;
RecruitFromConfigurationRequest() {}
explicit RecruitFromConfigurationRequest(DatabaseConfiguration const& configuration, bool recruitSeedServers, int maxOldLogRouters)
: configuration(configuration), recruitSeedServers(recruitSeedServers), maxOldLogRouters(maxOldLogRouters) {}
template <class Ar>
void serialize( Ar& ar ) {
serializer(ar, configuration, recruitSeedServers, maxOldLogRouters, reply);
}
};
struct RecruitRemoteFromConfigurationReply {
constexpr static FileIdentifier file_identifier = 9091392;
std::vector<WorkerInterface> remoteTLogs;
std::vector<WorkerInterface> logRouters;
template <class Ar>
void serialize( Ar& ar ) {
serializer(ar, remoteTLogs, logRouters);
}
};
struct RecruitRemoteFromConfigurationRequest {
constexpr static FileIdentifier file_identifier = 3235995;
DatabaseConfiguration configuration;
Optional<Key> dcId;
int logRouterCount;
std::vector<UID> exclusionWorkerIds;
ReplyPromise< struct RecruitRemoteFromConfigurationReply > reply;
RecruitRemoteFromConfigurationRequest() {}
RecruitRemoteFromConfigurationRequest(DatabaseConfiguration const& configuration, Optional<Key> const& dcId, int logRouterCount, const std::vector<UID> &exclusionWorkerIds) : configuration(configuration), dcId(dcId), logRouterCount(logRouterCount), exclusionWorkerIds(exclusionWorkerIds){}
template <class Ar>
void serialize( Ar& ar ) {
serializer(ar, configuration, dcId, logRouterCount, exclusionWorkerIds, reply);
}
};
struct RecruitStorageReply {
constexpr static FileIdentifier file_identifier = 15877089;
WorkerInterface worker;
ProcessClass processClass;
template <class Ar>
void serialize( Ar& ar ) {
serializer(ar, worker, processClass);
}
};
struct RecruitStorageRequest {
constexpr static FileIdentifier file_identifier = 905920;
std::vector<Optional<Standalone<StringRef>>> excludeMachines; //< Don't recruit any of these machines
std::vector<AddressExclusion> excludeAddresses; //< Don't recruit any of these addresses
std::vector<Optional<Standalone<StringRef>>> includeDCs;
bool criticalRecruitment; //< True if machine classes are to be ignored
ReplyPromise< RecruitStorageReply > reply;
template <class Ar>
void serialize( Ar& ar ) {
serializer(ar, excludeMachines, excludeAddresses, includeDCs, criticalRecruitment, reply);
}
};
struct RegisterWorkerReply {
constexpr static FileIdentifier file_identifier = 16475696;
ProcessClass processClass;
ClusterControllerPriorityInfo priorityInfo;
Optional<uint16_t> storageCache;
RegisterWorkerReply() : priorityInfo(ProcessClass::UnsetFit, false, ClusterControllerPriorityInfo::FitnessUnknown) {}
RegisterWorkerReply(ProcessClass processClass, ClusterControllerPriorityInfo priorityInfo, Optional<uint16_t> storageCache) : processClass(processClass), priorityInfo(priorityInfo), storageCache(storageCache) {}
template <class Ar>
void serialize( Ar& ar ) {
serializer(ar, processClass, priorityInfo, storageCache);
}
};
struct RegisterWorkerRequest {
constexpr static FileIdentifier file_identifier = 14332605;
WorkerInterface wi;
ProcessClass initialClass;
ProcessClass processClass;
ClusterControllerPriorityInfo priorityInfo;
Generation generation;
Optional<DataDistributorInterface> distributorInterf;
Optional<RatekeeperInterface> ratekeeperInterf;
Optional<std::pair<uint16_t,StorageServerInterface>> storageCacheInterf;
ReplyPromise<RegisterWorkerReply> reply;
bool degraded;
RegisterWorkerRequest() : priorityInfo(ProcessClass::UnsetFit, false, ClusterControllerPriorityInfo::FitnessUnknown), degraded(false) {}
RegisterWorkerRequest(WorkerInterface wi, ProcessClass initialClass, ProcessClass processClass, ClusterControllerPriorityInfo priorityInfo, Generation generation, Optional<DataDistributorInterface> ddInterf, Optional<RatekeeperInterface> rkInterf, Optional<std::pair<uint16_t,StorageServerInterface>> storageCacheInterf, bool degraded) :
wi(wi), initialClass(initialClass), processClass(processClass), priorityInfo(priorityInfo), generation(generation), distributorInterf(ddInterf), ratekeeperInterf(rkInterf), storageCacheInterf(storageCacheInterf), degraded(degraded) {}
template <class Ar>
void serialize( Ar& ar ) {
serializer(ar, wi, initialClass, processClass, priorityInfo, generation, distributorInterf, ratekeeperInterf, storageCacheInterf, reply, degraded);
}
};
struct GetWorkersRequest {
constexpr static FileIdentifier file_identifier = 1254174;
enum { TESTER_CLASS_ONLY = 0x1, NON_EXCLUDED_PROCESSES_ONLY = 0x2 };
int flags;
ReplyPromise<vector<WorkerDetails>> reply;
GetWorkersRequest() : flags(0) {}
explicit GetWorkersRequest(int fl) : flags(fl) {}
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, flags, reply);
}
};
struct RegisterMasterRequest {
constexpr static FileIdentifier file_identifier = 10773445;
UID id;
LocalityData mi;
LogSystemConfig logSystemConfig;
std::vector<MasterProxyInterface> proxies;
std::vector<ResolverInterface> resolvers;
DBRecoveryCount recoveryCount;
int64_t registrationCount;
Optional<DatabaseConfiguration> configuration;
std::vector<UID> priorCommittedLogServers;
RecoveryState recoveryState;
bool recoveryStalled;
ReplyPromise<Void> reply;
RegisterMasterRequest() : logSystemConfig(0) {}
template <class Ar>
void serialize(Ar& ar) {
if constexpr (!is_fb_function<Ar>) {
ASSERT(ar.protocolVersion().isValid());
}
serializer(ar, id, mi, logSystemConfig, proxies, resolvers, recoveryCount, registrationCount, configuration,
priorCommittedLogServers, recoveryState, recoveryStalled, reply);
}
};
#include "fdbserver/ServerDBInfo.h" // include order hack
#endif

View File

@ -216,7 +216,9 @@ ACTOR Future<Void> openDatabase(ClientData* db, int* clientCount, Reference<Asyn
++(*clientCount);
hasConnectedClients->set(true);
if(req.supportedVersions.size() > 0) {
db->clientStatusInfoMap[req.reply.getEndpoint().getPrimaryAddress()] = ClientStatusInfo(req.traceLogGroup, req.supportedVersions, req.issues);
}
while (db->clientInfo->get().read().id == req.knownClientInfoID && !db->clientInfo->get().read().forward.present()) {
choose {
@ -225,7 +227,9 @@ ACTOR Future<Void> openDatabase(ClientData* db, int* clientCount, Reference<Asyn
}
}
if(req.supportedVersions.size() > 0) {
db->clientStatusInfoMap.erase(req.reply.getEndpoint().getPrimaryAddress());
}
req.reply.send( db->clientInfo->get() );

View File

@ -23,6 +23,7 @@
#include "fdbclient/SystemData.h"
#include "fdbclient/DatabaseContext.h"
#include "fdbclient/ManagementAPI.actor.h"
#include "fdbclient/RunTransaction.actor.h"
#include "fdbrpc/Replication.h"
#include "fdbserver/DataDistribution.actor.h"
#include "fdbserver/FDBExecHelper.actor.h"
@ -45,8 +46,10 @@ class TCMachineTeamInfo;
ACTOR Future<Void> checkAndRemoveInvalidLocalityAddr(DDTeamCollection* self);
ACTOR Future<Void> removeWrongStoreType(DDTeamCollection* self);
struct TCServerInfo : public ReferenceCounted<TCServerInfo> {
UID id;
DDTeamCollection* collection;
StorageServerInterface lastKnownInterface;
ProcessClass lastKnownClass;
vector<Reference<TCTeamInfo>> teams;
@ -63,13 +66,14 @@ struct TCServerInfo : public ReferenceCounted<TCServerInfo> {
LocalityEntry localityEntry;
Promise<Void> updated;
AsyncVar<bool> wrongStoreTypeToRemove;
AsyncVar<bool> ssVersionTooFarBehind;
// A storage server's StoreType does not change.
// To change storeType for an ip:port, we destroy the old one and create a new one.
KeyValueStoreType storeType; // Storage engine type
TCServerInfo(StorageServerInterface ssi, ProcessClass processClass, bool inDesiredDC,
TCServerInfo(StorageServerInterface ssi, DDTeamCollection* collection, ProcessClass processClass, bool inDesiredDC,
Reference<LocalitySet> storageServerSet)
: id(ssi.id()), lastKnownInterface(ssi), lastKnownClass(processClass), dataInFlightToServer(0),
: id(ssi.id()), collection(collection), lastKnownInterface(ssi), lastKnownClass(processClass), dataInFlightToServer(0),
onInterfaceChanged(interfaceChanged.getFuture()), onRemoved(removed.getFuture()), inDesiredDC(inDesiredDC),
storeType(KeyValueStoreType::END) {
localityEntry = ((LocalityMap<UID>*) storageServerSet.getPtr())->add(ssi.locality, &id);
@ -80,6 +84,7 @@ struct TCServerInfo : public ReferenceCounted<TCServerInfo> {
// If a storage server does not reply its storeType, it will be tracked by failure monitor and removed.
return (storeType == configStoreType || storeType == KeyValueStoreType::END);
}
~TCServerInfo();
};
struct TCMachineInfo : public ReferenceCounted<TCMachineInfo> {
@ -109,51 +114,7 @@ struct TCMachineInfo : public ReferenceCounted<TCMachineInfo> {
}
};
ACTOR Future<Void> updateServerMetrics( TCServerInfo *server ) {
state StorageServerInterface ssi = server->lastKnownInterface;
state Future<ErrorOr<GetStorageMetricsReply>> metricsRequest = ssi.getStorageMetrics.tryGetReply( GetStorageMetricsRequest(), TaskPriority::DataDistributionLaunch );
state Future<Void> resetRequest = Never();
state Future<std::pair<StorageServerInterface, ProcessClass>> interfaceChanged( server->onInterfaceChanged );
state Future<Void> serverRemoved( server->onRemoved );
loop {
choose {
when( ErrorOr<GetStorageMetricsReply> rep = wait( metricsRequest ) ) {
if( rep.present() ) {
server->serverMetrics = rep;
if(server->updated.canBeSet()) {
server->updated.send(Void());
}
return Void();
}
metricsRequest = Never();
resetRequest = delay( SERVER_KNOBS->METRIC_DELAY, TaskPriority::DataDistributionLaunch );
}
when( std::pair<StorageServerInterface,ProcessClass> _ssi = wait( interfaceChanged ) ) {
ssi = _ssi.first;
interfaceChanged = server->onInterfaceChanged;
resetRequest = Void();
}
when( wait( serverRemoved ) ) {
return Void();
}
when( wait( resetRequest ) ) { //To prevent a tight spin loop
if(IFailureMonitor::failureMonitor().getState(ssi.getStorageMetrics.getEndpoint()).isFailed()) {
resetRequest = IFailureMonitor::failureMonitor().onStateEqual(ssi.getStorageMetrics.getEndpoint(), FailureStatus(false));
}
else {
resetRequest = Never();
metricsRequest = ssi.getStorageMetrics.tryGetReply( GetStorageMetricsRequest(), TaskPriority::DataDistributionLaunch );
}
}
}
}
}
ACTOR Future<Void> updateServerMetrics( Reference<TCServerInfo> server ) {
wait( updateServerMetrics( server.getPtr() ) );
return Void();
}
ACTOR Future<Void> updateServerMetrics( Reference<TCServerInfo> server);
// TeamCollection's machine team information
class TCMachineTeamInfo : public ReferenceCounted<TCMachineTeamInfo> {
@ -596,6 +557,8 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
int64_t unhealthyServers;
std::map<int,int> priority_teams;
std::map<UID, Reference<TCServerInfo>> server_info;
std::map<Key, int> lagging_zones; // zone to number of storage servers lagging
AsyncVar<bool> disableFailingLaggingServers;
// machine_info has all machines info; key must be unique across processes on the same machine
std::map<Standalone<StringRef>, Reference<TCMachineInfo>> machine_info;
@ -721,6 +684,23 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
teamBuilder.cancel();
}
void addLaggingStorageServer(Key zoneId) {
lagging_zones[zoneId]++;
if (lagging_zones.size() > std::max(1, configuration.storageTeamSize - 1) && !disableFailingLaggingServers.get())
disableFailingLaggingServers.set(true);
}
void removeLaggingStorageServer(Key zoneId) {
auto iter = lagging_zones.find(zoneId);
ASSERT(iter != lagging_zones.end());
iter->second--;
ASSERT(iter->second >= 0);
if (iter->second == 0)
lagging_zones.erase(iter);
if (lagging_zones.size() <= std::max(1, configuration.storageTeamSize - 1) && disableFailingLaggingServers.get())
disableFailingLaggingServers.set(false);
}
ACTOR static Future<Void> logOnCompletion( Future<Void> signal, DDTeamCollection* self ) {
wait(signal);
wait(delay(SERVER_KNOBS->LOG_ON_COMPLETION_DELAY, TaskPriority::DataDistribution));
@ -1040,7 +1020,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
TraceEvent(SevWarnAlways, "MissingLocality")
.detail("Server", i->first.uniqueID)
.detail("Locality", i->first.locality.toString());
auto addr = i->first.address();
auto addr = i->first.stableAddress();
self->invalidLocalityAddr.insert(AddressExclusion(addr.ip, addr.port));
if (self->checkInvalidLocalities.isReady()) {
self->checkInvalidLocalities = checkAndRemoveInvalidLocalityAddr(self);
@ -2255,6 +2235,8 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
.detail("DoBuildTeams", self->doBuildTeams)
.trackLatest("TeamCollectionInfo");
}
} else {
self->lastBuildTeamsFailed = true;
}
self->evaluateTeamQuality();
@ -2297,7 +2279,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
allServers.push_back( newServer.id() );
TraceEvent("AddedStorageServer", distributorId).detail("ServerID", newServer.id()).detail("ProcessClass", processClass.toString()).detail("WaitFailureToken", newServer.waitFailure.getEndpoint().token).detail("Address", newServer.waitFailure.getEndpoint().getPrimaryAddress());
auto &r = server_info[newServer.id()] = Reference<TCServerInfo>( new TCServerInfo( newServer, processClass, includedDCs.empty() || std::find(includedDCs.begin(), includedDCs.end(), newServer.locality.dcId()) != includedDCs.end(), storageServerSet ) );
auto &r = server_info[newServer.id()] = Reference<TCServerInfo>( new TCServerInfo( newServer, this, processClass, includedDCs.empty() || std::find(includedDCs.begin(), includedDCs.end(), newServer.locality.dcId()) != includedDCs.end(), storageServerSet ) );
// Establish the relation between server and machine
checkAndCreateMachine(r);
@ -2586,6 +2568,80 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
}
};
TCServerInfo::~TCServerInfo() {
if (ssVersionTooFarBehind.get()) {
collection->removeLaggingStorageServer(lastKnownInterface.locality.zoneId().get());
}
}
ACTOR Future<Void> updateServerMetrics( TCServerInfo *server ) {
state StorageServerInterface ssi = server->lastKnownInterface;
state Future<ErrorOr<GetStorageMetricsReply>> metricsRequest = ssi.getStorageMetrics.tryGetReply( GetStorageMetricsRequest(), TaskPriority::DataDistributionLaunch );
state Future<Void> resetRequest = Never();
state Future<std::pair<StorageServerInterface, ProcessClass>> interfaceChanged( server->onInterfaceChanged );
state Future<Void> serverRemoved( server->onRemoved );
loop {
choose {
when( ErrorOr<GetStorageMetricsReply> rep = wait( metricsRequest ) ) {
if( rep.present() ) {
server->serverMetrics = rep;
if(server->updated.canBeSet()) {
server->updated.send(Void());
}
break;
}
metricsRequest = Never();
resetRequest = delay( SERVER_KNOBS->METRIC_DELAY, TaskPriority::DataDistributionLaunch );
}
when( std::pair<StorageServerInterface,ProcessClass> _ssi = wait( interfaceChanged ) ) {
ssi = _ssi.first;
interfaceChanged = server->onInterfaceChanged;
resetRequest = Void();
}
when( wait( serverRemoved ) ) {
return Void();
}
when( wait( resetRequest ) ) { //To prevent a tight spin loop
if(IFailureMonitor::failureMonitor().getState(ssi.getStorageMetrics.getEndpoint()).isFailed()) {
resetRequest = IFailureMonitor::failureMonitor().onStateEqual(ssi.getStorageMetrics.getEndpoint(), FailureStatus(false));
}
else {
resetRequest = Never();
metricsRequest = ssi.getStorageMetrics.tryGetReply( GetStorageMetricsRequest(), TaskPriority::DataDistributionLaunch );
}
}
}
}
if ( server->serverMetrics.get().lastUpdate < now() - SERVER_KNOBS->DD_SS_STUCK_TIME_LIMIT ) {
if (server->ssVersionTooFarBehind.get() == false) {
TraceEvent("StorageServerStuck", server->collection->distributorId).detail("ServerId", server->id.toString()).detail("LastUpdate", server->serverMetrics.get().lastUpdate);
server->ssVersionTooFarBehind.set(true);
server->collection->addLaggingStorageServer(server->lastKnownInterface.locality.zoneId().get());
}
} else if ( server->serverMetrics.get().versionLag > SERVER_KNOBS->DD_SS_FAILURE_VERSIONLAG ) {
if (server->ssVersionTooFarBehind.get() == false) {
TraceEvent("SSVersionDiffLarge", server->collection->distributorId).detail("ServerId", server->id.toString()).detail("VersionLag", server->serverMetrics.get().versionLag);
server->ssVersionTooFarBehind.set(true);
server->collection->addLaggingStorageServer(server->lastKnownInterface.locality.zoneId().get());
}
} else if ( server->serverMetrics.get().versionLag < SERVER_KNOBS->DD_SS_ALLOWED_VERSIONLAG ) {
if (server->ssVersionTooFarBehind.get() == true) {
TraceEvent("SSVersionDiffNormal", server->collection->distributorId).detail("ServerId", server->id.toString()).detail("VersionLag", server->serverMetrics.get().versionLag);
server->ssVersionTooFarBehind.set(false);
server->collection->removeLaggingStorageServer(server->lastKnownInterface.locality.zoneId().get());
}
}
return Void();
}
ACTOR Future<Void> updateServerMetrics( Reference<TCServerInfo> server) {
wait( updateServerMetrics( server.getPtr() ) );
return Void();
}
ACTOR Future<Void> waitUntilHealthy(DDTeamCollection* self, double extraDelay = 0) {
state int waitCount = 0;
loop {
@ -2858,6 +2914,14 @@ bool teamContainsFailedServer(DDTeamCollection* self, Reference<TCTeamInfo> team
self->excludedServers.get(ipaddr) == DDTeamCollection::Status::FAILED) {
return true;
}
if(ssi.secondaryAddress().present()) {
AddressExclusion saddr(ssi.secondaryAddress().get().ip, ssi.secondaryAddress().get().port);
AddressExclusion sipaddr(ssi.secondaryAddress().get().ip);
if (self->excludedServers.get(saddr) == DDTeamCollection::Status::FAILED ||
self->excludedServers.get(sipaddr) == DDTeamCollection::Status::FAILED) {
return true;
}
}
}
return false;
}
@ -3332,7 +3396,7 @@ ACTOR Future<Void> waitHealthyZoneChange( DDTeamCollection* self ) {
}
}
ACTOR Future<Void> serverMetricsPolling( TCServerInfo *server) {
ACTOR Future<Void> serverMetricsPolling( TCServerInfo *server ) {
state double lastUpdate = now();
loop {
wait( updateServerMetrics( server ) );
@ -3479,6 +3543,7 @@ ACTOR Future<Void> storageServerTracker(
state ServerStatus status( false, false, server->lastKnownInterface.locality );
state bool lastIsUnhealthy = false;
state Future<Void> metricsTracker = serverMetricsPolling( server );
state Future<std::pair<StorageServerInterface, ProcessClass>> interfaceChanged = server->onInterfaceChanged;
state Future<Void> storeTypeTracker = keyValueStoreTypeTracker(self, server);
@ -3489,7 +3554,7 @@ ACTOR Future<Void> storageServerTracker(
try {
loop {
status.isUndesired = false;
status.isUndesired = !self->disableFailingLaggingServers.get() && server->ssVersionTooFarBehind.get();
status.isWrongConfiguration = false;
hasWrongDC = !isCorrectDC(self, server);
hasInvalidLocality =
@ -3569,29 +3634,41 @@ ACTOR Future<Void> storageServerTracker(
// If the storage server is in the excluded servers list, it is undesired
NetworkAddress a = server->lastKnownInterface.address();
state AddressExclusion addr( a.ip, a.port );
state AddressExclusion ipaddr( a.ip );
state DDTeamCollection::Status addrStatus = self->excludedServers.get(addr);
state DDTeamCollection::Status ipaddrStatus = self->excludedServers.get(ipaddr);
if (addrStatus != DDTeamCollection::Status::NONE || ipaddrStatus != DDTeamCollection::Status::NONE) {
AddressExclusion worstAddr( a.ip, a.port );
DDTeamCollection::Status worstStatus = self->excludedServers.get( worstAddr );
otherChanges.push_back( self->excludedServers.onChange( worstAddr ) );
for(int i = 0; i < 3; i++) {
if(i > 0 && !server->lastKnownInterface.secondaryAddress().present()) {
break;
}
AddressExclusion testAddr;
if(i == 0) testAddr = AddressExclusion(a.ip);
else if(i == 1) testAddr = AddressExclusion(server->lastKnownInterface.secondaryAddress().get().ip, server->lastKnownInterface.secondaryAddress().get().port);
else if(i == 2) testAddr = AddressExclusion(server->lastKnownInterface.secondaryAddress().get().ip);
DDTeamCollection::Status testStatus = self->excludedServers.get(testAddr);
if(testStatus > worstStatus) {
worstStatus = testStatus;
worstAddr = testAddr;
}
otherChanges.push_back( self->excludedServers.onChange( testAddr ) );
}
if (worstStatus != DDTeamCollection::Status::NONE) {
TraceEvent(SevWarn, "UndesiredStorageServer", self->distributorId)
.detail("Server", server->id)
.detail("Excluded",
ipaddrStatus == DDTeamCollection::Status::NONE ? addr.toString() : ipaddr.toString());
.detail("Excluded", worstAddr.toString());
status.isUndesired = true;
status.isWrongConfiguration = true;
if (addrStatus == DDTeamCollection::Status::FAILED ||
ipaddrStatus == DDTeamCollection::Status::FAILED) {
if (worstStatus == DDTeamCollection::Status::FAILED) {
TraceEvent(SevWarn, "FailedServerRemoveKeys", self->distributorId)
.detail("Address", addr.toString())
.detail("ServerID", server->id);
.detail("Server", server->id)
.detail("Excluded", worstAddr.toString());
wait(removeKeysFromFailedServer(cx, server->id, self->lock));
if (BUGGIFY) wait(delay(5.0));
self->shardsAffectedByTeamFailure->eraseServer(server->id);
}
}
otherChanges.push_back( self->excludedServers.onChange( addr ) );
otherChanges.push_back( self->excludedServers.onChange( ipaddr ) );
failureTracker = storageServerFailureTracker(self, server, cx, &status, addedVersion);
//We need to recruit new storage servers if the key value store type has changed
@ -3599,6 +3676,7 @@ ACTOR Future<Void> storageServerTracker(
self->restartRecruiting.trigger();
}
if (lastIsUnhealthy && !status.isUnhealthy() &&
( server->teams.size() < targetTeamNumPerServer || self->lastBuildTeamsFailed)) {
self->doBuildTeams = true;
@ -3753,6 +3831,8 @@ ACTOR Future<Void> storageServerTracker(
server->wakeUpTracker = Promise<Void>();
}
when(wait(storeTypeTracker)) {}
when(wait(server->ssVersionTooFarBehind.onChange())) { }
when(wait(self->disableFailingLaggingServers.onChange())) { }
}
if (recordTeamCollectionInfo) {
@ -3861,7 +3941,7 @@ ACTOR Future<Void> checkAndRemoveInvalidLocalityAddr(DDTeamCollection* self) {
int numExistingSSOnAddr(DDTeamCollection* self, const AddressExclusion& addr) {
int numExistingSS = 0;
for (auto& server : self->server_info) {
const NetworkAddress& netAddr = server.second->lastKnownInterface.address();
const NetworkAddress& netAddr = server.second->lastKnownInterface.stableAddress();
AddressExclusion usedAddr(netAddr.ip, netAddr.port);
if (usedAddr == addr) {
++numExistingSS;
@ -3875,10 +3955,10 @@ ACTOR Future<Void> initializeStorage(DDTeamCollection* self, RecruitStorageReply
// SOMEDAY: Cluster controller waits for availability, retry quickly if a server's Locality changes
self->recruitingStream.set(self->recruitingStream.get() + 1);
const NetworkAddress& netAddr = candidateWorker.worker.address();
const NetworkAddress& netAddr = candidateWorker.worker.stableAddress();
AddressExclusion workerAddr(netAddr.ip, netAddr.port);
if (numExistingSSOnAddr(self, workerAddr) <= 2 &&
self->recruitingLocalities.find(candidateWorker.worker.address()) == self->recruitingLocalities.end()) {
self->recruitingLocalities.find(candidateWorker.worker.stableAddress()) == self->recruitingLocalities.end()) {
// Only allow at most 2 storage servers on an address, because
// too many storage server on the same address (i.e., process) can cause OOM.
// Ask the candidateWorker to initialize a SS only if the worker does not have a pending request
@ -3899,7 +3979,7 @@ ACTOR Future<Void> initializeStorage(DDTeamCollection* self, RecruitStorageReply
.detail("RecruitingStream", self->recruitingStream.get());
self->recruitingIds.insert(interfaceId);
self->recruitingLocalities.insert(candidateWorker.worker.address());
self->recruitingLocalities.insert(candidateWorker.worker.stableAddress());
state ErrorOr<InitializeStorageReply> newServer =
wait(candidateWorker.worker.storage.tryGetReply(isr, TaskPriority::DataDistribution));
if (newServer.isError()) {
@ -3910,7 +3990,7 @@ ACTOR Future<Void> initializeStorage(DDTeamCollection* self, RecruitStorageReply
wait(delay(SERVER_KNOBS->STORAGE_RECRUITMENT_DELAY, TaskPriority::DataDistribution));
}
self->recruitingIds.erase(interfaceId);
self->recruitingLocalities.erase(candidateWorker.worker.address());
self->recruitingLocalities.erase(candidateWorker.worker.stableAddress());
TraceEvent("DDRecruiting")
.detail("Primary", self->primary)
@ -3956,7 +4036,7 @@ ACTOR Future<Void> storageRecruiter( DDTeamCollection* self, Reference<AsyncVar<
TraceEvent(SevDebug, "DDRecruitExcl1")
.detail("Primary", self->primary)
.detail("Excluding", s->second->lastKnownInterface.address());
auto addr = s->second->lastKnownInterface.address();
auto addr = s->second->lastKnownInterface.stableAddress();
AddressExclusion addrExcl(addr.ip, addr.port);
exclusions.insert(addrExcl);
numSSPerAddr[addrExcl]++; // increase from 0
@ -4007,8 +4087,8 @@ ACTOR Future<Void> storageRecruiter( DDTeamCollection* self, Reference<AsyncVar<
choose {
when( RecruitStorageReply candidateWorker = wait( fCandidateWorker ) ) {
AddressExclusion candidateSSAddr(candidateWorker.worker.address().ip,
candidateWorker.worker.address().port);
AddressExclusion candidateSSAddr(candidateWorker.worker.stableAddress().ip,
candidateWorker.worker.stableAddress().port);
int numExistingSS = numSSPerAddr[candidateSSAddr];
if (numExistingSS >= 2) {
TraceEvent(SevWarnAlways, "StorageRecruiterTooManySSOnSameAddr", self->distributorId)
@ -4742,7 +4822,7 @@ ACTOR Future<Void> ddExclusionSafetyCheck(DistributorExclusionSafetyCheckRequest
// Go through storage server interfaces and translate Address -> server ID (UID)
for (const AddressExclusion& excl : req.exclusions) {
for (const auto& ssi : ssis) {
if (excl.excludes(ssi.address())) {
if (excl.excludes(ssi.address()) || (ssi.secondaryAddress().present() && excl.excludes(ssi.secondaryAddress().get()))) {
excludeServerIDs.push_back(ssi.id());
}
}
@ -4844,7 +4924,7 @@ DDTeamCollection* testTeamCollection(int teamSize, Reference<IReplicationPolicy>
interface.locality.set(LiteralStringRef("machineid"), Standalone<StringRef>(std::to_string(id)));
interface.locality.set(LiteralStringRef("zoneid"), Standalone<StringRef>(std::to_string(id % 5)));
interface.locality.set(LiteralStringRef("data_hall"), Standalone<StringRef>(std::to_string(id % 3)));
collection->server_info[uid] = Reference<TCServerInfo>(new TCServerInfo(interface, ProcessClass(), true, collection->storageServerSet));
collection->server_info[uid] = Reference<TCServerInfo>(new TCServerInfo(interface, collection, ProcessClass(), true, collection->storageServerSet));
collection->server_status.set(uid, ServerStatus(false, false, interface.locality));
collection->checkAndCreateMachine(collection->server_info[uid]);
}
@ -4885,7 +4965,7 @@ DDTeamCollection* testMachineTeamCollection(int teamSize, Reference<IReplication
interface.locality.set(LiteralStringRef("data_hall"), Standalone<StringRef>(std::to_string(data_hall_id)));
interface.locality.set(LiteralStringRef("dcid"), Standalone<StringRef>(std::to_string(dc_id)));
collection->server_info[uid] =
Reference<TCServerInfo>(new TCServerInfo(interface, ProcessClass(), true, collection->storageServerSet));
Reference<TCServerInfo>(new TCServerInfo(interface, collection, ProcessClass(), true, collection->storageServerSet));
collection->server_status.set(uid, ServerStatus(false, false, interface.locality));
}

View File

@ -25,7 +25,6 @@
#define FDBSERVER_DATA_DISTRIBUTION_ACTOR_H
#include "fdbclient/NativeAPI.actor.h"
#include "fdbserver/ClusterRecruitmentInterface.h"
#include "fdbserver/MoveKeys.actor.h"
#include "fdbserver/LogSystem.h"
#include "flow/actorcompiler.h" // This must be the last #include.

File diff suppressed because it is too large Load Diff

View File

@ -29,24 +29,30 @@
#define REDWOOD_DEBUG 0
#define debug_printf_stream stderr
#define debug_printf_always(...) { fprintf(debug_printf_stream, "%s %f %04d ", g_network->getLocalAddress().toString().c_str(), now(), __LINE__); fprintf(debug_printf_stream, __VA_ARGS__); fflush(debug_printf_stream); }
#define debug_printf_stream stdout
#define debug_printf_always(...) \
{ \
fprintf(debug_printf_stream, "%s %f %04d ", g_network->getLocalAddress().toString().c_str(), now(), __LINE__); \
fprintf(debug_printf_stream, __VA_ARGS__); \
fflush(debug_printf_stream); \
}
#define debug_printf_noop(...)
#if defined(NO_INTELLISENSE)
#if REDWOOD_DEBUG
#define debug_printf debug_printf_always
#else
#define debug_printf debug_printf_noop
#endif
#if REDWOOD_DEBUG
#define debug_printf debug_printf_always
#else
// To get error-checking on debug_printf statements in IDE
#define debug_printf printf
#define debug_printf debug_printf_noop
#endif
#else
// To get error-checking on debug_printf statements in IDE
#define debug_printf printf
#endif
#define BEACON debug_printf_always("HERE\n")
#define TRACE debug_printf_always("%s: %s line %d %s\n", __FUNCTION__, __FILE__, __LINE__, platform::get_backtrace().c_str());
#define TRACE \
debug_printf_always("%s: %s line %d %s\n", __FUNCTION__, __FILE__, __LINE__, platform::get_backtrace().c_str());
#ifndef VALGRIND
#define VALGRIND_MAKE_MEM_UNDEFINED(x, y)
@ -67,12 +73,10 @@ public:
// Must return the same size for all pages created by the same pager instance
virtual int size() const = 0;
StringRef asStringRef() const {
return StringRef(begin(), size());
}
StringRef asStringRef() const { return StringRef(begin(), size()); }
virtual ~IPage() {
if(userData != nullptr && userDataDestructor != nullptr) {
if (userData != nullptr && userDataDestructor != nullptr) {
userDataDestructor(userData);
}
}
@ -82,8 +86,8 @@ public:
virtual void addref() const = 0;
virtual void delref() const = 0;
mutable void *userData;
mutable void (*userDataDestructor)(void *);
mutable void* userData;
mutable void (*userDataDestructor)(void*);
};
class IPagerSnapshot {

View File

@ -50,8 +50,9 @@ public:
virtual StorageBytes getStorageBytes() = 0;
// Writes are provided in an ordered stream.
// A write is considered part of (a change leading to) the version determined by the previous call to setWriteVersion()
// A write shall not become durable until the following call to commit() begins, and shall be durable once the following call to commit() returns
// A write is considered part of (a change leading to) the version determined by the previous call to
// setWriteVersion() A write shall not become durable until the following call to commit() begins, and shall be
// durable once the following call to commit() returns
virtual void set(KeyValueRef keyValue) = 0;
virtual void clear(KeyRangeRef range) = 0;
virtual void mutate(int op, StringRef param1, StringRef param2) = 0;
@ -63,11 +64,15 @@ public:
virtual Future<Void> init() = 0;
virtual Version getLatestVersion() = 0;
// readAtVersion() may only be called on a version which has previously been passed to setWriteVersion() and never previously passed
// to forgetVersion. The returned results when violating this precondition are unspecified; the store is not required to be able to detect violations.
// The returned read cursor provides a consistent snapshot of the versioned store, corresponding to all the writes done with write versions less
// readAtVersion() may only be called on a version which has previously been passed to setWriteVersion() and never
// previously passed
// to forgetVersion. The returned results when violating this precondition are unspecified; the store is not
// required to be able to detect violations.
// The returned read cursor provides a consistent snapshot of the versioned store, corresponding to all the writes
// done with write versions less
// than or equal to the given version.
// If readAtVersion() is called on the *current* write version, the given read cursor MAY reflect subsequent writes at the same
// If readAtVersion() is called on the *current* write version, the given read cursor MAY reflect subsequent writes
// at the same
// write version, OR it may represent a snapshot as of the call to readAtVersion().
virtual Reference<IStoreCursor> readAtVersion(Version) = 0;
};

View File

@ -88,6 +88,8 @@ void ServerKnobs::initialize(bool randomize, ClientKnobs* clientKnobs, bool isSi
init( TLOG_IGNORE_POP_AUTO_ENABLE_DELAY, 300.0 );
init( TXS_POPPED_MAX_DELAY, 1.0 ); if ( randomize && BUGGIFY ) TXS_POPPED_MAX_DELAY = deterministicRandom()->random01();
init( TLOG_MAX_CREATE_DURATION, 10.0 );
init( PEEK_LOGGING_AMOUNT, 5 );
init( PEEK_LOGGING_DELAY, 5.0 );
// disk snapshot max timeout, to be put in TLog, storage and coordinator nodes
init( SNAP_CREATE_MAX_TIMEOUT, 300.0 );
@ -201,7 +203,7 @@ void ServerKnobs::initialize(bool randomize, ClientKnobs* clientKnobs, bool isSi
init( STORAGE_METRICS_POLLING_DELAY, 2.0 ); if( randomize && BUGGIFY ) STORAGE_METRICS_POLLING_DELAY = 15.0;
init( STORAGE_METRICS_RANDOM_DELAY, 0.2 );
init( AVAILABLE_SPACE_RATIO_CUTOFF, 0.05 );
init( DESIRED_TEAMS_PER_SERVER, 5 ); DESIRED_TEAMS_PER_SERVER = deterministicRandom()->randomInt(1, 10);
init( DESIRED_TEAMS_PER_SERVER, 5 ); if( randomize && BUGGIFY ) DESIRED_TEAMS_PER_SERVER = deterministicRandom()->randomInt(1, 10);
init( MAX_TEAMS_PER_SERVER, 5*DESIRED_TEAMS_PER_SERVER );
init( DD_SHARD_SIZE_GRANULARITY, 5000000 );
init( DD_SHARD_SIZE_GRANULARITY_SIM, 500000 ); if( randomize && BUGGIFY ) DD_SHARD_SIZE_GRANULARITY_SIM = 0;
@ -219,6 +221,9 @@ void ServerKnobs::initialize(bool randomize, ClientKnobs* clientKnobs, bool isSi
init( DD_VALIDATE_LOCALITY, true ); if( randomize && BUGGIFY ) DD_VALIDATE_LOCALITY = false;
init( DD_CHECK_INVALID_LOCALITY_DELAY, 60 ); if( randomize && BUGGIFY ) DD_CHECK_INVALID_LOCALITY_DELAY = 1 + deterministicRandom()->random01() * 600;
init( DD_ENABLE_VERBOSE_TRACING, false ); if( randomize && BUGGIFY ) DD_ENABLE_VERBOSE_TRACING = true;
init( DD_SS_FAILURE_VERSIONLAG, 250000000 );
init( DD_SS_ALLOWED_VERSIONLAG, 200000000 ); if( randomize && BUGGIFY ) { DD_SS_FAILURE_VERSIONLAG = deterministicRandom()->randomInt(15000000, 500000000); DD_SS_ALLOWED_VERSIONLAG = 0.75 * DD_SS_FAILURE_VERSIONLAG; }
init( DD_SS_STUCK_TIME_LIMIT, 300.0 ); if( randomize && BUGGIFY ) { DD_SS_STUCK_TIME_LIMIT = 200.0 + deterministicRandom()->random01() * 100.0; }
// TeamRemover
init( TR_FLAG_DISABLE_MACHINE_TEAM_REMOVER, false ); if( randomize && BUGGIFY ) TR_FLAG_DISABLE_MACHINE_TEAM_REMOVER = deterministicRandom()->random01() < 0.1 ? true : false; // false by default. disable the consistency check when it's true
@ -344,6 +349,7 @@ void ServerKnobs::initialize(bool randomize, ClientKnobs* clientKnobs, bool isSi
init( MAX_PROXY_COMPUTE, 2.0 );
init( PROXY_COMPUTE_BUCKETS, 20000 );
init( PROXY_COMPUTE_GROWTH_RATE, 0.01 );
init( TXN_STATE_SEND_AMOUNT, 2 );
// Master Server
// masterCommitter() in the master server will allow lower priority tasks (e.g. DataDistibution)
@ -411,6 +417,8 @@ void ServerKnobs::initialize(bool randomize, ClientKnobs* clientKnobs, bool isSi
init( POLICY_RATING_TESTS, 200 ); if( randomize && BUGGIFY ) POLICY_RATING_TESTS = 20;
init( POLICY_GENERATIONS, 100 ); if( randomize && BUGGIFY ) POLICY_GENERATIONS = 10;
init( DBINFO_SEND_AMOUNT, 2 );
init( DBINFO_BATCH_DELAY, 0.1 );
//Move Keys
init( SHARD_READY_DELAY, 0.25 );
@ -522,13 +530,13 @@ void ServerKnobs::initialize(bool randomize, ClientKnobs* clientKnobs, bool isSi
//Worker
init( WORKER_LOGGING_INTERVAL, 5.0 );
init( INCOMPATIBLE_PEER_DELAY_BEFORE_LOGGING, 5.0 );
init( HEAP_PROFILER_INTERVAL, 30.0 );
init( DEGRADED_RESET_INTERVAL, 24*60*60 ); if ( randomize && BUGGIFY ) DEGRADED_RESET_INTERVAL = 10;
init( DEGRADED_WARNING_LIMIT, 1 );
init( DEGRADED_WARNING_RESET_DELAY, 7*24*60*60 );
init( TRACE_LOG_FLUSH_FAILURE_CHECK_INTERVAL_SECONDS, 10 );
init( TRACE_LOG_PING_TIMEOUT_SECONDS, 5.0 );
init( DBINFO_FAILED_DELAY, 1.0 );
// Test harness
init( WORKER_POLL_DELAY, 1.0 );
@ -561,19 +569,19 @@ void ServerKnobs::initialize(bool randomize, ClientKnobs* clientKnobs, bool isSi
// Fast Restore
init( FASTRESTORE_FAILURE_TIMEOUT, 3600 );
init( FASTRESTORE_HEARTBEAT_INTERVAL, 60 );
init( FASTRESTORE_SAMPLING_PERCENT, 1 ); if( randomize ) { FASTRESTORE_SAMPLING_PERCENT = deterministicRandom()->random01() * 100; }
init( FASTRESTORE_NUM_LOADERS, 3 ); if( randomize ) { FASTRESTORE_NUM_LOADERS = deterministicRandom()->random01() * 10 + 1; }
init( FASTRESTORE_NUM_APPLIERS, 3 ); if( randomize ) { FASTRESTORE_NUM_APPLIERS = deterministicRandom()->random01() * 10 + 1; }
init( FASTRESTORE_TXN_BATCH_MAX_BYTES, 512.0 ); if( randomize ) { FASTRESTORE_TXN_BATCH_MAX_BYTES = deterministicRandom()->random01() * 1024.0 * 1024.0 + 1.0; }
init( FASTRESTORE_VERSIONBATCH_MAX_BYTES, 10.0 * 1024.0 * 1024.0 ); if( randomize ) { FASTRESTORE_VERSIONBATCH_MAX_BYTES = deterministicRandom()->random01() * 10.0 * 1024.0 * 1024.0 * 1024.0; }
init( FASTRESTORE_VB_PARALLELISM, 3 ); if( randomize ) { FASTRESTORE_VB_PARALLELISM = deterministicRandom()->random01() * 20 + 1; }
init( FASTRESTORE_VB_MONITOR_DELAY, 5 ); if( randomize ) { FASTRESTORE_VB_MONITOR_DELAY = deterministicRandom()->random01() * 20 + 1; }
init( FASTRESTORE_VB_LAUNCH_DELAY, 5 ); if( randomize ) { FASTRESTORE_VB_LAUNCH_DELAY = deterministicRandom()->random01() * 60 + 1; }
init( FASTRESTORE_ROLE_LOGGING_DELAY, 5 ); if( randomize ) { FASTRESTORE_ROLE_LOGGING_DELAY = deterministicRandom()->random01() * 60 + 1; }
init( FASTRESTORE_UPDATE_PROCESS_STATS_INTERVAL, 5 ); if( randomize ) { FASTRESTORE_UPDATE_PROCESS_STATS_INTERVAL = deterministicRandom()->random01() * 60 + 1; }
init( FASTRESTORE_ATOMICOP_WEIGHT, 100 ); if( randomize ) { FASTRESTORE_ATOMICOP_WEIGHT = deterministicRandom()->random01() * 200 + 1; }
init( FASTRESTORE_APPLYING_PARALLELISM, 100 ); if( randomize ) { FASTRESTORE_APPLYING_PARALLELISM = deterministicRandom()->random01() * 10 + 1; }
init( FASTRESTORE_MONITOR_LEADER_DELAY, 5 ); if( randomize ) { FASTRESTORE_MONITOR_LEADER_DELAY = deterministicRandom()->random01() * 100; }
init( FASTRESTORE_SAMPLING_PERCENT, 1 ); if( randomize && BUGGIFY ) { FASTRESTORE_SAMPLING_PERCENT = deterministicRandom()->random01() * 100; }
init( FASTRESTORE_NUM_LOADERS, 3 ); if( randomize && BUGGIFY ) { FASTRESTORE_NUM_LOADERS = deterministicRandom()->random01() * 10 + 1; }
init( FASTRESTORE_NUM_APPLIERS, 3 ); if( randomize && BUGGIFY ) { FASTRESTORE_NUM_APPLIERS = deterministicRandom()->random01() * 10 + 1; }
init( FASTRESTORE_TXN_BATCH_MAX_BYTES, 512.0 ); if( randomize && BUGGIFY ) { FASTRESTORE_TXN_BATCH_MAX_BYTES = deterministicRandom()->random01() * 1024.0 * 1024.0 + 1.0; }
init( FASTRESTORE_VERSIONBATCH_MAX_BYTES, 10.0 * 1024.0 * 1024.0 ); if( randomize && BUGGIFY ) { FASTRESTORE_VERSIONBATCH_MAX_BYTES = deterministicRandom()->random01() * 10.0 * 1024.0 * 1024.0 * 1024.0; }
init( FASTRESTORE_VB_PARALLELISM, 3 ); if( randomize && BUGGIFY ) { FASTRESTORE_VB_PARALLELISM = deterministicRandom()->random01() * 20 + 1; }
init( FASTRESTORE_VB_MONITOR_DELAY, 5 ); if( randomize && BUGGIFY ) { FASTRESTORE_VB_MONITOR_DELAY = deterministicRandom()->random01() * 20 + 1; }
init( FASTRESTORE_VB_LAUNCH_DELAY, 5 ); if( randomize && BUGGIFY ) { FASTRESTORE_VB_LAUNCH_DELAY = deterministicRandom()->random01() * 60 + 1; }
init( FASTRESTORE_ROLE_LOGGING_DELAY, 5 ); if( randomize && BUGGIFY ) { FASTRESTORE_ROLE_LOGGING_DELAY = deterministicRandom()->random01() * 60 + 1; }
init( FASTRESTORE_UPDATE_PROCESS_STATS_INTERVAL, 5 ); if( randomize && BUGGIFY ) { FASTRESTORE_UPDATE_PROCESS_STATS_INTERVAL = deterministicRandom()->random01() * 60 + 1; }
init( FASTRESTORE_ATOMICOP_WEIGHT, 100 ); if( randomize && BUGGIFY ) { FASTRESTORE_ATOMICOP_WEIGHT = deterministicRandom()->random01() * 200 + 1; }
init( FASTRESTORE_APPLYING_PARALLELISM, 100 ); if( randomize && BUGGIFY ) { FASTRESTORE_APPLYING_PARALLELISM = deterministicRandom()->random01() * 10 + 1; }
init( FASTRESTORE_MONITOR_LEADER_DELAY, 5 ); if( randomize && BUGGIFY ) { FASTRESTORE_MONITOR_LEADER_DELAY = deterministicRandom()->random01() * 100; }
init( FASTRESTORE_STRAGGLER_THRESHOLD_SECONDS, 60 ); if( randomize && BUGGIFY ) { FASTRESTORE_STRAGGLER_THRESHOLD_SECONDS = deterministicRandom()->random01() * 240 + 10; }
init( FASTRESTORE_TRACK_REQUEST_LATENCY, true ); if( randomize && BUGGIFY ) { FASTRESTORE_TRACK_REQUEST_LATENCY = false; }
init( FASTRESTORE_TRACK_LOADER_SEND_REQUESTS, false ); if( randomize && BUGGIFY ) { FASTRESTORE_TRACK_LOADER_SEND_REQUESTS = true; }

View File

@ -86,6 +86,8 @@ public:
int64_t MAX_CACHE_VERSIONS;
double TXS_POPPED_MAX_DELAY;
double TLOG_MAX_CREATE_DURATION;
int PEEK_LOGGING_AMOUNT;
double PEEK_LOGGING_DELAY;
// Data distribution queue
double HEALTH_POLL_TIME;
@ -173,6 +175,9 @@ public:
bool DD_VALIDATE_LOCALITY;
int DD_CHECK_INVALID_LOCALITY_DELAY;
bool DD_ENABLE_VERBOSE_TRACING;
int64_t DD_SS_FAILURE_VERSIONLAG; // Allowed SS version lag from the current read version before marking it as failed.
int64_t DD_SS_ALLOWED_VERSIONLAG; // SS will be marked as healthy if it's version lag goes below this value.
double DD_SS_STUCK_TIME_LIMIT; // If a storage server is not getting new versions for this amount of time, then it becomes undesired.
// TeamRemover to remove redundant teams
bool TR_FLAG_DISABLE_MACHINE_TEAM_REMOVER; // disable the machineTeamRemover actor
@ -281,6 +286,7 @@ public:
double MAX_PROXY_COMPUTE;
int PROXY_COMPUTE_BUCKETS;
double PROXY_COMPUTE_GROWTH_RATE;
int TXN_STATE_SEND_AMOUNT;
// Master Server
double COMMIT_SLEEP_TIME;
@ -345,6 +351,8 @@ public:
int EXPECTED_PROXY_FITNESS;
int EXPECTED_RESOLVER_FITNESS;
double RECRUITMENT_TIMEOUT;
int DBINFO_SEND_AMOUNT;
double DBINFO_BATCH_DELAY;
//Move Keys
double SHARD_READY_DELAY;
@ -457,13 +465,13 @@ public:
//Worker
double WORKER_LOGGING_INTERVAL;
double INCOMPATIBLE_PEER_DELAY_BEFORE_LOGGING;
double HEAP_PROFILER_INTERVAL;
double DEGRADED_RESET_INTERVAL;
double DEGRADED_WARNING_LIMIT;
double DEGRADED_WARNING_RESET_DELAY;
int64_t TRACE_LOG_FLUSH_FAILURE_CHECK_INTERVAL_SECONDS;
double TRACE_LOG_PING_TIMEOUT_SECONDS;
double DBINFO_FAILED_DELAY;
// Test harness
double WORKER_POLL_DELAY;

View File

@ -20,7 +20,6 @@
#include "fdbrpc/FailureMonitor.h"
#include "fdbrpc/Locality.h"
#include "fdbserver/ClusterRecruitmentInterface.h"
#include "fdbserver/CoordinationInterface.h"
#include "fdbclient/MonitorLeader.h"
#include "flow/actorcompiler.h" // This must be the last #include.

View File

@ -40,6 +40,7 @@ struct MasterInterface {
RequestStream<struct BackupWorkerDoneRequest> notifyBackupWorkerDone;
NetworkAddress address() const { return changeCoordinators.getEndpoint().getPrimaryAddress(); }
NetworkAddressList addresses() const { return changeCoordinators.getEndpoint().addresses; }
UID id() const { return changeCoordinators.getEndpoint().token; }
template <class Archive>

View File

@ -48,6 +48,29 @@
#include "flow/TDMetric.actor.h"
#include "flow/actorcompiler.h" // This must be the last #include.
ACTOR Future<Void> broadcastTxnRequest(TxnStateRequest req, int sendAmount, bool sendReply) {
state ReplyPromise<Void> reply = req.reply;
resetReply( req );
std::vector<Future<Void>> replies;
int currentStream = 0;
std::vector<Endpoint> broadcastEndpoints = req.broadcastInfo;
for(int i = 0; i < sendAmount && currentStream < broadcastEndpoints.size(); i++) {
std::vector<Endpoint> endpoints;
RequestStream<TxnStateRequest> cur(broadcastEndpoints[currentStream++]);
while(currentStream < broadcastEndpoints.size()*(i+1)/sendAmount) {
endpoints.push_back(broadcastEndpoints[currentStream++]);
}
req.broadcastInfo = endpoints;
replies.push_back(brokenPromiseToNever( cur.getReply( req ) ));
resetReply( req );
}
wait( waitForAll(replies) );
if(sendReply) {
reply.send(Void());
}
return Void();
}
struct ProxyStats {
CounterCollection cc;
Counter txnRequestIn, txnRequestOut, txnRequestErrors;
@ -157,7 +180,7 @@ struct TransactionRateInfo {
}
void setRate(double rate) {
ASSERT(rate >= 0 && rate != std::numeric_limits<double>::infinity() && !isnan(rate));
ASSERT(rate >= 0 && rate != std::numeric_limits<double>::infinity() && !std::isnan(rate));
this->rate = rate;
if(disabled) {
@ -1954,7 +1977,7 @@ ACTOR Future<Void> masterProxyServerCore(
when(ExclusionSafetyCheckRequest exclCheckReq = waitNext(proxy.exclusionSafetyCheckReq.getFuture())) {
addActor.send(proxyCheckSafeExclusion(db, exclCheckReq));
}
when(TxnStateRequest req = waitNext(proxy.txnState.getFuture())) {
when(state TxnStateRequest req = waitNext(proxy.txnState.getFuture())) {
state ReplyPromise<Void> reply = req.reply;
if(req.last) maxSequence = req.sequence + 1;
if (!txnSequences.count(req.sequence)) {
@ -2022,7 +2045,7 @@ ACTOR Future<Void> masterProxyServerCore(
commitData.txnStateStore->enableSnapshot();
}
}
reply.send(Void());
addActor.send(broadcastTxnRequest(req, SERVER_KNOBS->TXN_STATE_SEND_AMOUNT, true));
wait(yield());
}
}

View File

@ -774,6 +774,7 @@ ACTOR Future<std::pair<Version, Tag>> addStorageServer( Database cx, StorageServ
try {
state Future<Standalone<RangeResultRef>> fTagLocalities = tr.getRange( tagLocalityListKeys, CLIENT_KNOBS->TOO_MANY );
state Future<Optional<Value>> fv = tr.get( serverListKeyFor(server.id()) );
state Future<Optional<Value>> fExclProc = tr.get(
StringRef(encodeExcludedServersKey( AddressExclusion( server.address().ip, server.address().port ))) );
state Future<Optional<Value>> fExclIP = tr.get(
@ -782,14 +783,28 @@ ACTOR Future<std::pair<Version, Tag>> addStorageServer( Database cx, StorageServ
StringRef(encodeFailedServersKey( AddressExclusion( server.address().ip, server.address().port ))) );
state Future<Optional<Value>> fFailIP = tr.get(
StringRef(encodeFailedServersKey( AddressExclusion( server.address().ip ))) );
state Future<Optional<Value>> fExclProc2 = server.secondaryAddress().present() ? tr.get(
StringRef(encodeExcludedServersKey( AddressExclusion( server.secondaryAddress().get().ip, server.secondaryAddress().get().port ))) ) : Future<Optional<Value>>( Optional<Value>() );
state Future<Optional<Value>> fExclIP2 = server.secondaryAddress().present() ? tr.get(
StringRef(encodeExcludedServersKey( AddressExclusion( server.secondaryAddress().get().ip ))) ) : Future<Optional<Value>>( Optional<Value>() );
state Future<Optional<Value>> fFailProc2 = server.secondaryAddress().present() ? tr.get(
StringRef(encodeFailedServersKey( AddressExclusion( server.secondaryAddress().get().ip, server.secondaryAddress().get().port ))) ) : Future<Optional<Value>>( Optional<Value>() );
state Future<Optional<Value>> fFailIP2 = server.secondaryAddress().present() ? tr.get(
StringRef(encodeFailedServersKey( AddressExclusion( server.secondaryAddress().get().ip ))) ) : Future<Optional<Value>>( Optional<Value>() );
state Future<Standalone<RangeResultRef>> fTags = tr.getRange( serverTagKeys, CLIENT_KNOBS->TOO_MANY, true);
state Future<Standalone<RangeResultRef>> fHistoryTags = tr.getRange( serverTagHistoryKeys, CLIENT_KNOBS->TOO_MANY, true);
wait( success(fTagLocalities) && success(fv) && success(fExclProc) && success(fExclIP) && success(fFailProc) && success(fFailIP) && success(fTags) && success(fHistoryTags) );
wait( success(fTagLocalities) && success(fv) && success(fTags) && success(fHistoryTags) &&
success(fExclProc) && success(fExclIP) && success(fFailProc) && success(fFailIP) &&
success(fExclProc2) && success(fExclIP2) && success(fFailProc2) && success(fFailIP2) );
// If we have been added to the excluded/failed state servers list, we have to fail
if (fExclProc.get().present() || fExclIP.get().present() || fFailProc.get().present() || fFailIP.get().present() )
if (fExclProc.get().present() || fExclIP.get().present() || fFailProc.get().present() || fFailIP.get().present() ||
fExclProc2.get().present() || fExclIP2.get().present() || fFailProc2.get().present() || fFailIP2.get().present() ) {
throw recruitment_failed();
}
if(fTagLocalities.get().more || fTags.get().more || fHistoryTags.get().more)
ASSERT(false);

View File

@ -385,6 +385,43 @@ struct LogData : NonCopyable, public ReferenceCounted<LogData> {
struct PeekTrackerData {
std::map<int, Promise<std::pair<Version, bool>>> sequence_version;
double lastUpdate;
Tag tag;
double lastLogged;
int64_t totalPeeks;
int64_t replyBytes;
int64_t duplicatePeeks;
double queueTime;
double queueMax;
double blockTime;
double blockMax;
double workTime;
double workMax;
int64_t unblockedPeeks;
double idleTime;
double idleMax;
PeekTrackerData() : lastUpdate(0) {
resetMetrics();
}
void resetMetrics() {
lastLogged = now();
totalPeeks = 0;
replyBytes = 0;
duplicatePeeks = 0;
queueTime = 0;
queueMax = 0;
blockTime = 0;
blockMax = 0;
workTime = 0;
workMax = 0;
unblockedPeeks = 0;
idleTime = 0;
idleMax = 0;
}
};
std::map<UID, PeekTrackerData> peekTracker;
@ -1049,6 +1086,7 @@ ACTOR Future<Void> tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere
state BinaryWriter messages2(Unversioned());
state int sequence = -1;
state UID peekId;
state double queueStart = now();
if(req.sequence.present()) {
try {
@ -1059,6 +1097,7 @@ ACTOR Future<Void> tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere
}
auto& trackerData = logData->peekTracker[peekId];
if (sequence == 0 && trackerData.sequence_version.find(0) == trackerData.sequence_version.end()) {
trackerData.tag = req.tag;
trackerData.sequence_version[0].send(std::make_pair(req.begin, req.onlySpilled));
}
auto seqBegin = trackerData.sequence_version.begin();
@ -1074,8 +1113,16 @@ ACTOR Future<Void> tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere
throw operation_obsolete();
}
Future<std::pair<Version, bool>> fPrevPeekData = trackerData.sequence_version[sequence].getFuture();
if(fPrevPeekData.isReady()) {
trackerData.unblockedPeeks++;
double t = now() - trackerData.lastUpdate;
if(t > trackerData.idleMax) trackerData.idleMax = t;
trackerData.idleTime += t;
}
trackerData.lastUpdate = now();
std::pair<Version, bool> prevPeekData = wait(trackerData.sequence_version[sequence].getFuture());
std::pair<Version, bool> prevPeekData = wait(fPrevPeekData);
req.begin = std::max(prevPeekData.first, req.begin);
req.onlySpilled = prevPeekData.second;
wait(yield());
@ -1089,6 +1136,8 @@ ACTOR Future<Void> tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere
}
}
state double blockStart = now();
if( req.returnIfBlocked && logData->version.get() < req.begin ) {
req.reply.sendError(end_of_stream());
if(req.sequence.present()) {
@ -1123,6 +1172,8 @@ ACTOR Future<Void> tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere
wait(delay(0, TaskPriority::TLogSpilledPeekReply));
}
state double workStart = now();
Version poppedVer = poppedVersion(logData, req.tag);
if(poppedVer > req.begin) {
TLogPeekReply rep;
@ -1211,6 +1262,22 @@ ACTOR Future<Void> tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere
if(req.sequence.present()) {
auto& trackerData = logData->peekTracker[peekId];
trackerData.lastUpdate = now();
double queueT = blockStart-queueStart;
double blockT = workStart-blockStart;
double workT = now()-workStart;
trackerData.totalPeeks++;
trackerData.replyBytes += reply.messages.size();
if(queueT > trackerData.queueMax) trackerData.queueMax = queueT;
if(blockT > trackerData.blockMax) trackerData.blockMax = blockT;
if(workT > trackerData.workMax) trackerData.workMax = workT;
trackerData.queueTime += queueT;
trackerData.blockTime += blockT;
trackerData.workTime += workT;
auto& sequenceData = trackerData.sequence_version[sequence+1];
if(trackerData.sequence_version.size() && sequence+1 < trackerData.sequence_version.begin()->first) {
req.reply.sendError(operation_obsolete());
@ -1219,6 +1286,7 @@ ACTOR Future<Void> tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere
return Void();
}
if(sequenceData.isSet()) {
trackerData.duplicatePeeks++;
if(sequenceData.getFuture().get().first != reply.end) {
TEST(true); //tlog peek second attempt ended at a different version
req.reply.sendError(operation_obsolete());
@ -1542,6 +1610,47 @@ ACTOR Future<Void> cleanupPeekTrackers( LogData* logData ) {
}
}
ACTOR Future<Void> logPeekTrackers( LogData* logData ) {
loop {
int64_t logThreshold = 1;
if(logData->peekTracker.size() > SERVER_KNOBS->PEEK_LOGGING_AMOUNT) {
std::vector<int64_t> peekCounts;
peekCounts.reserve(logData->peekTracker.size());
for( auto& it : logData->peekTracker ) {
peekCounts.push_back(it.second.totalPeeks);
}
size_t pivot = peekCounts.size()-SERVER_KNOBS->PEEK_LOGGING_AMOUNT;
std::nth_element(peekCounts.begin(), peekCounts.begin()+pivot, peekCounts.end());
logThreshold = std::max<int64_t>(1,peekCounts[pivot]);
}
int logCount = 0;
for( auto& it : logData->peekTracker ) {
if(it.second.totalPeeks >= logThreshold) {
logCount++;
TraceEvent("PeekMetrics", logData->logId)
.detail("Tag", it.second.tag.toString())
.detail("Elapsed", now() - it.second.lastLogged)
.detail("MeanReplyBytes", it.second.replyBytes/it.second.totalPeeks)
.detail("TotalPeeks", it.second.totalPeeks)
.detail("UnblockedPeeks", it.second.unblockedPeeks)
.detail("DuplicatePeeks", it.second.duplicatePeeks)
.detail("Sequence", it.second.sequence_version.size() ? it.second.sequence_version.begin()->first : -1)
.detail("IdleSeconds", it.second.idleTime)
.detail("IdleMax", it.second.idleMax)
.detail("QueueSeconds", it.second.queueTime)
.detail("QueueMax", it.second.queueMax)
.detail("BlockSeconds", it.second.blockTime)
.detail("BlockMax", it.second.blockMax)
.detail("WorkSeconds", it.second.workTime)
.detail("WorkMax", it.second.workMax);
it.second.resetMetrics();
}
}
wait( delay(SERVER_KNOBS->PEEK_LOGGING_DELAY * std::max(1,logCount)) );
}
}
void getQueuingMetrics( TLogData* self, Reference<LogData> logData, TLogQueuingMetricsRequest const& req ) {
TLogQueuingMetricsReply reply;
reply.localTime = now();
@ -1880,6 +1989,7 @@ ACTOR Future<Void> tLogCore( TLogData* self, Reference<LogData> logData, TLogInt
logData->addActor.send( traceCounters("TLogMetrics", logData->logId, SERVER_KNOBS->STORAGE_LOGGING_DELAY, &logData->cc, logData->logId.toString() + "/TLogMetrics"));
logData->addActor.send( serveTLogInterface(self, tli, logData, warningCollectorInput) );
logData->addActor.send( cleanupPeekTrackers(logData.getPtr()) );
logData->addActor.send( logPeekTrackers(logData.getPtr()) );
if(!logData->isPrimary) {
std::vector<Tag> tags;

View File

@ -495,6 +495,44 @@ struct LogData : NonCopyable, public ReferenceCounted<LogData> {
struct PeekTrackerData {
std::map<int, Promise<std::pair<Version, bool>>> sequence_version;
double lastUpdate;
Tag tag;
double lastLogged;
int64_t totalPeeks;
int64_t replyBytes;
int64_t duplicatePeeks;
double queueTime;
double queueMax;
double blockTime;
double blockMax;
double workTime;
double workMax;
int64_t unblockedPeeks;
double idleTime;
double idleMax;
PeekTrackerData() : lastUpdate(0) {
resetMetrics();
}
void resetMetrics() {
lastLogged = now();
totalPeeks = 0;
replyBytes = 0;
duplicatePeeks = 0;
queueTime = 0;
queueMax = 0;
blockTime = 0;
blockMax = 0;
workTime = 0;
workMax = 0;
unblockedPeeks = 0;
idleTime = 0;
idleMax = 0;
}
};
std::map<UID, PeekTrackerData> peekTracker;
@ -1352,6 +1390,7 @@ ACTOR Future<Void> tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere
state BinaryWriter messages2(Unversioned());
state int sequence = -1;
state UID peekId;
state double queueStart = now();
if(req.sequence.present()) {
try {
@ -1362,6 +1401,7 @@ ACTOR Future<Void> tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere
}
auto& trackerData = logData->peekTracker[peekId];
if (sequence == 0 && trackerData.sequence_version.find(0) == trackerData.sequence_version.end()) {
trackerData.tag = req.tag;
trackerData.sequence_version[0].send(std::make_pair(req.begin, req.onlySpilled));
}
auto seqBegin = trackerData.sequence_version.begin();
@ -1378,8 +1418,15 @@ ACTOR Future<Void> tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere
throw operation_obsolete();
}
Future<std::pair<Version, bool>> fPrevPeekData = trackerData.sequence_version[sequence].getFuture();
if(fPrevPeekData.isReady()) {
trackerData.unblockedPeeks++;
double t = now() - trackerData.lastUpdate;
if(t > trackerData.idleMax) trackerData.idleMax = t;
trackerData.idleTime += t;
}
trackerData.lastUpdate = now();
std::pair<Version, bool> prevPeekData = wait(trackerData.sequence_version[sequence].getFuture());
std::pair<Version, bool> prevPeekData = wait(fPrevPeekData);
req.begin = std::max(prevPeekData.first, req.begin);
req.onlySpilled = prevPeekData.second;
wait(yield());
@ -1393,6 +1440,8 @@ ACTOR Future<Void> tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere
}
}
state double blockStart = now();
if( req.returnIfBlocked && logData->version.get() < req.begin ) {
req.reply.sendError(end_of_stream());
if(req.sequence.present()) {
@ -1427,6 +1476,8 @@ ACTOR Future<Void> tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere
wait(delay(0, TaskPriority::TLogSpilledPeekReply));
}
state double workStart = now();
Version poppedVer = poppedVersion(logData, req.tag);
if(poppedVer > req.begin) {
TLogPeekReply rep;
@ -1603,6 +1654,22 @@ ACTOR Future<Void> tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere
if(req.sequence.present()) {
auto& trackerData = logData->peekTracker[peekId];
trackerData.lastUpdate = now();
double queueT = blockStart-queueStart;
double blockT = workStart-blockStart;
double workT = now()-workStart;
trackerData.totalPeeks++;
trackerData.replyBytes += reply.messages.size();
if(queueT > trackerData.queueMax) trackerData.queueMax = queueT;
if(blockT > trackerData.blockMax) trackerData.blockMax = blockT;
if(workT > trackerData.workMax) trackerData.workMax = workT;
trackerData.queueTime += queueT;
trackerData.blockTime += blockT;
trackerData.workTime += workT;
auto& sequenceData = trackerData.sequence_version[sequence+1];
if(trackerData.sequence_version.size() && sequence+1 < trackerData.sequence_version.begin()->first) {
req.reply.sendError(operation_obsolete());
@ -1611,6 +1678,7 @@ ACTOR Future<Void> tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere
return Void();
}
if(sequenceData.isSet()) {
trackerData.duplicatePeeks++;
if(sequenceData.getFuture().get().first != reply.end) {
TEST(true); //tlog peek second attempt ended at a different version
req.reply.sendError(operation_obsolete());
@ -1934,6 +2002,47 @@ ACTOR Future<Void> cleanupPeekTrackers( LogData* logData ) {
}
}
ACTOR Future<Void> logPeekTrackers( LogData* logData ) {
loop {
int64_t logThreshold = 1;
if(logData->peekTracker.size() > SERVER_KNOBS->PEEK_LOGGING_AMOUNT) {
std::vector<int64_t> peekCounts;
peekCounts.reserve(logData->peekTracker.size());
for( auto& it : logData->peekTracker ) {
peekCounts.push_back(it.second.totalPeeks);
}
size_t pivot = peekCounts.size()-SERVER_KNOBS->PEEK_LOGGING_AMOUNT;
std::nth_element(peekCounts.begin(), peekCounts.begin()+pivot, peekCounts.end());
logThreshold = std::max<int64_t>(1,peekCounts[pivot]);
}
int logCount = 0;
for( auto& it : logData->peekTracker ) {
if(it.second.totalPeeks >= logThreshold) {
logCount++;
TraceEvent("PeekMetrics", logData->logId)
.detail("Tag", it.second.tag.toString())
.detail("Elapsed", now() - it.second.lastLogged)
.detail("MeanReplyBytes", it.second.replyBytes/it.second.totalPeeks)
.detail("TotalPeeks", it.second.totalPeeks)
.detail("UnblockedPeeks", it.second.unblockedPeeks)
.detail("DuplicatePeeks", it.second.duplicatePeeks)
.detail("Sequence", it.second.sequence_version.size() ? it.second.sequence_version.begin()->first : -1)
.detail("IdleSeconds", it.second.idleTime)
.detail("IdleMax", it.second.idleMax)
.detail("QueueSeconds", it.second.queueTime)
.detail("QueueMax", it.second.queueMax)
.detail("BlockSeconds", it.second.blockTime)
.detail("BlockMax", it.second.blockMax)
.detail("WorkSeconds", it.second.workTime)
.detail("WorkMax", it.second.workMax);
it.second.resetMetrics();
}
}
wait( delay(SERVER_KNOBS->PEEK_LOGGING_DELAY * std::max(1,logCount)) );
}
}
void getQueuingMetrics( TLogData* self, Reference<LogData> logData, TLogQueuingMetricsRequest const& req ) {
TLogQueuingMetricsReply reply;
reply.localTime = now();
@ -2283,6 +2392,7 @@ ACTOR Future<Void> tLogCore( TLogData* self, Reference<LogData> logData, TLogInt
logData->addActor.send( traceCounters("TLogMetrics", logData->logId, SERVER_KNOBS->STORAGE_LOGGING_DELAY, &logData->cc, logData->logId.toString() + "/TLogMetrics"));
logData->addActor.send( serveTLogInterface(self, tli, logData, warningCollectorInput) );
logData->addActor.send( cleanupPeekTrackers(logData.getPtr()) );
logData->addActor.send( logPeekTrackers(logData.getPtr()) );
if(!logData->isPrimary) {
std::vector<Tag> tags;

View File

@ -829,7 +829,7 @@ ACTOR static Future<Void> _parseRangeFileToMutationsOnLoader(
// The set of key value version is rangeFile.version. the key-value set in the same range file has the same version
Reference<IAsyncFile> inFile = wait(bc->readFile(asset.filename));
state VectorRef<KeyValueRef> blockData;
state Standalone<VectorRef<KeyValueRef>> blockData;
try {
Standalone<VectorRef<KeyValueRef>> kvs =
wait(fileBackup::decodeRangeFileBlock(inFile, asset.offset, asset.len));

View File

@ -717,7 +717,7 @@ ACTOR static Future<Version> collectBackupFiles(Reference<IBackupContainer> bc,
ACTOR static Future<Void> insertRangeVersion(KeyRangeMap<Version>* pRangeVersions, RestoreFileFR* file,
Reference<IBackupContainer> bc) {
TraceEvent("FastRestoreMasterDecodeRangeVersion").detail("File", file->toString());
RangeFile rangeFile(file->version, file->blockSize, file->fileName, file->fileSize);
RangeFile rangeFile = { file->version, (uint32_t)file->blockSize, file->fileName, file->fileSize };
// First and last key are the range for this file: endKey is exclusive
KeyRange fileRange = wait(bc->getSnapshotFileKeyRange(rangeFile));

View File

@ -38,22 +38,6 @@
#define SevFRMutationInfo SevVerbose
//#define SevFRMutationInfo SevInfo
struct VersionedMutation {
MutationRef mutation;
LogMessageVersion version;
VersionedMutation() = default;
explicit VersionedMutation(MutationRef mutation, LogMessageVersion version)
: mutation(mutation), version(version) {}
explicit VersionedMutation(Arena& to, const VersionedMutation& from)
: mutation(to, from.mutation), version(from.version) {}
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, mutation, version);
}
};
using MutationsVec = Standalone<VectorRef<MutationRef>>;
using LogMessageVersionVec = Standalone<VectorRef<LogMessageVersion>>;
using VersionedMutationsVec = Standalone<VectorRef<VersionedMutation>>;

View File

@ -147,7 +147,10 @@ ACTOR Future<Void> collectRestoreWorkerInterface(Reference<RestoreWorkerData> se
}
break;
}
TraceEvent("FastRestore").suppressFor(10.0).detail("NotEnoughWorkers", agentValues.size());
TraceEvent("FastRestore")
.suppressFor(10.0)
.detail("NotEnoughWorkers", agentValues.size())
.detail("MinWorkers", min_num_workers);
wait(delay(5.0));
} catch (Error& e) {
wait(tr.onError(e));

View File

@ -22,13 +22,13 @@
#define FDBSERVER_SERVERDBINFO_H
#pragma once
#include "fdbserver/ClusterRecruitmentInterface.h"
#include "fdbserver/DataDistributorInterface.h"
#include "fdbserver/MasterInterface.h"
#include "fdbserver/LogSystemConfig.h"
#include "fdbserver/RatekeeperInterface.h"
#include "fdbserver/RecoveryState.h"
#include "fdbserver/LatencyBandConfig.h"
#include "fdbserver/WorkerInterface.actor.h"
struct ServerDBInfo {
constexpr static FileIdentifier file_identifier = 13838807;
@ -51,29 +51,45 @@ struct ServerDBInfo {
std::vector<UID> priorCommittedLogServers; // If !fullyRecovered and logSystemConfig refers to a new log system which may not have been committed to the coordinated state yet, then priorCommittedLogServers are the previous, fully committed generation which need to stay alive in case this recovery fails
Optional<LatencyBandConfig> latencyBandConfig;
std::vector<std::pair<uint16_t,StorageServerInterface>> storageCaches;
int64_t infoGeneration;
explicit ServerDBInfo() : recoveryCount(0), recoveryState(RecoveryState::UNINITIALIZED), logSystemConfig(0) {}
ServerDBInfo() : recoveryCount(0), recoveryState(RecoveryState::UNINITIALIZED), logSystemConfig(0), infoGeneration(0) {}
bool operator == (ServerDBInfo const& r) const { return id == r.id; }
bool operator != (ServerDBInfo const& r) const { return id != r.id; }
template <class Ar>
void serialize( Ar& ar ) {
serializer(ar, id, clusterInterface, client, distributor, master, ratekeeper, resolvers, recoveryCount, recoveryState, masterLifetime, logSystemConfig, priorCommittedLogServers, latencyBandConfig, storageCaches);
serializer(ar, id, clusterInterface, client, distributor, master, ratekeeper, resolvers, recoveryCount, recoveryState, masterLifetime, logSystemConfig, priorCommittedLogServers, latencyBandConfig, storageCaches, infoGeneration);
}
};
struct UpdateServerDBInfoRequest {
constexpr static FileIdentifier file_identifier = 9467438;
Standalone<StringRef> serializedDbInfo;
std::vector<Endpoint> broadcastInfo;
ReplyPromise<std::vector<Endpoint>> reply;
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, serializedDbInfo, broadcastInfo, reply);
}
};
struct GetServerDBInfoRequest {
constexpr static FileIdentifier file_identifier = 9467438;
constexpr static FileIdentifier file_identifier = 9467439;
UID knownServerInfoID;
Standalone<VectorRef<StringRef>> issues;
std::vector<NetworkAddress> incompatiblePeers;
ReplyPromise< CachedSerialization<struct ServerDBInfo> > reply;
ReplyPromise<struct ServerDBInfo> reply;
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, knownServerInfoID, issues, incompatiblePeers, reply);
serializer(ar, knownServerInfoID, reply);
}
};
Future<Void> broadcastTxnRequest(TxnStateRequest const& req, int const& sendAmount, bool const& sendReply);
Future<std::vector<Endpoint>> broadcastDBInfoRequest(UpdateServerDBInfoRequest const& req, int const& sendAmount, Optional<Endpoint> const& sender, bool const& sendReply);
#endif

View File

@ -25,7 +25,6 @@
#include "fdbserver/WorkerInterface.actor.h"
#include "fdbclient/ClusterInterface.h"
#include "fdbserver/Knobs.h"
#include "fdbserver/ClusterRecruitmentInterface.h"
#include "fdbserver/CoordinationInterface.h"
#include "fdbmonitor/SimpleIni.h"
#include "fdbrpc/AsyncFileNonDurable.actor.h"
@ -737,7 +736,7 @@ void SimulationConfig::generateNormalConfig(int minimumReplication, int minimumR
if (deterministicRandom()->random01() < 0.25) db.desiredTLogCount = deterministicRandom()->randomInt(1,7);
if (deterministicRandom()->random01() < 0.25) db.masterProxyCount = deterministicRandom()->randomInt(1,7);
if (deterministicRandom()->random01() < 0.25) db.resolverCount = deterministicRandom()->randomInt(1,7);
int storage_engine_type = deterministicRandom()->randomInt(0, 3);
int storage_engine_type = deterministicRandom()->randomInt(0, 4);
switch (storage_engine_type) {
case 0: {
TEST(true); // Simulated cluster using ssd storage engine

View File

@ -25,7 +25,6 @@
#include "fdbclient/SystemData.h"
#include "fdbclient/ReadYourWrites.h"
#include "fdbserver/WorkerInterface.actor.h"
#include "fdbserver/ClusterRecruitmentInterface.h"
#include <time.h>
#include "fdbserver/CoordinationInterface.h"
#include "fdbserver/DataDistribution.actor.h"
@ -35,28 +34,6 @@
#include "fdbclient/JsonBuilder.h"
#include "flow/actorcompiler.h" // This must be the last #include.
void setIssues(ProcessIssuesMap& issueMap, NetworkAddress const& addr, VectorRef<StringRef> const& issues,
Optional<UID>& issueID) {
if (issues.size()) {
auto& e = issueMap[addr];
e.first = issues;
e.second = deterministicRandom()->randomUniqueID();
issueID = e.second;
} else {
issueMap.erase(addr);
issueID = Optional<UID>();
}
}
void removeIssues(ProcessIssuesMap& issueMap, NetworkAddress const& addr, Optional<UID>& issueID) {
if (!issueID.present()) {
return;
}
if (issueMap.count(addr) && issueMap[addr].second == issueID.get()) {
issueMap.erase( addr );
}
}
const char* RecoveryStatus::names[] = {
"reading_coordinated_state", "locking_coordinated_state", "locking_old_transaction_servers", "reading_transaction_system_state",
"configuration_missing", "configuration_never_created", "configuration_invalid",
@ -364,7 +341,10 @@ static JsonBuilderObject machineStatusFetcher(WorkerEvents mMetrics, vector<Work
machineJsonMap[machineId] = statusObj;
}
if (configuration.present() && !configuration.get().isExcludedServer(it->first))
//FIXME: this will not catch if the secondary address of the process was excluded
NetworkAddressList tempList;
tempList.address = it->first;
if (configuration.present() && !configuration.get().isExcludedServer(tempList))
notExcludedMap[machineId] = false;
workerContribMap[machineId] ++;
}
@ -569,7 +549,7 @@ struct RolesInfo {
};
ACTOR static Future<JsonBuilderObject> processStatusFetcher(
Reference<AsyncVar<CachedSerialization<ServerDBInfo>>> db, std::vector<WorkerDetails> workers, WorkerEvents pMetrics,
Reference<AsyncVar<ServerDBInfo>> db, std::vector<WorkerDetails> workers, WorkerEvents pMetrics,
WorkerEvents mMetrics, WorkerEvents nMetrics, WorkerEvents errors, WorkerEvents traceFileOpenErrors,
WorkerEvents programStarts, std::map<std::string, std::vector<JsonBuilderObject>> processIssues,
vector<std::pair<StorageServerInterface, EventMap>> storageServers,
@ -627,18 +607,18 @@ ACTOR static Future<JsonBuilderObject> processStatusFetcher(
state RolesInfo roles;
roles.addRole("master", db->get().read().master);
roles.addRole("cluster_controller", db->get().read().clusterInterface.clientInterface);
roles.addRole("master", db->get().master);
roles.addRole("cluster_controller", db->get().clusterInterface.clientInterface);
if (db->get().read().distributor.present()) {
roles.addRole("data_distributor", db->get().read().distributor.get());
if (db->get().distributor.present()) {
roles.addRole("data_distributor", db->get().distributor.get());
}
if (db->get().read().ratekeeper.present()) {
roles.addRole("ratekeeper", db->get().read().ratekeeper.get());
if (db->get().ratekeeper.present()) {
roles.addRole("ratekeeper", db->get().ratekeeper.get());
}
for(auto& tLogSet : db->get().read().logSystemConfig.tLogs) {
for(auto& tLogSet : db->get().logSystemConfig.tLogs) {
for(auto& it : tLogSet.logRouters) {
if(it.present()) {
roles.addRole("router", it.interf());
@ -646,7 +626,7 @@ ACTOR static Future<JsonBuilderObject> processStatusFetcher(
}
}
for(auto& old : db->get().read().logSystemConfig.oldTLogs) {
for(auto& old : db->get().logSystemConfig.oldTLogs) {
for(auto& tLogSet : old.tLogs) {
for(auto& it : tLogSet.logRouters) {
if(it.present()) {
@ -689,7 +669,7 @@ ACTOR static Future<JsonBuilderObject> processStatusFetcher(
}
state std::vector<ResolverInterface>::const_iterator res;
state std::vector<ResolverInterface> resolvers = db->get().read().resolvers;
state std::vector<ResolverInterface> resolvers = db->get().resolvers;
for(res = resolvers.begin(); res != resolvers.end(); ++res) {
roles.addRole( "resolver", *res );
wait(yield());
@ -850,7 +830,7 @@ ACTOR static Future<JsonBuilderObject> processStatusFetcher(
statusObj["roles"] = roles.getStatusForAddress(address);
if (configuration.present()){
statusObj["excluded"] = configuration.get().isExcludedServer(address);
statusObj["excluded"] = configuration.get().isExcludedServer(workerItr->interf.addresses());
}
statusObj["class_type"] = workerItr->processClass.toString();
@ -1551,17 +1531,17 @@ ACTOR static Future<vector<std::pair<StorageServerInterface, EventMap>>> getStor
return results;
}
ACTOR static Future<vector<std::pair<TLogInterface, EventMap>>> getTLogsAndMetrics(Reference<AsyncVar<CachedSerialization<ServerDBInfo>>> db, std::unordered_map<NetworkAddress, WorkerInterface> address_workers) {
vector<TLogInterface> servers = db->get().read().logSystemConfig.allPresentLogs();
ACTOR static Future<vector<std::pair<TLogInterface, EventMap>>> getTLogsAndMetrics(Reference<AsyncVar<ServerDBInfo>> db, std::unordered_map<NetworkAddress, WorkerInterface> address_workers) {
vector<TLogInterface> servers = db->get().logSystemConfig.allPresentLogs();
vector<std::pair<TLogInterface, EventMap>> results =
wait(getServerMetrics(servers, address_workers, std::vector<std::string>{ "TLogMetrics" }));
return results;
}
ACTOR static Future<vector<std::pair<MasterProxyInterface, EventMap>>> getProxiesAndMetrics(Reference<AsyncVar<CachedSerialization<ServerDBInfo>>> db, std::unordered_map<NetworkAddress, WorkerInterface> address_workers) {
ACTOR static Future<vector<std::pair<MasterProxyInterface, EventMap>>> getProxiesAndMetrics(Reference<AsyncVar<ServerDBInfo>> db, std::unordered_map<NetworkAddress, WorkerInterface> address_workers) {
vector<std::pair<MasterProxyInterface, EventMap>> results = wait(getServerMetrics(
db->get().read().client.proxies, address_workers, std::vector<std::string>{ "GRVLatencyMetrics", "CommitLatencyMetrics" }));
db->get().client.proxies, address_workers, std::vector<std::string>{ "GRVLatencyMetrics", "CommitLatencyMetrics" }));
return results;
}
@ -1571,7 +1551,7 @@ static int getExtraTLogEligibleZones(const vector<WorkerDetails>& workers, const
std::map<Key,std::set<StringRef>> dcId_zone;
for(auto const& worker : workers) {
if(worker.processClass.machineClassFitness(ProcessClass::TLog) < ProcessClass::NeverAssign
&& !configuration.isExcludedServer(worker.interf.address()))
&& !configuration.isExcludedServer(worker.interf.addresses()))
{
allZones.insert(worker.interf.locality.zoneId().get());
if(worker.interf.locality.dcId().present()) {
@ -1629,7 +1609,7 @@ JsonBuilderObject getPerfLimit(TraceEventFields const& ratekeeper, double transP
return perfLimit;
}
ACTOR static Future<JsonBuilderObject> workloadStatusFetcher(Reference<AsyncVar<CachedSerialization<ServerDBInfo>>> db, vector<WorkerDetails> workers, WorkerDetails mWorker, WorkerDetails rkWorker,
ACTOR static Future<JsonBuilderObject> workloadStatusFetcher(Reference<AsyncVar<ServerDBInfo>> db, vector<WorkerDetails> workers, WorkerDetails mWorker, WorkerDetails rkWorker,
JsonBuilderObject *qos, JsonBuilderObject *data_overlay, std::set<std::string> *incomplete_reasons, Future<ErrorOr<vector<std::pair<StorageServerInterface, EventMap>>>> storageServerFuture)
{
state JsonBuilderObject statusObj;
@ -1644,7 +1624,7 @@ ACTOR static Future<JsonBuilderObject> workloadStatusFetcher(Reference<AsyncVar<
for (auto const& w : workers) {
workersMap[w.interf.address()] = w;
}
for (auto &p : db->get().read().client.proxies) {
for (auto &p : db->get().client.proxies) {
auto worker = getWorker(workersMap, p.address());
if (worker.present())
proxyStatFutures.push_back(timeoutError(worker.get().interf.eventLogRequest.getReply(EventLogRequest(LiteralStringRef("ProxyMetrics"))), 1.0));
@ -1859,11 +1839,11 @@ ACTOR static Future<JsonBuilderObject> clusterSummaryStatisticsFetcher(WorkerEve
return statusObj;
}
static JsonBuilderArray oldTlogFetcher(int* oldLogFaultTolerance, Reference<AsyncVar<CachedSerialization<ServerDBInfo>>> db, std::unordered_map<NetworkAddress, WorkerInterface> const& address_workers) {
static JsonBuilderArray oldTlogFetcher(int* oldLogFaultTolerance, Reference<AsyncVar<ServerDBInfo>> db, std::unordered_map<NetworkAddress, WorkerInterface> const& address_workers) {
JsonBuilderArray oldTlogsArray;
if(db->get().read().recoveryState >= RecoveryState::ACCEPTING_COMMITS) {
for(auto it : db->get().read().logSystemConfig.oldTLogs) {
if(db->get().recoveryState >= RecoveryState::ACCEPTING_COMMITS) {
for(auto it : db->get().logSystemConfig.oldTLogs) {
JsonBuilderObject statusObj;
JsonBuilderArray logsObj;
Optional<int32_t> sat_log_replication_factor, sat_log_write_anti_quorum, sat_log_fault_tolerance, log_replication_factor, log_write_anti_quorum, log_fault_tolerance, remote_log_replication_factor, remote_log_fault_tolerance;
@ -1986,15 +1966,14 @@ static std::string getIssueDescription(std::string name) {
}
static std::map<std::string, std::vector<JsonBuilderObject>> getProcessIssuesAsMessages(
ProcessIssuesMap const& _issues) {
std::vector<ProcessIssues> const& issues) {
std::map<std::string, std::vector<JsonBuilderObject>> issuesMap;
try {
ProcessIssuesMap issues = _issues;
for (auto processIssues : issues) {
for (auto issue : processIssues.second.first) {
for (auto issue : processIssues.issues) {
std::string issueStr = issue.toString();
issuesMap[processIssues.first.toString()].push_back(
issuesMap[processIssues.address.toString()].push_back(
JsonString::makeMessage(issueStr.c_str(), getIssueDescription(issueStr).c_str()));
}
}
@ -2109,7 +2088,7 @@ ACTOR Future<JsonBuilderObject> layerStatusFetcher(Database cx, JsonBuilderArray
return statusObj;
}
ACTOR Future<JsonBuilderObject> lockedStatusFetcher(Reference<AsyncVar<CachedSerialization<ServerDBInfo>>> db, JsonBuilderArray *messages, std::set<std::string> *incomplete_reasons) {
ACTOR Future<JsonBuilderObject> lockedStatusFetcher(Reference<AsyncVar<ServerDBInfo>> db, JsonBuilderArray *messages, std::set<std::string> *incomplete_reasons) {
state JsonBuilderObject statusObj;
state Database cx = openDBOnServer(db, TaskPriority::DefaultEndpoint, true, false); // Open a new database connection that isn't lock-aware
@ -2181,10 +2160,10 @@ ACTOR Future<Optional<Value>> getActivePrimaryDC(Database cx, JsonBuilderArray*
// constructs the cluster section of the json status output
ACTOR Future<StatusReply> clusterGetStatus(
Reference<AsyncVar<CachedSerialization<ServerDBInfo>>> db,
Reference<AsyncVar<ServerDBInfo>> db,
Database cx,
vector<WorkerDetails> workers,
ProcessIssuesMap workerIssues,
std::vector<ProcessIssues> workerIssues,
std::map<NetworkAddress, std::pair<double, OpenDatabaseRequest>>* clientStatus,
ServerCoordinators coordinators,
std::vector<NetworkAddress> incompatibleConnections,
@ -2201,7 +2180,7 @@ ACTOR Future<StatusReply> clusterGetStatus(
try {
// Get the master Worker interface
Optional<WorkerDetails> _mWorker = getWorker( workers, db->get().read().master.address() );
Optional<WorkerDetails> _mWorker = getWorker( workers, db->get().master.address() );
if (_mWorker.present()) {
mWorker = _mWorker.get();
} else {
@ -2209,11 +2188,11 @@ ACTOR Future<StatusReply> clusterGetStatus(
}
// Get the DataDistributor worker interface
Optional<WorkerDetails> _ddWorker;
if (db->get().read().distributor.present()) {
_ddWorker = getWorker( workers, db->get().read().distributor.get().address() );
if (db->get().distributor.present()) {
_ddWorker = getWorker( workers, db->get().distributor.get().address() );
}
if (!db->get().read().distributor.present() || !_ddWorker.present()) {
if (!db->get().distributor.present() || !_ddWorker.present()) {
messages.push_back(JsonString::makeMessage("unreachable_dataDistributor_worker", "Unable to locate the data distributor worker."));
} else {
ddWorker = _ddWorker.get();
@ -2221,11 +2200,11 @@ ACTOR Future<StatusReply> clusterGetStatus(
// Get the Ratekeeper worker interface
Optional<WorkerDetails> _rkWorker;
if (db->get().read().ratekeeper.present()) {
_rkWorker = getWorker( workers, db->get().read().ratekeeper.get().address() );
if (db->get().ratekeeper.present()) {
_rkWorker = getWorker( workers, db->get().ratekeeper.get().address() );
}
if (!db->get().read().ratekeeper.present() || !_rkWorker.present()) {
if (!db->get().ratekeeper.present() || !_rkWorker.present()) {
messages.push_back(JsonString::makeMessage("unreachable_ratekeeper_worker", "Unable to locate the ratekeeper worker."));
} else {
rkWorker = _rkWorker.get();
@ -2283,8 +2262,8 @@ ACTOR Future<StatusReply> clusterGetStatus(
state WorkerEvents programStarts = workerEventsVec[5].present() ? workerEventsVec[5].get().first : WorkerEvents();
state JsonBuilderObject statusObj;
if(db->get().read().recoveryCount > 0) {
statusObj["generation"] = db->get().read().recoveryCount;
if(db->get().recoveryCount > 0) {
statusObj["generation"] = db->get().recoveryCount;
}
state std::map<std::string, std::vector<JsonBuilderObject>> processIssues =
@ -2367,7 +2346,7 @@ ACTOR Future<StatusReply> clusterGetStatus(
state std::vector<JsonBuilderObject> workerStatuses = wait(getAll(futures2));
int oldLogFaultTolerance = 100;
if(db->get().read().recoveryState >= RecoveryState::ACCEPTING_COMMITS && db->get().read().logSystemConfig.oldTLogs.size() > 0) {
if(db->get().recoveryState >= RecoveryState::ACCEPTING_COMMITS && db->get().logSystemConfig.oldTLogs.size() > 0) {
statusObj["old_logs"] = oldTlogFetcher(&oldLogFaultTolerance, db, address_workers);
}

View File

@ -27,14 +27,14 @@
#include "fdbserver/MasterInterface.h"
#include "fdbclient/ClusterInterface.h"
typedef Standalone<VectorRef<StringRef>> ProcessIssues;
typedef std::map<NetworkAddress, std::pair<ProcessIssues, UID>> ProcessIssuesMap;
struct ProcessIssues {
NetworkAddress address;
Standalone<VectorRef<StringRef>> issues;
void setIssues(ProcessIssuesMap& issueMap, NetworkAddress const& addr, VectorRef<StringRef> const& issues, Optional<UID>& issueID);
ProcessIssues(NetworkAddress address, Standalone<VectorRef<StringRef>> issues) : address(address), issues(issues) {}
};
void removeIssues(ProcessIssuesMap& issueMap, NetworkAddress const& addr, Optional<UID>& issueID);
Future<StatusReply> clusterGetStatus( Reference<AsyncVar<CachedSerialization<struct ServerDBInfo>>> const& db, Database const& cx, vector<WorkerDetails> const& workers,
ProcessIssuesMap const& workerIssues, std::map<NetworkAddress, std::pair<double, OpenDatabaseRequest>>* const& clientStatus, ServerCoordinators const& coordinators, std::vector<NetworkAddress> const& incompatibleConnections, Version const& datacenterVersionDifference );
Future<StatusReply> clusterGetStatus( Reference<AsyncVar<struct ServerDBInfo>> const& db, Database const& cx, vector<WorkerDetails> const& workers, std::vector<ProcessIssues> const& workerIssues,
std::map<NetworkAddress, std::pair<double, OpenDatabaseRequest>>* const& clientStatus, ServerCoordinators const& coordinators, std::vector<NetworkAddress> const& incompatibleConnections, Version const& datacenterVersionDifference );
#endif

View File

@ -378,7 +378,7 @@ struct StorageServerMetrics {
}
}
void getStorageMetrics( GetStorageMetricsRequest req, StorageBytes sb, double bytesInputRate ){
void getStorageMetrics( GetStorageMetricsRequest req, StorageBytes sb, double bytesInputRate, int64_t versionLag, double lastUpdate ){
GetStorageMetricsReply rep;
// SOMEDAY: make bytes dynamic with hard disk space
@ -405,6 +405,9 @@ struct StorageServerMetrics {
rep.bytesInputRate = bytesInputRate;
rep.versionLag = versionLag;
rep.lastUpdate = lastUpdate;
req.reply.send(rep);
}

View File

@ -491,6 +491,43 @@ struct LogData : NonCopyable, public ReferenceCounted<LogData> {
struct PeekTrackerData {
std::map<int, Promise<std::pair<Version, bool>>> sequence_version;
double lastUpdate;
Tag tag;
double lastLogged;
int64_t totalPeeks;
int64_t replyBytes;
int64_t duplicatePeeks;
double queueTime;
double queueMax;
double blockTime;
double blockMax;
double workTime;
double workMax;
int64_t unblockedPeeks;
double idleTime;
double idleMax;
PeekTrackerData() : lastUpdate(0) {
resetMetrics();
}
void resetMetrics() {
lastLogged = now();
totalPeeks = 0;
replyBytes = 0;
duplicatePeeks = 0;
queueTime = 0;
queueMax = 0;
blockTime = 0;
blockMax = 0;
workTime = 0;
workMax = 0;
unblockedPeeks = 0;
idleTime = 0;
idleMax = 0;
}
};
std::map<UID, PeekTrackerData> peekTracker;
@ -1366,6 +1403,7 @@ ACTOR Future<Void> tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere
state BinaryWriter messages2(Unversioned());
state int sequence = -1;
state UID peekId;
state double queueStart = now();
if(req.sequence.present()) {
try {
@ -1376,6 +1414,7 @@ ACTOR Future<Void> tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere
}
auto& trackerData = logData->peekTracker[peekId];
if (sequence == 0 && trackerData.sequence_version.find(0) == trackerData.sequence_version.end()) {
trackerData.tag = req.tag;
trackerData.sequence_version[0].send(std::make_pair(req.begin, req.onlySpilled));
}
auto seqBegin = trackerData.sequence_version.begin();
@ -1392,8 +1431,15 @@ ACTOR Future<Void> tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere
throw operation_obsolete();
}
Future<std::pair<Version, bool>> fPrevPeekData = trackerData.sequence_version[sequence].getFuture();
if(fPrevPeekData.isReady()) {
trackerData.unblockedPeeks++;
double t = now() - trackerData.lastUpdate;
if(t > trackerData.idleMax) trackerData.idleMax = t;
trackerData.idleTime += t;
}
trackerData.lastUpdate = now();
std::pair<Version, bool> prevPeekData = wait(trackerData.sequence_version[sequence].getFuture());
std::pair<Version, bool> prevPeekData = wait(fPrevPeekData);
req.begin = std::max(prevPeekData.first, req.begin);
req.onlySpilled = prevPeekData.second;
wait(yield());
@ -1407,6 +1453,8 @@ ACTOR Future<Void> tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere
}
}
state double blockStart = now();
if( req.returnIfBlocked && logData->version.get() < req.begin ) {
req.reply.sendError(end_of_stream());
if(req.sequence.present()) {
@ -1442,6 +1490,8 @@ ACTOR Future<Void> tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere
wait(delay(0, TaskPriority::TLogSpilledPeekReply));
}
state double workStart = now();
Version poppedVer = poppedVersion(logData, req.tag);
if(poppedVer > req.begin) {
TLogPeekReply rep;
@ -1617,8 +1667,24 @@ ACTOR Future<Void> tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere
if(req.sequence.present()) {
auto& trackerData = logData->peekTracker[peekId];
auto& sequenceData = trackerData.sequence_version[sequence+1];
trackerData.lastUpdate = now();
double queueT = blockStart-queueStart;
double blockT = workStart-blockStart;
double workT = now()-workStart;
trackerData.totalPeeks++;
trackerData.replyBytes += reply.messages.size();
if(queueT > trackerData.queueMax) trackerData.queueMax = queueT;
if(blockT > trackerData.blockMax) trackerData.blockMax = blockT;
if(workT > trackerData.workMax) trackerData.workMax = workT;
trackerData.queueTime += queueT;
trackerData.blockTime += blockT;
trackerData.workTime += workT;
auto& sequenceData = trackerData.sequence_version[sequence+1];
if(trackerData.sequence_version.size() && sequence+1 < trackerData.sequence_version.begin()->first) {
req.reply.sendError(operation_obsolete());
if(!sequenceData.isSet()) {
@ -1631,6 +1697,7 @@ ACTOR Future<Void> tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere
return Void();
}
if(sequenceData.isSet()) {
trackerData.duplicatePeeks++;
if(sequenceData.getFuture().get().first != reply.end) {
TEST(true); //tlog peek second attempt ended at a different version
req.reply.sendError(operation_obsolete());
@ -1956,6 +2023,47 @@ ACTOR Future<Void> cleanupPeekTrackers( LogData* logData ) {
}
}
ACTOR Future<Void> logPeekTrackers( LogData* logData ) {
loop {
int64_t logThreshold = 1;
if(logData->peekTracker.size() > SERVER_KNOBS->PEEK_LOGGING_AMOUNT) {
std::vector<int64_t> peekCounts;
peekCounts.reserve(logData->peekTracker.size());
for( auto& it : logData->peekTracker ) {
peekCounts.push_back(it.second.totalPeeks);
}
size_t pivot = peekCounts.size()-SERVER_KNOBS->PEEK_LOGGING_AMOUNT;
std::nth_element(peekCounts.begin(), peekCounts.begin()+pivot, peekCounts.end());
logThreshold = std::max<int64_t>(1,peekCounts[pivot]);
}
int logCount = 0;
for( auto& it : logData->peekTracker ) {
if(it.second.totalPeeks >= logThreshold) {
logCount++;
TraceEvent("PeekMetrics", logData->logId)
.detail("Tag", it.second.tag.toString())
.detail("Elapsed", now() - it.second.lastLogged)
.detail("MeanReplyBytes", it.second.replyBytes/it.second.totalPeeks)
.detail("TotalPeeks", it.second.totalPeeks)
.detail("UnblockedPeeks", it.second.unblockedPeeks)
.detail("DuplicatePeeks", it.second.duplicatePeeks)
.detail("Sequence", it.second.sequence_version.size() ? it.second.sequence_version.begin()->first : -1)
.detail("IdleSeconds", it.second.idleTime)
.detail("IdleMax", it.second.idleMax)
.detail("QueueSeconds", it.second.queueTime)
.detail("QueueMax", it.second.queueMax)
.detail("BlockSeconds", it.second.blockTime)
.detail("BlockMax", it.second.blockMax)
.detail("WorkSeconds", it.second.workTime)
.detail("WorkMax", it.second.workMax);
it.second.resetMetrics();
}
}
wait( delay(SERVER_KNOBS->PEEK_LOGGING_DELAY * std::max(1,logCount)) );
}
}
void getQueuingMetrics( TLogData* self, Reference<LogData> logData, TLogQueuingMetricsRequest const& req ) {
TLogQueuingMetricsReply reply;
reply.localTime = now();
@ -2302,6 +2410,7 @@ ACTOR Future<Void> tLogCore( TLogData* self, Reference<LogData> logData, TLogInt
logData->addActor.send( traceCounters("TLogMetrics", logData->logId, SERVER_KNOBS->STORAGE_LOGGING_DELAY, &logData->cc, logData->logId.toString() + "/TLogMetrics"));
logData->addActor.send( serveTLogInterface(self, tli, logData, warningCollectorInput) );
logData->addActor.send( cleanupPeekTrackers(logData.getPtr()) );
logData->addActor.send( logPeekTrackers(logData.getPtr()) );
if(!logData->isPrimary) {
std::vector<Tag> tags;

File diff suppressed because it is too large Load Diff

View File

@ -37,6 +37,7 @@
#include "fdbserver/LogSystemConfig.h"
#include "fdbrpc/MultiInterface.h"
#include "fdbclient/ClientWorkerInterface.h"
#include "fdbserver/RecoveryState.h"
#include "flow/actorcompiler.h"
struct WorkerInterface {
@ -60,14 +61,17 @@ struct WorkerInterface {
RequestStream< struct EventLogRequest > eventLogRequest;
RequestStream< struct TraceBatchDumpRequest > traceBatchDumpRequest;
RequestStream< struct DiskStoreRequest > diskStoreRequest;
RequestStream<struct ExecuteRequest> execReq;
RequestStream<struct WorkerSnapRequest> workerSnapReq;
RequestStream< struct ExecuteRequest> execReq;
RequestStream< struct WorkerSnapRequest> workerSnapReq;
RequestStream< struct UpdateServerDBInfoRequest > updateServerDBInfo;
TesterInterface testerInterface;
UID id() const { return tLog.getEndpoint().token; }
NetworkAddress address() const { return tLog.getEndpoint().getPrimaryAddress(); }
NetworkAddress stableAddress() const { return tLog.getEndpoint().getStableAddress(); }
Optional<NetworkAddress> secondaryAddress() const { return tLog.getEndpoint().addresses.secondaryAddress; }
NetworkAddressList addresses() const { return tLog.getEndpoint().addresses; }
WorkerInterface() {}
WorkerInterface( const LocalityData& locality ) : locality( locality ) {}
@ -81,11 +85,13 @@ struct WorkerInterface {
logRouter.getEndpoint( TaskPriority::Worker );
debugPing.getEndpoint( TaskPriority::Worker );
coordinationPing.getEndpoint( TaskPriority::Worker );
updateServerDBInfo.getEndpoint( TaskPriority::Worker );
eventLogRequest.getEndpoint( TaskPriority::Worker );
}
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, clientInterface, locality, tLog, master, masterProxy, dataDistributor, ratekeeper, resolver, storage, logRouter, debugPing, coordinationPing, waitFailure, setMetricsRate, eventLogRequest, traceBatchDumpRequest, testerInterface, diskStoreRequest, execReq, workerSnapReq, backup);
serializer(ar, clientInterface, locality, tLog, master, masterProxy, dataDistributor, ratekeeper, resolver, storage, logRouter, debugPing, coordinationPing, waitFailure, setMetricsRate, eventLogRequest, traceBatchDumpRequest, testerInterface, diskStoreRequest, execReq, workerSnapReq, backup, updateServerDBInfo);
}
};
@ -104,6 +110,230 @@ struct WorkerDetails {
}
};
// This interface and its serialization depend on slicing, since the client will deserialize only the first part of this structure
struct ClusterControllerFullInterface {
constexpr static FileIdentifier file_identifier =
ClusterControllerClientInterface::file_identifier;
ClusterInterface clientInterface;
RequestStream< struct RecruitFromConfigurationRequest > recruitFromConfiguration;
RequestStream< struct RecruitRemoteFromConfigurationRequest > recruitRemoteFromConfiguration;
RequestStream< struct RecruitStorageRequest > recruitStorage;
RequestStream< struct RegisterWorkerRequest > registerWorker;
RequestStream< struct GetWorkersRequest > getWorkers;
RequestStream< struct RegisterMasterRequest > registerMaster;
RequestStream< struct GetServerDBInfoRequest > getServerDBInfo; //only used by testers; the cluster controller will send the serverDBInfo to workers
UID id() const { return clientInterface.id(); }
bool operator == (ClusterControllerFullInterface const& r) const { return id() == r.id(); }
bool operator != (ClusterControllerFullInterface const& r) const { return id() != r.id(); }
bool hasMessage() {
return clientInterface.hasMessage() ||
recruitFromConfiguration.getFuture().isReady() ||
recruitRemoteFromConfiguration.getFuture().isReady() ||
recruitStorage.getFuture().isReady() ||
registerWorker.getFuture().isReady() ||
getWorkers.getFuture().isReady() ||
registerMaster.getFuture().isReady() ||
getServerDBInfo.getFuture().isReady();
}
void initEndpoints() {
clientInterface.initEndpoints();
recruitFromConfiguration.getEndpoint( TaskPriority::ClusterControllerRecruit );
recruitRemoteFromConfiguration.getEndpoint( TaskPriority::ClusterControllerRecruit );
recruitStorage.getEndpoint( TaskPriority::ClusterController );
registerWorker.getEndpoint( TaskPriority::ClusterControllerWorker );
getWorkers.getEndpoint( TaskPriority::ClusterController );
registerMaster.getEndpoint( TaskPriority::ClusterControllerRegister );
getServerDBInfo.getEndpoint( TaskPriority::ClusterController );
}
template <class Ar>
void serialize(Ar& ar) {
if constexpr (!is_fb_function<Ar>) {
ASSERT(ar.protocolVersion().isValid());
}
serializer(ar, clientInterface, recruitFromConfiguration, recruitRemoteFromConfiguration, recruitStorage,
registerWorker, getWorkers, registerMaster, getServerDBInfo);
}
};
struct RegisterWorkerReply {
constexpr static FileIdentifier file_identifier = 16475696;
ProcessClass processClass;
ClusterControllerPriorityInfo priorityInfo;
Optional<uint16_t> storageCache;
RegisterWorkerReply() : priorityInfo(ProcessClass::UnsetFit, false, ClusterControllerPriorityInfo::FitnessUnknown) {}
RegisterWorkerReply(ProcessClass processClass, ClusterControllerPriorityInfo priorityInfo, Optional<uint16_t> storageCache) : processClass(processClass), priorityInfo(priorityInfo), storageCache(storageCache) {}
template <class Ar>
void serialize( Ar& ar ) {
serializer(ar, processClass, priorityInfo, storageCache);
}
};
struct RegisterMasterRequest {
constexpr static FileIdentifier file_identifier = 10773445;
UID id;
LocalityData mi;
LogSystemConfig logSystemConfig;
std::vector<MasterProxyInterface> proxies;
std::vector<ResolverInterface> resolvers;
DBRecoveryCount recoveryCount;
int64_t registrationCount;
Optional<DatabaseConfiguration> configuration;
std::vector<UID> priorCommittedLogServers;
RecoveryState recoveryState;
bool recoveryStalled;
ReplyPromise<Void> reply;
RegisterMasterRequest() : logSystemConfig(0) {}
template <class Ar>
void serialize(Ar& ar) {
if constexpr (!is_fb_function<Ar>) {
ASSERT(ar.protocolVersion().isValid());
}
serializer(ar, id, mi, logSystemConfig, proxies, resolvers, recoveryCount, registrationCount, configuration,
priorCommittedLogServers, recoveryState, recoveryStalled, reply);
}
};
struct RecruitFromConfigurationReply {
constexpr static FileIdentifier file_identifier = 2224085;
std::vector<WorkerInterface> backupWorkers;
std::vector<WorkerInterface> tLogs;
std::vector<WorkerInterface> satelliteTLogs;
std::vector<WorkerInterface> proxies;
std::vector<WorkerInterface> resolvers;
std::vector<WorkerInterface> storageServers;
std::vector<WorkerInterface> oldLogRouters;
Optional<Key> dcId;
bool satelliteFallback;
RecruitFromConfigurationReply() : satelliteFallback(false) {}
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, tLogs, satelliteTLogs, proxies, resolvers, storageServers, oldLogRouters, dcId,
satelliteFallback, backupWorkers);
}
};
struct RecruitFromConfigurationRequest {
constexpr static FileIdentifier file_identifier = 2023046;
DatabaseConfiguration configuration;
bool recruitSeedServers;
int maxOldLogRouters;
ReplyPromise< RecruitFromConfigurationReply > reply;
RecruitFromConfigurationRequest() {}
explicit RecruitFromConfigurationRequest(DatabaseConfiguration const& configuration, bool recruitSeedServers, int maxOldLogRouters)
: configuration(configuration), recruitSeedServers(recruitSeedServers), maxOldLogRouters(maxOldLogRouters) {}
template <class Ar>
void serialize( Ar& ar ) {
serializer(ar, configuration, recruitSeedServers, maxOldLogRouters, reply);
}
};
struct RecruitRemoteFromConfigurationReply {
constexpr static FileIdentifier file_identifier = 9091392;
std::vector<WorkerInterface> remoteTLogs;
std::vector<WorkerInterface> logRouters;
template <class Ar>
void serialize( Ar& ar ) {
serializer(ar, remoteTLogs, logRouters);
}
};
struct RecruitRemoteFromConfigurationRequest {
constexpr static FileIdentifier file_identifier = 3235995;
DatabaseConfiguration configuration;
Optional<Key> dcId;
int logRouterCount;
std::vector<UID> exclusionWorkerIds;
ReplyPromise< RecruitRemoteFromConfigurationReply > reply;
RecruitRemoteFromConfigurationRequest() {}
RecruitRemoteFromConfigurationRequest(DatabaseConfiguration const& configuration, Optional<Key> const& dcId, int logRouterCount, const std::vector<UID> &exclusionWorkerIds) : configuration(configuration), dcId(dcId), logRouterCount(logRouterCount), exclusionWorkerIds(exclusionWorkerIds){}
template <class Ar>
void serialize( Ar& ar ) {
serializer(ar, configuration, dcId, logRouterCount, exclusionWorkerIds, reply);
}
};
struct RecruitStorageReply {
constexpr static FileIdentifier file_identifier = 15877089;
WorkerInterface worker;
ProcessClass processClass;
template <class Ar>
void serialize( Ar& ar ) {
serializer(ar, worker, processClass);
}
};
struct RecruitStorageRequest {
constexpr static FileIdentifier file_identifier = 905920;
std::vector<Optional<Standalone<StringRef>>> excludeMachines; //< Don't recruit any of these machines
std::vector<AddressExclusion> excludeAddresses; //< Don't recruit any of these addresses
std::vector<Optional<Standalone<StringRef>>> includeDCs;
bool criticalRecruitment; //< True if machine classes are to be ignored
ReplyPromise< RecruitStorageReply > reply;
template <class Ar>
void serialize( Ar& ar ) {
serializer(ar, excludeMachines, excludeAddresses, includeDCs, criticalRecruitment, reply);
}
};
struct RegisterWorkerRequest {
constexpr static FileIdentifier file_identifier = 14332605;
WorkerInterface wi;
ProcessClass initialClass;
ProcessClass processClass;
ClusterControllerPriorityInfo priorityInfo;
Generation generation;
Optional<DataDistributorInterface> distributorInterf;
Optional<RatekeeperInterface> ratekeeperInterf;
Optional<std::pair<uint16_t,StorageServerInterface>> storageCacheInterf;
Standalone<VectorRef<StringRef>> issues;
std::vector<NetworkAddress> incompatiblePeers;
ReplyPromise<RegisterWorkerReply> reply;
bool degraded;
RegisterWorkerRequest() : priorityInfo(ProcessClass::UnsetFit, false, ClusterControllerPriorityInfo::FitnessUnknown), degraded(false) {}
RegisterWorkerRequest(WorkerInterface wi, ProcessClass initialClass, ProcessClass processClass, ClusterControllerPriorityInfo priorityInfo, Generation generation, Optional<DataDistributorInterface> ddInterf, Optional<RatekeeperInterface> rkInterf, Optional<std::pair<uint16_t,StorageServerInterface>> storageCacheInterf, bool degraded) :
wi(wi), initialClass(initialClass), processClass(processClass), priorityInfo(priorityInfo), generation(generation), distributorInterf(ddInterf), ratekeeperInterf(rkInterf), storageCacheInterf(storageCacheInterf), degraded(degraded) {}
template <class Ar>
void serialize( Ar& ar ) {
serializer(ar, wi, initialClass, processClass, priorityInfo, generation, distributorInterf, ratekeeperInterf, storageCacheInterf, issues, incompatiblePeers, reply, degraded);
}
};
struct GetWorkersRequest {
constexpr static FileIdentifier file_identifier = 1254174;
enum { TESTER_CLASS_ONLY = 0x1, NON_EXCLUDED_PROCESSES_ONLY = 0x2 };
int flags;
ReplyPromise<vector<WorkerDetails>> reply;
GetWorkersRequest() : flags(0) {}
explicit GetWorkersRequest(int fl) : flags(fl) {}
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, flags, reply);
}
};
struct InitializeTLogRequest {
constexpr static FileIdentifier file_identifier = 15604392;
UID recruitmentID;
@ -462,7 +692,6 @@ void endRole(const Role &role, UID id, std::string reason, bool ok = true, Error
struct ServerDBInfo;
class Database openDBOnServer( Reference<AsyncVar<ServerDBInfo>> const& db, TaskPriority taskID = TaskPriority::DefaultEndpoint, bool enableLocalityLoadBalance = true, bool lockAware = false );
class Database openDBOnServer( Reference<AsyncVar<CachedSerialization<ServerDBInfo>>> const& db, TaskPriority taskID = TaskPriority::DefaultEndpoint, bool enableLocalityLoadBalance = true, bool lockAware = false );
ACTOR Future<Void> extractClusterInterface(Reference<AsyncVar<Optional<struct ClusterControllerFullInterface>>> a,
Reference<AsyncVar<Optional<struct ClusterInterface>>> b);
@ -496,12 +725,6 @@ ACTOR Future<Void> tLog(IKeyValueStore* persistentData, IDiskQueue* persistentQu
PromiseStream<InitializeTLogRequest> tlogRequests, UID tlogId, UID workerID,
bool restoreFromDisk, Promise<Void> oldLog, Promise<Void> recovered, std::string folder,
Reference<AsyncVar<bool>> degraded, Reference<AsyncVar<UID>> activeSharedTLog);
ACTOR Future<Void> monitorServerDBInfo(Reference<AsyncVar<Optional<ClusterControllerFullInterface>>> ccInterface,
Reference<ClusterConnectionFile> ccf, LocalityData locality,
Reference<AsyncVar<ServerDBInfo>> dbInfo,
Optional<Reference<AsyncVar<std::set<std::string>>>> issues =
Optional<Reference<AsyncVar<std::set<std::string>>>>());
ACTOR Future<Void> resolver(ResolverInterface proxy, InitializeResolverRequest initReq,
Reference<AsyncVar<ServerDBInfo>> db);
ACTOR Future<Void> logRouter(TLogInterface interf, InitializeLogRouterRequest req,
@ -535,5 +758,6 @@ ACTOR Future<Void> tLog(IKeyValueStore* persistentData, IDiskQueue* persistentQu
typedef decltype(&tLog) TLogFn;
#include "fdbserver/ServerDBInfo.h"
#include "flow/unactorcompiler.h"
#endif

View File

@ -34,7 +34,6 @@
#include "fdbserver/CoordinationInterface.h"
#include "fdbserver/WorkerInterface.actor.h"
#include "fdbclient/RestoreWorkerInterface.actor.h"
#include "fdbserver/ClusterRecruitmentInterface.h"
#include "fdbserver/ServerDBInfo.h"
#include "fdbserver/MoveKeys.actor.h"
#include "fdbserver/ConflictSet.h"
@ -1629,6 +1628,7 @@ int main(int argc, char* argv[]) {
openTraceFile(NetworkAddress(), opts.rollsize, opts.maxLogsSize, opts.logFolder, "trace", opts.logGroup);
} else {
g_network = newNet2(opts.tlsConfig, opts.useThreadPool, true);
g_network->addStopCallback( Net2FileSystem::stop );
FlowTransport::createInstance(false, 1);
const bool expectsPublicAddress = (role == FDBD || role == NetworkTestServer || role == Restore);

View File

@ -33,7 +33,6 @@
#include "fdbserver/MasterInterface.h"
#include "fdbserver/WaitFailure.h"
#include "fdbserver/WorkerInterface.actor.h"
#include "fdbserver/ClusterRecruitmentInterface.h"
#include "fdbserver/ServerDBInfo.h"
#include "fdbserver/CoordinatedState.h"
#include "fdbserver/CoordinationInterface.h" // copy constructors for ServerCoordinators class
@ -740,22 +739,27 @@ ACTOR Future<Void> sendInitialCommitToResolvers( Reference<MasterData> self ) {
ASSERT(self->recoveryTransactionVersion);
state Standalone<RangeResultRef> data = self->txnStateStore->readRange(txnKeys, BUGGIFY ? 3 : SERVER_KNOBS->DESIRED_TOTAL_BYTES, SERVER_KNOBS->DESIRED_TOTAL_BYTES).get();
state vector<Future<Void>> txnReplies;
state std::vector<Future<Void>> txnReplies;
state int64_t dataOutstanding = 0;
state std::vector<Endpoint> endpoints;
for(auto& it : self->proxies) {
endpoints.push_back(it.txnState.getEndpoint());
}
loop {
if(!data.size()) break;
((KeyRangeRef&)txnKeys) = KeyRangeRef( keyAfter(data.back().key, txnKeys.arena()), txnKeys.end );
Standalone<RangeResultRef> nextData = self->txnStateStore->readRange(txnKeys, BUGGIFY ? 3 : SERVER_KNOBS->DESIRED_TOTAL_BYTES, SERVER_KNOBS->DESIRED_TOTAL_BYTES).get();
for(auto& r : self->proxies) {
TxnStateRequest req;
req.arena = data.arena();
req.data = data;
req.sequence = txnSequence;
req.last = !nextData.size();
txnReplies.push_back( brokenPromiseToNever( r.txnState.getReply( req ) ) );
dataOutstanding += data.arena().getSize();
}
req.broadcastInfo = endpoints;
txnReplies.push_back(broadcastTxnRequest(req, SERVER_KNOBS->TXN_STATE_SEND_AMOUNT, false));
dataOutstanding += SERVER_KNOBS->TXN_STATE_SEND_AMOUNT*data.arena().getSize();
data = nextData;
txnSequence++;

View File

@ -3484,7 +3484,7 @@ ACTOR Future<Void> metricsCore( StorageServer* self, StorageServerInterface ssi
}
when (GetStorageMetricsRequest req = waitNext(ssi.getStorageMetrics.getFuture())) {
StorageBytes sb = self->storage.getStorageBytes();
self->metrics.getStorageMetrics( req, sb, self->counters.bytesInput.getRate() );
self->metrics.getStorageMetrics( req, sb, self->counters.bytesInput.getRate(), self->versionLag, self->lastUpdate );
}
when (wait(doPollMetrics) ) {
self->metrics.poll();

View File

@ -28,13 +28,13 @@
#include "fdbclient/SystemData.h"
#include "fdbserver/TesterInterface.actor.h"
#include "fdbserver/WorkerInterface.actor.h"
#include "fdbserver/ClusterRecruitmentInterface.h"
#include "fdbserver/workloads/workloads.actor.h"
#include "fdbserver/Status.h"
#include "fdbserver/QuietDatabase.h"
#include "fdbclient/MonitorLeader.h"
#include "fdbserver/CoordinationInterface.h"
#include "fdbclient/ManagementAPI.actor.h"
#include "fdbserver/WorkerInterface.actor.h"
#include "flow/actorcompiler.h" // This must be the last #include.
using namespace std;
@ -1017,11 +1017,40 @@ vector<TestSpec> readTests( ifstream& ifs ) {
return result;
}
ACTOR Future<Void> monitorServerDBInfo(Reference<AsyncVar<Optional<ClusterControllerFullInterface>>> ccInterface,
LocalityData locality,
Reference<AsyncVar<ServerDBInfo>> dbInfo) {
// Initially most of the serverDBInfo is not known, but we know our locality right away
ServerDBInfo localInfo;
localInfo.myLocality = locality;
dbInfo->set(localInfo);
loop {
GetServerDBInfoRequest req;
req.knownServerInfoID = dbInfo->get().id;
choose {
when( ServerDBInfo _localInfo = wait( ccInterface->get().present() ? brokenPromiseToNever( ccInterface->get().get().getServerDBInfo.getReply( req ) ) : Never() ) ) {
ServerDBInfo localInfo = _localInfo;
TraceEvent("GotServerDBInfoChange").detail("ChangeID", localInfo.id).detail("MasterID", localInfo.master.id())
.detail("RatekeeperID", localInfo.ratekeeper.present() ? localInfo.ratekeeper.get().id() : UID())
.detail("DataDistributorID", localInfo.distributor.present() ? localInfo.distributor.get().id() : UID());
localInfo.myLocality = locality;
dbInfo->set(localInfo);
}
when( wait( ccInterface->onChange() ) ) {
if(ccInterface->get().present())
TraceEvent("GotCCInterfaceChange").detail("CCID", ccInterface->get().get().id()).detail("CCMachine", ccInterface->get().get().getWorkers.getEndpoint().getPrimaryAddress());
}
}
}
}
ACTOR Future<Void> runTests( Reference<AsyncVar<Optional<struct ClusterControllerFullInterface>>> cc, Reference<AsyncVar<Optional<struct ClusterInterface>>> ci, vector< TesterInterface > testers, vector<TestSpec> tests, StringRef startingConfiguration, LocalityData locality ) {
state Database cx;
state Reference<AsyncVar<ServerDBInfo>> dbInfo( new AsyncVar<ServerDBInfo> );
state Future<Void> ccMonitor =
monitorServerDBInfo(cc, Reference<ClusterConnectionFile>(), LocalityData(), dbInfo); // FIXME: locality
state Future<Void> ccMonitor = monitorServerDBInfo(cc, LocalityData(), dbInfo); // FIXME: locality
state bool useDB = false;
state bool waitForQuiescenceBegin = false;
@ -1192,7 +1221,7 @@ ACTOR Future<Void> runTests( Reference<ClusterConnectionFile> connFile, test_typ
if (at == TEST_HERE) {
Reference<AsyncVar<ServerDBInfo>> db( new AsyncVar<ServerDBInfo> );
vector<TesterInterface> iTesters(1);
actors.push_back( reportErrors(monitorServerDBInfo( cc, Reference<ClusterConnectionFile>(), LocalityData(), db ), "MonitorServerDBInfo") ); // FIXME: Locality
actors.push_back( reportErrors(monitorServerDBInfo( cc, LocalityData(), db ), "MonitorServerDBInfo") ); // FIXME: Locality
actors.push_back( reportErrors(testerServerCore( iTesters[0], connFile, db, locality ), "TesterServerCore") );
tests = runTests( cc, ci, iTesters, testSpecs, startingConfiguration, locality );
} else {

View File

@ -33,7 +33,6 @@
#include "fdbserver/TesterInterface.actor.h" // for poisson()
#include "fdbserver/IDiskQueue.h"
#include "fdbclient/DatabaseContext.h"
#include "fdbserver/ClusterRecruitmentInterface.h"
#include "fdbserver/DataDistributorInterface.h"
#include "fdbserver/ServerDBInfo.h"
#include "fdbserver/FDBExecHelper.actor.h"
@ -68,6 +67,44 @@ extern IKeyValueStore* keyValueStoreCompressTestData(IKeyValueStore* store);
# define KV_STORE(filename,uid) keyValueStoreMemory(filename,uid)
#endif
ACTOR Future<std::vector<Endpoint>> tryDBInfoBroadcast(RequestStream<UpdateServerDBInfoRequest> stream, UpdateServerDBInfoRequest req) {
ErrorOr<std::vector<Endpoint>> rep = wait( stream.getReplyUnlessFailedFor(req, SERVER_KNOBS->DBINFO_FAILED_DELAY, 0) );
if(rep.present()) {
return rep.get();
}
req.broadcastInfo.push_back(stream.getEndpoint());
return req.broadcastInfo;
}
ACTOR Future<std::vector<Endpoint>> broadcastDBInfoRequest(UpdateServerDBInfoRequest req, int sendAmount, Optional<Endpoint> sender, bool sendReply) {
state std::vector<Future<std::vector<Endpoint>>> replies;
state ReplyPromise<std::vector<Endpoint>> reply = req.reply;
resetReply( req );
int currentStream = 0;
std::vector<Endpoint> broadcastEndpoints = req.broadcastInfo;
for(int i = 0; i < sendAmount && currentStream < broadcastEndpoints.size(); i++) {
std::vector<Endpoint> endpoints;
RequestStream<UpdateServerDBInfoRequest> cur(broadcastEndpoints[currentStream++]);
while(currentStream < broadcastEndpoints.size()*(i+1)/sendAmount) {
endpoints.push_back(broadcastEndpoints[currentStream++]);
}
req.broadcastInfo = endpoints;
replies.push_back( tryDBInfoBroadcast( cur, req ) );
resetReply( req );
}
wait( waitForAll(replies) );
std::vector<Endpoint> notUpdated;
if(sender.present()) {
notUpdated.push_back(sender.get());
}
for(auto& it : replies) {
notUpdated.insert(notUpdated.end(), it.get().begin(), it.get().end());
}
if(sendReply) {
reply.send(notUpdated);
}
return notUpdated;
}
ACTOR static Future<Void> extractClientInfo( Reference<AsyncVar<ServerDBInfo>> db, Reference<AsyncVar<ClientDBInfo>> info ) {
state std::vector<UID> lastProxyUIDs;
@ -80,27 +117,11 @@ ACTOR static Future<Void> extractClientInfo( Reference<AsyncVar<ServerDBInfo>> d
}
}
ACTOR static Future<Void> extractClientInfo( Reference<AsyncVar<CachedSerialization<ServerDBInfo>>> db, Reference<AsyncVar<ClientDBInfo>> info ) {
state std::vector<UID> lastProxyUIDs;
state std::vector<MasterProxyInterface> lastProxies;
loop {
ClientDBInfo ni = db->get().read().client;
shrinkProxyList(ni, lastProxyUIDs, lastProxies);
info->set( ni );
wait( db->onChange() );
}
}
Database openDBOnServer( Reference<AsyncVar<ServerDBInfo>> const& db, TaskPriority taskID, bool enableLocalityLoadBalance, bool lockAware ) {
Reference<AsyncVar<ClientDBInfo>> info( new AsyncVar<ClientDBInfo> );
return DatabaseContext::create( info, extractClientInfo(db, info), enableLocalityLoadBalance ? db->get().myLocality : LocalityData(), enableLocalityLoadBalance, taskID, lockAware );
}
Database openDBOnServer( Reference<AsyncVar<CachedSerialization<ServerDBInfo>>> const& db, TaskPriority taskID, bool enableLocalityLoadBalance, bool lockAware ) {
Reference<AsyncVar<ClientDBInfo>> info( new AsyncVar<ClientDBInfo> );
return DatabaseContext::create( info, extractClientInfo(db, info), enableLocalityLoadBalance ? db->get().read().myLocality : LocalityData(), enableLocalityLoadBalance, taskID, lockAware );
}
struct ErrorInfo {
Error error;
const Role &role;
@ -413,7 +434,9 @@ ACTOR Future<Void> registrationClient(
Reference<AsyncVar<bool>> degraded,
PromiseStream< ErrorInfo > errors,
LocalityData locality,
Reference<AsyncVar<ServerDBInfo>> dbInfo) {
Reference<AsyncVar<ServerDBInfo>> dbInfo,
Reference<ClusterConnectionFile> connFile,
Reference<AsyncVar<std::set<std::string>>> issues) {
// Keeps the cluster controller (as it may be re-elected) informed that this worker exists
// The cluster controller uses waitFailureClient to find out if we die, and returns from registrationReply (requiring us to re-register)
// The registration request piggybacks optional distributor interface if it exists.
@ -422,8 +445,41 @@ ACTOR Future<Void> registrationClient(
state Reference<AsyncVar<Optional<std::pair<uint16_t,StorageServerInterface>>>> scInterf( new AsyncVar<Optional<std::pair<uint16_t,StorageServerInterface>>>() );
state Future<Void> cacheProcessFuture;
state Future<Void> cacheErrorsFuture;
state Optional<double> incorrectTime;
loop {
RegisterWorkerRequest request(interf, initialClass, processClass, asyncPriorityInfo->get(), requestGeneration++, ddInterf->get(), rkInterf->get(), scInterf->get(), degraded->get());
for (auto const& i : issues->get()) {
request.issues.push_back_deep(request.issues.arena(), i);
}
ClusterConnectionString fileConnectionString;
if (connFile && !connFile->fileContentsUpToDate(fileConnectionString)) {
request.issues.push_back_deep(request.issues.arena(), LiteralStringRef("incorrect_cluster_file_contents"));
std::string connectionString = connFile->getConnectionString().toString();
if(!incorrectTime.present()) {
incorrectTime = now();
}
if(connFile->canGetFilename()) {
// Don't log a SevWarnAlways initially to account for transient issues (e.g. someone else changing the file right before us)
TraceEvent(now() - incorrectTime.get() > 300 ? SevWarnAlways : SevWarn, "IncorrectClusterFileContents")
.detail("Filename", connFile->getFilename())
.detail("ConnectionStringFromFile", fileConnectionString.toString())
.detail("CurrentConnectionString", connectionString);
}
}
else {
incorrectTime = Optional<double>();
}
auto peers = FlowTransport::transport().getIncompatiblePeers();
for(auto it = peers->begin(); it != peers->end();) {
if( now() - it->second.second > FLOW_KNOBS->INCOMPATIBLE_PEER_DELAY_BEFORE_LOGGING ) {
request.incompatiblePeers.push_back(it->first);
it = peers->erase(it);
} else {
it++;
}
}
Future<RegisterWorkerReply> registrationReply = ccInterface->get().present() ? brokenPromiseToNever( ccInterface->get().get().registerWorker.getReply(request) ) : Never();
choose {
when ( RegisterWorkerReply reply = wait( registrationReply )) {
@ -464,6 +520,8 @@ ACTOR Future<Void> registrationClient(
when ( wait( rkInterf->onChange() ) ) {}
when ( wait( scInterf->onChange() ) ) {}
when ( wait( degraded->onChange() ) ) {}
when ( wait( FlowTransport::transport().onIncompatibleChanged() ) ) {}
when ( wait( issues->onChange() ) ) {}
}
}
}
@ -749,7 +807,10 @@ ACTOR Future<Void> workerSnapCreate(WorkerSnapRequest snapReq, StringRef snapFol
return Void();
}
ACTOR Future<Void> monitorTraceLogIssues(Optional<Reference<AsyncVar<std::set<std::string>>>> issues) {
// TODO: `issues` is right now only updated by `monitorTraceLogIssues` and thus is being `set` on every update.
// It could be changed to `insert` and `trigger` later if we want to use it as a generic way for the caller of this
// function to report issues to cluster controller.
ACTOR Future<Void> monitorTraceLogIssues(Reference<AsyncVar<std::set<std::string>>> issues) {
state bool pingTimeout = false;
loop {
wait(delay(SERVER_KNOBS->TRACE_LOG_FLUSH_FAILURE_CHECK_INTERVAL_SECONDS));
@ -764,7 +825,6 @@ ACTOR Future<Void> monitorTraceLogIssues(Optional<Reference<AsyncVar<std::set<st
throw;
}
}
if (issues.present()) {
std::set<std::string> _issues;
retriveTraceLogIssues(_issues);
if (pingTimeout) {
@ -772,79 +832,7 @@ ACTOR Future<Void> monitorTraceLogIssues(Optional<Reference<AsyncVar<std::set<st
_issues.insert("trace_log_writer_thread_unresponsive");
pingTimeout = false;
}
issues.get()->set(_issues);
}
}
}
// TODO: `issues` is right now only updated by `monitorTraceLogIssues` and thus is being `set` on every update.
// It could be changed to `insert` and `trigger` later if we want to use it as a generic way for the caller of this
// function to report issues to cluster controller.
ACTOR Future<Void> monitorServerDBInfo(Reference<AsyncVar<Optional<ClusterControllerFullInterface>>> ccInterface,
Reference<ClusterConnectionFile> connFile, LocalityData locality,
Reference<AsyncVar<ServerDBInfo>> dbInfo,
Optional<Reference<AsyncVar<std::set<std::string>>>> issues) {
// Initially most of the serverDBInfo is not known, but we know our locality right away
ServerDBInfo localInfo;
localInfo.myLocality = locality;
dbInfo->set(localInfo);
state Optional<double> incorrectTime;
loop {
GetServerDBInfoRequest req;
req.knownServerInfoID = dbInfo->get().id;
if (issues.present()) {
for (auto const& i : issues.get()->get()) {
req.issues.push_back_deep(req.issues.arena(), i);
}
}
ClusterConnectionString fileConnectionString;
if (connFile && !connFile->fileContentsUpToDate(fileConnectionString)) {
req.issues.push_back_deep(req.issues.arena(), LiteralStringRef("incorrect_cluster_file_contents"));
std::string connectionString = connFile->getConnectionString().toString();
if(!incorrectTime.present()) {
incorrectTime = now();
}
if(connFile->canGetFilename()) {
// Don't log a SevWarnAlways initially to account for transient issues (e.g. someone else changing the file right before us)
TraceEvent(now() - incorrectTime.get() > 300 ? SevWarnAlways : SevWarn, "IncorrectClusterFileContents")
.detail("Filename", connFile->getFilename())
.detail("ConnectionStringFromFile", fileConnectionString.toString())
.detail("CurrentConnectionString", connectionString);
}
}
else {
incorrectTime = Optional<double>();
}
auto peers = FlowTransport::transport().getIncompatiblePeers();
for(auto it = peers->begin(); it != peers->end();) {
if( now() - it->second.second > SERVER_KNOBS->INCOMPATIBLE_PEER_DELAY_BEFORE_LOGGING ) {
req.incompatiblePeers.push_back(it->first);
it = peers->erase(it);
} else {
it++;
}
}
choose {
when( CachedSerialization<ServerDBInfo> ni = wait( ccInterface->get().present() ? brokenPromiseToNever( ccInterface->get().get().getServerDBInfo.getReply( req ) ) : Never() ) ) {
ServerDBInfo localInfo = ni.read();
TraceEvent("GotServerDBInfoChange").detail("ChangeID", localInfo.id).detail("MasterID", localInfo.master.id())
.detail("RatekeeperID", localInfo.ratekeeper.present() ? localInfo.ratekeeper.get().id() : UID())
.detail("DataDistributorID", localInfo.distributor.present() ? localInfo.distributor.get().id() : UID());
localInfo.myLocality = locality;
dbInfo->set(localInfo);
}
when( wait( ccInterface->onChange() ) ) {
if(ccInterface->get().present())
TraceEvent("GotCCInterfaceChange").detail("CCID", ccInterface->get().get().id()).detail("CCMachine", ccInterface->get().get().getWorkers.getEndpoint().getPrimaryAddress());
}
when(wait(issues.present() ? issues.get()->onChange() : Never())) {}
}
issues->set(_issues);
}
}
@ -934,8 +922,7 @@ ACTOR Future<Void> workerServer(
errorForwarders.add( resetAfter(degraded, SERVER_KNOBS->DEGRADED_RESET_INTERVAL, false, SERVER_KNOBS->DEGRADED_WARNING_LIMIT, SERVER_KNOBS->DEGRADED_WARNING_RESET_DELAY, "DegradedReset"));
errorForwarders.add( loadedPonger( interf.debugPing.getFuture() ) );
errorForwarders.add( waitFailureServer( interf.waitFailure.getFuture() ) );
errorForwarders.add(monitorTraceLogIssues(issues));
errorForwarders.add(monitorServerDBInfo(ccInterface, connFile, locality, dbInfo, issues));
errorForwarders.add( monitorTraceLogIssues(issues) );
errorForwarders.add( testerServerCore( interf.testerInterface, connFile, dbInfo, locality ) );
errorForwarders.add(monitorHighMemory(memoryProfileThreshold));
@ -958,6 +945,7 @@ ACTOR Future<Void> workerServer(
DUMPTOKEN(recruited.setMetricsRate);
DUMPTOKEN(recruited.eventLogRequest);
DUMPTOKEN(recruited.traceBatchDumpRequest);
DUMPTOKEN(recruited.updateServerDBInfo);
}
state std::vector<Future<Void>> recoveries;
@ -1051,12 +1039,34 @@ ACTOR Future<Void> workerServer(
wait(waitForAll(recoveries));
recoveredDiskFiles.send(Void());
errorForwarders.add( registrationClient( ccInterface, interf, asyncPriorityInfo, initialClass, ddInterf, rkInterf, degraded, errors, locality, dbInfo ) );
errorForwarders.add( registrationClient( ccInterface, interf, asyncPriorityInfo, initialClass, ddInterf, rkInterf, degraded, errors, locality, dbInfo, connFile, issues) );
TraceEvent("RecoveriesComplete", interf.id());
loop choose {
when( UpdateServerDBInfoRequest req = waitNext( interf.updateServerDBInfo.getFuture() ) ) {
ServerDBInfo localInfo = BinaryReader::fromStringRef<ServerDBInfo>(req.serializedDbInfo, AssumeVersion(currentProtocolVersion));
localInfo.myLocality = locality;
if(localInfo.infoGeneration < dbInfo->get().infoGeneration && localInfo.clusterInterface == dbInfo->get().clusterInterface) {
std::vector<Endpoint> rep = req.broadcastInfo;
rep.push_back(interf.updateServerDBInfo.getEndpoint());
req.reply.send(rep);
} else {
Optional<Endpoint> notUpdated;
if(!ccInterface->get().present() || localInfo.clusterInterface != ccInterface->get().get()) {
notUpdated = interf.updateServerDBInfo.getEndpoint();
}
else if(localInfo.infoGeneration > dbInfo->get().infoGeneration || dbInfo->get().clusterInterface != ccInterface->get().get()) {
TraceEvent("GotServerDBInfoChange").detail("ChangeID", localInfo.id).detail("MasterID", localInfo.master.id())
.detail("RatekeeperID", localInfo.ratekeeper.present() ? localInfo.ratekeeper.get().id() : UID())
.detail("DataDistributorID", localInfo.distributor.present() ? localInfo.distributor.get().id() : UID());
dbInfo->set(localInfo);
}
errorForwarders.add(success(broadcastDBInfoRequest(req, SERVER_KNOBS->DBINFO_SEND_AMOUNT, notUpdated, true)));
}
}
when( RebootRequest req = waitNext( interf.clientInterface.reboot.getFuture() ) ) {
state RebootRequest rebootReq = req;
// If suspendDuration is INT_MAX, the trace will not be logged if it was inside the next block

View File

@ -1141,12 +1141,12 @@ struct ConsistencyCheckWorkload : TestWorkload
std::set<Optional<Key>> missingStorage;
for( int i = 0; i < workers.size(); i++ ) {
NetworkAddress addr = workers[i].interf.tLog.getEndpoint().addresses.getTLSAddress();
if( !configuration.isExcludedServer(addr) &&
NetworkAddress addr = workers[i].interf.stableAddress();
if( !configuration.isExcludedServer(workers[i].interf.addresses()) &&
( workers[i].processClass == ProcessClass::StorageClass || workers[i].processClass == ProcessClass::UnsetClass ) ) {
bool found = false;
for( int j = 0; j < storageServers.size(); j++ ) {
if( storageServers[j].getValue.getEndpoint().addresses.getTLSAddress() == addr ) {
if( storageServers[j].stableAddress() == addr ) {
found = true;
break;
}

View File

@ -595,7 +595,7 @@ struct FuzzApiCorrectnessWorkload : TestWorkload {
TestGet(unsigned int id, FuzzApiCorrectnessWorkload *workload) : BaseTest(id, workload, "TestGet") {
key = makeKey();
contract = {
std::make_pair( error_code_key_outside_legal_range, ExceptionContract::requiredIf((key >= (workload->useSystemKeys ? systemKeys.end : normalKeys.end))) ),
std::make_pair( error_code_key_outside_legal_range, ExceptionContract::requiredIf((key >= (workload->useSystemKeys ? systemKeys.end : normalKeys.end)) && !specialKeys.contains(key)) ),
std::make_pair( error_code_client_invalid_operation, ExceptionContract::Possible ),
std::make_pair( error_code_accessed_unreadable, ExceptionContract::Possible )
};
@ -652,12 +652,15 @@ struct FuzzApiCorrectnessWorkload : TestWorkload {
limit = deterministicRandom()->randomInt(0, INT_MAX)+1;
}
bool isSpecialKeyRange = specialKeys.contains(keysel1.getKey()) && specialKeys.contains(keysel2.getKey());
contract = {
std::make_pair( error_code_range_limits_invalid, ExceptionContract::possibleButRequiredIf(limit < 0) ),
std::make_pair( error_code_client_invalid_operation, ExceptionContract::Possible ),
std::make_pair( error_code_key_outside_legal_range, ExceptionContract::requiredIf(
(keysel1.getKey() > (workload->useSystemKeys ? systemKeys.end : normalKeys.end)) ||
(keysel2.getKey() > (workload->useSystemKeys ? systemKeys.end : normalKeys.end))) ),
((keysel1.getKey() > (workload->useSystemKeys ? systemKeys.end : normalKeys.end)) ||
(keysel2.getKey() > (workload->useSystemKeys ? systemKeys.end : normalKeys.end))) &&
!isSpecialKeyRange) ),
std::make_pair( error_code_accessed_unreadable, ExceptionContract::Possible )
};
}
@ -681,12 +684,16 @@ struct FuzzApiCorrectnessWorkload : TestWorkload {
keysel1 = makeKeySel();
keysel2 = makeKeySel();
limits = makeRangeLimits();
bool isSpecialKeyRange = specialKeys.contains(keysel1.getKey()) && specialKeys.contains(keysel2.getKey());
contract = {
std::make_pair( error_code_range_limits_invalid, ExceptionContract::possibleButRequiredIf( !limits.isReached() && !limits.isValid()) ),
std::make_pair( error_code_client_invalid_operation, ExceptionContract::Possible ),
std::make_pair( error_code_key_outside_legal_range, ExceptionContract::requiredIf(
(keysel1.getKey() > (workload->useSystemKeys ? systemKeys.end : normalKeys.end)) ||
(keysel2.getKey() > (workload->useSystemKeys ? systemKeys.end : normalKeys.end))) ),
((keysel1.getKey() > (workload->useSystemKeys ? systemKeys.end : normalKeys.end)) ||
(keysel2.getKey() > (workload->useSystemKeys ? systemKeys.end : normalKeys.end))) &&
!isSpecialKeyRange) ),
std::make_pair( error_code_accessed_unreadable, ExceptionContract::Possible )
};
}
@ -721,13 +728,17 @@ struct FuzzApiCorrectnessWorkload : TestWorkload {
else
limit = deterministicRandom()->randomInt(0, INT_MAX)+1;
}
bool isSpecialKeyRange = specialKeys.contains(key1) && specialKeys.contains(key2);
contract = {
std::make_pair( error_code_inverted_range, ExceptionContract::requiredIf(key1 > key2) ),
std::make_pair( error_code_range_limits_invalid, ExceptionContract::possibleButRequiredIf(limit < 0) ),
std::make_pair( error_code_client_invalid_operation, ExceptionContract::Possible ),
std::make_pair( error_code_key_outside_legal_range, ExceptionContract::requiredIf(
(key1 > (workload->useSystemKeys ? systemKeys.end : normalKeys.end)) ||
(key2 > (workload->useSystemKeys ? systemKeys.end : normalKeys.end))) ),
((key1 > (workload->useSystemKeys ? systemKeys.end : normalKeys.end)) ||
(key2 > (workload->useSystemKeys ? systemKeys.end : normalKeys.end)))
&& !isSpecialKeyRange) ),
std::make_pair( error_code_accessed_unreadable, ExceptionContract::Possible )
};
}
@ -752,13 +763,17 @@ struct FuzzApiCorrectnessWorkload : TestWorkload {
key1 = makeKey();
key2 = makeKey();
limits = makeRangeLimits();
bool isSpecialKeyRange = specialKeys.contains(key1) && specialKeys.contains(key2);
contract = {
std::make_pair( error_code_inverted_range, ExceptionContract::requiredIf(key1 > key2) ),
std::make_pair( error_code_range_limits_invalid, ExceptionContract::possibleButRequiredIf( !limits.isReached() && !limits.isValid()) ),
std::make_pair( error_code_client_invalid_operation, ExceptionContract::Possible ),
std::make_pair( error_code_key_outside_legal_range, ExceptionContract::requiredIf(
(key1 > (workload->useSystemKeys ? systemKeys.end : normalKeys.end)) ||
(key2 > (workload->useSystemKeys ? systemKeys.end : normalKeys.end))) ),
((key1 > (workload->useSystemKeys ? systemKeys.end : normalKeys.end)) ||
(key2 > (workload->useSystemKeys ? systemKeys.end : normalKeys.end))) &&
!isSpecialKeyRange) ),
std::make_pair( error_code_accessed_unreadable, ExceptionContract::Possible )
};
}

View File

@ -66,6 +66,13 @@ struct KillRegionWorkload : TestWorkload {
return Void();
}
ACTOR static Future<Void> waitForStorageRecovered( KillRegionWorkload *self ) {
while( self->dbInfo->get().recoveryState < RecoveryState::STORAGE_RECOVERED ) {
wait( self->dbInfo->onChange() );
}
return Void();
}
ACTOR static Future<Void> killRegion( KillRegionWorkload *self, Database cx ) {
ASSERT( g_network->isSimulated() );
if(deterministicRandom()->random01() < 0.5) {
@ -94,10 +101,13 @@ struct KillRegionWorkload : TestWorkload {
TraceEvent("ForceRecovery_GotConfig").detail("Conf", conf.toString());
if(conf.usableRegions>1) {
loop {
//only needed if force recovery was unnecessary and we killed the secondary
wait( success( changeConfig( cx, g_simulator.disablePrimary + " repopulate_anti_quorum=1", true ) ) );
while( self->dbInfo->get().recoveryState < RecoveryState::STORAGE_RECOVERED ) {
wait( self->dbInfo->onChange() );
choose {
when( wait( waitForStorageRecovered(self) ) ) { break; }
when( wait( delay(300.0) ) ) { }
}
}
wait( success( changeConfig( cx, "usable_regions=1", true ) ) );
}

View File

@ -20,7 +20,6 @@
#include "fdbclient/NativeAPI.actor.h"
#include "fdbclient/CoordinationInterface.h"
#include "fdbserver/ClusterRecruitmentInterface.h"
#include "fdbserver/TesterInterface.actor.h"
#include "fdbserver/WorkerInterface.actor.h"
#include "fdbserver/workloads/workloads.actor.h"

View File

@ -22,7 +22,6 @@
#include "fdbserver/TesterInterface.actor.h"
#include "fdbserver/workloads/workloads.actor.h"
#include "fdbserver/QuietDatabase.h"
#include "fdbserver/ClusterRecruitmentInterface.h"
#include "flow/actorcompiler.h" // This must be the last #include.
struct PerformanceWorkload : TestWorkload {

View File

@ -28,7 +28,6 @@
#include "fdbserver/WorkerInterface.actor.h"
#include "fdbserver/workloads/workloads.actor.h"
#include "fdbserver/workloads/BulkSetup.actor.h"
#include "fdbserver/ClusterRecruitmentInterface.h"
#include "fdbclient/ReadYourWrites.h"
#include "flow/TDMetric.actor.h"
#include "flow/actorcompiler.h" // This must be the last #include.

View File

@ -4,7 +4,6 @@
#include "fdbclient/ReadYourWrites.h"
#include "fdbrpc/ContinuousSample.h"
#include "fdbmonitor/SimpleIni.h"
#include "fdbserver/ClusterRecruitmentInterface.h"
#include "fdbserver/Status.h"
#include "fdbserver/TesterInterface.actor.h"
#include "fdbserver/WorkerInterface.actor.h"

View File

@ -576,6 +576,12 @@ public:
return eatAny(StringRef((const uint8_t *)sep, strlen(sep)), foundSeparator);
}
// Copies string contents to dst and returns a pointer to the next byte after
uint8_t * copyTo(uint8_t *dst) const {
memcpy(dst, data, length);
return dst + length;
}
private:
// Unimplemented; blocks conversion through std::string
StringRef( char* );

View File

@ -19,6 +19,7 @@
*/
#pragma once
#include <stdint.h>
// A signed compressed integer format that retains ordering in compressed form.
// Format is: [~sign_bit] [unary_len] [value_bits]

View File

@ -80,6 +80,7 @@ void FlowKnobs::initialize(bool randomize, bool isSimulated) {
init( TOO_MANY_CONNECTIONS_CLOSED_RESET_DELAY, 5.0 );
init( TOO_MANY_CONNECTIONS_CLOSED_TIMEOUT, 20.0 );
init( PEER_UNAVAILABLE_FOR_LONG_TIME_TIMEOUT, 3600.0 );
init( INCOMPATIBLE_PEER_DELAY_BEFORE_LOGGING, 5.0 );
init( TLS_CERT_REFRESH_DELAY_SECONDS, 12*60*60 );
init( TLS_SERVER_CONNECTION_THROTTLE_TIMEOUT, 9.0 );
@ -206,6 +207,12 @@ void FlowKnobs::initialize(bool randomize, bool isSimulated) {
init( FUTURE_VERSION_BACKOFF_GROWTH, 2.0 );
init( LOAD_BALANCE_MAX_BAD_OPTIONS, 1 ); //should be the same as MAX_MACHINES_FALLING_BEHIND
init( LOAD_BALANCE_PENALTY_IS_BAD, true );
// Health Monitor
init( FAILURE_DETECTION_DELAY, 4.0 ); if( randomize && BUGGIFY ) FAILURE_DETECTION_DELAY = 1.0;
init( HEALTH_MONITOR_MARK_FAILED_UNSTABLE_CONNECTIONS, true );
init( HEALTH_MONITOR_CLIENT_REQUEST_INTERVAL_SECS, 30 );
init( HEALTH_MONITOR_CONNECTION_MAX_CLOSED, 5 );
}
// clang-format on

View File

@ -94,6 +94,7 @@ public:
double RECONNECTION_TIME_GROWTH_RATE;
double RECONNECTION_RESET_TIME;
int ACCEPT_BATCH_SIZE;
double INCOMPATIBLE_PEER_DELAY_BEFORE_LOGGING;
int TLS_CERT_REFRESH_DELAY_SECONDS;
double TLS_SERVER_CONNECTION_THROTTLE_TIMEOUT;
@ -226,6 +227,12 @@ public:
int LOAD_BALANCE_MAX_BAD_OPTIONS;
bool LOAD_BALANCE_PENALTY_IS_BAD;
// Health Monitor
int FAILURE_DETECTION_DELAY;
bool HEALTH_MONITOR_MARK_FAILED_UNSTABLE_CONNECTIONS;
int HEALTH_MONITOR_CLIENT_REQUEST_INTERVAL_SECS;
int HEALTH_MONITOR_CONNECTION_MAX_CLOSED;
FlowKnobs();
void initialize(bool randomize = false, bool isSimulated = false);
};

View File

@ -23,12 +23,13 @@
#define BOOST_SYSTEM_NO_LIB
#define BOOST_DATE_TIME_NO_LIB
#define BOOST_REGEX_NO_LIB
#include "boost/asio.hpp"
#include "boost/bind.hpp"
#include "boost/date_time/posix_time/posix_time_types.hpp"
#include <boost/asio.hpp>
#include <boost/bind.hpp>
#include <boost/date_time/posix_time/posix_time_types.hpp>
#include <boost/range.hpp>
#include <boost/algorithm/string/join.hpp>
#include "flow/network.h"
#include "flow/IThreadPool.h"
#include "boost/range.hpp"
#include "flow/ActorCollection.h"
#include "flow/ThreadSafeQueue.h"
@ -142,9 +143,14 @@ public:
if ( thread_network == this )
stopImmediately();
else
// SOMEDAY: NULL for deferred error, no analysis of correctness (itp)
onMainThreadVoid( [this] { this->stopImmediately(); }, NULL );
}
virtual void addStopCallback( std::function<void()> fn ) {
if ( thread_network == this )
stopCallbacks.emplace_back(std::move(fn));
else
onMainThreadVoid( [this, fn] { this->stopCallbacks.emplace_back(std::move(fn)); }, nullptr );
}
virtual bool isSimulated() const { return false; }
virtual THREAD_HANDLE startThread( THREAD_FUNC_RETURN (*func) (void*), void *arg);
@ -232,6 +238,7 @@ public:
EventMetricHandle<SlowTask> slowTaskMetric;
std::vector<std::string> blobCredentialFiles;
std::vector<std::function<void()>> stopCallbacks;
};
static boost::asio::ip::address tcpAddress(IPAddress const& n) {
@ -261,11 +268,19 @@ public:
try {
if (error) {
// Log the error...
TraceEvent(SevWarn, errContext, errID).suppressFor(1.0).detail("ErrorCode", error.value()).detail("Message", error.message())
{
TraceEvent evt(SevWarn, errContext, errID);
evt.suppressFor(1.0).detail("ErrorCode", error.value()).detail("Message", error.message());
#ifndef TLS_DISABLED
.detail("WhichMeans", TLSPolicy::ErrorString(error))
// There is no function in OpenSSL to use to check if an error code is from OpenSSL,
// but all OpenSSL errors have a non-zero "library" code set in bits 24-32, and linux
// error codes should never go that high.
if (error.value() >= (1 << 24L)) {
evt.detail("WhichMeans", TLSPolicy::ErrorString(error));
}
#endif
;
}
p.sendError( connection_failed() );
} else
p.send( Void() );
@ -790,11 +805,11 @@ private:
}
void onReadError( const boost::system::error_code& error ) {
TraceEvent(SevWarn, "N2_ReadError", id).suppressFor(1.0).detail("Message", error.value());
TraceEvent(SevWarn, "N2_ReadError", id).suppressFor(1.0).detail("ErrorCode", error.value()).detail("Message", error.message());
closeSocket();
}
void onWriteError( const boost::system::error_code& error ) {
TraceEvent(SevWarn, "N2_WriteError", id).suppressFor(1.0).detail("Message", error.value());
TraceEvent(SevWarn, "N2_WriteError", id).suppressFor(1.0).detail("ErrorCode", error.value()).detail("Message", error.message());
closeSocket();
}
};
@ -896,13 +911,19 @@ ACTOR static Future<Void> watchFileForChanges( std::string filename, AsyncTrigge
if (filename == "") {
return Never();
}
state std::time_t lastModTime = wait(IAsyncFileSystem::filesystem()->lastWriteTime(filename));
state bool firstRun = true;
state bool statError = false;
state std::time_t lastModTime = 0;
loop {
wait(delay(FLOW_KNOBS->TLS_CERT_REFRESH_DELAY_SECONDS));
try {
std::time_t modtime = wait(IAsyncFileSystem::filesystem()->lastWriteTime(filename));
if (lastModTime != modtime) {
if (firstRun) {
lastModTime = modtime;
firstRun = false;
}
if (lastModTime != modtime || statError) {
lastModTime = modtime;
statError = false;
fileChanged->trigger();
}
} catch (Error& e) {
@ -912,10 +933,12 @@ ACTOR static Future<Void> watchFileForChanges( std::string filename, AsyncTrigge
// certificates, then there's no point in crashing, but we should complain
// loudly. IAsyncFile will log the error, but not necessarily as a warning.
TraceEvent(SevWarnAlways, "TLSCertificateRefreshStatError").detail("File", filename);
statError = true;
} else {
throw;
}
}
wait(delay(FLOW_KNOBS->TLS_CERT_REFRESH_DELAY_SECONDS));
}
}
@ -964,16 +987,22 @@ void Net2::initTLS() {
return;
}
#ifndef TLS_DISABLED
auto onPolicyFailure = [this]() { this->countTLSPolicyFailures++; };
try {
boost::asio::ssl::context newContext(boost::asio::ssl::context::tls);
auto onPolicyFailure = [this]() { this->countTLSPolicyFailures++; };
const LoadedTLSConfig& loaded = tlsConfig.loadSync();
TraceEvent("Net2TLSConfig")
.detail("CAPath", tlsConfig.getCAPathSync())
.detail("CertificatePath", tlsConfig.getCertificatePathSync())
.detail("KeyPath", tlsConfig.getKeyPathSync())
.detail("HasPassword", !loaded.getPassword().empty())
.detail("VerifyPeers", boost::algorithm::join(loaded.getVerifyPeers(), "|"));
ConfigureSSLContext( tlsConfig.loadSync(), &newContext, onPolicyFailure );
sslContextVar.set(ReferencedObject<boost::asio::ssl::context>::from(std::move(newContext)));
backgroundCertRefresh = reloadCertificatesOnChange( tlsConfig, onPolicyFailure, &sslContextVar );
} catch (Error& e) {
TraceEvent("Net2TLSInitError").error(e);
throw tls_error();
}
backgroundCertRefresh = reloadCertificatesOnChange( tlsConfig, onPolicyFailure, &sslContextVar );
#endif
tlsInitialized = true;
}
@ -1199,6 +1228,10 @@ void Net2::run() {
TraceEvent("SomewhatSlowRunLoopBottom").detail("Elapsed", nnow - now); // This includes the time spent running tasks
}
for ( auto& fn : stopCallbacks ) {
fn();
}
#ifdef WIN32
timeEndPeriod(1);
#endif

Some files were not shown because too many files have changed in this diff Show More