diff --git a/README.md b/README.md index a3964f63a8..a3e7ef5979 100755 --- a/README.md +++ b/README.md @@ -206,37 +206,3 @@ will automatically find it and build with TLS support. If you installed WIX before running `cmake` you should find the `FDBInstaller.msi` in your build directory under `packaging/msi`. -## Makefile (Deprecated - all users should transition to using cmake) - -#### MacOS - -1. Check out this repo on your Mac. -1. Install the Xcode command-line tools. -1. Download version 1.67.0 of [Boost](https://sourceforge.net/projects/boost/files/boost/1.67.0/). -1. Set the `BOOSTDIR` environment variable to the location containing this boost installation. -1. Install [Mono](http://www.mono-project.com/download/stable/). -1. Install a [JDK](http://www.oracle.com/technetwork/java/javase/downloads/index.html). FoundationDB currently builds with Java 8. -1. Navigate to the directory where you checked out the foundationdb repo. -1. Run `make`. - -#### Linux - -1. Install [Docker](https://www.docker.com/). -1. Check out the foundationdb repo. -1. Run the docker image interactively with [Docker Run](https://docs.docker.com/engine/reference/run/#general-form), and with the directory containing the foundationdb repo mounted via [Docker Mounts](https://docs.docker.com/storage/volumes/). - - ```shell - docker run -it -v '/local/dir/path/foundationdb:/docker/dir/path/foundationdb' foundationdb/foundationdb-build:latest - ``` - -1. Run `$ scl enable devtoolset-8 python27 rh-python36 rh-ruby24 -- bash` within the running container. This enables a more modern compiler, which is required to build FoundationDB. -1. Navigate to the container's mounted directory which contains the foundationdb repo. - - ```shell - cd /docker/dir/path/foundationdb - ``` - -1. Run `make`. - -This will build the fdbserver binary and the python bindings. If you want to build our other bindings, you will need to install a runtime for the language whose binding you want to build. Each binding has an `.mk` file which provides specific targets for that binding. - diff --git a/bindings/go/CMakeLists.txt b/bindings/go/CMakeLists.txt index 793089a3f7..701fa49ca8 100644 --- a/bindings/go/CMakeLists.txt +++ b/bindings/go/CMakeLists.txt @@ -99,6 +99,8 @@ function(build_go_package) endif() add_custom_command(OUTPUT ${outfile} COMMAND ${CMAKE_COMMAND} -E env ${go_env} + ${GO_EXECUTABLE} get -d ${GO_IMPORT_PATH}/${BGP_PATH} && + ${CMAKE_COMMAND} -E env ${go_env} ${GO_EXECUTABLE} install ${GO_IMPORT_PATH}/${BGP_PATH} DEPENDS ${fdb_options_file} COMMENT "Building ${BGP_NAME}") diff --git a/bindings/go/src/fdb/database.go b/bindings/go/src/fdb/database.go index 6d914d7928..c9bf818fab 100644 --- a/bindings/go/src/fdb/database.go +++ b/bindings/go/src/fdb/database.go @@ -27,7 +27,7 @@ package fdb import "C" import ( - "sync" + "runtime" ) // Database is a handle to a FoundationDB database. Database is a lightweight @@ -74,14 +74,13 @@ func (d Database) CreateTransaction() (Transaction, error) { return Transaction{}, Error{int(err)} } - t := &transaction{outt, d, sync.Once{}} + t := &transaction{outt, d} + runtime.SetFinalizer(t, (*transaction).destroy) return Transaction{t}, nil } -func retryable(t Transaction, wrapped func() (interface{}, error), onError func(Error) FutureNil) (ret interface{}, e error) { - defer t.Close() - +func retryable(wrapped func() (interface{}, error), onError func(Error) FutureNil) (ret interface{}, e error) { for { ret, e = wrapped() @@ -141,7 +140,7 @@ func (d Database) Transact(f func(Transaction) (interface{}, error)) (interface{ return } - return retryable(tr, wrapped, tr.OnError) + return retryable(wrapped, tr.OnError) } // ReadTransact runs a caller-provided function inside a retry loop, providing @@ -181,7 +180,7 @@ func (d Database) ReadTransact(f func(ReadTransaction) (interface{}, error)) (in return } - return retryable(tr, wrapped, tr.OnError) + return retryable(wrapped, tr.OnError) } // Options returns a DatabaseOptions instance suitable for setting options diff --git a/bindings/go/src/fdb/directory/directoryLayer.go b/bindings/go/src/fdb/directory/directoryLayer.go index 5be70e5dd1..63574d9148 100644 --- a/bindings/go/src/fdb/directory/directoryLayer.go +++ b/bindings/go/src/fdb/directory/directoryLayer.go @@ -417,7 +417,6 @@ func (dl directoryLayer) subdirNames(rtr fdb.ReadTransaction, node subspace.Subs rr := rtr.GetRange(sd, fdb.RangeOptions{}) ri := rr.Iterator() - defer ri.Close() var ret []string @@ -443,7 +442,6 @@ func (dl directoryLayer) subdirNodes(tr fdb.Transaction, node subspace.Subspace) rr := tr.GetRange(sd, fdb.RangeOptions{}) ri := rr.Iterator() - defer ri.Close() var ret []subspace.Subspace diff --git a/bindings/go/src/fdb/fdb_test.go b/bindings/go/src/fdb/fdb_test.go index 2c10100e30..ed9478878a 100644 --- a/bindings/go/src/fdb/fdb_test.go +++ b/bindings/go/src/fdb/fdb_test.go @@ -246,7 +246,6 @@ func ExampleRangeIterator() { rr := tr.GetRange(fdb.KeyRange{fdb.Key(""), fdb.Key{0xFF}}, fdb.RangeOptions{}) ri := rr.Iterator() - defer ri.Close() // Advance will return true until the iterator is exhausted for ri.Advance() { diff --git a/bindings/go/src/fdb/futures.go b/bindings/go/src/fdb/futures.go index aa58e7c81b..17ae1d70a4 100644 --- a/bindings/go/src/fdb/futures.go +++ b/bindings/go/src/fdb/futures.go @@ -39,6 +39,7 @@ package fdb import "C" import ( + "runtime" "sync" "unsafe" ) @@ -74,7 +75,9 @@ type future struct { } func newFuture(ptr *C.FDBFuture) *future { - return &future{ptr} + f := &future{ptr} + runtime.SetFinalizer(f, func(f *future) { C.fdb_future_destroy(f.ptr) }) + return f } // Note: This function guarantees the callback will be executed **at most once**. @@ -97,14 +100,17 @@ func fdb_future_block_until_ready(f *C.FDBFuture) { } func (f *future) BlockUntilReady() { + defer runtime.KeepAlive(f) fdb_future_block_until_ready(f.ptr) } func (f *future) IsReady() bool { + defer runtime.KeepAlive(f) return C.fdb_future_is_ready(f.ptr) != 0 } func (f *future) Cancel() { + defer runtime.KeepAlive(f) C.fdb_future_cancel(f.ptr) } @@ -136,7 +142,7 @@ type futureByteSlice struct { func (f *futureByteSlice) Get() ([]byte, error) { f.o.Do(func() { - defer C.fdb_future_destroy(f.ptr) + defer runtime.KeepAlive(f.future) var present C.fdb_bool_t var value *C.uint8_t @@ -150,14 +156,10 @@ func (f *futureByteSlice) Get() ([]byte, error) { } if present != 0 { - // Copy the native `value` into a Go byte slice so the underlying - // native Future can be freed. This avoids the need for finalizers. - valueDestination := make([]byte, length) - valueSource := C.GoBytes(unsafe.Pointer(value), length) - copy(valueDestination, valueSource) - - f.v = valueDestination + f.v = C.GoBytes(unsafe.Pointer(value), length) } + + C.fdb_future_release_memory(f.ptr) }) return f.v, f.e @@ -197,7 +199,7 @@ type futureKey struct { func (f *futureKey) Get() (Key, error) { f.o.Do(func() { - defer C.fdb_future_destroy(f.ptr) + defer runtime.KeepAlive(f.future) var value *C.uint8_t var length C.int @@ -209,11 +211,8 @@ func (f *futureKey) Get() (Key, error) { return } - keySource := C.GoBytes(unsafe.Pointer(value), length) - keyDestination := make([]byte, length) - copy(keyDestination, keySource) - - f.k = keyDestination + f.k = C.GoBytes(unsafe.Pointer(value), length) + C.fdb_future_release_memory(f.ptr) }) return f.k, f.e @@ -246,21 +245,17 @@ type FutureNil interface { type futureNil struct { *future - o sync.Once - e error } func (f *futureNil) Get() error { - f.o.Do(func() { - defer C.fdb_future_destroy(f.ptr) + defer runtime.KeepAlive(f.future) - f.BlockUntilReady() - if err := C.fdb_future_get_error(f.ptr); err != 0 { - f.e = Error{int(err)} - } - }) + f.BlockUntilReady() + if err := C.fdb_future_get_error(f.ptr); err != 0 { + return Error{int(err)} + } - return f.e + return nil } func (f *futureNil) MustGet() { @@ -273,6 +268,7 @@ type futureKeyValueArray struct { *future } +//go:nocheckptr func stringRefToSlice(ptr unsafe.Pointer) []byte { size := *((*C.int)(unsafe.Pointer(uintptr(ptr) + 8))) @@ -286,6 +282,8 @@ func stringRefToSlice(ptr unsafe.Pointer) []byte { } func (f *futureKeyValueArray) Get() ([]KeyValue, bool, error) { + defer runtime.KeepAlive(f.future) + f.BlockUntilReady() var kvs *C.FDBKeyValue @@ -296,42 +294,13 @@ func (f *futureKeyValueArray) Get() ([]KeyValue, bool, error) { return nil, false, Error{int(err)} } - // To minimize the number of individual allocations, we first calculate the - // final size used by all keys and values returned from this iteration, - // then perform one larger allocation and slice within it. - - poolSize := 0 - for i := 0; i < int(count); i++ { - kvptr := unsafe.Pointer(uintptr(unsafe.Pointer(kvs)) + uintptr(i*24)) - - poolSize += len(stringRefToSlice(kvptr)) - poolSize += len(stringRefToSlice(unsafe.Pointer(uintptr(kvptr) + 12))) - } - - poolOffset := 0 - pool := make([]byte, poolSize) - ret := make([]KeyValue, int(count)) for i := 0; i < int(count); i++ { kvptr := unsafe.Pointer(uintptr(unsafe.Pointer(kvs)) + uintptr(i*24)) - keySource := stringRefToSlice(kvptr) - valueSource := stringRefToSlice(unsafe.Pointer(uintptr(kvptr) + 12)) - - keyDestination := pool[poolOffset : poolOffset+len(keySource)] - poolOffset += len(keySource) - - valueDestination := pool[poolOffset : poolOffset+len(valueSource)] - poolOffset += len(valueSource) - - copy(keyDestination, keySource) - copy(valueDestination, valueSource) - - ret[i] = KeyValue{ - Key: keyDestination, - Value: valueDestination, - } + ret[i].Key = stringRefToSlice(kvptr) + ret[i].Value = stringRefToSlice(unsafe.Pointer(uintptr(kvptr) + 12)) } return ret, (more != 0), nil @@ -356,28 +325,19 @@ type FutureInt64 interface { type futureInt64 struct { *future - o sync.Once - e error - v int64 } func (f *futureInt64) Get() (int64, error) { - f.o.Do(func() { - defer C.fdb_future_destroy(f.ptr) + defer runtime.KeepAlive(f.future) - f.BlockUntilReady() + f.BlockUntilReady() - var ver C.int64_t - if err := C.fdb_future_get_int64(f.ptr, &ver); err != 0 { - f.v = 0 - f.e = Error{int(err)} - return - } + var ver C.int64_t + if err := C.fdb_future_get_int64(f.ptr, &ver); err != 0 { + return 0, Error{int(err)} + } - f.v = int64(ver) - }) - - return f.v, f.e + return int64(ver), nil } func (f *futureInt64) MustGet() int64 { @@ -408,40 +368,27 @@ type FutureStringSlice interface { type futureStringSlice struct { *future - o sync.Once - e error - v []string } func (f *futureStringSlice) Get() ([]string, error) { - f.o.Do(func() { - defer C.fdb_future_destroy(f.ptr) + defer runtime.KeepAlive(f.future) - f.BlockUntilReady() + f.BlockUntilReady() - var strings **C.char - var count C.int + var strings **C.char + var count C.int - if err := C.fdb_future_get_string_array(f.ptr, (***C.char)(unsafe.Pointer(&strings)), &count); err != 0 { - f.e = Error{int(err)} - return - } + if err := C.fdb_future_get_string_array(f.ptr, (***C.char)(unsafe.Pointer(&strings)), &count); err != 0 { + return nil, Error{int(err)} + } - ret := make([]string, int(count)) + ret := make([]string, int(count)) - for i := 0; i < int(count); i++ { - source := C.GoString((*C.char)(*(**C.char)(unsafe.Pointer(uintptr(unsafe.Pointer(strings)) + uintptr(i*8))))) + for i := 0; i < int(count); i++ { + ret[i] = C.GoString((*C.char)(*(**C.char)(unsafe.Pointer(uintptr(unsafe.Pointer(strings)) + uintptr(i*8))))) + } - destination := make([]byte, len(source)) - copy(destination, source) - - ret[i] = string(destination) - } - - f.v = ret - }) - - return f.v, f.e + return ret, nil } func (f *futureStringSlice) MustGet() []string { diff --git a/bindings/go/src/fdb/generated.go b/bindings/go/src/fdb/generated.go index 2a60a479b1..f8cf89f5fd 100644 --- a/bindings/go/src/fdb/generated.go +++ b/bindings/go/src/fdb/generated.go @@ -304,7 +304,7 @@ func (o DatabaseOptions) SetTransactionTimeout(param int64) error { return o.setOpt(500, int64ToBytes(param)) } -// Set a timeout in milliseconds which, when elapsed, will cause a transaction automatically to be cancelled. This sets the ``retry_limit`` option of each transaction created by this database. See the transaction option description for more information. +// Set a maximum number of retries after which additional calls to ``onError`` will throw the most recently seen error code. This sets the ``retry_limit`` option of each transaction created by this database. See the transaction option description for more information. // // Parameter: number of times to retry func (o DatabaseOptions) SetTransactionRetryLimit(param int64) error { @@ -330,7 +330,7 @@ func (o DatabaseOptions) SetTransactionCausalReadRisky() error { return o.setOpt(504, nil) } -// Addresses returned by get_addresses_for_key include the port when enabled. As of api version 700, this option is enabled by default and setting this has no effect. +// Addresses returned by get_addresses_for_key include the port when enabled. As of api version 630, this option is enabled by default and setting this has no effect. func (o DatabaseOptions) SetTransactionIncludePortInAddress() error { return o.setOpt(505, nil) } @@ -350,7 +350,7 @@ func (o TransactionOptions) SetCausalReadDisable() error { return o.setOpt(21, nil) } -// Addresses returned by get_addresses_for_key include the port when enabled. As of api version 700, this option is enabled by default and setting this has no effect. +// Addresses returned by get_addresses_for_key include the port when enabled. As of api version 630, this option is enabled by default and setting this has no effect. func (o TransactionOptions) SetIncludePortInAddress() error { return o.setOpt(23, nil) } @@ -429,7 +429,7 @@ func (o TransactionOptions) SetDebugTransactionIdentifier(param string) error { return o.setOpt(403, []byte(param)) } -// Enables tracing for this transaction and logs results to the client trace logs. The DEBUG_TRANSACTION_IDENTIFIER option must be set before using this option, and client trace logging must be enabled and to get log output. +// Enables tracing for this transaction and logs results to the client trace logs. The DEBUG_TRANSACTION_IDENTIFIER option must be set before using this option, and client trace logging must be enabled to get log output. func (o TransactionOptions) SetLogTransaction() error { return o.setOpt(404, nil) } @@ -479,7 +479,7 @@ func (o TransactionOptions) SetSnapshotRywDisable() error { return o.setOpt(601, nil) } -// The transaction can read and write to locked databases, and is resposible for checking that it took the lock. +// The transaction can read and write to locked databases, and is responsible for checking that it took the lock. func (o TransactionOptions) SetLockAware() error { return o.setOpt(700, nil) } diff --git a/bindings/go/src/fdb/range.go b/bindings/go/src/fdb/range.go index 5d9e635b39..67a45c63b2 100644 --- a/bindings/go/src/fdb/range.go +++ b/bindings/go/src/fdb/range.go @@ -28,7 +28,6 @@ import "C" import ( "fmt" - "sync" ) // KeyValue represents a single key-value pair in the database. @@ -141,7 +140,6 @@ func (rr RangeResult) GetSliceWithError() ([]KeyValue, error) { var ret []KeyValue ri := rr.Iterator() - defer ri.Close() if rr.options.Limit != 0 { ri.options.Mode = StreamingModeExact @@ -209,18 +207,6 @@ type RangeIterator struct { index int err error snapshot bool - o sync.Once -} - -// Close releases the underlying native resources for all the `KeyValue`s -// ever returned by this iterator. The `KeyValue`s themselves are copied -// before they're returned, so they are still safe to use after calling -// this function. This is instended to be called with `defer` inside -// your transaction function. -func (ri *RangeIterator) Close() { - ri.o.Do(func() { - C.fdb_future_destroy(ri.f.ptr) - }) } // Advance attempts to advance the iterator to the next key-value pair. Advance diff --git a/bindings/go/src/fdb/transaction.go b/bindings/go/src/fdb/transaction.go index 9547cb64f5..4102a0556b 100644 --- a/bindings/go/src/fdb/transaction.go +++ b/bindings/go/src/fdb/transaction.go @@ -25,7 +25,6 @@ package fdb // #define FDB_API_VERSION 630 // #include import "C" -import "sync" // A ReadTransaction can asynchronously read from a FoundationDB // database. Transaction and Snapshot both satisfy the ReadTransaction @@ -71,7 +70,6 @@ type Transaction struct { type transaction struct { ptr *C.FDBTransaction db Database - o sync.Once } // TransactionOptions is a handle with which to set options that affect a @@ -87,18 +85,16 @@ func (opt TransactionOptions) setOpt(code int, param []byte) error { }, param) } +func (t *transaction) destroy() { + C.fdb_transaction_destroy(t.ptr) +} + // GetDatabase returns a handle to the database with which this transaction is // interacting. func (t Transaction) GetDatabase() Database { return t.transaction.db } -func (t Transaction) Close() { - t.o.Do(func() { - C.fdb_transaction_destroy(t.ptr) - }) -} - // Transact executes the caller-provided function, passing it the Transaction // receiver object. // @@ -410,6 +406,9 @@ func (t *transaction) getApproximateSize() FutureInt64 { } } +// Returns a future that is the approximate transaction size so far in this +// transaction, which is the summation of the estimated size of mutations, +// read conflict ranges, and write conflict ranges. func (t Transaction) GetApproximateSize() FutureInt64 { return t.getApproximateSize() } diff --git a/bindings/python/LICENSE b/bindings/python/LICENSE new file mode 100644 index 0000000000..19586598a8 --- /dev/null +++ b/bindings/python/LICENSE @@ -0,0 +1,207 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +------------------------------------------------------------------------------- +SOFTWARE DISTRIBUTED WITH FOUNDATIONDB: + +The FoundationDB software includes a number of subcomponents with separate +copyright notices and license terms - please see the file ACKNOWLEDGEMENTS. +------------------------------------------------------------------------------- diff --git a/build/gen_dev_docker.sh b/build/gen_dev_docker.sh index 03b171f969..89129d5a86 100755 --- a/build/gen_dev_docker.sh +++ b/build/gen_dev_docker.sh @@ -20,7 +20,7 @@ cd ${tmpdir} echo cat <> Dockerfile -FROM foundationdb/foundationdb-build:latest +FROM foundationdb/foundationdb-dev:0.11.1 RUN yum install -y sudo RUN echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers RUN groupadd -g 1100 sudo @@ -64,13 +64,19 @@ then ccache_args=\$args fi +if [ -t 1 ] ; then + TERMINAL_ARGS=-it `# Run in interactive mode and simulate a TTY` +else + TERMINAL_ARGS=-i `# Run in interactive mode` +fi sudo docker run --rm `# delete (temporary) image after return` \\ - -it `# Run in interactive mode and simulate a TTY` \\ + \${TERMINAL_ARGS} \\ --privileged=true `# Run in privileged mode ` \\ --cap-add=SYS_PTRACE \\ --security-opt seccomp=unconfined \\ -v "${HOME}:${HOME}" `# Mount home directory` \\ + -w="\$(pwd)" \\ \${ccache_args} \\ ${image} "\$@" EOF @@ -87,6 +93,7 @@ then echo -e "\tThis can cause problems with some scripts (like fdb-clangd)" fi chmod +x $HOME/bin/fdb-dev +chmod +x $HOME/bin/clangd echo "To start the dev docker image run $HOME/bin/fdb-dev" echo "$HOME/bin/clangd can be used for IDE integration" echo "You can edit these files but be aware that this script will overwrite your changes if you rerun it" diff --git a/cmake/AddFdbTest.cmake b/cmake/AddFdbTest.cmake index a8fae7837b..77aec8b561 100644 --- a/cmake/AddFdbTest.cmake +++ b/cmake/AddFdbTest.cmake @@ -87,6 +87,9 @@ function(add_fdb_test) if (NOT "${ADD_FDB_TEST_TEST_NAME}" STREQUAL "") set(test_name ${ADD_FDB_TEST_TEST_NAME}) endif() + if((NOT test_name MATCHES "${TEST_INCLUDE}") OR (test_name MATCHES "${TEST_EXCLUDE}")) + return() + endif() math(EXPR test_idx "${CURRENT_TEST_INDEX} + ${NUM_TEST_FILES}") set(CURRENT_TEST_INDEX "${test_idx}" PARENT_SCOPE) # set( PARENT_SCOPE) doesn't set the @@ -160,8 +163,6 @@ function(create_test_package) string(SUBSTRING ${file} ${base_length} -1 rel_out_file) set(out_file ${CMAKE_BINARY_DIR}/packages/tests/${rel_out_file}) list(APPEND out_files ${out_file}) - get_filename_component(test_dir ${out_file} DIRECTORY) - file(MAKE_DIRECTORY packages/tests/${test_dir}) add_custom_command( OUTPUT ${out_file} DEPENDS ${file} diff --git a/cmake/CompilerChecks.cmake b/cmake/CompilerChecks.cmake new file mode 100644 index 0000000000..027be35796 --- /dev/null +++ b/cmake/CompilerChecks.cmake @@ -0,0 +1,53 @@ +include(CheckCXXCompilerFlag) + +function(env_set var_name default_value type docstring) + set(val ${default_value}) + if(DEFINED ENV{${var_name}}) + set(val $ENV{${var_name}}) + endif() + set(${var_name} ${val} CACHE ${type} "${docstring}") +endfunction() + +function(default_linker var_name) + if(APPLE) + set("${var_name}" "DEFAULT" PARENT_SCOPE) + elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang") + find_program(lld_path ld.lld "Path to LLD - is only used to determine default linker") + if(lld_path) + set("${var_name}" "LLD" PARENT_SCOPE) + else() + set("${var_name}" "DEFAULT" PARENT_SCOPE) + endif() + else() + set("${var_name}" "DEFAULT" PARENT_SCOPE) + endif() +endfunction() + +function(use_libcxx out) + if(APPLE OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang") + set("${out}" ON PARENT_SCOPE) + else() + set("${out}" OFF PARENT_SCOPE) + endif() +endfunction() + +function(static_link_libcxx out) + if(APPLE) + set("${out}" OFF PARENT_SCOPE) + elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang") + default_linker(linker) + if(NOT linker STREQUAL "LLD") + set("${out}" OFF PARENT_SCOPE) + return() + endif() + find_library(libcxx_a libc++.a) + find_library(libcxx_abi libc++abi.a) + if(libcxx_a AND libcxx_abi) + set("${out}" ON PARENT_SCOPE) + else() + set("${out}" OFF PARENT_SCOPE) + endif() + else() + set("${out}" ON PARENT_SCOPE) + endif() +endfunction() diff --git a/cmake/ConfigureCompiler.cmake b/cmake/ConfigureCompiler.cmake index 5f263788b2..33b749c0ee 100644 --- a/cmake/ConfigureCompiler.cmake +++ b/cmake/ConfigureCompiler.cmake @@ -1,25 +1,23 @@ -function(env_set var_name default_value type docstring) - set(val ${default_value}) - if(DEFINED ENV{${var_name}}) - set(val $ENV{${var_name}}) - endif() - set(${var_name} ${val} CACHE ${type} "${docstring}") -endfunction() +include(CompilerChecks) -set(USE_GPERFTOOLS OFF CACHE BOOL "Use gperfools for profiling") +env_set(USE_GPERFTOOLS OFF BOOL "Use gperfools for profiling") env_set(USE_VALGRIND OFF BOOL "Compile for valgrind usage") -set(USE_VALGRIND_FOR_CTEST ${USE_VALGRIND} CACHE BOOL "Use valgrind for ctest") -set(ALLOC_INSTRUMENTATION OFF CACHE BOOL "Instrument alloc") -set(WITH_UNDODB OFF CACHE BOOL "Use rr or undodb") -set(USE_ASAN OFF CACHE BOOL "Compile with address sanitizer") -set(USE_UBSAN OFF CACHE BOOL "Compile with undefined behavior sanitizer") -set(FDB_RELEASE OFF CACHE BOOL "This is a building of a final release") -env_set(USE_LD "DEFAULT" STRING "The linker to use for building: can be LD (system default, default choice), BFD, GOLD, or LLD") -env_set(USE_LIBCXX OFF BOOL "Use libc++") +env_set(USE_VALGRIND_FOR_CTEST ${USE_VALGRIND} BOOL "Use valgrind for ctest") +env_set(ALLOC_INSTRUMENTATION OFF BOOL "Instrument alloc") +env_set(WITH_UNDODB OFF BOOL "Use rr or undodb") +env_set(USE_ASAN OFF BOOL "Compile with address sanitizer") +env_set(USE_UBSAN OFF BOOL "Compile with undefined behavior sanitizer") +env_set(FDB_RELEASE OFF BOOL "This is a building of a final release") env_set(USE_CCACHE OFF BOOL "Use ccache for compilation if available") -set(RELATIVE_DEBUG_PATHS OFF CACHE BOOL "Use relative file paths in debug info") -set(STATIC_LINK_LIBCXX ON CACHE BOOL "Statically link libstdcpp/libc++") -set(USE_WERROR OFF CACHE BOOL "Compile with -Werror. Recommended for local development and CI.") +env_set(RELATIVE_DEBUG_PATHS OFF BOOL "Use relative file paths in debug info") +env_set(USE_WERROR OFF BOOL "Compile with -Werror. Recommended for local development and CI.") +default_linker(_use_ld) +env_set(USE_LD "${_use_ld}" STRING + "The linker to use for building: can be LD (system default and same as DEFAULT), BFD, GOLD, or LLD - will be LLD for Clang if available, DEFAULT otherwise") +use_libcxx(_use_libcxx) +env_set(USE_LIBCXX "${_use_libcxx}" BOOL "Use libc++") +static_link_libcxx(_static_link_libcxx) +env_set(STATIC_LINK_LIBCXX "${_static_link_libcxx}" BOOL "Statically link libstdcpp/libc++") if(USE_LIBCXX AND STATIC_LINK_LIBCXX AND NOT USE_LD STREQUAL "LLD") message(FATAL_ERROR "Unsupported configuration: STATIC_LINK_LIBCXX with libc+++ only works if USE_LD=LLD") diff --git a/cmake/FlowCommands.cmake b/cmake/FlowCommands.cmake index a9546d0bcb..53cdd7a33b 100644 --- a/cmake/FlowCommands.cmake +++ b/cmake/FlowCommands.cmake @@ -185,12 +185,12 @@ function(add_flow_target) if(WIN32) add_custom_command(OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/${generated}" COMMAND $ "${CMAKE_CURRENT_SOURCE_DIR}/${src}" "${CMAKE_CURRENT_BINARY_DIR}/${generated}" ${actor_compiler_flags} - DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/${src}" + DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/${src}" ${actor_exe} COMMENT "Compile actor: ${src}") else() add_custom_command(OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/${generated}" COMMAND ${MONO_EXECUTABLE} ${actor_exe} "${CMAKE_CURRENT_SOURCE_DIR}/${src}" "${CMAKE_CURRENT_BINARY_DIR}/${generated}" ${actor_compiler_flags} > /dev/null - DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/${src}" + DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/${src}" ${actor_exe} COMMENT "Compile actor: ${src}") endif() else() diff --git a/cmake/InstallLayout.cmake b/cmake/InstallLayout.cmake index b1fb33c68e..76a9889dc9 100644 --- a/cmake/InstallLayout.cmake +++ b/cmake/InstallLayout.cmake @@ -131,9 +131,9 @@ set(install_destination_for_log_el6 "var/log/foundationdb") set(install_destination_for_log_el7 "var/log/foundationdb") set(install_destination_for_log_pm "") set(install_destination_for_data_tgz "lib/foundationdb") -set(install_destination_for_data_deb "var/lib/foundationdb") -set(install_destination_for_data_el6 "var/lib/foundationdb") -set(install_destination_for_data_el7 "var/lib/foundationdb") +set(install_destination_for_data_deb "var/lib/foundationdb/data") +set(install_destination_for_data_el6 "var/lib/foundationdb/data") +set(install_destination_for_data_el7 "var/lib/foundationdb/data") set(install_destination_for_data_pm "") set(generated_dir "${CMAKE_CURRENT_BINARY_DIR}/generated") diff --git a/design/backup_v2_partitioned_logs.md b/design/backup_v2_partitioned_logs.md new file mode 100644 index 0000000000..2fb6528baf --- /dev/null +++ b/design/backup_v2_partitioned_logs.md @@ -0,0 +1,336 @@ +# The New FDB Backup System: Requirements & Design + +Github tracking issue: https://github.com/apple/foundationdb/issues/1003 + +## Purpose and Audience + +The purpose of this document is to capture functional requirements as well as propose a high level design for implementation of the new backup system in FoundationDB. The intended audience for this document includes: + +* **FDB users** - Users can understand what are the changes in the new backup system, especially how to start a backup using the new backup system. The restore for new backup is handled by the [Performant Restore System](https://github.com/apple/foundationdb/issues/1049). +* **SRE's and Support** - can understand the high level architecture and know the requirements, including the metrics, tooling, and documentation to ensure that the new FDB backup can be supported. +* **Developers** - can know why this feature is needed, what it does, and how it is to be implemented. The hope is that this document becomes the starting point for any developer wishing to understand or be involved in the related aspects of FDB. + +## Functional Requirements + +As an essential component of a database system, backup and restore is commonly used technique for disaster recovery, reliability, audit and compliance purposes. The current FDB backup system consumes about half of the cluster’s write bandwidth, causes write skew among storage servers, increases storage space usage, and results in data balancing. The new backup system aims to double cluster’s write bandwidth for *HA clusters* (old DR clusters still need old style backup system). + +## Background + +FDB backup system continuously scan the database’s key-value space, save key-value pairs and mutations at versions into range files and log files in blob storage. Specifically, mutation logs are generated at Proxy, and are written to transaction logs along with regular mutations. In production clusters like CK clusters, backup system is always on, which means each mutation is written twice to transaction logs, consuming about half of write bandwidth and about 40% of Proxy CPU time. + +The design of old backup system is [here](https://github.com/apple/foundationdb/blob/master/design/backup.md), and the data format of range files and mutations files is [here](https://github.com/apple/foundationdb/blob/master/design/backup-dataFormat.md). The technical overview of FDB is [here](https://github.com/apple/foundationdb/wiki/Technical-Overview-of-the-Database). The FDB recovery is described in this [doc](https://github.com/apple/foundationdb/blob/master/design/recovery-internals.md). + + +## Terminology + +* **Blob storage**: blob storage is an object storage for unstructed data. Backup files are encoded in binary format and saved in blob storage, e.g., Amazon S3. +* **Version**: FDB continuously generate increasing number as version and use version to decide mutation ordering. Version number typically advance one million per second. To restore a FDB cluster to a specified date and time, the restore system first convert the date and time to the corresponding version number and restore the cluster to the version number. +* **Epoch**: A generation of FDB’s transaction system. After a component of the transaction system failed, FDB automatically initiates a recovery and restores the system in a new healthy generation, which is called an epoch. +* **Backup worker**: is a new role added to the FDB cluster that is responsible for pulling mutations from transaction logs and saving them to blob storage. +* **Tag**: A tag is a short address for a mutation’s destination, which includes a locality (`int8_t`, representing the data center ID and a negative number denotes special system locality) and an ID (`int16_t`). The idea is that the tag is a small data structure that consumes less bytes than using IP addresses or storage server’s UIDs (16 bytes each), since tags are associated with each mutation and are stored both in memory and on disk. +* **Tag partitioned log system**: FDB’s write-ahead log is a tag partitioned log system, where each mutation is assigned a number of tags. +* **Log router tag**: is a special system tag, e.g., `-2:0` where locality `-2` means log router tag and `0` means ID. If attached to a mutation, originally this tag means the mutation should be sent to a remote log router. In the new backup system, we reuse this tag for backup workers to receive all mutations in a number of partitioned streams. +* **Restorable version:** The version that a backup can be restored to. A version `v` is a restorable version if the entire key-space and mutations in version `[v1, v)` are recorded in backup files. +* **Node**: A node is a machine or a process in a cluster. + +## Detailed Feature Requirements + +Feature priorities: Feature 1, 2, 3, 4, 5 are must-have; Feature 6 is better to have. + +1. **Write bandwidth reduction by half**: removes the requirement to generate backup mutations at the Proxy, thus reduce TLog write bandwidth usage by half and significantly improve Proxy CPU usage; +2. **Correctness**: The restored database must be consistent: each *restored* state (i.e., key-value pair) at a version `v` must match the original state at version `v`. +3. **Performance**: The backup system should be performant, mostly measured as a small CPU overhead on transaction logs and backup workers. The version lag on backup workers is an indicator of performance. +4. **Fault-tolerant**: The backup system should be fault-tolerant to node failures in the FDB cluster. +5. **Restore ready**: The new backup system should be restored by the Performant Restore System. As a fallback for new performant restore system, we can convert new backup logs into the format of old backup logs, thus enabling restore of the new backup with existing old restore system. +6. **Backward compatibility**: The new backup system should allow both old style backup and DR (FDB 6.2 and below) to be performed, as well as support new backup in FDB 6.3 and above. + +## Security and Privacy Requirements + +**Security**: The backup system’s components are assumed to be trusted components, because they are running on the nodes in a FDB cluster. The transmission from cluster to blob store is through SSL connections. Blob credentials are passed in from “fdbserver” command line. + +**Privacy**: Backup data are stored in blob store with appropriate access control. Data retention policy can be set with “fdbbackup” tool to delete older backup data. + +## Operational and Maintainability Requirements + +This section discusses changes that may need to be identified or accounted for on the back-end in order to support the feature from a monitoring or management perspective. + +### Tooling / Front-End + +Workflow is needed for DBA to start, pause, resume, abort the new type of backups. The difference from the old type of backups should be only a flag change for starting the backup. The FDB cluster then generates backups as specified by the flag. + +A command line tool `fdbconvert` has been written to convert new backup logs into the format of old backup logs. Thus, if the new restore system has issues, we can still restore the new backup with existing old restore system. + +**Deployment instructions for tooling development** + +* A new stateless role “`Backup Worker`” (or “`BW`” for abbreviation) is introduced in a FDB cluster. The number of BW processes is based on the number of log routers (usually they are the same). If there is no log routers, the number of transaction logs is used. Note that occasionally the cluster may recruit more backup workers for version ranges in the old epoch. Since these version ranges are small, the resource requirements for these short-lived backup workers are very small. +* As in the old backup system, backup agents need to be started for saving snapshot files to blob storage. In contrast, backup workers in the new backup system running in the primary DC are responsible for saving mutation logs to blob storage. +* Backup worker’s memory should be large enough to hold 10s of seconds worth of mutation data from TLogs. The memory requirement can be calculated as: `WriteThroughput * BufferPeriod / partitions + SafetyMargin`, where `WriteThroughput` is the aggregated TLog write bandwidth, `partitions` is the number of log router tags. +* A new process class “backup” is defined for backup workers. +* How to start a new type backup: e.g., + + ``` + fdbbackup start -C fdb.cluster -p -d blob_url + ``` + +### KPI's and Health + +The solution must provide at least the following KPIs: + +* How fast (MB/s) does the transaction logs commit writes (already existed); +* How much backup data has been processed; +* An estimation of backup delay; + +### Customer Care + +The feature does not require any specific customer care awareness or interaction. + +### Roll-out + +The feature must follow the usual roll-out process. It needs to coexist with the existing backup system and periodically restore clusters to test its correctness. Only after we gain enough confidence will we deprecate the existing backup system. + +Note the new backup system is designed for HA clusters. Existing DR clusters still uses the old backup system. Thus, rolling out of the new backup system is only for HA clusters. + +### Quota + +This feature requires a blob storage for saving all log files. The blob storage must have enough: + +* disk capacity for all backup data; +* write bandwidth for uploading backup data; +* file count for backup data: the new backup system stored partitioned mutation logs, thus expecting several time increases of the file count. + +## Success Criteria + +* Write bandwidth reduction meets the expectation: TLog write bandwidth is reduced by half; +* New backup workflow is available to SREs; +* Continuous backup and restore should be performed to validate the restore. + +# Design + +**One sentence summary**: the new backup system introduces a new role, backup worker, to pull mutations from transaction logs and save them, thus removing the burden of saving mutation logs into the database. + +The old backup system writes the mutation log to the database itself, thus doubling the write bandwidth usage. Backup agents later fetch mutation logs from the database, upload them to blob storage, and then remove the mutation logs from the database. + +This project saves the mutation log to blob storage directly from the FDB cluster, which should almost double the database's write bandwidth when backup is enabled. In FDB, every mutation already has exactly one log router tag, so the idea of the new system is to backup data for each log router tag individually (i.e., saving mutation logs into multiple partitioned logs). During restore time, these partitioned mutation logs are combined together to form a continuous mutation log stream. + +## Design choices + +**Design question 1**: Should backup workers be recruited as part of log system or not? +There are two design alternatives: + +1. Backup worker is external to the log system. In other words, backup workers survive master recovery. Thus, backup workers are recruited and monitored by the cluster controller. + 1. The advantage is that the failure of backup workers does not cause master recovery. + 2. The disadvantage is that backup workers need to monitor master recovery, especially configuration changes. Because the number of log routers can change after a recovery, we might need to recruit more backup workers for an increase and need to pause/shutdown backup workers for a decrease, which complicates the recruitment logic; or we might need to changing the mapping of tags to backup workers, which is also complex. A further complication is that backup workers need to constantly monitor master recovery and be very careful about the version boundary between two consecutive epochs, because the number of tags may change. +2. Backup worker is recruited during master recovery as part of log system. The Master recruits a fixed number of backup workers, i.e., the same number as LogRouters. + 1. The advantage is that recruiting and mapping from backup worker to LogRouter tags are simple, i.e., one tag per worker. + 2. The disadvantages is that backup workers are tied with master recovery -- a failure of a backup worker results in a master recovery, and a master recovery stops old backup workers and starts new ones. + +**Decision**: We choose the second approach for the simplicity of the recruiting process and handling of mapping of LogRouter tags to backup workers. + +**Design question 2**: Place of backup workers on the primary or remote Data Center (DC)? +Placing backup workers on the primary side has the advantage of supporting any deployment configurations (single DC, multi DC). + +Placing on the remote is desirable to reduce the workload on the primary DC’s transaction logs. Since log routers on the remote side is already pulling mutations from primary DC, backup workers can simply pull from these log routers. + +**Decision**: We choose to recruit backup workers on the primary DC, because not all clusters are configured with multiple DCs and the backup system needs to support all types of deployment. + +## Design Assumptions + +The design proposed below is based upon the following assumptions: + +* Blob system has enough write bandwidth and storage space for backup workers to save log files. +* FDB cluster has enough stateless processes to run as backup workers and these processes have memory capacity to buffer 10s of seconds of commit data. + +## Design Challenges + +The requirement of the new backup system raises several design challenges: + +1. Correctness of the new backup files. Backup files must be complete and accurate to capture all data, otherwise we end up with corrupted data in the backup. The challenge here is to make sure no mutation is missing, even when the FDB cluster experiences failures and has to perform recovery. +2. Testing of the new backup system. How can we test the new backup system when there is no restore system available? We need to verify backup files are correct without performing a full restore. + +## System components + +**Backup Worker**: This is a new role introduced in the new backup system. A backup worker is a `fdbserver` process running inside a FDB cluster, responsible for pulling mutations from transaction logs and saving the mutations to blob storage. + +**Master**: The master is responsible for coordinating the transition of the FDB transaction sub-system from one generation to the next. In particular, the master recruits backup workers during the recovery. + +**Transaction Logs (TLogs)**: The transaction logs make mutations durable to disk for fast commit latencies. The logs receive commits from the proxy in version order, and only respond to the proxy once the data has been written and fsync'ed to an append only mutation log on disk. Storage servers retrieve mutations from TLogs. Once the storage servers have persisted mutations, storage servers then pop the mutations from the TLogs. + +**Proxy**: The proxies are responsible for providing read versions, committing transactions, and tracking the storage servers responsible for each range of keys. In the old backup system, Proxies are responsible to group mutations into backup mutations and write them to the database. + +## System overview + +From an end-to-end perspective, the new backup system works in the following steps: + +1. Operators issue a new backup request via `fdbbackup` command line tool; +2. FDB cluster receives the request and registers the request in the database (internal `TaskBucket` and system keys); +3. Backup workers monitor changes to system keys, register the request in its own internal queue, and starts logging mutations for the request key range; at the same time, backup agents (scheduled by `TaskBucket`) starts taking snapshots of key ranges in the database; +4. Periodically, backup workers upload mutations to the requested blob storage, and save the progress into the database; +5. The backup is restorable when backup workers have saved versions that are larger than the complete snapshot’s end version, and the backup is stopped if a stop on restorable flag is set in the request. + +The new backup has four major components: 1) backup workers; 2) recruitment of backup workers; 3) extension of tag partitioned log system to support pseudo tags; 4) integration with existing `TaskBucket` based backup command interface; and 5) integration with the Performant Restore System. + +### Backup workers + +Backup worker is a new role introduced in the new backup system. A backup worker is responsible for pulling mutations from transaction logs and saving the mutations to blob storage. Internally, a backup worker maintains a message buffer, which keeps mutations pulled from transaction logs, but have not been saved to blob storage yet. Periodically, the backup worker parses mutations in the message buffer, extracts those mutations that are within user specified key ranges, and then uploads mutation data to blob storage. After data is saved, the backup worker removes these messages from its internal buffer and saves its progress in the database, so that after a failure, a new backup worker starts from the previously saved version. + +Backup worker has two modes of operation: *no-op* mode, and *working* mode. When there is no active backup in the cluster, backup worker operates in the no-op mode, which simply obtains the recently committed version from Proxies and then pops mutations from transaction logs. After operators submit a new backup request to the cluster, backup workers transition into the working mode that starts pulling mutations from transaction logs and saving the mutation data to blob storage. + +In the working mode, the popping of backup workers need to follow a strictly increasing version order. For the same tag, there could be multiple backup workers, each is responsible for a different epoch. These backup workers must coordinating their popping order, otherwise the backup can miss some mutation data. This coordination among backup workers is achieved by deferring popping of a later epoch and only allowing the oldest epoch to pop first. After the oldest epoch has finished, these corresponding backup workers notifies the master, which will then advances the oldest backup epoch so that the next epoch can proceed the popping. + +A subtle issue for a displaced backup worker (i.e., being displaced because a new epoch begins), is that the last pop of the backup worker can cause missing version ranges in mutation logs. This is because the transaction for saving the progress may be delayed during recovery. As a result, the master could already recruited a new backup worker for the old epoch starting at the previously saved progress version. Then the saving transaction succeeds, and the worker pops mutations that the new backup worker is supposed to save, resulting in missing data for new backup worker’s log. The solution to this problem can be: 1) the old backup worker aborts immediately after knowing itself is displaced, thus not trying to save its progress; or 2) the old backup worker skip its last pop, since the next epoch will pop versions larger than its progress. Because the second approach avoids doing duplicated work in the new epoch, we choose to the second approach. + +Finally, multiple concurrent backups are supported. Each backup worker keeps track of current backup jobs and saves mutations to corresponding backup containers for the same batch of mutations. + +### Recruitment of Backup workers + +Backup workers are recruited during master recovery as part of log system. The Master recruits a fixed number of backup workers, one for each log router tag. During the recruiting process, the master sends backup worker initialization request as: + +``` +struct InitializeBackupRequest { + UID reqId; + LogEpoch epoch; // epoch this worker is recruited + LogEpoch backupEpoch; // epoch that this worker actually works on + Tag routerTag; + Version startVersion; + Optional endVersion; // Only present for unfinished old epoch + ReplyPromise reply; + … // additional methods elided +}; + +``` + +Note we need two epochs here: one for the recruited epoch and one for backing up epoch. The recruited epoch is the epoch of the log system, which is used by a backup worker to find out if it works for the current epoch. If so, the worker should save its progress and immediately exit. The `backupEpoch` is used for saving progress. The `backupEpoch` is usually the same as the epoch that the worker is recruited. However, it can be some earlier epoch than the recruiting epoch, signifying that the worker is responsible for data in that earlier epoch. In this case, when the worker is done and exits, the master should not flag its departure as a trigger of recovery. This is solved by the following protocol: + +1. The backup worker finishes its work, including saving progress to the key value store and uploading to cloud storage, and then sends a `BackupWorkerDoneRequest` to the master; +2. The master receives the request, removes the worker from its log system, and updates the oldest backing up epoch `oldestBackupEpoch`; +3. The master sends backup a reply message to the backup worker and registers the new log system with cluster controller; +4. The backup worker exits after receiving the reply. Other backup workers in the system get the new log system from the cluster controller. If a backup worker’s `backupEpoch` is equal to `oldestBackupEpoch`, then the worker may start popping from TLogs. + +Note `oldestBackupEpoch` is introduced to prevent a backup worker for a newer epoch from popping when there are backup workers for older epochs. Otherwise, these older backup workers may lose data. + +### Extension of tag partitioned log system to support pseudo tags + +The tag partitioned log system is modeled like a FIFO queue, where Proxies push mutations to the queue and Storage Servers or Log Routers pop mutations from the queue. Specifically, consumers of the tag partitioned log system use two operations, `peek` and `pop`, to read mutations for a given tag and to pop mutations from the queue. Because Proxies assign each mutation a unique log router tag, the backup system reuses this tag to obtain the whole mutation stream. As a result, each log router tag now has two consumers, a log router and a backup worker. + +To support multiple consumers of the log router tag, the peek and pop has been extended to support pseudo tags. In other words, each log router tag can be mapped to multiple pseudo tags. Log routers and Backup workers still `peek` mutations with the log router tag, but `pop` with different pseudo tags. Only after both pseudo tags are popped, TLogs can pop the mutations from its internal queue. + +Note the introduction of pseudo tags opens the possibility for more usage scenarios. For instance, a change stream can be implemented with a pseudo tag, where the new consumer can look at each mutation and emit mutations on specified key ranges. + +### Integration with existing taskbucket based backup command interface + +We strive to keep the operational interface the same as the old backup system. That is, the new backup is initiated by the client as before with an additional flag. FDB cluster receives the backup request, sees the flag being set, and uses the new system for generating mutation logs. + +By default, backup workers are not enabled in the system. When operators submit a new backup request for the first time, the database performs a configuration change (`backup_worker_enabled:=1`) that enables backup workers. + +The operator’s backup request can indicate if an old backup or a new backup is used. This is a command line option (i.e., `-p` or `--partitioned_log`) in the `fdbbackup` command. A backup request of the new type is started in the following steps: + +1. Operators use `fdbbackup` tool to write the backup range to a system key, i.e., `\xff\x02/backupStarted`. +2. All backup workers monitor the key `\xff\x02/backupStarted`, see the change, and start logging mutations. +3. After all backup workers have started, the `fdbbackup` tool initiates the backup of all or specified key ranges by issuing a transaction `Ts`. + +Compared to the old backup system, the above step 1 and 2 are new and is only triggered if client requests for a new type of backup. The purpose is to allow backup workers to function as no-op if there are no ongoing backups. However, the backup workers should still continuously pop their corresponding tags, otherwise mutations will be kept in the TLog. In order to know the version to pop, backup workers can obtain the read version from any proxy. Because the read version must be a committed version, so popping to this version is safe. + +**Backup Submission Protocol** +Protocol for `submitBackup()` to ensure that all backup workers of the current epoch have started logging mutations: + +1. After the `submitBackup()` call, the task bucket (i.e., `StartFullBackupTaskFunc`) starts by creating a `BackupConfig` object in the system key space. +2. Each backup worker monitors the `\xff\x02/backupStarted` key and notices the new backup job. Then the backup worker inserts the new job into its internal queue, and writes to `startedBackupWorkers` key in the `BackupConfig` object if the worker’s `backupEpoch` is the current epoch. Among these workers, the worker with Log Router Tag `-2:0` monitors the `startedBackupWorkers` key, and sets `allWorkerStarted` key after all workers have updated the `startedBackupWorkers` key. +3. The task bucket watches change to the `startedBackupWorkers` key and declares the job submission successful. + +This protocol was implemented after another abandoned protocol: the `startedBackupWorkers` key is set after all backup workers have saved logs with versions larger than the version of `submitBackup()` call. This protocol fails if there is already a backup job and there is a backup worker that doesn’t notice the change to the `\xff\x02/backupStarted` key. As a result, the worker is saving versions larger than the new job’s start version, but in the old backup container. Thus the new container misses some mutations. + +**Protocol for Determining A Backup is Restorable** + +1. Each backup worker independently logs mutations to a backup container and updates its progress in the system key space. +2. The worker with Log Router Tag `-2:0` of current epoch monitors all workers’ progress. If the oldest backup epoch is the current epoch (i.e, there are no backup workers for any old epochs, thus no version ranges missing before this epoch), this worker updates `latestBackupWorkerSavedVersion` key in the `BackupConfig` object with the minimum saved version among workers. +3. The client calls `describeBackup()`, which eventually calls `getLatestRestorableVersion` to read the value from the `latestBackupWorkerSavedVersion` key. If this version is larger than the first snapshot’s end version, then the backup is restorable. + +**Pause and Resume Backups** +The command line for pause or resume backups remains the same, but the implementation for the new backup system is different from the old one. This is because in the old backup system, both mutation logs and range logs are handled by `TaskBucket`, an asynchronous task scheduling framework that stores states in the FDB database. Thus, the old backup system simply pauses or resumes the `TaskBucket`. In the new backup system, mutation logs are generated by backup workers, thus the pause or resume command needs to tell all backup workers to pause or resume pulling mutations from TLogs. Specifically, + +1. The operator issues a pause or resume request that upates both the `TaskBucket` and `\xff\x02/backupPaused` key. +2. Each backup worker monitors the `\xff\x02/backupPaused` key and notices the change. Then the backup worker pauses or resumes pulling from TLogs. + +**Backup Container Changes** + +* Partitioned mutation logs are stored in `plogs/XXXX/XXXX` directory and their names are in the format of `log,[startVersion],[endVersion],[UID],[N-of-M],[blockSize]`, where `M` is total partition number, `N` can be any number from `0` to `M - 1`. In contrast, old mutation logs are stored in `logs/XXXX/XXXX` directory and are named differently. +* To restore a version range, all partitioned logs for the range needs to be available. The restore process should read all partitioned logs, and combine mutations from different logs into one mutation stream, ordered by `(commit_version, subsequence)` pair. It is guaranteed that all mutations form a total order. Note in the old backup files, there is no subsequence number, as each version’s mutations are serialized in order in one file. + +### Integration with the [Performant Restore System](https://github.com/apple/foundationdb/issues/1049) + +As discussed above, the new backup system split mutation logs into multiple partitions. Thus, the restore process must verify the backup files are continuous for all partitions with the restore’s version range. This is possible because each log file name has the information about its partition number and the total number of partitions. + +Once the restore system verifies the version range is continuous, the restore system needs to filter out duplicated version range among different log files (both log continuity analysis and dedup logic are implemented in `BackupContainer` abstraction). A given version range may be stored in **multiple** mutation log files. This can happen because a recruited backup worker can upload mutation files successfully, but doesn’t save the progress before another recovery happens. As a result, the new epoch tries to backup this version range again, producing the same version ranges (though the file names are different). + +Finally, the restore system loads the same version’s mutations from all partitions, and then merges these mutations in the order of their subsequence number before they are applied on the restore cluster. Note the mutations in the old backup system lack subsequence numbers. As a result, restoring old backups needs to assign subsequence number to mutations. + +## Ordered and Complete Guarantee of Mutation Logs + +The backup system must generate log files that the restore system can apply all the mutations on the backup cluster in the same order exactly once. + +**Ordering guarantee**. To maintain the ordering of mutations, each mutation is stored with its commit version and a subsequence number, both are assigned by Proxies during commit. The restore system can load all mutations and derive a total order among all the mutations. + +**Completeness guarantee**. All mutations should be saved in log files. We cannot allow any mutations missing from the backup. This is guaranteed by the fault tolerance discussed below. Essentially all backup workers checkpoint their progress in the database. After the recovery, the new master reads previous checkpoints and recruit new backup workers for any missing version ranges. + +## Backup File Format + +The old backup file format is documented [here](https://github.com/apple/foundationdb/blob/release-6.2/design/backup-dataFormat.md). We can’t use this file format, because our backup files are created for log router tags. When there are more than one log routers (almost always the case), the mutations in one transaction can be given different log router tags. As a result, for the same version, mutations are distributed in many files. Another subtle issue is that, there can be two mutations, (e.g., `a = 1` and `a = 2` in a transaction), which are given two different tags. We have to preserve the order of these two mutations in the restore process. Even though the order is saved in the sub-sequence number of a version, we still need to merge mutations from multiple files and apply them in the correct order. + +In the new backup system, mutation log file is named as `log,[startVersion],[endVersion],[UID],[N-of-M],[blockSize]`, where `startVersion` is inclusive and `endVersion` is *not* inclusive, e.g., `log,332850851,332938927,7be23c0a3e80df8ab1530fa76fa66980,1-of-4,1048576`. With the information from all file names, the restore process can find all files for a version range, i.e., versions intersect with the range and all log router tags. “`M`” is the total number of tags, and “`N`” is from `0` to `m - 1`.Note `tagId` is not required in the old backup filename, since all mutations for a version are included in one file. + +Each file content is a list of fixed size blocks. Each block contains a sequence of mutations, where each mutation consists of a serialized `Version`, `int32_t`, `int32_t`, (all these three numbers are in big endian) and `Mutation`, where `Mutation` is of format `type|kLen|vLen|Key|Value`, where `type` is the mutation type (e.g., `Set` or `Clear`), `kLen` and `vLen` respectively are the lengths of the key and value in the mutation. `Key` and `Value` are the serialized value of the Key and Value in the mutation. The paddings at the end of the block are bytes of `0xFF`. + +``` +`` +`` +`` +`…` +` +` +``` + +Note the big Endianness for version is required, as `0xFF` is used as the padding to indicate block end. A little endian number can easily be mistaken as the end. In contrast, big endian for version almost guarantee the first byte is not `0xFF` (should always be `0x00`). + +## Performance optimization + +### Future Optimizations + +Add a metadata file describe the backup file: + +* The number of mutations; +* The number of atomic operations; +* key range and version range of mutations in each backup file; + +The information can be used to optimize the restore process. For instance, the number of mutations can be used to make better load balancing decisions; if there is no atomic operations, the restore can apply mutation in a backward fashion -- skipping mutations with earlier versions. + +## Fault Tolerance + +Failures of a backup worker will trigger a master recovery. After the recovery, the new master recruits a new set of backup workers. Among them, a new backup worker shall continue the work of the failed backup worker from the previous epoch. + +The interesting part is the handling of old epochs, since the backup workers for the old epoch are in the “displaced” state and should exit. So the basic idea is that we need a set of backup workers for the data left in the old epochs. To figure out the set of data not backed up yet, the master first loads saved backup progress data ` `from the database, and then computes for each epoch, what version ranges have not been backed up. For each of the version range and tag, master recruit a worker to resume the backup for that version range and tag. Note that this worker has a different worker UID from the worker in the original epoch. As a result, for a given epoch and a tag, there might be multiple progress status, as these workers are recruited at different epochs. + +## KPI's and Metrics + +The backup system emits the following metrics: + +* How much backup data has been processed: the backup command line tool `fdbbackup` can show the status of backup, including the size of mutation logs (`LogBytes written`) and snapshots (`RangeBytes written`). By taking two consecutive backup status, the backup speed can be estimated as (`2nd_LogBytes - 1st_LogBytes) / interval`. +* An estimation of backup delay: Each backup worker emits `BackupWorkerMetrics` trace events every 5 seconds, which includes `SavedVersion`, `MinKnownCommittedVersion`, and `MsgQ`. The backup delay can be estimated as (`MinKnownCommittedVersion - SavedVersion) / 1,000,000` seconds, which is the difference between a worker’s saved version and current committed version, divided by 1M version per second. `MsgQ` is the queue size of memory buffer of the backup worker. + +## Controlling Properties + +System operator can control the following backup properties: + +* **Backup key ranges**: The non-overlapped key ranges that will be backed up to the blob storage. +* **Blob url**: The root path in blob that host all backup files. +* **Performance knobs**: The knobs that control the performance + * The backup interval (knob `BACKUP_UPLOAD_DELAY`) for saving mutation logs to blob storage; + +## Testing + +The feature will be tested both in simulation and in real clusters: + +* New test cases are added into the test folder in FDB. The nightly correctness (i.e., simulation) tests will test the correctness of both backup and restore. +* Tests will be added to constantly backup a cluster with the new backup system and restore the backup to ensure the restore works on real clusters. During the time period of active backup, the cluster should have better write performance than using old backup system. +* Tests should also be conducted with production data. This ensures backup data is restorable and catches potential bugs in backup and restore. This test is preferably conducted regularly, e.g., weekly per cluster. + +Before the restore system is available, the testing strategy for backup files is to keep old backup system running. Thus, both new backup files and old backup files are generated. Then both types of log files are decoded and compared against. The new backup file is considered correct if its content matches the content of old log files. diff --git a/documentation/sphinx/source/api-common.rst.inc b/documentation/sphinx/source/api-common.rst.inc index 8518dc058b..39e09a83d0 100644 --- a/documentation/sphinx/source/api-common.rst.inc +++ b/documentation/sphinx/source/api-common.rst.inc @@ -176,6 +176,9 @@ .. |transaction-get-committed-version-blurb| replace:: Gets the version number at which a successful commit modified the database. This must be called only after the successful (non-error) completion of a call to |commit-func| on this Transaction, or the behavior is undefined. Read-only transactions do not modify the database when committed and will have a committed version of -1. Keep in mind that a transaction which reads keys and then sets them to their current values may be optimized to a read-only transaction. +.. |transaction-get-approximate-size-blurb| replace:: + Gets the the approximate transaction size so far, which is the summation of the estimated size of mutations, read conflict ranges, and write conflict ranges. + .. |transaction-get-versionstamp-blurb| replace:: Returns a future which will contain the versionstamp which was used by any versionstamp operations in this transaction. This function must be called before a call to |commit-func| on this Transaction. The future will be ready only after the successful completion of a call to |commit-func| on this Transaction. Read-only transactions do not modify the database when committed and will result in the future completing with an error. Keep in mind that a transaction which reads keys and then sets them to their current values may be optimized to a read-only transaction. diff --git a/documentation/sphinx/source/api-python.rst b/documentation/sphinx/source/api-python.rst index 086e942f0e..086eeb0bf0 100644 --- a/documentation/sphinx/source/api-python.rst +++ b/documentation/sphinx/source/api-python.rst @@ -805,6 +805,13 @@ Transaction misc functions .. _api-python-transaction-options: +Transaction misc functions +-------------------------- + +.. method:: Transaction.get_approximate_size() + + |transaction-get-approximate-size-blurb|. Returns a :class:`FutureInt64`. + Transaction options ------------------- diff --git a/documentation/sphinx/source/api-ruby.rst b/documentation/sphinx/source/api-ruby.rst index 8471183f2c..77ae67ceac 100644 --- a/documentation/sphinx/source/api-ruby.rst +++ b/documentation/sphinx/source/api-ruby.rst @@ -736,7 +736,7 @@ Most applications should use the read version that FoundationDB determines autom |infrequent| |transaction-get-committed-version-blurb| -.. method:: Transaction.get_verionstamp() -> String +.. method:: Transaction.get_versionstamp() -> String |infrequent| |transaction-get-versionstamp-blurb| @@ -747,6 +747,10 @@ Transaction misc functions Get the estimated byte size of the given key range. Returns a :class:`Int64Future`. +.. method:: Transaction.get_approximate_size() -> Int64Future + + |transaction-get-approximate-size-blurb|. Returns a :class:`Int64Future`. + Transaction options ------------------- diff --git a/documentation/sphinx/source/command-line-interface.rst b/documentation/sphinx/source/command-line-interface.rst index ec49c4d517..0259525a7c 100644 --- a/documentation/sphinx/source/command-line-interface.rst +++ b/documentation/sphinx/source/command-line-interface.rst @@ -167,6 +167,11 @@ getversion The ``getversion`` command fetches the current read version of the cluster or currently running transaction. +advanceversion +-------------- + +Forces the cluster to recover at the specified version. If the specified version is larger than the current version of the cluster, the cluster version is advanced to the specified version via a forced recovery. + help ---- diff --git a/documentation/sphinx/source/downloads.rst b/documentation/sphinx/source/downloads.rst index 32a7260605..65dd0713ef 100644 --- a/documentation/sphinx/source/downloads.rst +++ b/documentation/sphinx/source/downloads.rst @@ -10,38 +10,38 @@ macOS The macOS installation package is supported on macOS 10.7+. It includes the client and (optionally) the server. -* `FoundationDB-6.2.19.pkg `_ +* `FoundationDB-6.2.20.pkg `_ Ubuntu ------ The Ubuntu packages are supported on 64-bit Ubuntu 12.04+, but beware of the Linux kernel bug in Ubuntu 12.x. -* `foundationdb-clients-6.2.19-1_amd64.deb `_ -* `foundationdb-server-6.2.19-1_amd64.deb `_ (depends on the clients package) +* `foundationdb-clients-6.2.20-1_amd64.deb `_ +* `foundationdb-server-6.2.20-1_amd64.deb `_ (depends on the clients package) RHEL/CentOS EL6 --------------- The RHEL/CentOS EL6 packages are supported on 64-bit RHEL/CentOS 6.x. -* `foundationdb-clients-6.2.19-1.el6.x86_64.rpm `_ -* `foundationdb-server-6.2.19-1.el6.x86_64.rpm `_ (depends on the clients package) +* `foundationdb-clients-6.2.20-1.el6.x86_64.rpm `_ +* `foundationdb-server-6.2.20-1.el6.x86_64.rpm `_ (depends on the clients package) RHEL/CentOS EL7 --------------- The RHEL/CentOS EL7 packages are supported on 64-bit RHEL/CentOS 7.x. -* `foundationdb-clients-6.2.19-1.el7.x86_64.rpm `_ -* `foundationdb-server-6.2.19-1.el7.x86_64.rpm `_ (depends on the clients package) +* `foundationdb-clients-6.2.20-1.el7.x86_64.rpm `_ +* `foundationdb-server-6.2.20-1.el7.x86_64.rpm `_ (depends on the clients package) Windows ------- The Windows installer is supported on 64-bit Windows XP and later. It includes the client and (optionally) the server. -* `foundationdb-6.2.19-x64.msi `_ +* `foundationdb-6.2.20-x64.msi `_ API Language Bindings ===================== @@ -58,18 +58,18 @@ On macOS and Windows, the FoundationDB Python API bindings are installed as part If you need to use the FoundationDB Python API from other Python installations or paths, use the Python package manager ``pip`` (``pip install foundationdb``) or download the Python package: -* `foundationdb-6.2.19.tar.gz `_ +* `foundationdb-6.2.20.tar.gz `_ Ruby 1.9.3/2.0.0+ ----------------- -* `fdb-6.2.19.gem `_ +* `fdb-6.2.20.gem `_ Java 8+ ------- -* `fdb-java-6.2.19.jar `_ -* `fdb-java-6.2.19-javadoc.jar `_ +* `fdb-java-6.2.20.jar `_ +* `fdb-java-6.2.20-javadoc.jar `_ Go 1.11+ -------- diff --git a/documentation/sphinx/source/release-notes.rst b/documentation/sphinx/source/release-notes.rst index 1ab89c2b6c..72e0df1b5c 100644 --- a/documentation/sphinx/source/release-notes.rst +++ b/documentation/sphinx/source/release-notes.rst @@ -2,7 +2,7 @@ Release Notes ############# -7.0.0 +6.3.0 ===== Features @@ -28,9 +28,6 @@ Bindings * Java: Introduced ``keyAfter`` utility function that can be used to create the immediate next key for a given byte array. `(PR #2458) `_ * C: The ``FDBKeyValue`` struct's ``key`` and ``value`` members have changed type from ``void*`` to ``uint8_t*``. `(PR #2622) `_ * Deprecated ``enable_slow_task_profiling`` transaction option and replaced it with ``enable_run_loop_profiling``. `(PR #2608) `_ -* Go: Added a ``Close`` function to ``RangeIterator`` which **must** be called to free resources returned from ``Transaction.GetRange``. `(PR #1910) `_. -* Go: Finalizers are no longer used to clean up native resources. ``Future`` results are now copied from the native heap to the Go heap, and native resources are freed immediately. `(PR #1910) `_. - Other Changes ------------- diff --git a/fdbbackup/CMakeLists.txt b/fdbbackup/CMakeLists.txt index 3c6fd0ef58..b9259935f3 100644 --- a/fdbbackup/CMakeLists.txt +++ b/fdbbackup/CMakeLists.txt @@ -45,11 +45,11 @@ if(NOT OPEN_FOR_IDE) symlink_files( LOCATION packages/bin SOURCE fdbbackup - TARGETS fdbdr dr_agent backup_agent fdbrestore) + TARGETS fdbdr dr_agent backup_agent fdbrestore fastrestore_agent) symlink_files( LOCATION bin SOURCE fdbbackup - TARGETS fdbdr dr_agent backup_agent fdbrestore) + TARGETS fdbdr dr_agent backup_agent fdbrestore fastrestore_agent) endif() if (GPERFTOOLS_FOUND) diff --git a/fdbbackup/FileConverter.actor.cpp b/fdbbackup/FileConverter.actor.cpp index 006b311f87..67f0e3493d 100644 --- a/fdbbackup/FileConverter.actor.cpp +++ b/fdbbackup/FileConverter.actor.cpp @@ -373,17 +373,6 @@ struct LogFileWriter { return wr.toValue(); } - // Return a block of contiguous padding bytes, growing if needed. - static Value makePadding(int size) { - static Value pad; - if (pad.size() < size) { - pad = makeString(size); - memset(mutateString(pad), '\xff', pad.size()); - } - - return pad.substr(0, size); - } - // Start a new block if needed, then write the key and value ACTOR static Future writeKV_impl(LogFileWriter* self, Key k, Value v) { // If key and value do not fit in this block, end it and start a new one @@ -392,7 +381,7 @@ struct LogFileWriter { // Write padding if needed int bytesLeft = self->blockEnd - self->file->size(); if (bytesLeft > 0) { - state Value paddingFFs = makePadding(bytesLeft); + state Value paddingFFs = fileBackup::makePadding(bytesLeft); wait(self->file->append(paddingFFs.begin(), bytesLeft)); } diff --git a/fdbbackup/backup.actor.cpp b/fdbbackup/backup.actor.cpp index 7eef8ebc15..1354ab2685 100644 --- a/fdbbackup/backup.actor.cpp +++ b/fdbbackup/backup.actor.cpp @@ -2192,8 +2192,7 @@ ACTOR Future runRestore(Database db, std::string originalClusterFile, std: // Fast restore agent that kicks off the restore: send restore requests to restore workers. ACTOR Future runFastRestoreAgent(Database db, std::string tagName, std::string container, Standalone> ranges, Version dbVersion, - bool performRestore, bool verbose, bool waitForDone, std::string addPrefix, - std::string removePrefix) { + bool performRestore, bool verbose, bool waitForDone) { try { state FileBackupAgent backupAgent; state Version restoreVersion = invalidVersion; @@ -2219,9 +2218,26 @@ ACTOR Future runFastRestoreAgent(Database db, std::string tagName, std::st dbVersion = desc.maxRestorableVersion.get(); TraceEvent("FastRestoreAgent").detail("TargetRestoreVersion", dbVersion); } - Version _restoreVersion = wait(fastRestore(db, KeyRef(tagName), KeyRef(container), waitForDone, dbVersion, - verbose, range, KeyRef(addPrefix), KeyRef(removePrefix))); - restoreVersion = _restoreVersion; + state UID randomUID = deterministicRandom()->randomUniqueID(); + TraceEvent("FastRestoreAgent") + .detail("SubmitRestoreRequests", ranges.size()) + .detail("RestoreUID", randomUID); + wait(backupAgent.submitParallelRestore(db, KeyRef(tagName), ranges, KeyRef(container), dbVersion, true, + randomUID)); + if (waitForDone) { + // Wait for parallel restore to finish and unlock DB after that + TraceEvent("FastRestoreAgent").detail("BackupAndParallelRestore", "WaitForRestoreToFinish"); + wait(backupAgent.parallelRestoreFinish(db, randomUID)); + TraceEvent("FastRestoreAgent").detail("BackupAndParallelRestore", "RestoreFinished"); + } else { + TraceEvent("FastRestoreAgent") + .detail("RestoreUID", randomUID) + .detail("OperationGuide", "Manually unlock DB when restore finishes"); + printf("WARNING: DB will be in locked state after restore. Need UID:%s to unlock DB\n", + randomUID.toString().c_str()); + } + + restoreVersion = dbVersion; } else { state Reference bc = IBackupContainer::openContainer(container); state BackupDescription description = wait(bc->describeBackup()); @@ -3740,7 +3756,7 @@ int main(int argc, char* argv[]) { switch (restoreType) { case RESTORE_START: f = stopAfter(runFastRestoreAgent(db, tagName, restoreContainer, backupKeys, restoreVersion, !dryRun, - !quietDisplay, waitForDone, addPrefix, removePrefix)); + !quietDisplay, waitForDone)); break; case RESTORE_WAIT: printf("[TODO][ERROR] FastRestore does not support RESTORE_WAIT yet!\n"); @@ -3887,102 +3903,3 @@ int main(int argc, char* argv[]) { flushAndExit(status); } - -//------Restore Agent: Kick off the restore by sending the restore requests -ACTOR static Future waitFastRestore(Database cx, Key tagName, bool verbose) { - // We should wait on all restore to finish before proceeds - TraceEvent("FastRestore").detail("Progress", "WaitForRestoreToFinish"); - state ReadYourWritesTransaction tr(cx); - state Future fRestoreRequestDone; - state bool restoreRequestDone = false; - - loop { - try { - tr.reset(); - tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); - tr.setOption(FDBTransactionOptions::LOCK_AWARE); - tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); - // In case restoreRequestDoneKey is already set before we set watch on it - Optional restoreRequestDoneKeyValue = wait(tr.get(restoreRequestDoneKey)); - if (restoreRequestDoneKeyValue.present()) { - restoreRequestDone = true; - tr.clear(restoreRequestDoneKey); - wait(tr.commit()); - break; - } else if (!restoreRequestDone) { - fRestoreRequestDone = tr.watch(restoreRequestDoneKey); - wait(tr.commit()); - wait(fRestoreRequestDone); - } else { - break; - } - } catch (Error& e) { - wait(tr.onError(e)); - } - } - - TraceEvent("FastRestore").detail("Progress", "RestoreFinished"); - - return FileBackupAgent::ERestoreState::COMPLETED; -} - -ACTOR static Future _fastRestore(Database cx, Key tagName, Key url, bool waitForComplete, - Version targetVersion, bool verbose, KeyRange range, Key addPrefix, - Key removePrefix) { - state Reference bc = IBackupContainer::openContainer(url.toString()); - state BackupDescription desc = wait(bc->describeBackup()); - wait(desc.resolveVersionTimes(cx)); - - if (targetVersion == invalidVersion && desc.maxRestorableVersion.present()) - targetVersion = desc.maxRestorableVersion.get(); - - Optional restoreSet = wait(bc->getRestoreSet(targetVersion)); - TraceEvent("FastRestore").detail("BackupDesc", desc.toString()).detail("TargetVersion", targetVersion); - - if (!restoreSet.present()) { - TraceEvent(SevWarn, "FileBackupAgentRestoreNotPossible") - .detail("BackupContainer", bc->getURL()) - .detail("TargetVersion", targetVersion); - throw restore_invalid_version(); - } - - // NOTE: The restore agent makes sure we only support 1 restore range for each restore request for now! - // The simulation test did test restoring multiple restore ranges in one restore request though. - state Reference tr(new ReadYourWritesTransaction(cx)); - state int restoreIndex = 0; - loop { - try { - tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); - tr->setOption(FDBTransactionOptions::LOCK_AWARE); - Standalone restoreTag(tagName.toString() + "_" + std::to_string(restoreIndex)); - bool locked = true; - struct RestoreRequest restoreRequest(restoreIndex, restoreTag, KeyRef(bc->getURL()), true, targetVersion, - true, range, Key(), Key(), locked, - deterministicRandom()->randomUniqueID()); - tr->set(restoreRequestKeyFor(restoreRequest.index), restoreRequestValue(restoreRequest)); - // backupRanges.size = 1 because we only support restoring 1 range in real mode for now - tr->set(restoreRequestTriggerKey, restoreRequestTriggerValue(deterministicRandom()->randomUniqueID(),1)); - wait(tr->commit()); // Trigger fast restore - break; - } catch (Error& e) { - if (e.code() != error_code_restore_duplicate_tag) { - wait(tr->onError(e)); - } - } - } - - if (waitForComplete) { - FileBackupAgent::ERestoreState finalState = wait(waitFastRestore(cx, tagName, verbose)); - if (finalState != FileBackupAgent::ERestoreState::COMPLETED) throw restore_error(); - } - - return targetVersion; -} - -ACTOR Future fastRestore(Database cx, Standalone tagName, Standalone url, - bool waitForComplete, long targetVersion, bool verbose, Standalone range, - Standalone addPrefix, Standalone removePrefix) { - Version result = - wait(_fastRestore(cx, tagName, url, waitForComplete, targetVersion, verbose, range, addPrefix, removePrefix)); - return result; -} diff --git a/fdbcli/fdbcli.actor.cpp b/fdbcli/fdbcli.actor.cpp index fffb018c20..bd2ac38913 100644 --- a/fdbcli/fdbcli.actor.cpp +++ b/fdbcli/fdbcli.actor.cpp @@ -525,6 +525,11 @@ void initHelp() { helpMap["getversion"] = CommandHelp("getversion", "Fetch the current read version", "Displays the current read version of the database or currently running transaction."); + helpMap["advanceversion"] = CommandHelp( + "advanceversion ", "Force the cluster to recover at the specified version", + "Forces the cluster to recover at the specified version. If the specified version is larger than the current " + "version of the cluster, the cluster version is advanced " + "to the specified version via a forced recovery."); helpMap["reset"] = CommandHelp( "reset", "reset the current transaction", @@ -3217,6 +3222,23 @@ ACTOR Future cli(CLIOptions opt, LineNoise* plinenoise) { continue; } + if (tokencmp(tokens[0], "advanceversion")) { + if (tokens.size() != 2) { + printUsage(tokens[0]); + is_error = true; + } else { + Version v; + int n = 0; + if (sscanf(tokens[1].toString().c_str(), "%ld%n", &v, &n) != 1 || n != tokens[1].size()) { + printUsage(tokens[0]); + is_error = true; + } else { + wait(makeInterruptable(advanceVersion(db, v))); + } + } + continue; + } + if (tokencmp(tokens[0], "kill")) { getTransaction(db, tr, options, intrans); if (tokens.size() == 1) { diff --git a/fdbclient/BackupAgent.actor.h b/fdbclient/BackupAgent.actor.h index f728bcd488..962d37b10b 100644 --- a/fdbclient/BackupAgent.actor.h +++ b/fdbclient/BackupAgent.actor.h @@ -278,7 +278,7 @@ public: // parallel restore Future parallelRestoreFinish(Database cx, UID randomUID); Future submitParallelRestore(Database cx, Key backupTag, Standalone> backupRanges, - KeyRef bcUrl, Version targetVersion, bool lockDB, UID randomUID); + Key bcUrl, Version targetVersion, bool lockDB, UID randomUID); Future atomicParallelRestore(Database cx, Key tagName, Standalone> ranges, Key addPrefix, Key removePrefix); @@ -893,10 +893,6 @@ public: } }; -ACTOR Future fastRestore(Database cx, Standalone tagName, Standalone url, - bool waitForComplete, long targetVersion, bool verbose, Standalone range, - Standalone addPrefix, Standalone removePrefix); - // Helper class for reading restore data from a buffer and throwing the right errors. struct StringRefReader { StringRefReader(StringRef s = StringRef(), Error e = Error()) : rptr(s.begin()), end(s.end()), failure_error(e) {} @@ -937,6 +933,9 @@ struct StringRefReader { namespace fileBackup { ACTOR Future>> decodeRangeFileBlock(Reference file, int64_t offset, int len); + +// Return a block of contiguous padding bytes "\0xff" for backup files, growing if needed. +Value makePadding(int size); } #include "flow/unactorcompiler.h" diff --git a/fdbclient/BackupContainer.actor.cpp b/fdbclient/BackupContainer.actor.cpp index a5ec9223f2..0fabe6e83a 100644 --- a/fdbclient/BackupContainer.actor.cpp +++ b/fdbclient/BackupContainer.actor.cpp @@ -1343,6 +1343,7 @@ public: Standalone> blockData = wait(fileBackup::decodeRangeFileBlock(inFile, j, len)); if (!beginKeySet) { beginKey = blockData.front().key; + beginKeySet = true; } endKey = blockData.back().key; } @@ -2096,6 +2097,8 @@ ACTOR Future> timeKeeperEpochsFromVersion(Version v, Reference return found.first + (v - found.second) / CLIENT_KNOBS->CORE_VERSIONSPERSECOND; } +namespace backup_test { + int chooseFileSize(std::vector &sizes) { int size = 1000; if(!sizes.empty()) { @@ -2133,7 +2136,30 @@ Version nextVersion(Version v) { return v + increment; } -ACTOR Future testBackupContainer(std::string url) { +// Write a snapshot file with only begin & end key +ACTOR static Future testWriteSnapshotFile(Reference file, Key begin, Key end, uint32_t blockSize) { + ASSERT(blockSize > 3 * sizeof(uint32_t) + begin.size() + end.size()); + + uint32_t fileVersion = BACKUP_AGENT_SNAPSHOT_FILE_VERSION; + // write Header + wait(file->append((uint8_t*)&fileVersion, sizeof(fileVersion))); + + // write begin key length and key + wait(file->appendStringRefWithLen(begin)); + + // write end key length and key + wait(file->appendStringRefWithLen(end)); + + int bytesLeft = blockSize - file->size(); + if (bytesLeft > 0) { + Value paddings = fileBackup::makePadding(bytesLeft); + wait(file->append(paddings.begin(), bytesLeft)); + } + wait(file->finish()); + return Void(); +} + +ACTOR static Future testBackupContainer(std::string url) { printf("BackupContainerTest URL %s\n", url.c_str()); state Reference c = IBackupContainer::openContainer(url); @@ -2162,6 +2188,9 @@ ACTOR Future testBackupContainer(std::string url) { loop { state Version logStart = v; state int kvfiles = deterministicRandom()->randomInt(0, 3); + state Key begin = LiteralStringRef(""); + state Key end = LiteralStringRef(""); + state int blockSize = 3 * sizeof(uint32_t) + begin.size() + end.size() + 8; while(kvfiles > 0) { if(snapshots.empty()) { @@ -2172,15 +2201,17 @@ ACTOR Future testBackupContainer(std::string url) { v = nextVersion(v); } } - Reference range = wait(c->writeRangeFile(snapshots.rbegin()->first, 0, v, 10)); + Reference range = wait(c->writeRangeFile(snapshots.rbegin()->first, 0, v, blockSize)); ++nRangeFiles; v = nextVersion(v); snapshots.rbegin()->second.push_back(range->getFileName()); - snapshotBeginEndKeys.rbegin()->second.emplace_back(LiteralStringRef(""), LiteralStringRef("")); + snapshotBeginEndKeys.rbegin()->second.emplace_back(begin, end); int size = chooseFileSize(fileSizes); snapshotSizes.rbegin()->second += size; - writes.push_back(writeAndVerifyFile(c, range, size)); + // Write in actual range file format, instead of random data. + // writes.push_back(writeAndVerifyFile(c, range, size)); + wait(testWriteSnapshotFile(range, begin, end, blockSize)); if(deterministicRandom()->random01() < .2) { writes.push_back(c->writeKeyspaceSnapshotFile( @@ -2376,4 +2407,6 @@ TEST_CASE("/backup/continuous") { ASSERT(BackupContainerFileSystem::getPartitionedLogsContinuousEndVersion(files, 250) == 399); return Void(); -} \ No newline at end of file +} + +} // namespace backup_test diff --git a/fdbclient/BackupContainer.h b/fdbclient/BackupContainer.h index 92c03b1985..8ac79937dd 100644 --- a/fdbclient/BackupContainer.h +++ b/fdbclient/BackupContainer.h @@ -68,6 +68,9 @@ static const uint32_t BACKUP_AGENT_MLOG_VERSION = 2001; // Mutation log version written by BackupWorker static const uint32_t PARTITIONED_MLOG_VERSION = 4110; +// Snapshot file version written by FileBackupAgent +static const uint32_t BACKUP_AGENT_SNAPSHOT_FILE_VERSION = 1001; + struct LogFile { Version beginVersion; Version endVersion; @@ -108,12 +111,6 @@ struct RangeFile { std::string fileName; int64_t fileSize; - RangeFile() {} - RangeFile(Version v, uint32_t bSize, std::string name, int64_t size) - : version(v), blockSize(bSize), fileName(name), fileSize(size) {} - RangeFile(const RangeFile& f) - : version(f.version), blockSize(f.blockSize), fileName(f.fileName), fileSize(f.fileSize) {} - // Order by version, break ties with name bool operator< (const RangeFile &rhs) const { return version == rhs.version ? fileName < rhs.fileName : version < rhs.version; diff --git a/fdbclient/DatabaseConfiguration.cpp b/fdbclient/DatabaseConfiguration.cpp index 11938f29fa..d7b8468f25 100644 --- a/fdbclient/DatabaseConfiguration.cpp +++ b/fdbclient/DatabaseConfiguration.cpp @@ -494,11 +494,16 @@ Optional DatabaseConfiguration::get( KeyRef key ) const { } } -bool DatabaseConfiguration::isExcludedServer( NetworkAddress a ) const { - return get( encodeExcludedServersKey( AddressExclusion(a.ip, a.port) ) ).present() || - get( encodeExcludedServersKey( AddressExclusion(a.ip) ) ).present() || - get( encodeFailedServersKey( AddressExclusion(a.ip, a.port) ) ).present() || - get( encodeFailedServersKey( AddressExclusion(a.ip) ) ).present(); +bool DatabaseConfiguration::isExcludedServer( NetworkAddressList a ) const { + return get( encodeExcludedServersKey( AddressExclusion(a.address.ip, a.address.port) ) ).present() || + get( encodeExcludedServersKey( AddressExclusion(a.address.ip) ) ).present() || + get( encodeFailedServersKey( AddressExclusion(a.address.ip, a.address.port) ) ).present() || + get( encodeFailedServersKey( AddressExclusion(a.address.ip) ) ).present() || + ( a.secondaryAddress.present() && ( + get( encodeExcludedServersKey( AddressExclusion(a.secondaryAddress.get().ip, a.secondaryAddress.get().port) ) ).present() || + get( encodeExcludedServersKey( AddressExclusion(a.secondaryAddress.get().ip) ) ).present() || + get( encodeFailedServersKey( AddressExclusion(a.secondaryAddress.get().ip, a.secondaryAddress.get().port) ) ).present() || + get( encodeFailedServersKey( AddressExclusion(a.secondaryAddress.get().ip) ) ).present() ) ); } std::set DatabaseConfiguration::getExcludedServers() const { const_cast(this)->makeConfigurationImmutable(); diff --git a/fdbclient/DatabaseConfiguration.h b/fdbclient/DatabaseConfiguration.h index c2a99de9c4..46e0fbfc1f 100644 --- a/fdbclient/DatabaseConfiguration.h +++ b/fdbclient/DatabaseConfiguration.h @@ -187,7 +187,7 @@ struct DatabaseConfiguration { std::vector regions; // Excluded servers (no state should be here) - bool isExcludedServer( NetworkAddress ) const; + bool isExcludedServer( NetworkAddressList ) const; std::set getExcludedServers() const; int32_t getDesiredProxies() const { if(masterProxyCount == -1) return autoMasterProxyCount; return masterProxyCount; } diff --git a/fdbclient/FDBTypes.h b/fdbclient/FDBTypes.h index 7d765392fd..318832a227 100644 --- a/fdbclient/FDBTypes.h +++ b/fdbclient/FDBTypes.h @@ -284,6 +284,7 @@ struct KeyRangeRef { force_inline void serialize(Ar& ar) { serializer(ar, const_cast(begin), const_cast(end)); if( begin > end ) { + TraceEvent("InvertedRange").detail("Begin", begin).detail("End", end); throw inverted_range(); }; } diff --git a/fdbclient/FileBackupAgent.actor.cpp b/fdbclient/FileBackupAgent.actor.cpp index 59f1837374..49bd98816d 100644 --- a/fdbclient/FileBackupAgent.actor.cpp +++ b/fdbclient/FileBackupAgent.actor.cpp @@ -461,7 +461,8 @@ namespace fileBackup { // then the space after the final key to the next 1MB boundary would // just be padding anyway. struct RangeFileWriter { - RangeFileWriter(Reference file = Reference(), int blockSize = 0) : file(file), blockSize(blockSize), blockEnd(0), fileVersion(1001) {} + RangeFileWriter(Reference file = Reference(), int blockSize = 0) + : file(file), blockSize(blockSize), blockEnd(0), fileVersion(BACKUP_AGENT_SNAPSHOT_FILE_VERSION) {} // Handles the first block and internal blocks. Ends current block if needed. // The final flag is used in simulation to pad the file's final block to a whole block size @@ -557,8 +558,8 @@ namespace fileBackup { state StringRefReader reader(buf, restore_corrupted_data()); try { - // Read header, currently only decoding version 1001 - if(reader.consume() != 1001) + // Read header, currently only decoding BACKUP_AGENT_SNAPSHOT_FILE_VERSION + if(reader.consume() != BACKUP_AGENT_SNAPSHOT_FILE_VERSION) throw restore_unsupported_file_version(); // Read begin key, if this fails then block was invalid. @@ -2406,6 +2407,7 @@ namespace fileBackup { state bool backupWorkerEnabled = dbConfig.backupWorkerEnabled; if (!backupWorkerEnabled) { wait(success(changeConfig(cx, "backup_worker_enabled:=1", true))); + backupWorkerEnabled = true; } // Set the "backupStartedKey" and wait for all backup worker started @@ -3626,8 +3628,32 @@ public: } ACTOR static Future submitParallelRestore(Database cx, Key backupTag, - Standalone> backupRanges, KeyRef bcUrl, + Standalone> backupRanges, Key bcUrl, Version targetVersion, bool lockDB, UID randomUID) { + // Sanity check backup is valid + state Reference bc = IBackupContainer::openContainer(bcUrl.toString()); + state BackupDescription desc = wait(bc->describeBackup()); + wait(desc.resolveVersionTimes(cx)); + + if (targetVersion == invalidVersion && desc.maxRestorableVersion.present()) { + targetVersion = desc.maxRestorableVersion.get(); + TraceEvent(SevWarn, "FastRestoreSubmitRestoreRequestWithInvalidTargetVersion") + .detail("OverrideTargetVersion", targetVersion); + } + + Optional restoreSet = wait(bc->getRestoreSet(targetVersion)); + + if (!restoreSet.present()) { + TraceEvent(SevWarn, "FileBackupAgentRestoreNotPossible") + .detail("BackupContainer", bc->getURL()) + .detail("TargetVersion", targetVersion); + throw restore_invalid_version(); + } + + TraceEvent("FastRestoreSubmitRestoreRequest") + .detail("BackupDesc", desc.toString()) + .detail("TargetVersion", targetVersion); + state Reference tr(new ReadYourWritesTransaction(cx)); state int restoreIndex = 0; state int numTries = 0; @@ -4606,7 +4632,7 @@ Future FileBackupAgent::parallelRestoreFinish(Database cx, UID randomUID) } Future FileBackupAgent::submitParallelRestore(Database cx, Key backupTag, - Standalone> backupRanges, KeyRef bcUrl, + Standalone> backupRanges, Key bcUrl, Version targetVersion, bool lockDB, UID randomUID) { return FileBackupAgentImpl::submitParallelRestore(cx, backupTag, backupRanges, bcUrl, targetVersion, lockDB, randomUID); diff --git a/fdbclient/ManagementAPI.actor.cpp b/fdbclient/ManagementAPI.actor.cpp index 77f2cc7a86..06aff74ff2 100644 --- a/fdbclient/ManagementAPI.actor.cpp +++ b/fdbclient/ManagementAPI.actor.cpp @@ -1803,6 +1803,26 @@ ACTOR Future checkDatabaseLock( Reference tr, U return Void(); } +ACTOR Future advanceVersion(Database cx, Version v) { + state Transaction tr(cx); + loop { + tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + tr.setOption(FDBTransactionOptions::LOCK_AWARE); + try { + Version rv = wait(tr.getReadVersion()); + if (rv <= v) { + tr.set(minRequiredCommitVersionKey, BinaryWriter::toValue(v + 1, Unversioned())); + wait(tr.commit()); + } else { + printf("Current read version is %ld\n", rv); + return Void(); + } + } catch (Error& e) { + wait(tr.onError(e)); + } + } +} + ACTOR Future forceRecovery( Reference clusterFile, Key dcId ) { state Reference>> clusterInterface(new AsyncVar>); state Future leaderMon = monitorLeader(clusterFile, clusterInterface); diff --git a/fdbclient/ManagementAPI.actor.h b/fdbclient/ManagementAPI.actor.h index fe18d42717..a024f596c8 100644 --- a/fdbclient/ManagementAPI.actor.h +++ b/fdbclient/ManagementAPI.actor.h @@ -178,6 +178,8 @@ ACTOR Future unlockDatabase( Database cx, UID id ); ACTOR Future checkDatabaseLock( Transaction* tr, UID id ); ACTOR Future checkDatabaseLock( Reference tr, UID id ); +ACTOR Future advanceVersion(Database cx, Version v); + ACTOR Future setDDMode( Database cx, int mode ); ACTOR Future forceRecovery( Reference clusterFile, Standalone dcId ); diff --git a/fdbclient/MasterProxyInterface.h b/fdbclient/MasterProxyInterface.h index d24a26da3b..83f4606038 100644 --- a/fdbclient/MasterProxyInterface.h +++ b/fdbclient/MasterProxyInterface.h @@ -278,11 +278,12 @@ struct TxnStateRequest { VectorRef data; Sequence sequence; bool last; + std::vector broadcastInfo; ReplyPromise reply; template void serialize(Ar& ar) { - serializer(ar, data, sequence, last, reply, arena); + serializer(ar, data, sequence, last, broadcastInfo, reply, arena); } }; diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index 45ecb87047..9944eec5d7 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -76,16 +76,7 @@ TLSConfig tlsConfig(TLSEndpointType::CLIENT); NetworkOptions::NetworkOptions() : localAddress(""), clusterFile(""), traceDirectory(Optional()), traceRollSize(TRACE_DEFAULT_ROLL_SIZE), traceMaxLogsSize(TRACE_DEFAULT_MAX_LOGS_SIZE), traceLogGroup("default"), - traceFormat("xml"), traceClockSource("now"), runLoopProfilingEnabled(false) { - - Standalone> defaultSupportedVersions; - - StringRef sourceVersion = StringRef((const uint8_t*)getSourceVersion(), strlen(getSourceVersion())); - std::string protocolVersionString = format("%llx", currentProtocolVersion.version()); - defaultSupportedVersions.push_back_deep(defaultSupportedVersions.arena(), ClientVersionRef(LiteralStringRef(FDB_VT_VERSION), sourceVersion, protocolVersionString)); - - supportedVersions = ReferencedObject>>::from(defaultSupportedVersions); -} + traceFormat("xml"), traceClockSource("now"), runLoopProfilingEnabled(false), supportedVersions(new ReferencedObject>>()) {} static const Key CLIENT_LATENCY_INFO_PREFIX = LiteralStringRef("client_latency/"); static const Key CLIENT_LATENCY_INFO_CTR_PREFIX = LiteralStringRef("client_latency_counter/"); @@ -1045,7 +1036,10 @@ void setupNetwork(uint64_t transportId, bool useMetrics) { if (!networkOptions.logClientInfo.present()) networkOptions.logClientInfo = true; + TLS::DisableOpenSSLAtExitHandler(); g_network = newNet2(tlsConfig, false, useMetrics || networkOptions.traceDirectory.present()); + g_network->addStopCallback( Net2FileSystem::stop ); + g_network->addStopCallback( TLS::DestroyOpenSSLGlobalState ); FlowTransport::createInstance(true, transportId); Net2FileSystem::newFileSystem(); } diff --git a/fdbclient/ReadYourWrites.actor.cpp b/fdbclient/ReadYourWrites.actor.cpp index f47a01f54a..ff0b5dba3e 100644 --- a/fdbclient/ReadYourWrites.actor.cpp +++ b/fdbclient/ReadYourWrites.actor.cpp @@ -1229,8 +1229,8 @@ Future< Optional > ReadYourWritesTransaction::get( const Key& key, bool s return Optional(); } - // special key space are only allowed to query if both begin and end start with \xff\xff - if (key.startsWith(specialKeys.begin)) + // special key space are only allowed to query if both begin and end are in \xff\xff, \xff\xff\xff + if (specialKeys.contains(key)) return getDatabase()->specialKeySpace->get(Reference::addRef(this), key); if(checkUsedDuringCommit()) { @@ -1284,8 +1284,8 @@ Future< Standalone > ReadYourWritesTransaction::getRange( } } - // special key space are only allowed to query if both begin and end start with \xff\xff - if (begin.getKey().startsWith(specialKeys.begin) && end.getKey().startsWith(specialKeys.begin)) + // special key space are only allowed to query if both begin and end are in \xff\xff, \xff\xff\xff + if (specialKeys.contains(begin.getKey()) && specialKeys.contains(end.getKey())) return getDatabase()->specialKeySpace->getRange(Reference::addRef(this), begin, end, limits, reverse); diff --git a/fdbclient/StorageServerInterface.h b/fdbclient/StorageServerInterface.h index 3ba0ea7562..69fe31c5ee 100644 --- a/fdbclient/StorageServerInterface.h +++ b/fdbclient/StorageServerInterface.h @@ -74,6 +74,7 @@ struct StorageServerInterface { explicit StorageServerInterface(UID uid) : uniqueID( uid ) {} StorageServerInterface() : uniqueID( deterministicRandom()->randomUniqueID() ) {} NetworkAddress address() const { return getValue.getEndpoint().getPrimaryAddress(); } + NetworkAddress stableAddress() const { return getValue.getEndpoint().getStableAddress(); } Optional secondaryAddress() const { return getValue.getEndpoint().addresses.secondaryAddress; } UID id() const { return uniqueID; } std::string toString() const { return id().shortString(); } @@ -394,12 +395,14 @@ struct GetStorageMetricsReply { StorageMetrics available; StorageMetrics capacity; double bytesInputRate; + int64_t versionLag; + double lastUpdate; GetStorageMetricsReply() : bytesInputRate(0) {} template void serialize(Ar& ar) { - serializer(ar, load, available, capacity, bytesInputRate); + serializer(ar, load, available, capacity, bytesInputRate, versionLag, lastUpdate); } }; diff --git a/fdbrpc/AsyncFileEIO.actor.h b/fdbrpc/AsyncFileEIO.actor.h index f3450af847..512a6c95aa 100644 --- a/fdbrpc/AsyncFileEIO.actor.h +++ b/fdbrpc/AsyncFileEIO.actor.h @@ -52,6 +52,10 @@ public: } } + static void stop() { + eio_set_max_parallel(0); + } + static bool should_poll() { return want_poll; } static bool lock_fd( int fd ) { diff --git a/fdbrpc/AsyncFileWinASIO.actor.h b/fdbrpc/AsyncFileWinASIO.actor.h index 961d5a62f4..19b6ffcb9c 100644 --- a/fdbrpc/AsyncFileWinASIO.actor.h +++ b/fdbrpc/AsyncFileWinASIO.actor.h @@ -39,6 +39,8 @@ class AsyncFileWinASIO : public IAsyncFile, public ReferenceCounted waitForStateEqual( IFailureMonitor* monitor, Endpoint endpoint, FailureStatus status ) { +ACTOR Future waitForStateEqual(IFailureMonitor* monitor, Endpoint endpoint, FailureStatus status) { loop { Future change = monitor->onStateChanged(endpoint); - if (monitor->getState(endpoint) == status) - return Void(); - wait( change ); + if (monitor->getState(endpoint) == status) return Void(); + wait(change); } } -ACTOR Future waitForContinuousFailure( IFailureMonitor* monitor, Endpoint endpoint, double sustainedFailureDuration, double slope ) { +ACTOR Future waitForContinuousFailure(IFailureMonitor* monitor, Endpoint endpoint, + double sustainedFailureDuration, double slope) { state double startT = now(); + loop { - wait( monitor->onFailed( endpoint ) ); - if(monitor->permanentlyFailed(endpoint)) - return Void(); + wait(monitor->onFailed(endpoint)); + if (monitor->permanentlyFailed(endpoint)) return Void(); // X == sustainedFailureDuration + slope * (now()-startT+X) - double waitDelay = (sustainedFailureDuration + slope * (now()-startT)) / (1-slope); + double waitDelay = (sustainedFailureDuration + slope * (now() - startT)) / (1 - slope); - //SOMEDAY: if we know that this process is a server or client we can tune this optimization better - if(waitDelay < std::min(FLOW_KNOBS->CLIENT_REQUEST_INTERVAL, FLOW_KNOBS->SERVER_REQUEST_INTERVAL)) //We will not get a failure monitoring update in this amount of time, so there is no point in waiting for changes + // SOMEDAY: if we know that this process is a server or client we can tune this optimization better + if (waitDelay < + std::min(FLOW_KNOBS->CLIENT_REQUEST_INTERVAL, + FLOW_KNOBS->SERVER_REQUEST_INTERVAL)) // We will not get a failure monitoring update in this amount + // of time, so there is no point in waiting for changes waitDelay = 0; choose { - when (wait( monitor->onStateEqual( endpoint, FailureStatus(false) ) )) {} // SOMEDAY: Use onStateChanged() for efficiency - when (wait( delay(waitDelay) )) { - return Void(); - } + when(wait(monitor->onStateEqual(endpoint, FailureStatus(false)))) { + } // SOMEDAY: Use onStateChanged() for efficiency + when(wait(delay(waitDelay))) { return Void(); } } } } -Future IFailureMonitor::onStateEqual( Endpoint const& endpoint, FailureStatus status ) { - if ( status == getState(endpoint) ) return Void(); +Future IFailureMonitor::onStateEqual(Endpoint const& endpoint, FailureStatus status) { + if (status == getState(endpoint)) return Void(); return waitForStateEqual(this, endpoint, status); } -Future IFailureMonitor::onFailedFor( Endpoint const& endpoint, double sustainedFailureDuration, double slope ) { - ASSERT( slope < 1.0 ); - return waitForContinuousFailure( this, endpoint, sustainedFailureDuration, slope ); +Future IFailureMonitor::onFailedFor(Endpoint const& endpoint, double sustainedFailureDuration, double slope) { + ASSERT(slope < 1.0); + return waitForContinuousFailure(this, endpoint, sustainedFailureDuration, slope); } -void SimpleFailureMonitor::setStatus( NetworkAddress const& address, FailureStatus const& status ) { +void SimpleFailureMonitor::setStatus(NetworkAddress const& address, FailureStatus const& status) { - //if (status.failed) - // printf("On machine '%s': Machine '%s' is failed\n", g_network->getLocalAddress().toString().c_str(), address.toString().c_str()); - //printf("%s.setState(%s, %s) %p\n", g_network->getLocalAddress().toString(), address.toString(), status.failed ? "FAILED" : "OK", this); - //addressStatus.set( address, status ); + // if (status.failed) + // printf("On machine '%s': Machine '%s' is failed\n", g_network->getLocalAddress().toString().c_str(), + // address.toString().c_str()); printf("%s.setState(%s, %s) %p\n", g_network->getLocalAddress().toString(), + // address.toString(), status.failed ? "FAILED" : "OK", this); addressStatus.set( address, status ); // onStateChanged() will be waiting on endpointKnownFailed only where it is false, so if the address status // for an endpoint that is waited on changes, the waiter sees its failure status change @@ -96,22 +98,29 @@ void SimpleFailureMonitor::setStatus( NetworkAddress const& address, FailureStat } } -void SimpleFailureMonitor::endpointNotFound( Endpoint const& endpoint ) { +void SimpleFailureMonitor::endpointNotFound(Endpoint const& endpoint) { // SOMEDAY: Expiration (this "leaks" memory) - if(endpoint.token.first() == -1) { - TraceEvent("WellKnownEndpointNotFound").suppressFor(1.0).detail("Address", endpoint.getPrimaryAddress()).detail("TokenFirst", endpoint.token.first()).detail("TokenSecond", endpoint.token.second()); + if (endpoint.token.first() == -1) { + TraceEvent("WellKnownEndpointNotFound") + .suppressFor(1.0) + .detail("Address", endpoint.getPrimaryAddress()) + .detail("TokenFirst", endpoint.token.first()) + .detail("TokenSecond", endpoint.token.second()); return; } - TraceEvent("EndpointNotFound").suppressFor(1.0).detail("Address", endpoint.getPrimaryAddress()).detail("Token", endpoint.token); - endpointKnownFailed.set( endpoint, true ); + TraceEvent("EndpointNotFound") + .suppressFor(1.0) + .detail("Address", endpoint.getPrimaryAddress()) + .detail("Token", endpoint.token); + endpointKnownFailed.set(endpoint, true); } -void SimpleFailureMonitor::notifyDisconnect( NetworkAddress const& address ) { +void SimpleFailureMonitor::notifyDisconnect(NetworkAddress const& address) { //TraceEvent("NotifyDisconnect").detail("Address", address); - endpointKnownFailed.triggerRange( Endpoint({address}, UID()), Endpoint({address}, UID(-1,-1)) ); + endpointKnownFailed.triggerRange(Endpoint({ address }, UID()), Endpoint({ address }, UID(-1, -1))); } -Future SimpleFailureMonitor::onDisconnectOrFailure( Endpoint const& endpoint ) { +Future SimpleFailureMonitor::onDisconnectOrFailure(Endpoint const& endpoint) { // If the endpoint or address is already failed, return right away auto i = addressStatus.find(endpoint.getPrimaryAddress()); if (i == addressStatus.end() || i->second.isFailed() || endpointKnownFailed.get(endpoint)) { @@ -120,12 +129,12 @@ Future SimpleFailureMonitor::onDisconnectOrFailure( Endpoint const& endpoi } // Return when the endpoint is triggered, which means that either the endpoint has become known failed, or the - // address has changed state (and since it was previously not failed, it must now be failed), or notifyDisconnect() - // has been called. + // address has changed state (and since it was previously not failed, it must now be failed), or + // notifyDisconnect() has been called. return endpointKnownFailed.onChange(endpoint); } -Future SimpleFailureMonitor::onStateChanged( Endpoint const& endpoint ) { +Future SimpleFailureMonitor::onStateChanged(Endpoint const& endpoint) { // Wait on endpointKnownFailed if it is false, to pick up both endpointNotFound errors (which set it to true) // and changes to addressStatus (which trigger a range). Don't wait on endpointKnownFailed if it is true, because // failure status for that endpoint can never change (and we could be spuriously triggered by setStatus) @@ -137,36 +146,42 @@ Future SimpleFailureMonitor::onStateChanged( Endpoint const& endpoint ) { return endpointKnownFailed.onChange(endpoint); } -FailureStatus SimpleFailureMonitor::getState( Endpoint const& endpoint ) { +FailureStatus SimpleFailureMonitor::getState(Endpoint const& endpoint) { if (endpointKnownFailed.get(endpoint)) return FailureStatus(true); else { auto a = addressStatus.find(endpoint.getPrimaryAddress()); - if (a == addressStatus.end()) return FailureStatus(); - else return a->second; - //printf("%s.getState(%s) = %s %p\n", g_network->getLocalAddress().toString(), endpoint.address.toString(), a.failed ? "FAILED" : "OK", this); + if (a == addressStatus.end()) + return FailureStatus(); + else + return a->second; + // printf("%s.getState(%s) = %s %p\n", g_network->getLocalAddress().toString(), endpoint.address.toString(), + // a.failed ? "FAILED" : "OK", this); } } -FailureStatus SimpleFailureMonitor::getState( NetworkAddress const& address ) { +FailureStatus SimpleFailureMonitor::getState(NetworkAddress const& address) { auto a = addressStatus.find(address); - if (a == addressStatus.end()) return FailureStatus(); - else return a->second; + if (a == addressStatus.end()) + return FailureStatus(); + else + return a->second; } -bool SimpleFailureMonitor::onlyEndpointFailed( Endpoint const& endpoint ) { - if(!endpointKnownFailed.get(endpoint)) - return false; +bool SimpleFailureMonitor::onlyEndpointFailed(Endpoint const& endpoint) { + if (!endpointKnownFailed.get(endpoint)) return false; auto a = addressStatus.find(endpoint.getPrimaryAddress()); - if (a == addressStatus.end()) return true; - else return !a->second.failed; + if (a == addressStatus.end()) + return true; + else + return !a->second.failed; } -bool SimpleFailureMonitor::permanentlyFailed( Endpoint const& endpoint ) { +bool SimpleFailureMonitor::permanentlyFailed(Endpoint const& endpoint) { return endpointKnownFailed.get(endpoint); } void SimpleFailureMonitor::reset() { - addressStatus = std::unordered_map< NetworkAddress, FailureStatus >(); + addressStatus = std::unordered_map(); endpointKnownFailed.resetNoWaiting(); } diff --git a/fdbrpc/FailureMonitor.h b/fdbrpc/FailureMonitor.h index ef82846558..a62a57f3e5 100644 --- a/fdbrpc/FailureMonitor.h +++ b/fdbrpc/FailureMonitor.h @@ -76,8 +76,8 @@ struct FailureStatus { bool isFailed() const { return failed; } bool isAvailable() const { return !failed; } - bool operator == (FailureStatus const& r) const { return failed == r.failed; } - bool operator != (FailureStatus const& r) const { return failed != r.failed; } + bool operator==(FailureStatus const& r) const { return failed == r.failed; } + bool operator!=(FailureStatus const& r) const { return failed != r.failed; } template void serialize(Ar& ar) { serializer(ar, failed); @@ -87,43 +87,43 @@ struct FailureStatus { class IFailureMonitor { public: // Returns the currently known status for the endpoint - virtual FailureStatus getState( Endpoint const& endpoint ) = 0; + virtual FailureStatus getState(Endpoint const& endpoint) = 0; // Returns the currently known status for the address - virtual FailureStatus getState( NetworkAddress const& address ) = 0; + virtual FailureStatus getState(NetworkAddress const& address) = 0; // Only use this function when the endpoint is known to be failed - virtual void endpointNotFound( Endpoint const& ) = 0; + virtual void endpointNotFound(Endpoint const&) = 0; // The next time the known status for the endpoint changes, returns the new status. - virtual Future onStateChanged( Endpoint const& endpoint ) = 0; + virtual Future onStateChanged(Endpoint const& endpoint) = 0; // Returns when onFailed(endpoint) || transport().onDisconnect( endpoint.getPrimaryAddress() ), but more efficiently - virtual Future onDisconnectOrFailure( Endpoint const& endpoint ) = 0; + virtual Future onDisconnectOrFailure(Endpoint const& endpoint) = 0; // Returns true if the endpoint is failed but the address of the endpoint is not failed. - virtual bool onlyEndpointFailed( Endpoint const& endpoint ) = 0; + virtual bool onlyEndpointFailed(Endpoint const& endpoint) = 0; // Returns true if the endpoint will never become available. - virtual bool permanentlyFailed( Endpoint const& endpoint ) = 0; + virtual bool permanentlyFailed(Endpoint const& endpoint) = 0; // Called by FlowTransport when a connection closes and a prior request or reply might be lost - virtual void notifyDisconnect( NetworkAddress const& ) = 0; + virtual void notifyDisconnect(NetworkAddress const&) = 0; // Called to update the failure status of network address directly when running client. virtual void setStatus(NetworkAddress const& address, FailureStatus const& status) = 0; // Returns when the known status of endpoint is next equal to status. Returns immediately // if appropriate. - Future onStateEqual( Endpoint const& endpoint, FailureStatus status ); + Future onStateEqual(Endpoint const& endpoint, FailureStatus status); // Returns when the status of the given endpoint is next considered "failed" - Future onFailed( Endpoint const& endpoint ) { - return onStateEqual( endpoint, FailureStatus() ); - } + Future onFailed(Endpoint const& endpoint) { return onStateEqual(endpoint, FailureStatus()); } - // Returns when the status of the given endpoint has continuously been "failed" for sustainedFailureDuration + (elapsedTime*sustainedFailureSlope) - Future onFailedFor( Endpoint const& endpoint, double sustainedFailureDuration, double sustainedFailureSlope = 0.0 ); + // Returns when the status of the given endpoint has continuously been "failed" for sustainedFailureDuration + + // (elapsedTime*sustainedFailureSlope) + Future onFailedFor(Endpoint const& endpoint, double sustainedFailureDuration, + double sustainedFailureSlope = 0.0); // Returns the failure monitor that the calling machine should use static IFailureMonitor& failureMonitor() { @@ -137,22 +137,23 @@ public: class SimpleFailureMonitor : public IFailureMonitor { public: - SimpleFailureMonitor() : endpointKnownFailed() { } - void setStatus( NetworkAddress const& address, FailureStatus const& status ); - void endpointNotFound( Endpoint const& ); - virtual void notifyDisconnect( NetworkAddress const& ); + SimpleFailureMonitor() : endpointKnownFailed() {} + void setStatus(NetworkAddress const& address, FailureStatus const& status); + void endpointNotFound(Endpoint const&); + virtual void notifyDisconnect(NetworkAddress const&); - virtual Future onStateChanged( Endpoint const& endpoint ); - virtual FailureStatus getState( Endpoint const& endpoint ); - virtual FailureStatus getState( NetworkAddress const& address ); - virtual Future onDisconnectOrFailure( Endpoint const& endpoint ); - virtual bool onlyEndpointFailed( Endpoint const& endpoint ); - virtual bool permanentlyFailed( Endpoint const& endpoint ); + virtual Future onStateChanged(Endpoint const& endpoint); + virtual FailureStatus getState(Endpoint const& endpoint); + virtual FailureStatus getState(NetworkAddress const& address); + virtual Future onDisconnectOrFailure(Endpoint const& endpoint); + virtual bool onlyEndpointFailed(Endpoint const& endpoint); + virtual bool permanentlyFailed(Endpoint const& endpoint); void reset(); + private: - std::unordered_map< NetworkAddress, FailureStatus > addressStatus; - YieldedAsyncMap< Endpoint, bool > endpointKnownFailed; + std::unordered_map addressStatus; + YieldedAsyncMap endpointKnownFailed; friend class OnStateChangedActorActor; }; diff --git a/fdbrpc/FlowTests.actor.cpp b/fdbrpc/FlowTests.actor.cpp index ade8946c35..16805e9cd5 100644 --- a/fdbrpc/FlowTests.actor.cpp +++ b/fdbrpc/FlowTests.actor.cpp @@ -236,6 +236,7 @@ struct YieldMockNetwork : INetwork, ReferenceCounted { virtual double now() { return baseNetwork->now(); } virtual double timer() { return baseNetwork->timer(); } virtual void stop() { return baseNetwork->stop(); } + virtual void addStopCallback( std::function fn ) { ASSERT(false); return; } virtual bool isSimulated() const { return baseNetwork->isSimulated(); } virtual void onMainThread(Promise&& signal, TaskPriority taskID) { return baseNetwork->onMainThread(std::move(signal), taskID); } bool isOnMainThread() const override { return baseNetwork->isOnMainThread(); } diff --git a/fdbrpc/FlowTransport.actor.cpp b/fdbrpc/FlowTransport.actor.cpp index 95a832c6d4..ccd762be91 100644 --- a/fdbrpc/FlowTransport.actor.cpp +++ b/fdbrpc/FlowTransport.actor.cpp @@ -28,6 +28,7 @@ #include "flow/crc32c.h" #include "fdbrpc/fdbrpc.h" #include "fdbrpc/FailureMonitor.h" +#include "fdbrpc/HealthMonitor.h" #include "fdbrpc/genericactors.actor.h" #include "fdbrpc/simulator.h" #include "flow/ActorCollection.h" @@ -189,6 +190,7 @@ public: std::vector> listeners; std::unordered_map> peers; std::unordered_map> closedPeers; + HealthMonitor healthMonitor; Reference> degraded; bool warnAlwaysForLargePacket; @@ -206,6 +208,7 @@ public: Int64MetricHandle countConnClosedWithoutError; std::map> incompatiblePeers; + AsyncTrigger incompatiblePeersChanged; uint32_t numIncompatibleConnections; std::map multiVersionConnections; double lastIncompatibleMessage; @@ -295,7 +298,7 @@ static ReliablePacket* sendPacket( TransportData* self, Reference peer, IS ACTOR Future connectionMonitor( Reference peer ) { state Endpoint remotePingEndpoint({ peer->destination }, WLTOKEN_PING_PACKET); loop { - if (!FlowTransport::transport().isClient() && !peer->destination.isPublic() && peer->compatible) { + if (!FlowTransport::isClient() && !peer->destination.isPublic() && peer->compatible) { // Don't send ping messages to clients unless necessary. Instead monitor incoming client pings. // We ignore this block for incompatible clients because pings from server would trigger the // peer->resetPing and prevent 'connection_failed' due to ping timeout. @@ -324,7 +327,7 @@ ACTOR Future connectionMonitor( Reference peer ) { (peer->lastDataPacketSentTime < now() - FLOW_KNOBS->CONNECTION_MONITOR_UNREFERENCED_CLOSE_DELAY)) { // TODO: What about when peerReference == -1? throw connection_unreferenced(); - } else if (FlowTransport::transport().isClient() && peer->compatible && peer->destination.isPublic() && + } else if (FlowTransport::isClient() && peer->compatible && peer->destination.isPublic() && (peer->lastConnectTime < now() - FLOW_KNOBS->CONNECTION_MONITOR_IDLE_TIMEOUT) && (peer->lastDataPacketSentTime < now() - FLOW_KNOBS->CONNECTION_MONITOR_IDLE_TIMEOUT)) { // First condition is necessary because we may get here if we are server. @@ -396,80 +399,133 @@ ACTOR Future connectionWriter( Reference self, Reference delayedHealthUpdate(NetworkAddress address) { + state double start = now(); + state bool delayed = false; + loop { + if (FLOW_KNOBS->HEALTH_MONITOR_MARK_FAILED_UNSTABLE_CONNECTIONS && + FlowTransport::transport().healthMonitor()->tooManyConnectionsClosed(address) && address.isPublic()) { + if (!delayed) { + TraceEvent("TooManyConnectionsClosedMarkFailed") + .detail("Dest", address) + .detail("StartTime", start) + .detail("ClosedCount", FlowTransport::transport().healthMonitor()->closedConnectionsCount(address)); + IFailureMonitor::failureMonitor().setStatus(address, FailureStatus(true)); + } + delayed = true; + wait(delayJittered(FLOW_KNOBS->MAX_RECONNECTION_TIME * 2.0)); + } else { + if (delayed) { + TraceEvent("TooManyConnectionsClosedMarkAvailable") + .detail("Dest", address) + .detail("StartTime", start) + .detail("TimeElapsed", now() - start) + .detail("ClosedCount", FlowTransport::transport().healthMonitor()->closedConnectionsCount(address)); + } + IFailureMonitor::failureMonitor().setStatus(address, FailureStatus(false)); + break; + } + } + return Void(); +} + ACTOR Future connectionKeeper( Reference self, Reference conn = Reference(), Future reader = Void()) { TraceEvent(SevDebug, "ConnectionKeeper", conn ? conn->getDebugID() : UID()) .detail("PeerAddr", self->destination) .detail("ConnSet", (bool)conn); + ASSERT_WE_THINK(FlowTransport::transport().getLocalAddress() != self->destination); state Optional firstConnFailedTime = Optional(); + state int retryConnect = false; + loop { try { + state Future delayedHealthUpdateF = Future(); + if (!conn) { // Always, except for the first loop with an incoming connection self->outgoingConnectionIdle = true; // Wait until there is something to send. while (self->unsent.empty()) { - if (self->destination.isPublic() && - IFailureMonitor::failureMonitor().getState(self->destination).isFailed()) { - break; + // Override waiting, if we are in failed state to update failure monitoring status. + Future retryConnectF = Never(); + if (retryConnect) { + retryConnectF = IFailureMonitor::failureMonitor().getState(self->destination).isAvailable() + ? delay(FLOW_KNOBS->FAILURE_DETECTION_DELAY) + : delay(FLOW_KNOBS->SERVER_REQUEST_INTERVAL); } - wait (self->dataToSend.onTrigger()); + choose { + when(wait(self->dataToSend.onTrigger())) {} + when(wait(retryConnectF)) { break; } + } } - ASSERT( self->destination.isPublic() ); + ASSERT(self->destination.isPublic()); self->outgoingConnectionIdle = false; wait(delayJittered( std::max(0.0, self->lastConnectTime + self->reconnectionDelay - now()))); // Don't connect() to the same peer more than once per 2 sec self->lastConnectTime = now(); - TraceEvent("ConnectingTo", conn ? conn->getDebugID() : UID()).suppressFor(1.0).detail("PeerAddr", self->destination); + TraceEvent("ConnectingTo", conn ? conn->getDebugID() : UID()) + .suppressFor(1.0) + .detail("PeerAddr", self->destination) + .detail("PeerReferences", self->peerReferences) + .detail("FailureStatus", IFailureMonitor::failureMonitor().getState(self->destination).isAvailable() + ? "OK" + : "FAILED"); try { choose { - when( Reference _conn = wait( INetworkConnections::net()->connect(self->destination) ) ) { + when(Reference _conn = + wait(INetworkConnections::net()->connect(self->destination))) { conn = _conn; wait(conn->connectHandshake()); - IFailureMonitor::failureMonitor().setStatus(self->destination, FailureStatus(false)); if (self->unsent.empty()) { - conn->close(); - conn = Reference(); - continue; - } else { - TraceEvent("ConnectionExchangingConnectPacket", conn->getDebugID()) - .suppressFor(1.0) - .detail("PeerAddr", self->destination); - self->prependConnectPacket(); + delayedHealthUpdateF = delayedHealthUpdate(self->destination); + choose { + when(wait(delayedHealthUpdateF)) { + conn->close(); + conn = Reference(); + retryConnect = false; + continue; + } + when(wait(self->dataToSend.onTrigger())) {} + } } - reader = connectionReader( self->transport, conn, self, Promise>()); + + TraceEvent("ConnectionExchangingConnectPacket", conn->getDebugID()) + .suppressFor(1.0) + .detail("PeerAddr", self->destination); + self->prependConnectPacket(); + reader = connectionReader(self->transport, conn, self, Promise>()); } when( wait( delay( FLOW_KNOBS->CONNECTION_MONITOR_TIMEOUT ) ) ) { throw connection_failed(); } } - } catch( Error &e ) { - if(e.code() != error_code_connection_failed) { + } catch (Error& e) { + if (e.code() != error_code_connection_failed) { throw; } TraceEvent("ConnectionTimedOut", conn ? conn->getDebugID() : UID()) .suppressFor(1.0) .detail("PeerAddr", self->destination); - IFailureMonitor::failureMonitor().setStatus( - self->destination, FailureStatus(e.code() == error_code_connection_failed)); throw; } } else { - IFailureMonitor::failureMonitor().setStatus(self->destination, FailureStatus(false)); self->outgoingConnectionIdle = false; } firstConnFailedTime.reset(); try { self->transport->countConnEstablished++; - wait( connectionWriter( self, conn ) || reader || connectionMonitor(self) ); + if (!delayedHealthUpdateF.isValid()) + delayedHealthUpdateF = delayedHealthUpdate(self->destination); + wait(connectionWriter(self, conn) || reader || connectionMonitor(self)); } catch (Error& e) { if (e.code() == error_code_connection_failed || e.code() == error_code_actor_cancelled || e.code() == error_code_connection_unreferenced || @@ -483,6 +539,7 @@ ACTOR Future connectionKeeper( Reference self, ASSERT( false ); } catch (Error& e) { + delayedHealthUpdateF.cancel(); if(now() - self->lastConnectTime > FLOW_KNOBS->RECONNECTION_RESET_TIME) { self->reconnectionDelay = FLOW_KNOBS->INITIAL_RECONNECTION_TIME; } else { @@ -499,6 +556,18 @@ ACTOR Future connectionKeeper( Reference self, firstConnFailedTime = now(); } + // Don't immediately mark connection as failed. To stay closed to earlier behaviour of centralized + // failure monitoring, wait until connection stays failed for FLOW_KNOBS->FAILURE_DETECTION_DELAY timeout. + retryConnect = self->destination.isPublic() && e.code() == error_code_connection_failed; + if (e.code() == error_code_connection_failed) { + if (!self->destination.isPublic()) { + // Can't connect back to non-public addresses. + IFailureMonitor::failureMonitor().setStatus(self->destination, FailureStatus(true)); + } else if (now() - firstConnFailedTime.get() > FLOW_KNOBS->FAILURE_DETECTION_DELAY) { + IFailureMonitor::failureMonitor().setStatus(self->destination, FailureStatus(true)); + } + } + self->discardUnreliablePackets(); reader = Future(); bool ok = e.code() == error_code_connection_failed || e.code() == error_code_actor_cancelled || @@ -521,7 +590,7 @@ ACTOR Future connectionKeeper( Reference self, if(self->destination.isPublic() && IFailureMonitor::failureMonitor().getState(self->destination).isAvailable() - && !FlowTransport::transport().isClient()) + && !FlowTransport::isClient()) { auto& it = self->transport->closedPeers[self->destination]; if(now() - it.second > FLOW_KNOBS->TOO_MANY_CONNECTIONS_CLOSED_RESET_DELAY) { @@ -536,6 +605,10 @@ ACTOR Future connectionKeeper( Reference self, } if (conn) { + if (self->destination.isPublic() && e.code() == error_code_connection_failed) { + FlowTransport::transport().healthMonitor()->reportPeerClosed(self->destination); + } + conn->close(); conn = Reference(); } @@ -556,6 +629,14 @@ ACTOR Future connectionKeeper( Reference self, } } +Peer::Peer(TransportData* transport, NetworkAddress const& destination) + : transport(transport), destination(destination), outgoingConnectionIdle(true), lastConnectTime(0.0), + reconnectionDelay(FLOW_KNOBS->INITIAL_RECONNECTION_TIME), compatible(true), outstandingReplies(0), + incompatibleProtocolVersionNewer(false), peerReferences(-1), bytesReceived(0), lastDataPacketSentTime(now()) { + + IFailureMonitor::failureMonitor().setStatus(destination, FailureStatus(false)); +} + void Peer::send(PacketBuffer* pb, ReliablePacket* rp, bool firstUnsent) { unsent.setWriteBuffer(pb); if (rp) reliable.insert(rp); @@ -662,6 +743,9 @@ ACTOR static void deliver(TransportData* self, Endpoint destination, ArenaReader } catch (Error& e) { g_currentDeliveryPeerAddress = {NetworkAddress()}; TraceEvent(SevError, "ReceiverError").error(e).detail("Token", destination.token.toString()).detail("Peer", destination.getPrimaryAddress()); + if(!FlowTransport::isClient()) { + flushAndExit(FDB_EXIT_ERROR); + } throw; } } else if (destination.token.first() & TOKEN_STREAM_FLAG) { @@ -1023,7 +1107,7 @@ Reference TransportData::getOrOpenPeer( NetworkAddress const& address, boo auto peer = getPeer(address); if(!peer) { peer = Reference( new Peer(this, address) ); - if(startConnectionKeeper) { + if(startConnectionKeeper && !isLocalAddress(address)) { peer->connect = connectionKeeper(peer); } peers[address] = peer; @@ -1039,10 +1123,14 @@ bool TransportData::isLocalAddress(const NetworkAddress& address) const { ACTOR static Future multiVersionCleanupWorker( TransportData* self ) { loop { wait(delay(FLOW_KNOBS->CONNECTION_CLEANUP_DELAY)); + bool foundIncompatible = false; for(auto it = self->incompatiblePeers.begin(); it != self->incompatiblePeers.end();) { if( self->multiVersionConnections.count(it->second.first) ) { it = self->incompatiblePeers.erase(it); } else { + if( now() - it->second.second > FLOW_KNOBS->INCOMPATIBLE_PEER_DELAY_BEFORE_LOGGING ) { + foundIncompatible = true; + } it++; } } @@ -1054,6 +1142,10 @@ ACTOR static Future multiVersionCleanupWorker( TransportData* self ) { it++; } } + + if(foundIncompatible) { + self->incompatiblePeersChanged.trigger(); + } } } @@ -1084,6 +1176,10 @@ std::map>* FlowTransport::getIncompa return &self->incompatiblePeers; } +Future FlowTransport::onIncompatibleChanged() { + return self->incompatiblePeersChanged.onTrigger(); +} + Future FlowTransport::bind( NetworkAddress publicAddress, NetworkAddress listenAddress ) { ASSERT( publicAddress.isPublic() ); if(self->localAddresses.address == NetworkAddress()) { @@ -1107,9 +1203,7 @@ void FlowTransport::addPeerReference(const Endpoint& endpoint, bool isStream) { return; Reference peer = self->getOrOpenPeer(endpoint.getPrimaryAddress()); - - if(peer->peerReferences == -1) { - IFailureMonitor::failureMonitor().setStatus(endpoint.getPrimaryAddress(), FailureStatus(false)); + if (peer->peerReferences == -1) { peer->peerReferences = 1; } else { peer->peerReferences++; @@ -1173,7 +1267,8 @@ static void sendLocal( TransportData* self, ISerializeSource const& what, const deliver(self, destination, ArenaReader(copy.arena(), copy, AssumeVersion(currentProtocolVersion)), false); } -static ReliablePacket* sendPacket( TransportData* self, Reference peer, ISerializeSource const& what, const Endpoint& destination, bool reliable ) { +static ReliablePacket* sendPacket(TransportData* self, Reference peer, ISerializeSource const& what, + const Endpoint& destination, bool reliable) { const bool checksumEnabled = !destination.getPrimaryAddress().isTLS(); ++self->countPacketsGenerated; @@ -1315,4 +1410,15 @@ void FlowTransport::createInstance(bool isClient, uint64_t transportId) { g_network->setGlobal(INetwork::enFlowTransport, (flowGlobalType) new FlowTransport(transportId)); g_network->setGlobal(INetwork::enNetworkAddressFunc, (flowGlobalType) &FlowTransport::getGlobalLocalAddress); g_network->setGlobal(INetwork::enNetworkAddressesFunc, (flowGlobalType) &FlowTransport::getGlobalLocalAddresses); + + // Mark ourselves as avaiable in FailureMonitor + const auto& localAddresses = FlowTransport::transport().getLocalAddresses(); + IFailureMonitor::failureMonitor().setStatus(localAddresses.address, FailureStatus(false)); + if (localAddresses.secondaryAddress.present()) { + IFailureMonitor::failureMonitor().setStatus(localAddresses.secondaryAddress.get(), FailureStatus(false)); + } +} + +HealthMonitor* FlowTransport::healthMonitor() { + return &self->healthMonitor; } diff --git a/fdbrpc/FlowTransport.h b/fdbrpc/FlowTransport.h index f99554fdc2..7ff104e1a5 100644 --- a/fdbrpc/FlowTransport.h +++ b/fdbrpc/FlowTransport.h @@ -23,6 +23,7 @@ #pragma once #include +#include "fdbrpc/HealthMonitor.h" #include "flow/genericactors.actor.h" #include "flow/network.h" #include "flow/FileIdentifier.h" @@ -44,7 +45,9 @@ public: } void choosePrimaryAddress() { - if(addresses.secondaryAddress.present() && !g_network->getLocalAddresses().secondaryAddress.present() && (addresses.address.isTLS() != g_network->getLocalAddresses().address.isTLS())) { + if(addresses.secondaryAddress.present() && + ((!g_network->getLocalAddresses().secondaryAddress.present() && (addresses.address.isTLS() != g_network->getLocalAddresses().address.isTLS())) || + (g_network->getLocalAddresses().secondaryAddress.present() && !addresses.address.isTLS()))) { std::swap(addresses.address, addresses.secondaryAddress.get()); } } @@ -58,6 +61,10 @@ public: return addresses.address; } + NetworkAddress getStableAddress() const { + return addresses.getTLSAddress(); + } + bool operator == (Endpoint const& r) const { return getPrimaryAddress() == r.getPrimaryAddress() && token == r.token; } @@ -123,10 +130,7 @@ struct Peer : public ReferenceCounted { double lastDataPacketSentTime; int outstandingReplies; - explicit Peer(TransportData* transport, NetworkAddress const& destination) - : transport(transport), destination(destination), outgoingConnectionIdle(true), lastConnectTime(0.0), - reconnectionDelay(FLOW_KNOBS->INITIAL_RECONNECTION_TIME), compatible(true), outstandingReplies(0), - incompatibleProtocolVersionNewer(false), peerReferences(-1), bytesReceived(0), lastDataPacketSentTime(now()) {} + explicit Peer(TransportData* transport, NetworkAddress const& destination); void send(PacketBuffer* pb, ReliablePacket* rp, bool firstUnsent); @@ -164,6 +168,9 @@ public: std::map>* getIncompatiblePeers(); // Returns the same of all peers that have attempted to connect, but have incompatible protocol versions + Future onIncompatibleChanged(); + // Returns when getIncompatiblePeers has at least one peer which is incompatible. + void addPeerReference(const Endpoint&, bool isStream); // Signal that a peer connection is being used, even if no messages are currently being sent to the peer @@ -205,6 +212,8 @@ public: Endpoint loadedEndpoint(const UID& token); + HealthMonitor* healthMonitor(); + private: class TransportData* self; }; diff --git a/fdbrpc/HealthMonitor.actor.cpp b/fdbrpc/HealthMonitor.actor.cpp new file mode 100644 index 0000000000..bf03370fd2 --- /dev/null +++ b/fdbrpc/HealthMonitor.actor.cpp @@ -0,0 +1,51 @@ +/* + * HealthMonitor.actor.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2020 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "fdbrpc/FailureMonitor.h" +#include "fdbrpc/FlowTransport.h" +#include "fdbrpc/HealthMonitor.h" + +void HealthMonitor::reportPeerClosed(const NetworkAddress& peerAddress) { + purgeOutdatedHistory(); + peerClosedHistory.push_back(std::make_pair(now(), peerAddress)); + peerClosedNum[peerAddress] += 1; +} + +void HealthMonitor::purgeOutdatedHistory() { + for (auto it : peerClosedHistory) { + if (it.first < now() - FLOW_KNOBS->HEALTH_MONITOR_CLIENT_REQUEST_INTERVAL_SECS) { + peerClosedNum[it.second] -= 1; + ASSERT(peerClosedNum[it.second] >= 0); + peerClosedHistory.pop_front(); + } else { + break; + } + } +} + +bool HealthMonitor::tooManyConnectionsClosed(const NetworkAddress& peerAddress) { + purgeOutdatedHistory(); + return peerClosedNum[peerAddress] > FLOW_KNOBS->HEALTH_MONITOR_CONNECTION_MAX_CLOSED; +} + +int HealthMonitor::closedConnectionsCount(const NetworkAddress& peerAddress) { + purgeOutdatedHistory(); + return peerClosedNum[peerAddress]; +} diff --git a/fdbrpc/HealthMonitor.h b/fdbrpc/HealthMonitor.h new file mode 100644 index 0000000000..ef301cc7e1 --- /dev/null +++ b/fdbrpc/HealthMonitor.h @@ -0,0 +1,41 @@ +/* + * HealthMonitor.h + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2020 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef FDBRPC_HEALTH_MONITOR_H +#define FDBRPC_HEALTH_MONITOR_H + +#include +#include + +#include + +class HealthMonitor { +public: + void reportPeerClosed(const NetworkAddress& peerAddress); + bool tooManyConnectionsClosed(const NetworkAddress& peerAddress); + int closedConnectionsCount(const NetworkAddress& peerAddress); +private: + void purgeOutdatedHistory(); + + std::deque> peerClosedHistory; + std::unordered_map peerClosedNum; +}; + +#endif // FDBRPC_HEALTH_MONITOR_H diff --git a/fdbrpc/Net2FileSystem.cpp b/fdbrpc/Net2FileSystem.cpp index 2377ddfd7e..84b71e83e4 100644 --- a/fdbrpc/Net2FileSystem.cpp +++ b/fdbrpc/Net2FileSystem.cpp @@ -115,3 +115,7 @@ Net2FileSystem::Net2FileSystem(double ioTimeout, std::string fileSystemPath) } #endif } + +void Net2FileSystem::stop() { + Net2AsyncFile::stop(); +} diff --git a/fdbrpc/Net2FileSystem.h b/fdbrpc/Net2FileSystem.h index ab9a8911dc..19cd223c5f 100644 --- a/fdbrpc/Net2FileSystem.h +++ b/fdbrpc/Net2FileSystem.h @@ -36,6 +36,7 @@ public: virtual Future< std::time_t > lastWriteTime( std::string filename ); //void init(); + static void stop(); Net2FileSystem(double ioTimeout=0.0, std::string fileSystemPath = ""); diff --git a/fdbrpc/sim2.actor.cpp b/fdbrpc/sim2.actor.cpp index 6335dab89f..c5003b73fc 100644 --- a/fdbrpc/sim2.actor.cpp +++ b/fdbrpc/sim2.actor.cpp @@ -871,7 +871,12 @@ public: return emptyConfig; } - virtual void stop() { isStopped = true; } + virtual void stop() { + isStopped = true; + } + virtual void addStopCallback( std::function fn ) { + stopCallbacks.emplace_back(std::move(fn)); + } virtual bool isSimulated() const { return true; } struct SimThreadArgs { @@ -995,6 +1000,9 @@ public: } self->currentProcess = callingMachine; self->net2->stop(); + for ( auto& fn : self->stopCallbacks ) { + fn(); + } return Void(); } @@ -1615,6 +1623,7 @@ public: // Not letting currentProcess be NULL eliminates some annoying special cases currentProcess = new ProcessInfo("NoMachine", LocalityData(Optional>(), StringRef(), StringRef(), StringRef()), ProcessClass(), {NetworkAddress()}, this, "", ""); g_network = net2 = newNet2(TLSConfig(), false, true); + g_network->addStopCallback( Net2FileSystem::stop ); Net2FileSystem::newFileSystem(); check_yield(TaskPriority::Zero); } @@ -1713,6 +1722,8 @@ public: //tasks is guarded by ISimulator::mutex std::priority_queue> tasks; + std::vector> stopCallbacks; + //Sim2Net network; INetwork *net2; diff --git a/fdbserver/BackupWorker.actor.cpp b/fdbserver/BackupWorker.actor.cpp index 066ccc0143..2ff0502188 100644 --- a/fdbserver/BackupWorker.actor.cpp +++ b/fdbserver/BackupWorker.actor.cpp @@ -68,7 +68,6 @@ struct BackupData { const UID myId; const Tag tag; // LogRouter tag for this worker, i.e., (-2, i) const int totalTags; // Total log router tags - // Backup request's commit version. Mutations are logged at some version after this. const Version startVersion; // This worker's start version const Optional endVersion; // old epoch's end version (inclusive), or empty for current epoch const LogEpoch recruitedEpoch; // current epoch whose tLogs are receiving mutations @@ -209,8 +208,12 @@ struct BackupData { } BackupData* self = nullptr; + + // Backup request's commit version. Mutations are logged at some version after this. Version startVersion = invalidVersion; + // The last mutation log's saved version (not inclusive), i.e., next log's begin version. Version lastSavedVersion = invalidVersion; + Future>> container; Future>> ranges; // Key ranges of this backup Future updateWorker; @@ -568,17 +571,6 @@ ACTOR Future saveProgress(BackupData* self, Version backupVersion) { } } -// Return a block of contiguous padding bytes, growing if needed. -static Value makePadding(int size) { - static Value pad; - if (pad.size() < size) { - pad = makeString(size); - memset(mutateString(pad), '\xff', pad.size()); - } - - return pad.substr(0, size); -} - // Write a mutation to a log file. Note the mutation can be different from // message.message for clear mutations. ACTOR Future addMutation(Reference logFile, VersionedMessage message, StringRef mutation, @@ -599,7 +591,7 @@ ACTOR Future addMutation(Reference logFile, VersionedMessage // Write padding if needed const int bytesLeft = *blockEnd - logFile->size(); if (bytesLeft > 0) { - state Value paddingFFs = makePadding(bytesLeft); + state Value paddingFFs = fileBackup::makePadding(bytesLeft); wait(logFile->append(paddingFFs.begin(), bytesLeft)); } @@ -762,6 +754,10 @@ ACTOR Future uploadData(BackupData* self) { state int numMsg = 0; Version lastPopVersion = popVersion; + // index of last version's end position in self->messages + int lastVersionIndex = 0; + Version lastVersion = invalidVersion; + if (self->messages.empty()) { // Even though messages is empty, we still want to advance popVersion. if (!self->endVersion.present()) { @@ -770,18 +766,30 @@ ACTOR Future uploadData(BackupData* self) { } else { for (const auto& message : self->messages) { // message may be prefetched in peek; uncommitted message should not be uploaded. - if (message.getVersion() > self->maxPopVersion()) break; - popVersion = std::max(popVersion, message.getVersion()); + const Version version = message.getVersion(); + if (version > self->maxPopVersion()) break; + if (version > popVersion) { + lastVersionIndex = numMsg; + lastVersion = popVersion; + popVersion = version; + } numMsg++; } } if (self->pullFinished()) { popVersion = self->endVersion.get(); + } else { + // make sure file is saved on version boundary + popVersion = lastVersion; + numMsg = lastVersionIndex; } if (((numMsg > 0 || popVersion > lastPopVersion) && self->pulling) || self->pullFinished()) { TraceEvent("BackupWorkerSave", self->myId) .detail("Version", popVersion) + .detail("LastPopVersion", lastPopVersion) + .detail("Pulling", self->pulling) .detail("SavedVersion", self->savedVersion) + .detail("NumMsg", numMsg) .detail("MsgQ", self->messages.size()); // save an empty file for old epochs so that log file versions are continuous wait(saveMutationsToFile(self, popVersion, numMsg)); diff --git a/fdbserver/CMakeLists.txt b/fdbserver/CMakeLists.txt index b6f3cb339b..4bf4aa1c5c 100644 --- a/fdbserver/CMakeLists.txt +++ b/fdbserver/CMakeLists.txt @@ -6,7 +6,6 @@ set(FDBSERVER_SRCS BackupProgress.actor.h BackupWorker.actor.cpp ClusterController.actor.cpp - ClusterRecruitmentInterface.h ConflictSet.h CoordinatedState.actor.cpp CoordinatedState.h diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp index fc00bccbf0..3cf16b2ec6 100644 --- a/fdbserver/ClusterController.actor.cpp +++ b/fdbserver/ClusterController.actor.cpp @@ -36,7 +36,6 @@ #include "fdbserver/LeaderElection.h" #include "fdbserver/LogSystemConfig.h" #include "fdbserver/WaitFailure.h" -#include "fdbserver/ClusterRecruitmentInterface.h" #include "fdbserver/RatekeeperInterface.h" #include "fdbserver/ServerDBInfo.h" #include "fdbserver/Status.h" @@ -63,14 +62,15 @@ struct WorkerInfo : NonCopyable { Future haltRatekeeper; Future haltDistributor; Optional storageCacheInfo; + Standalone> issues; WorkerInfo() : gen(-1), reboots(0), priorityInfo(ProcessClass::UnsetFit, false, ClusterControllerPriorityInfo::FitnessUnknown) {} - WorkerInfo( Future watcher, ReplyPromise reply, Generation gen, WorkerInterface interf, ProcessClass initialClass, ProcessClass processClass, ClusterControllerPriorityInfo priorityInfo, bool degraded ) : - watcher(watcher), reply(reply), gen(gen), reboots(0), initialClass(initialClass), priorityInfo(priorityInfo), details(interf, processClass, degraded) {} + WorkerInfo( Future watcher, ReplyPromise reply, Generation gen, WorkerInterface interf, ProcessClass initialClass, ProcessClass processClass, ClusterControllerPriorityInfo priorityInfo, bool degraded, Standalone> issues ) : + watcher(watcher), reply(reply), gen(gen), reboots(0), initialClass(initialClass), priorityInfo(priorityInfo), details(interf, processClass, degraded), issues(issues) {} WorkerInfo( WorkerInfo&& r ) BOOST_NOEXCEPT : watcher(std::move(r.watcher)), reply(std::move(r.reply)), gen(r.gen), reboots(r.reboots), initialClass(r.initialClass), priorityInfo(r.priorityInfo), details(std::move(r.details)), - haltRatekeeper(r.haltRatekeeper), haltDistributor(r.haltDistributor), storageCacheInfo(r.storageCacheInfo) {} + haltRatekeeper(r.haltRatekeeper), haltDistributor(r.haltDistributor), storageCacheInfo(r.storageCacheInfo), issues(r.issues) {} void operator=( WorkerInfo&& r ) BOOST_NOEXCEPT { watcher = std::move(r.watcher); reply = std::move(r.reply); @@ -82,6 +82,7 @@ struct WorkerInfo : NonCopyable { haltRatekeeper = r.haltRatekeeper; haltDistributor = r.haltDistributor; storageCacheInfo = r.storageCacheInfo; + issues = r.issues; } }; @@ -98,13 +99,11 @@ class ClusterControllerData { public: struct DBInfo { Reference> clientInfo; - Reference>> serverInfo; - CachedSerialization serverInfoMasterOnly; - std::set requiredAddresses; - ProcessIssuesMap workersWithIssues; + Reference> serverInfo; std::map incompatibleConnections; AsyncTrigger forceMasterFailure; int64_t masterRegistrationCount; + int64_t dbInfoCount; bool recoveryStalled; bool forceRecovery; DatabaseConfiguration config; // Asynchronously updated via master registration @@ -117,42 +116,36 @@ public: std::map> clientStatus; DBInfo() : masterRegistrationCount(0), recoveryStalled(false), forceRecovery(false), unfinishedRecoveries(0), logGenerations(0), cachePopulated(false), - clientInfo( new AsyncVar( ClientDBInfo() ) ), - serverInfo( new AsyncVar>( CachedSerialization() ) ), + clientInfo( new AsyncVar( ClientDBInfo() ) ), dbInfoCount(0), + serverInfo( new AsyncVar( ServerDBInfo() ) ), db( DatabaseContext::create( clientInfo, Future(), LocalityData(), true, TaskPriority::DefaultEndpoint, true ) ) // SOMEDAY: Locality! { } - void addRequiredAddresses(const std::vector& interfaces) { - for(auto& it : interfaces) { - requiredAddresses.insert(it.address()); - } - } - void setDistributor(const DataDistributorInterface& interf) { - CachedSerialization newInfoCache = serverInfo->get(); - auto& newInfo = newInfoCache.mutate(); + auto newInfo = serverInfo->get(); newInfo.id = deterministicRandom()->randomUniqueID(); + newInfo.infoGeneration = ++dbInfoCount; newInfo.distributor = interf; - serverInfo->set( newInfoCache ); + serverInfo->set( newInfo ); } void setRatekeeper(const RatekeeperInterface& interf) { - CachedSerialization newInfoCache = serverInfo->get(); - auto& newInfo = newInfoCache.mutate(); + auto newInfo = serverInfo->get(); newInfo.id = deterministicRandom()->randomUniqueID(); + newInfo.infoGeneration = ++dbInfoCount; newInfo.ratekeeper = interf; - serverInfo->set( newInfoCache ); + serverInfo->set( newInfo ); } void setStorageCache(uint16_t id, const StorageServerInterface& interf) { - CachedSerialization newInfoCache = serverInfo->get(); - auto& newInfo = newInfoCache.mutate(); + auto newInfo = serverInfo->get(); bool found = false; for(auto& it : newInfo.storageCaches) { if(it.first == id) { if(it.second != interf) { newInfo.id = deterministicRandom()->randomUniqueID(); + newInfo.infoGeneration = ++dbInfoCount; it.second = interf; } found = true; @@ -161,36 +154,36 @@ public: } if(!found) { newInfo.id = deterministicRandom()->randomUniqueID(); + newInfo.infoGeneration = ++dbInfoCount; newInfo.storageCaches.push_back(std::make_pair(id, interf)); } - serverInfo->set( newInfoCache ); + serverInfo->set( newInfo ); } void clearInterf(ProcessClass::ClassType t) { - CachedSerialization newInfoCache = serverInfo->get(); - auto& newInfo = newInfoCache.mutate(); + auto newInfo = serverInfo->get(); newInfo.id = deterministicRandom()->randomUniqueID(); + newInfo.infoGeneration = ++dbInfoCount; if (t == ProcessClass::DataDistributorClass) { newInfo.distributor = Optional(); } else if (t == ProcessClass::RatekeeperClass) { newInfo.ratekeeper = Optional(); } - serverInfo->set( newInfoCache ); + serverInfo->set( newInfo ); } void clearStorageCache(uint16_t id) { - CachedSerialization newInfoCache = serverInfo->get(); - auto& newInfo = newInfoCache.mutate(); + auto newInfo = serverInfo->get(); for(auto it = newInfo.storageCaches.begin(); it != newInfo.storageCaches.end(); ++it) { if(it->first == id) { newInfo.id = deterministicRandom()->randomUniqueID(); + newInfo.infoGeneration = ++dbInfoCount; newInfo.storageCaches.erase(it); break; } } - serverInfo->set( newInfoCache ); + serverInfo->set( newInfo ); } - }; struct UpdateWorkerList { @@ -256,8 +249,8 @@ public: } bool isLongLivedStateless( Optional const& processId ) { - return (db.serverInfo->get().read().distributor.present() && db.serverInfo->get().read().distributor.get().locality.processId() == processId) || - (db.serverInfo->get().read().ratekeeper.present() && db.serverInfo->get().read().ratekeeper.get().locality.processId() == processId); + return (db.serverInfo->get().distributor.present() && db.serverInfo->get().distributor.get().locality.processId() == processId) || + (db.serverInfo->get().ratekeeper.present() && db.serverInfo->get().ratekeeper.get().locality.processId() == processId); } WorkerDetails getStorageWorker( RecruitStorageRequest const& req ) { @@ -270,6 +263,7 @@ public: !excludedMachines.count(it.second.details.interf.locality.zoneId()) && ( includeDCs.size() == 0 || includeDCs.count(it.second.details.interf.locality.dcId()) ) && !addressExcluded(excludedAddresses, it.second.details.interf.address()) && + ( !it.second.details.interf.secondaryAddress().present() || !addressExcluded(excludedAddresses, it.second.details.interf.secondaryAddress().get()) ) && it.second.details.processClass.machineClassFitness( ProcessClass::Storage ) <= ProcessClass::UnsetFit ) { return it.second.details; } @@ -306,7 +300,7 @@ public: for( auto& it : id_worker ) { auto fitness = it.second.details.processClass.machineClassFitness( ProcessClass::Storage ); - if( workerAvailable(it.second, false) && !conf.isExcludedServer(it.second.details.interf.address()) && fitness != ProcessClass::NeverAssign && ( !dcId.present() || it.second.details.interf.locality.dcId()==dcId.get() ) ) { + if( workerAvailable(it.second, false) && !conf.isExcludedServer(it.second.details.interf.addresses()) && fitness != ProcessClass::NeverAssign && ( !dcId.present() || it.second.details.interf.locality.dcId()==dcId.get() ) ) { fitness_workers[ fitness ].push_back(it.second.details); } } @@ -351,7 +345,7 @@ public: for( auto& it : id_worker ) { if (std::find(exclusionWorkerIds.begin(), exclusionWorkerIds.end(), it.second.details.interf.id()) == exclusionWorkerIds.end()) { auto fitness = it.second.details.processClass.machineClassFitness(ProcessClass::TLog); - if (workerAvailable(it.second, checkStable) && !conf.isExcludedServer(it.second.details.interf.address()) && fitness != ProcessClass::NeverAssign && (!dcIds.size() || dcIds.count(it.second.details.interf.locality.dcId()))) { + if (workerAvailable(it.second, checkStable) && !conf.isExcludedServer(it.second.details.interf.addresses()) && fitness != ProcessClass::NeverAssign && (!dcIds.size() || dcIds.count(it.second.details.interf.locality.dcId()))) { fitness_workers[std::make_pair(fitness, it.second.details.degraded)].push_back(it.second.details); } else { @@ -507,7 +501,7 @@ public: for( auto& it : id_worker ) { auto fitness = it.second.details.processClass.machineClassFitness( role ); - if(conf.isExcludedServer(it.second.details.interf.address())) { + if(conf.isExcludedServer(it.second.details.interf.addresses())) { fitness = std::max(fitness, ProcessClass::ExcludeFit); } if( workerAvailable(it.second, checkStable) && fitness < unacceptableFitness && it.second.details.interf.locality.dcId()==dcId ) { @@ -545,7 +539,7 @@ public: for( auto& it : id_worker ) { auto fitness = it.second.details.processClass.machineClassFitness( role ); - if( workerAvailable(it.second, checkStable) && !conf.isExcludedServer(it.second.details.interf.address()) && it.second.details.interf.locality.dcId() == dcId && + if( workerAvailable(it.second, checkStable) && !conf.isExcludedServer(it.second.details.interf.addresses()) && it.second.details.interf.locality.dcId() == dcId && ( !minWorker.present() || ( it.second.details.interf.id() != minWorker.get().worker.interf.id() && ( fitness < minWorker.get().fitness || (fitness == minWorker.get().fitness && id_used[it.first] <= minWorker.get().used ) ) ) ) ) { if (isLongLivedStateless(it.first)) { fitness_workers[ std::make_pair(fitness, id_used[it.first]) ].second.push_back(it.second.details); @@ -664,7 +658,7 @@ public: std::set>> getDatacenters( DatabaseConfiguration const& conf, bool checkStable = false ) { std::set>> result; for( auto& it : id_worker ) - if( workerAvailable( it.second, checkStable ) && !conf.isExcludedServer( it.second.details.interf.address() ) ) + if( workerAvailable( it.second, checkStable ) && !conf.isExcludedServer( it.second.details.interf.addresses() ) ) result.insert(it.second.details.interf.locality.dcId()); return result; } @@ -984,7 +978,7 @@ public: } void checkRecoveryStalled() { - if( (db.serverInfo->get().read().recoveryState == RecoveryState::RECRUITING || db.serverInfo->get().read().recoveryState == RecoveryState::ACCEPTING_COMMITS || db.serverInfo->get().read().recoveryState == RecoveryState::ALL_LOGS_RECRUITED) && db.recoveryStalled ) { + if( (db.serverInfo->get().recoveryState == RecoveryState::RECRUITING || db.serverInfo->get().recoveryState == RecoveryState::ACCEPTING_COMMITS || db.serverInfo->get().recoveryState == RecoveryState::ALL_LOGS_RECRUITED) && db.recoveryStalled ) { if (db.config.regions.size() > 1) { auto regions = db.config.regions; if(clusterControllerDcId.get() == regions[0].dcId) { @@ -998,7 +992,7 @@ public: //FIXME: determine when to fail the cluster controller when a primaryDC has not been set bool betterMasterExists() { - const ServerDBInfo dbi = db.serverInfo->get().read(); + const ServerDBInfo dbi = db.serverInfo->get(); if(dbi.recoveryState < RecoveryState::ACCEPTING_COMMITS) { return false; @@ -1094,7 +1088,7 @@ public: // Check master fitness. Don't return false if master is excluded in case all the processes are excluded, we still need master for recovery. ProcessClass::Fitness oldMasterFit = masterWorker->second.details.processClass.machineClassFitness( ProcessClass::Master ); - if(db.config.isExcludedServer(dbi.master.address())) { + if(db.config.isExcludedServer(dbi.master.addresses())) { oldMasterFit = std::max(oldMasterFit, ProcessClass::ExcludeFit); } @@ -1102,7 +1096,7 @@ public: id_used[clusterControllerProcessId]++; WorkerFitnessInfo mworker = getWorkerForRoleInDatacenter(clusterControllerDcId, ProcessClass::Master, ProcessClass::NeverAssign, db.config, id_used, true); auto newMasterFit = mworker.worker.processClass.machineClassFitness( ProcessClass::Master ); - if(db.config.isExcludedServer(mworker.worker.interf.address())) { + if(db.config.isExcludedServer(mworker.worker.interf.addresses())) { newMasterFit = std::max(newMasterFit, ProcessClass::ExcludeFit); } @@ -1263,7 +1257,7 @@ public: ASSERT(masterProcessId.present()); if (processId == masterProcessId) return false; - auto& dbInfo = db.serverInfo->get().read(); + auto& dbInfo = db.serverInfo->get(); for (const auto& tlogset : dbInfo.logSystemConfig.tLogs) { for (const auto& tlog: tlogset.tLogs) { if (tlog.present() && tlog.interf().locality.processId() == processId) return true; @@ -1293,7 +1287,7 @@ public: std::map>, int> idUsed; updateKnownIds(&idUsed); - auto& dbInfo = db.serverInfo->get().read(); + auto& dbInfo = db.serverInfo->get(); for (const auto& tlogset : dbInfo.logSystemConfig.tLogs) { for (const auto& tlog: tlogset.tLogs) { if (tlog.present()) { @@ -1331,6 +1325,9 @@ public: UpdateWorkerList updateWorkerList; Future outstandingRequestChecker; Future outstandingRemoteRequestChecker; + AsyncTrigger updateDBInfo; + std::set updateDBInfoEndpoints; + std::set removedDBInfoEndpoints; DBInfo db; Database cx; @@ -1351,7 +1348,6 @@ public: Counter getWorkersRequests; Counter getClientWorkersRequests; Counter registerMasterRequests; - Counter getServerDBInfoRequests; Counter statusRequests; Counter failureMonitoringRequests; @@ -1370,18 +1366,18 @@ public: getWorkersRequests("GetWorkersRequests", clusterControllerMetrics), getClientWorkersRequests("GetClientWorkersRequests", clusterControllerMetrics), registerMasterRequests("RegisterMasterRequests", clusterControllerMetrics), - getServerDBInfoRequests("GetServerDBInfoRequests", clusterControllerMetrics), statusRequests("StatusRequests", clusterControllerMetrics), failureMonitoringRequests("FailureMonitoringRequests", clusterControllerMetrics), serversFailed("ServersFailed", clusterControllerMetrics), serversUnfailed("ServersUnfailed", clusterControllerMetrics) { - auto& serverInfo = db.serverInfoMasterOnly.mutate(); + auto serverInfo = ServerDBInfo(); serverInfo.id = deterministicRandom()->randomUniqueID(); + serverInfo.infoGeneration = ++db.dbInfoCount; serverInfo.masterLifetime.ccID = id; serverInfo.clusterInterface = ccInterface; serverInfo.myLocality = locality; - db.serverInfo->set( db.serverInfoMasterOnly ); + db.serverInfo->set( serverInfo ); cx = openDBOnServer(db.serverInfo, TaskPriority::DefaultEndpoint, true, true); } @@ -1416,7 +1412,7 @@ ACTOR Future clusterWatchDatabase( ClusterControllerData* cluster, Cluster continue; } RecruitMasterRequest rmq; - rmq.lifetime = db->serverInfo->get().read().masterLifetime; + rmq.lifetime = db->serverInfo->get().masterLifetime; rmq.forceRecovery = db->forceRecovery; cluster->masterProcessId = masterWorker.worker.interf.locality.processId(); @@ -1436,22 +1432,20 @@ ACTOR Future clusterWatchDatabase( ClusterControllerData* cluster, Cluster db->masterRegistrationCount = 0; db->recoveryStalled = false; - db->serverInfoMasterOnly = CachedSerialization(); - auto& dbInfo = db->serverInfoMasterOnly.mutate(); - + auto dbInfo = ServerDBInfo(); dbInfo.master = iMaster; dbInfo.id = deterministicRandom()->randomUniqueID(); - dbInfo.masterLifetime = db->serverInfo->get().read().masterLifetime; + dbInfo.infoGeneration = ++db->dbInfoCount; + dbInfo.masterLifetime = db->serverInfo->get().masterLifetime; ++dbInfo.masterLifetime; - dbInfo.clusterInterface = db->serverInfo->get().read().clusterInterface; - dbInfo.distributor = db->serverInfo->get().read().distributor; - dbInfo.ratekeeper = db->serverInfo->get().read().ratekeeper; - dbInfo.storageCaches = db->serverInfo->get().read().storageCaches; - dbInfo.latencyBandConfig = db->serverInfo->get().read().latencyBandConfig; + dbInfo.clusterInterface = db->serverInfo->get().clusterInterface; + dbInfo.distributor = db->serverInfo->get().distributor; + dbInfo.ratekeeper = db->serverInfo->get().ratekeeper; + dbInfo.storageCaches = db->serverInfo->get().storageCaches; + dbInfo.latencyBandConfig = db->serverInfo->get().latencyBandConfig; TraceEvent("CCWDB", cluster->id).detail("Lifetime", dbInfo.masterLifetime.toString()).detail("ChangeID", dbInfo.id); - db->requiredAddresses.clear(); - db->serverInfo->set( db->serverInfoMasterOnly ); + db->serverInfo->set( dbInfo ); state Future spinDelay = delay(SERVER_KNOBS->MASTER_SPIN_DELAY); // Don't retry master recovery more than once per second, but don't delay the "first" recovery after more than a second of normal operation @@ -1486,30 +1480,14 @@ ACTOR Future clusterWatchDatabase( ClusterControllerData* cluster, Cluster } ACTOR Future clusterGetServerInfo(ClusterControllerData::DBInfo* db, UID knownServerInfoID, - Standalone> issues, - std::vector incompatiblePeers, - ReplyPromise> reply) { - state Optional issueID; - state bool useMasterOnly = false; - setIssues(db->workersWithIssues, reply.getEndpoint().getPrimaryAddress(), issues, issueID); - for(auto it : incompatiblePeers) { - db->incompatibleConnections[it] = now() + SERVER_KNOBS->INCOMPATIBLE_PEERS_LOGGING_INTERVAL; - } - - loop { - useMasterOnly = db->serverInfo->get().read().recoveryState < RecoveryState::ACCEPTING_COMMITS && !db->requiredAddresses.count(reply.getEndpoint().getPrimaryAddress()); - if((useMasterOnly ? db->serverInfoMasterOnly.read().id : db->serverInfo->get().read().id) != knownServerInfoID) { - break; - } + ReplyPromise reply) { + while(db->serverInfo->get().id == knownServerInfoID) { choose { when (wait( yieldedFuture(db->serverInfo->onChange()) )) {} when (wait( delayJittered( 300 ) )) { break; } // The server might be long gone! } } - - removeIssues(db->workersWithIssues, reply.getEndpoint().getPrimaryAddress(), issueID); - - reply.send( useMasterOnly ? db->serverInfoMasterOnly : db->serverInfo->get() ); + reply.send( db->serverInfo->get() ); return Void(); } @@ -1535,12 +1513,6 @@ void checkOutstandingRecruitmentRequests( ClusterControllerData* self ) { RecruitFromConfigurationRequest& req = self->outstandingRecruitmentRequests[i]; try { RecruitFromConfigurationReply rep = self->findWorkersForConfiguration( req ); - self->db.addRequiredAddresses(rep.oldLogRouters); - self->db.addRequiredAddresses(rep.proxies); - self->db.addRequiredAddresses(rep.resolvers); - self->db.addRequiredAddresses(rep.satelliteTLogs); - self->db.addRequiredAddresses(rep.tLogs); - self->db.serverInfo->trigger(); req.reply.send( rep ); swapAndPop( &self->outstandingRecruitmentRequests, i-- ); } catch (Error& e) { @@ -1559,9 +1531,6 @@ void checkOutstandingRemoteRecruitmentRequests( ClusterControllerData* self ) { RecruitRemoteFromConfigurationRequest& req = self->outstandingRemoteRecruitmentRequests[i]; try { RecruitRemoteFromConfigurationReply rep = self->findRemoteWorkersForConfiguration( req ); - self->db.addRequiredAddresses(rep.remoteTLogs); - self->db.addRequiredAddresses(rep.logRouters); - self->db.serverInfo->trigger(); req.reply.send( rep ); swapAndPop( &self->outstandingRemoteRecruitmentRequests, i-- ); } catch (Error& e) { @@ -1609,7 +1578,7 @@ void checkOutstandingStorageRequests( ClusterControllerData* self ) { } void checkBetterDDOrRK(ClusterControllerData* self) { - if (!self->masterProcessId.present() || self->db.serverInfo->get().read().recoveryState < RecoveryState::ACCEPTING_COMMITS) { + if (!self->masterProcessId.present() || self->db.serverInfo->get().recoveryState < RecoveryState::ACCEPTING_COMMITS) { return; } @@ -1628,11 +1597,11 @@ void checkBetterDDOrRK(ClusterControllerData* self) { newDDWorker = self->id_worker[self->masterProcessId.get()].details; } auto bestFitnessForRK = newRKWorker.processClass.machineClassFitness(ProcessClass::Ratekeeper); - if(self->db.config.isExcludedServer(newRKWorker.interf.address())) { + if(self->db.config.isExcludedServer(newRKWorker.interf.addresses())) { bestFitnessForRK = std::max(bestFitnessForRK, ProcessClass::ExcludeFit); } auto bestFitnessForDD = newDDWorker.processClass.machineClassFitness(ProcessClass::DataDistributor); - if(self->db.config.isExcludedServer(newDDWorker.interf.address())) { + if(self->db.config.isExcludedServer(newDDWorker.interf.addresses())) { bestFitnessForDD = std::max(bestFitnessForDD, ProcessClass::ExcludeFit); } //TraceEvent("CheckBetterDDorRKNewRecruits", self->id).detail("MasterProcessId", self->masterProcessId) @@ -1641,7 +1610,7 @@ void checkBetterDDOrRK(ClusterControllerData* self) { Optional> currentRKProcessId; Optional> currentDDProcessId; - auto& db = self->db.serverInfo->get().read(); + auto& db = self->db.serverInfo->get(); bool ratekeeperHealthy = false; if (db.ratekeeper.present() && self->id_worker.count(db.ratekeeper.get().locality.processId()) && (!self->recruitingRatekeeperID.present() || (self->recruitingRatekeeperID.get() == db.ratekeeper.get().id()))) { @@ -1700,7 +1669,7 @@ ACTOR Future doCheckOutstandingRequests( ClusterControllerData* self ) { self->checkRecoveryStalled(); if (self->betterMasterExists()) { self->db.forceMasterFailure.trigger(); - TraceEvent("MasterRegistrationKill", self->id).detail("MasterId", self->db.serverInfo->get().read().master.id()); + TraceEvent("MasterRegistrationKill", self->id).detail("MasterId", self->db.serverInfo->get().master.id()); } } catch( Error &e ) { if(e.code() != error_code_no_more_servers) { @@ -1757,12 +1726,14 @@ ACTOR Future rebootAndCheck( ClusterControllerData* cluster, Optional workerAvailabilityWatch( WorkerInterface worker, ProcessClass startingClass, ClusterControllerData* cluster ) { +ACTOR Future workerAvailabilityWatch( WorkerInterface worker, ProcessClass startingClass, ClusterControllerData* cluster) { state Future failed = (worker.address() == g_network->getLocalAddress() || startingClass.classType() == ProcessClass::TesterClass) ? Never() : waitFailureClient(worker.waitFailure, SERVER_KNOBS->WORKER_FAILURE_TIME); - cluster->updateWorkerList.set( worker.locality.processId(), ProcessData(worker.locality, startingClass, worker.address()) ); + cluster->updateWorkerList.set( worker.locality.processId(), ProcessData(worker.locality, startingClass, worker.stableAddress()) ); + cluster->updateDBInfoEndpoints.insert(worker.updateServerDBInfo.getEndpoint()); + cluster->updateDBInfo.trigger(); // This switching avoids a race where the worker can be added to id_worker map after the workerAvailabilityWatch fails for the worker. wait(delay(0)); @@ -1801,6 +1772,7 @@ ACTOR Future workerAvailabilityWatch( WorkerInterface worker, ProcessClass if (worker.locality.processId() == cluster->masterProcessId) { cluster->masterProcessId = Optional(); } + cluster->removedDBInfoEndpoints.insert(worker.updateServerDBInfo.getEndpoint()); cluster->id_worker.erase( worker.locality.processId() ); cluster->updateWorkerList.set( worker.locality.processId(), Optional() ); return Void(); @@ -1996,12 +1968,6 @@ ACTOR Future clusterRecruitFromConfiguration( ClusterControllerData* self, loop { try { auto rep = self->findWorkersForConfiguration( req ); - self->db.addRequiredAddresses(rep.oldLogRouters); - self->db.addRequiredAddresses(rep.proxies); - self->db.addRequiredAddresses(rep.resolvers); - self->db.addRequiredAddresses(rep.satelliteTLogs); - self->db.addRequiredAddresses(rep.tLogs); - self->db.serverInfo->trigger(); req.reply.send( rep ); return Void(); } catch (Error& e) { @@ -2027,9 +1993,6 @@ ACTOR Future clusterRecruitRemoteFromConfiguration( ClusterControllerData* loop { try { RecruitRemoteFromConfigurationReply rep = self->findRemoteWorkersForConfiguration( req ); - self->db.addRequiredAddresses(rep.remoteTLogs); - self->db.addRequiredAddresses(rep.logRouters); - self->db.serverInfo->trigger(); req.reply.send( rep ); return Void(); } catch (Error& e) { @@ -2066,8 +2029,8 @@ void clusterRegisterMaster( ClusterControllerData* self, RegisterMasterRequest c //make sure the request comes from an active database auto db = &self->db; - if ( db->serverInfo->get().read().master.id() != req.id || req.registrationCount <= db->masterRegistrationCount ) { - TraceEvent("MasterRegistrationNotFound", self->id).detail("MasterId", req.id).detail("ExistingId", db->serverInfo->get().read().master.id()).detail("RegCount", req.registrationCount).detail("ExistingRegCount", db->masterRegistrationCount); + if ( db->serverInfo->get().master.id() != req.id || req.registrationCount <= db->masterRegistrationCount ) { + TraceEvent("MasterRegistrationNotFound", self->id).detail("MasterId", req.id).detail("ExistingId", db->serverInfo->get().master.id()).detail("RegCount", req.registrationCount).detail("ExistingRegCount", db->masterRegistrationCount); return; } @@ -2088,7 +2051,7 @@ void clusterRegisterMaster( ClusterControllerData* self, RegisterMasterRequest c self->gotFullyRecoveredConfig = true; db->fullyRecoveredConfig = req.configuration.get(); for ( auto& it : self->id_worker ) { - bool isExcludedFromConfig = db->fullyRecoveredConfig.isExcludedServer(it.second.details.interf.address()); + bool isExcludedFromConfig = db->fullyRecoveredConfig.isExcludedServer(it.second.details.interf.addresses()); if ( it.second.priorityInfo.isExcluded != isExcludedFromConfig ) { it.second.priorityInfo.isExcluded = isExcludedFromConfig; if( !it.second.reply.isSet() ) { @@ -2100,8 +2063,7 @@ void clusterRegisterMaster( ClusterControllerData* self, RegisterMasterRequest c } bool isChanged = false; - auto cachedInfo = self->db.serverInfo->get(); - auto& dbInfo = cachedInfo.mutate(); + auto dbInfo = self->db.serverInfo->get(); if (dbInfo.recoveryState != req.recoveryState) { dbInfo.recoveryState = req.recoveryState; @@ -2142,7 +2104,8 @@ void clusterRegisterMaster( ClusterControllerData* self, RegisterMasterRequest c if( isChanged ) { dbInfo.id = deterministicRandom()->randomUniqueID(); - self->db.serverInfo->set( cachedInfo ); + dbInfo.infoGeneration = ++self->db.dbInfoCount; + self->db.serverInfo->set( dbInfo ); } checkOutstandingRequests(self); @@ -2155,6 +2118,11 @@ void registerWorker( RegisterWorkerRequest req, ClusterControllerData *self ) { ClusterControllerPriorityInfo newPriorityInfo = req.priorityInfo; newPriorityInfo.processClassFitness = newProcessClass.machineClassFitness(ProcessClass::ClusterController); + for(auto it : req.incompatiblePeers) { + self->db.incompatibleConnections[it] = now() + SERVER_KNOBS->INCOMPATIBLE_PEERS_LOGGING_INTERVAL; + } + self->removedDBInfoEndpoints.erase(w.updateServerDBInfo.getEndpoint()); + if(info == self->id_worker.end()) { TraceEvent("ClusterControllerActualWorkers", self->id).detail("WorkerId",w.id()).detail("ProcessId", w.locality.processId()).detail("ZoneId", w.locality.zoneId()).detail("DataHall", w.locality.dataHallId()).detail("PClass", req.processClass.toString()).detail("Workers", self->id_worker.size()); self->goodRecruitmentTime = lowPriorityDelay(SERVER_KNOBS->WAIT_FOR_GOOD_RECRUITMENT_DELAY); @@ -2194,13 +2162,13 @@ void registerWorker( RegisterWorkerRequest req, ClusterControllerData *self ) { } if ( self->gotFullyRecoveredConfig ) { - newPriorityInfo.isExcluded = self->db.fullyRecoveredConfig.isExcludedServer(w.address()); + newPriorityInfo.isExcluded = self->db.fullyRecoveredConfig.isExcludedServer(w.addresses()); } } if( info == self->id_worker.end() ) { - self->id_worker[w.locality.processId()] = WorkerInfo( workerAvailabilityWatch( w, newProcessClass, self ), req.reply, req.generation, w, req.initialClass, newProcessClass, newPriorityInfo, req.degraded ); - if (!self->masterProcessId.present() && w.locality.processId() == self->db.serverInfo->get().read().master.locality.processId()) { + self->id_worker[w.locality.processId()] = WorkerInfo( workerAvailabilityWatch( w, newProcessClass, self ), req.reply, req.generation, w, req.initialClass, newProcessClass, newPriorityInfo, req.degraded, req.issues ); + if (!self->masterProcessId.present() && w.locality.processId() == self->db.serverInfo->get().master.locality.processId()) { self->masterProcessId = w.locality.processId(); } checkOutstandingRequests( self ); @@ -2214,8 +2182,10 @@ void registerWorker( RegisterWorkerRequest req, ClusterControllerData *self ) { info->second.initialClass = req.initialClass; info->second.details.degraded = req.degraded; info->second.gen = req.generation; + info->second.issues = req.issues; if(info->second.details.interf.id() != w.id()) { + self->removedDBInfoEndpoints.insert(info->second.details.interf.updateServerDBInfo.getEndpoint()); info->second.details.interf = w; info->second.watcher = workerAvailabilityWatch( w, newProcessClass, self ); } @@ -2224,7 +2194,7 @@ void registerWorker( RegisterWorkerRequest req, ClusterControllerData *self ) { TEST(true); // Received an old worker registration request. } - if (req.distributorInterf.present() && !self->db.serverInfo->get().read().distributor.present() && + if (req.distributorInterf.present() && !self->db.serverInfo->get().distributor.present() && self->clusterControllerDcId == req.distributorInterf.get().locality.dcId() && !self->recruitingDistributor) { const DataDistributorInterface& di = req.distributorInterf.get(); @@ -2244,7 +2214,7 @@ void registerWorker( RegisterWorkerRequest req, ClusterControllerData *self ) { req.ratekeeperInterf.get().haltRatekeeper.getReply(HaltRatekeeperRequest(self->id))); } else if (!self->recruitingRatekeeperID.present()) { const RatekeeperInterface& rki = req.ratekeeperInterf.get(); - const auto& ratekeeper = self->db.serverInfo->get().read().ratekeeper; + const auto& ratekeeper = self->db.serverInfo->get().ratekeeper; TraceEvent("CCRegisterRatekeeper", self->id).detail("RKID", rki.id()); if (ratekeeper.present() && ratekeeper.get().id() != rki.id() && self->id_worker.count(ratekeeper.get().locality.processId())) { TraceEvent("CCHaltPreviousRatekeeper", self->id).detail("RKID", ratekeeper.get().id()) @@ -2425,8 +2395,14 @@ ACTOR Future statusServer(FutureStream< StatusRequest> requests, // Get status but trap errors to send back to client. vector workers; - for(auto& it : self->id_worker) + std::vector workerIssues; + + for(auto& it : self->id_worker) { workers.push_back(it.second.details); + if(it.second.issues.size()) { + workerIssues.push_back(ProcessIssues(it.second.details.interf.address(), it.second.issues)); + } + } std::vector incompatibleConnections; for(auto it = self->db.incompatibleConnections.begin(); it != self->db.incompatibleConnections.end();) { @@ -2438,7 +2414,7 @@ ACTOR Future statusServer(FutureStream< StatusRequest> requests, } } - state ErrorOr result = wait(errorOr(clusterGetStatus(self->db.serverInfo, self->cx, workers, self->db.workersWithIssues, &self->db.clientStatus, coordinators, incompatibleConnections, self->datacenterVersionDifference))); + state ErrorOr result = wait(errorOr(clusterGetStatus(self->db.serverInfo, self->cx, workers, workerIssues, &self->db.clientStatus, coordinators, incompatibleConnections, self->datacenterVersionDifference))); if (result.isError() && result.getError().code() == error_code_actor_cancelled) throw result.getError(); @@ -2565,13 +2541,13 @@ ACTOR Future monitorServerInfoConfig(ClusterControllerData::DBInfo* db) { config = LatencyBandConfig::parse(configVal.get()); } - auto cachedInfo = db->serverInfo->get(); - auto& serverInfo = cachedInfo.mutate(); + auto serverInfo = db->serverInfo->get(); if(config != serverInfo.latencyBandConfig) { TraceEvent("LatencyBandConfigChanged").detail("Present", config.present()); serverInfo.id = deterministicRandom()->randomUniqueID(); + serverInfo.infoGeneration = ++db->dbInfoCount; serverInfo.latencyBandConfig = config; - db->serverInfo->set(cachedInfo); + db->serverInfo->set(serverInfo); } state Future configChangeFuture = tr.watch(latencyBandConfigKey); @@ -2799,7 +2775,7 @@ ACTOR Future updateDatacenterVersionDifference( ClusterControllerData *sel state double lastLogTime = 0; loop { self->versionDifferenceUpdated = false; - if(self->db.serverInfo->get().read().recoveryState >= RecoveryState::ACCEPTING_COMMITS && self->db.config.usableRegions == 1) { + if(self->db.serverInfo->get().recoveryState >= RecoveryState::ACCEPTING_COMMITS && self->db.config.usableRegions == 1) { bool oldDifferenceTooLarge = !self->versionDifferenceUpdated || self->datacenterVersionDifference >= SERVER_KNOBS->MAX_VERSION_DIFFERENCE; self->versionDifferenceUpdated = true; self->datacenterVersionDifference = 0; @@ -2814,8 +2790,8 @@ ACTOR Future updateDatacenterVersionDifference( ClusterControllerData *sel state Optional primaryLog; state Optional remoteLog; - if(self->db.serverInfo->get().read().recoveryState >= RecoveryState::ALL_LOGS_RECRUITED) { - for(auto& logSet : self->db.serverInfo->get().read().logSystemConfig.tLogs) { + if(self->db.serverInfo->get().recoveryState >= RecoveryState::ALL_LOGS_RECRUITED) { + for(auto& logSet : self->db.serverInfo->get().logSystemConfig.tLogs) { if(logSet.isLocal && logSet.locality != tagLocalitySatellite) { for(auto& tLog : logSet.tLogs) { if(tLog.present()) { @@ -2916,12 +2892,12 @@ ACTOR Future startDataDistributor( ClusterControllerDa TraceEvent("CCStartDataDistributor", self->id); loop { try { - state bool no_distributor = !self->db.serverInfo->get().read().distributor.present(); - while (!self->masterProcessId.present() || self->masterProcessId != self->db.serverInfo->get().read().master.locality.processId() || self->db.serverInfo->get().read().recoveryState < RecoveryState::ACCEPTING_COMMITS) { + state bool no_distributor = !self->db.serverInfo->get().distributor.present(); + while (!self->masterProcessId.present() || self->masterProcessId != self->db.serverInfo->get().master.locality.processId() || self->db.serverInfo->get().recoveryState < RecoveryState::ACCEPTING_COMMITS) { wait(self->db.serverInfo->onChange() || delay(SERVER_KNOBS->WAIT_FOR_GOOD_RECRUITMENT_DELAY)); } - if (no_distributor && self->db.serverInfo->get().read().distributor.present()) { - return self->db.serverInfo->get().read().distributor.get(); + if (no_distributor && self->db.serverInfo->get().distributor.present()) { + return self->db.serverInfo->get().distributor.get(); } std::map>, int> id_used = self->getUsedIds(); @@ -2951,15 +2927,15 @@ ACTOR Future startDataDistributor( ClusterControllerDa } ACTOR Future monitorDataDistributor(ClusterControllerData *self) { - while(self->db.serverInfo->get().read().recoveryState < RecoveryState::ACCEPTING_COMMITS) { + while(self->db.serverInfo->get().recoveryState < RecoveryState::ACCEPTING_COMMITS) { wait(self->db.serverInfo->onChange()); } loop { - if ( self->db.serverInfo->get().read().distributor.present() ) { - wait( waitFailureClient( self->db.serverInfo->get().read().distributor.get().waitFailure, SERVER_KNOBS->DD_FAILURE_TIME ) ); + if ( self->db.serverInfo->get().distributor.present() ) { + wait( waitFailureClient( self->db.serverInfo->get().distributor.get().waitFailure, SERVER_KNOBS->DD_FAILURE_TIME ) ); TraceEvent("CCDataDistributorDied", self->id) - .detail("DistributorId", self->db.serverInfo->get().read().distributor.get().id()); + .detail("DistributorId", self->db.serverInfo->get().distributor.get().id()); self->db.clearInterf(ProcessClass::DataDistributorClass); } else { self->recruitingDistributor = true; @@ -2976,11 +2952,11 @@ ACTOR Future startRatekeeper(ClusterControllerData *self) { TraceEvent("CCStartRatekeeper", self->id); loop { try { - state bool no_ratekeeper = !self->db.serverInfo->get().read().ratekeeper.present(); - while (!self->masterProcessId.present() || self->masterProcessId != self->db.serverInfo->get().read().master.locality.processId() || self->db.serverInfo->get().read().recoveryState < RecoveryState::ACCEPTING_COMMITS) { + state bool no_ratekeeper = !self->db.serverInfo->get().ratekeeper.present(); + while (!self->masterProcessId.present() || self->masterProcessId != self->db.serverInfo->get().master.locality.processId() || self->db.serverInfo->get().recoveryState < RecoveryState::ACCEPTING_COMMITS) { wait(self->db.serverInfo->onChange() || delay(SERVER_KNOBS->WAIT_FOR_GOOD_RECRUITMENT_DELAY)); } - if (no_ratekeeper && self->db.serverInfo->get().read().ratekeeper.present()) { + if (no_ratekeeper && self->db.serverInfo->get().ratekeeper.present()) { // Existing ratekeeper registers while waiting, so skip. return Void(); } @@ -3000,7 +2976,7 @@ ACTOR Future startRatekeeper(ClusterControllerData *self) { if (interf.present()) { self->recruitRatekeeper.set(false); self->recruitingRatekeeperID = interf.get().id(); - const auto& ratekeeper = self->db.serverInfo->get().read().ratekeeper; + const auto& ratekeeper = self->db.serverInfo->get().ratekeeper; TraceEvent("CCRatekeeperRecruited", self->id).detail("Addr", worker.interf.address()).detail("RKID", interf.get().id()); if (ratekeeper.present() && ratekeeper.get().id() != interf.get().id() && self->id_worker.count(ratekeeper.get().locality.processId())) { TraceEvent("CCHaltRatekeeperAfterRecruit", self->id).detail("RKID", ratekeeper.get().id()) @@ -3025,16 +3001,16 @@ ACTOR Future startRatekeeper(ClusterControllerData *self) { } ACTOR Future monitorRatekeeper(ClusterControllerData *self) { - while(self->db.serverInfo->get().read().recoveryState < RecoveryState::ACCEPTING_COMMITS) { + while(self->db.serverInfo->get().recoveryState < RecoveryState::ACCEPTING_COMMITS) { wait(self->db.serverInfo->onChange()); } loop { - if ( self->db.serverInfo->get().read().ratekeeper.present() && !self->recruitRatekeeper.get() ) { + if ( self->db.serverInfo->get().ratekeeper.present() && !self->recruitRatekeeper.get() ) { choose { - when(wait(waitFailureClient( self->db.serverInfo->get().read().ratekeeper.get().waitFailure, SERVER_KNOBS->RATEKEEPER_FAILURE_TIME ))) { + when(wait(waitFailureClient( self->db.serverInfo->get().ratekeeper.get().waitFailure, SERVER_KNOBS->RATEKEEPER_FAILURE_TIME ))) { TraceEvent("CCRatekeeperDied", self->id) - .detail("RKID", self->db.serverInfo->get().read().ratekeeper.get().id()); + .detail("RKID", self->db.serverInfo->get().ratekeeper.get().id()); self->db.clearInterf(ProcessClass::RatekeeperClass); } when(wait(self->recruitRatekeeper.onChange())) {} @@ -3045,6 +3021,54 @@ ACTOR Future monitorRatekeeper(ClusterControllerData *self) { } } +ACTOR Future dbInfoUpdater( ClusterControllerData* self ) { + state Future dbInfoChange = self->db.serverInfo->onChange(); + state Future updateDBInfo = self->updateDBInfo.onTrigger(); + loop { + choose { + when(wait(updateDBInfo)) { + wait(delay(SERVER_KNOBS->DBINFO_BATCH_DELAY) || dbInfoChange); + } + when(wait(dbInfoChange)) {} + } + + UpdateServerDBInfoRequest req; + if(dbInfoChange.isReady()) { + for(auto &it : self->id_worker) { + req.broadcastInfo.push_back(it.second.details.interf.updateServerDBInfo.getEndpoint()); + } + } else { + for(auto it : self->removedDBInfoEndpoints) { + self->updateDBInfoEndpoints.erase(it); + } + req.broadcastInfo = std::vector(self->updateDBInfoEndpoints.begin(), self->updateDBInfoEndpoints.end()); + } + + self->updateDBInfoEndpoints.clear(); + self->removedDBInfoEndpoints.clear(); + + dbInfoChange = self->db.serverInfo->onChange(); + updateDBInfo = self->updateDBInfo.onTrigger(); + + req.serializedDbInfo = BinaryWriter::toValue(self->db.serverInfo->get(), AssumeVersion(currentProtocolVersion)); + + TraceEvent("DBInfoStartBroadcast", self->id); + choose { + when(std::vector notUpdated = wait( broadcastDBInfoRequest(req, SERVER_KNOBS->DBINFO_SEND_AMOUNT, Optional(), false) )) { + TraceEvent("DBInfoFinishBroadcast", self->id); + for(auto &it : notUpdated) { + TraceEvent("DBInfoNotUpdated", self->id).detail("Addr", it.getPrimaryAddress()); + } + if(notUpdated.size()) { + self->updateDBInfoEndpoints.insert(notUpdated.begin(), notUpdated.end()); + self->updateDBInfo.trigger(); + } + } + when(wait(dbInfoChange)) {} + } + } +} + ACTOR Future clusterControllerCore( ClusterControllerFullInterface interf, Future leaderFail, ServerCoordinators coordinators, LocalityData locality ) { state ClusterControllerData self( interf, locality ); state Future coordinationPingDelay = delay( SERVER_KNOBS->WORKER_COORDINATION_PING_DELAY ); @@ -3066,6 +3090,7 @@ ACTOR Future clusterControllerCore( ClusterControllerFullInterface interf, self.addActor.send( monitorDataDistributor(&self) ); self.addActor.send( monitorRatekeeper(&self) ); self.addActor.send( monitorStorageCache(&self) ); + self.addActor.send( dbInfoUpdater(&self) ); self.addActor.send( traceCounters("ClusterControllerMetrics", self.id, SERVER_KNOBS->STORAGE_LOGGING_DELAY, &self.clusterControllerMetrics, self.id.toString() + "/ClusterControllerMetrics") ); //printf("%s: I am the cluster controller\n", g_network->getLocalAddress().toString().c_str()); @@ -3103,7 +3128,7 @@ ACTOR Future clusterControllerCore( ClusterControllerFullInterface interf, vector workers; for(auto& it : self.id_worker) { - if ( (req.flags & GetWorkersRequest::NON_EXCLUDED_PROCESSES_ONLY) && self.db.config.isExcludedServer(it.second.details.interf.address()) ) { + if ( (req.flags & GetWorkersRequest::NON_EXCLUDED_PROCESSES_ONLY) && self.db.config.isExcludedServer(it.second.details.interf.addresses()) ) { continue; } @@ -3138,9 +3163,7 @@ ACTOR Future clusterControllerCore( ClusterControllerFullInterface interf, clusterRegisterMaster( &self, req ); } when( GetServerDBInfoRequest req = waitNext( interf.getServerDBInfo.getFuture() ) ) { - ++self.getServerDBInfoRequests; - self.addActor.send( - clusterGetServerInfo(&self.db, req.knownServerInfoID, req.issues, req.incompatiblePeers, req.reply)); + self.addActor.send( clusterGetServerInfo(&self.db, req.knownServerInfoID, req.reply) ); } when( wait( leaderFail ) ) { // We are no longer the leader if this has changed. diff --git a/fdbserver/ClusterRecruitmentInterface.h b/fdbserver/ClusterRecruitmentInterface.h deleted file mode 100644 index 4e1c753be2..0000000000 --- a/fdbserver/ClusterRecruitmentInterface.h +++ /dev/null @@ -1,263 +0,0 @@ -/* - * ClusterRecruitmentInterface.h - * - * This source file is part of the FoundationDB open source project - * - * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef FDBSERVER_CLUSTERRECRUITMENTINTERFACE_H -#define FDBSERVER_CLUSTERRECRUITMENTINTERFACE_H -#pragma once - -#include - -#include "fdbclient/ClusterInterface.h" -#include "fdbclient/StorageServerInterface.h" -#include "fdbclient/MasterProxyInterface.h" -#include "fdbclient/DatabaseConfiguration.h" -#include "fdbserver/BackupInterface.h" -#include "fdbserver/DataDistributorInterface.h" -#include "fdbserver/MasterInterface.h" -#include "fdbserver/RecoveryState.h" -#include "fdbserver/TLogInterface.h" -#include "fdbserver/WorkerInterface.actor.h" -#include "fdbserver/Knobs.h" - -// This interface and its serialization depend on slicing, since the client will deserialize only the first part of this structure -struct ClusterControllerFullInterface { - constexpr static FileIdentifier file_identifier = - ClusterControllerClientInterface::file_identifier; - ClusterInterface clientInterface; - RequestStream< struct RecruitFromConfigurationRequest > recruitFromConfiguration; - RequestStream< struct RecruitRemoteFromConfigurationRequest > recruitRemoteFromConfiguration; - RequestStream< struct RecruitStorageRequest > recruitStorage; - RequestStream< struct RegisterWorkerRequest > registerWorker; - RequestStream< struct GetWorkersRequest > getWorkers; - RequestStream< struct RegisterMasterRequest > registerMaster; - RequestStream< struct GetServerDBInfoRequest > getServerDBInfo; - - UID id() const { return clientInterface.id(); } - bool operator == (ClusterControllerFullInterface const& r) const { return id() == r.id(); } - bool operator != (ClusterControllerFullInterface const& r) const { return id() != r.id(); } - - bool hasMessage() { - return clientInterface.hasMessage() || - recruitFromConfiguration.getFuture().isReady() || - recruitRemoteFromConfiguration.getFuture().isReady() || - recruitStorage.getFuture().isReady() || - registerWorker.getFuture().isReady() || - getWorkers.getFuture().isReady() || - registerMaster.getFuture().isReady() || - getServerDBInfo.getFuture().isReady(); - } - - void initEndpoints() { - clientInterface.initEndpoints(); - recruitFromConfiguration.getEndpoint( TaskPriority::ClusterControllerRecruit ); - recruitRemoteFromConfiguration.getEndpoint( TaskPriority::ClusterControllerRecruit ); - recruitStorage.getEndpoint( TaskPriority::ClusterController ); - registerWorker.getEndpoint( TaskPriority::ClusterControllerWorker ); - getWorkers.getEndpoint( TaskPriority::ClusterController ); - registerMaster.getEndpoint( TaskPriority::ClusterControllerRegister ); - getServerDBInfo.getEndpoint( TaskPriority::ClusterController ); - } - - template - void serialize(Ar& ar) { - if constexpr (!is_fb_function) { - ASSERT(ar.protocolVersion().isValid()); - } - serializer(ar, clientInterface, recruitFromConfiguration, recruitRemoteFromConfiguration, recruitStorage, - registerWorker, getWorkers, registerMaster, getServerDBInfo); - } -}; - -struct RecruitFromConfigurationReply { - constexpr static FileIdentifier file_identifier = 2224085; - std::vector backupWorkers; - std::vector tLogs; - std::vector satelliteTLogs; - std::vector proxies; - std::vector resolvers; - std::vector storageServers; - std::vector oldLogRouters; - Optional dcId; - bool satelliteFallback; - - RecruitFromConfigurationReply() : satelliteFallback(false) {} - - template - void serialize(Ar& ar) { - serializer(ar, tLogs, satelliteTLogs, proxies, resolvers, storageServers, oldLogRouters, dcId, - satelliteFallback, backupWorkers); - } -}; - -struct RecruitFromConfigurationRequest { - constexpr static FileIdentifier file_identifier = 2023046; - DatabaseConfiguration configuration; - bool recruitSeedServers; - int maxOldLogRouters; - ReplyPromise< struct RecruitFromConfigurationReply > reply; - - RecruitFromConfigurationRequest() {} - explicit RecruitFromConfigurationRequest(DatabaseConfiguration const& configuration, bool recruitSeedServers, int maxOldLogRouters) - : configuration(configuration), recruitSeedServers(recruitSeedServers), maxOldLogRouters(maxOldLogRouters) {} - - template - void serialize( Ar& ar ) { - serializer(ar, configuration, recruitSeedServers, maxOldLogRouters, reply); - } -}; - -struct RecruitRemoteFromConfigurationReply { - constexpr static FileIdentifier file_identifier = 9091392; - std::vector remoteTLogs; - std::vector logRouters; - - template - void serialize( Ar& ar ) { - serializer(ar, remoteTLogs, logRouters); - } -}; - -struct RecruitRemoteFromConfigurationRequest { - constexpr static FileIdentifier file_identifier = 3235995; - DatabaseConfiguration configuration; - Optional dcId; - int logRouterCount; - std::vector exclusionWorkerIds; - ReplyPromise< struct RecruitRemoteFromConfigurationReply > reply; - - RecruitRemoteFromConfigurationRequest() {} - RecruitRemoteFromConfigurationRequest(DatabaseConfiguration const& configuration, Optional const& dcId, int logRouterCount, const std::vector &exclusionWorkerIds) : configuration(configuration), dcId(dcId), logRouterCount(logRouterCount), exclusionWorkerIds(exclusionWorkerIds){} - - template - void serialize( Ar& ar ) { - serializer(ar, configuration, dcId, logRouterCount, exclusionWorkerIds, reply); - } -}; - -struct RecruitStorageReply { - constexpr static FileIdentifier file_identifier = 15877089; - WorkerInterface worker; - ProcessClass processClass; - - template - void serialize( Ar& ar ) { - serializer(ar, worker, processClass); - } -}; - -struct RecruitStorageRequest { - constexpr static FileIdentifier file_identifier = 905920; - std::vector>> excludeMachines; //< Don't recruit any of these machines - std::vector excludeAddresses; //< Don't recruit any of these addresses - std::vector>> includeDCs; - bool criticalRecruitment; //< True if machine classes are to be ignored - ReplyPromise< RecruitStorageReply > reply; - - template - void serialize( Ar& ar ) { - serializer(ar, excludeMachines, excludeAddresses, includeDCs, criticalRecruitment, reply); - } -}; - -struct RegisterWorkerReply { - constexpr static FileIdentifier file_identifier = 16475696; - ProcessClass processClass; - ClusterControllerPriorityInfo priorityInfo; - Optional storageCache; - - RegisterWorkerReply() : priorityInfo(ProcessClass::UnsetFit, false, ClusterControllerPriorityInfo::FitnessUnknown) {} - RegisterWorkerReply(ProcessClass processClass, ClusterControllerPriorityInfo priorityInfo, Optional storageCache) : processClass(processClass), priorityInfo(priorityInfo), storageCache(storageCache) {} - - template - void serialize( Ar& ar ) { - serializer(ar, processClass, priorityInfo, storageCache); - } -}; - -struct RegisterWorkerRequest { - constexpr static FileIdentifier file_identifier = 14332605; - WorkerInterface wi; - ProcessClass initialClass; - ProcessClass processClass; - ClusterControllerPriorityInfo priorityInfo; - Generation generation; - Optional distributorInterf; - Optional ratekeeperInterf; - Optional> storageCacheInterf; - ReplyPromise reply; - bool degraded; - - RegisterWorkerRequest() : priorityInfo(ProcessClass::UnsetFit, false, ClusterControllerPriorityInfo::FitnessUnknown), degraded(false) {} - RegisterWorkerRequest(WorkerInterface wi, ProcessClass initialClass, ProcessClass processClass, ClusterControllerPriorityInfo priorityInfo, Generation generation, Optional ddInterf, Optional rkInterf, Optional> storageCacheInterf, bool degraded) : - wi(wi), initialClass(initialClass), processClass(processClass), priorityInfo(priorityInfo), generation(generation), distributorInterf(ddInterf), ratekeeperInterf(rkInterf), storageCacheInterf(storageCacheInterf), degraded(degraded) {} - - template - void serialize( Ar& ar ) { - serializer(ar, wi, initialClass, processClass, priorityInfo, generation, distributorInterf, ratekeeperInterf, storageCacheInterf, reply, degraded); - } -}; - -struct GetWorkersRequest { - constexpr static FileIdentifier file_identifier = 1254174; - enum { TESTER_CLASS_ONLY = 0x1, NON_EXCLUDED_PROCESSES_ONLY = 0x2 }; - - int flags; - ReplyPromise> reply; - - GetWorkersRequest() : flags(0) {} - explicit GetWorkersRequest(int fl) : flags(fl) {} - - template - void serialize(Ar& ar) { - serializer(ar, flags, reply); - } -}; - -struct RegisterMasterRequest { - constexpr static FileIdentifier file_identifier = 10773445; - UID id; - LocalityData mi; - LogSystemConfig logSystemConfig; - std::vector proxies; - std::vector resolvers; - DBRecoveryCount recoveryCount; - int64_t registrationCount; - Optional configuration; - std::vector priorCommittedLogServers; - RecoveryState recoveryState; - bool recoveryStalled; - - ReplyPromise reply; - - RegisterMasterRequest() : logSystemConfig(0) {} - - template - void serialize(Ar& ar) { - if constexpr (!is_fb_function) { - ASSERT(ar.protocolVersion().isValid()); - } - serializer(ar, id, mi, logSystemConfig, proxies, resolvers, recoveryCount, registrationCount, configuration, - priorCommittedLogServers, recoveryState, recoveryStalled, reply); - } -}; - -#include "fdbserver/ServerDBInfo.h" // include order hack - -#endif diff --git a/fdbserver/Coordination.actor.cpp b/fdbserver/Coordination.actor.cpp index db22f4acdc..646be46282 100644 --- a/fdbserver/Coordination.actor.cpp +++ b/fdbserver/Coordination.actor.cpp @@ -216,7 +216,9 @@ ACTOR Future openDatabase(ClientData* db, int* clientCount, Referenceset(true); - db->clientStatusInfoMap[req.reply.getEndpoint().getPrimaryAddress()] = ClientStatusInfo(req.traceLogGroup, req.supportedVersions, req.issues); + if(req.supportedVersions.size() > 0) { + db->clientStatusInfoMap[req.reply.getEndpoint().getPrimaryAddress()] = ClientStatusInfo(req.traceLogGroup, req.supportedVersions, req.issues); + } while (db->clientInfo->get().read().id == req.knownClientInfoID && !db->clientInfo->get().read().forward.present()) { choose { @@ -225,7 +227,9 @@ ACTOR Future openDatabase(ClientData* db, int* clientCount, ReferenceclientStatusInfoMap.erase(req.reply.getEndpoint().getPrimaryAddress()); + if(req.supportedVersions.size() > 0) { + db->clientStatusInfoMap.erase(req.reply.getEndpoint().getPrimaryAddress()); + } req.reply.send( db->clientInfo->get() ); diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp index 569b8e1ceb..e79338cebd 100644 --- a/fdbserver/DataDistribution.actor.cpp +++ b/fdbserver/DataDistribution.actor.cpp @@ -23,6 +23,7 @@ #include "fdbclient/SystemData.h" #include "fdbclient/DatabaseContext.h" #include "fdbclient/ManagementAPI.actor.h" +#include "fdbclient/RunTransaction.actor.h" #include "fdbrpc/Replication.h" #include "fdbserver/DataDistribution.actor.h" #include "fdbserver/FDBExecHelper.actor.h" @@ -45,8 +46,10 @@ class TCMachineTeamInfo; ACTOR Future checkAndRemoveInvalidLocalityAddr(DDTeamCollection* self); ACTOR Future removeWrongStoreType(DDTeamCollection* self); + struct TCServerInfo : public ReferenceCounted { UID id; + DDTeamCollection* collection; StorageServerInterface lastKnownInterface; ProcessClass lastKnownClass; vector> teams; @@ -63,13 +66,14 @@ struct TCServerInfo : public ReferenceCounted { LocalityEntry localityEntry; Promise updated; AsyncVar wrongStoreTypeToRemove; + AsyncVar ssVersionTooFarBehind; // A storage server's StoreType does not change. // To change storeType for an ip:port, we destroy the old one and create a new one. KeyValueStoreType storeType; // Storage engine type - TCServerInfo(StorageServerInterface ssi, ProcessClass processClass, bool inDesiredDC, + TCServerInfo(StorageServerInterface ssi, DDTeamCollection* collection, ProcessClass processClass, bool inDesiredDC, Reference storageServerSet) - : id(ssi.id()), lastKnownInterface(ssi), lastKnownClass(processClass), dataInFlightToServer(0), + : id(ssi.id()), collection(collection), lastKnownInterface(ssi), lastKnownClass(processClass), dataInFlightToServer(0), onInterfaceChanged(interfaceChanged.getFuture()), onRemoved(removed.getFuture()), inDesiredDC(inDesiredDC), storeType(KeyValueStoreType::END) { localityEntry = ((LocalityMap*) storageServerSet.getPtr())->add(ssi.locality, &id); @@ -80,6 +84,7 @@ struct TCServerInfo : public ReferenceCounted { // If a storage server does not reply its storeType, it will be tracked by failure monitor and removed. return (storeType == configStoreType || storeType == KeyValueStoreType::END); } + ~TCServerInfo(); }; struct TCMachineInfo : public ReferenceCounted { @@ -109,51 +114,7 @@ struct TCMachineInfo : public ReferenceCounted { } }; -ACTOR Future updateServerMetrics( TCServerInfo *server ) { - state StorageServerInterface ssi = server->lastKnownInterface; - state Future> metricsRequest = ssi.getStorageMetrics.tryGetReply( GetStorageMetricsRequest(), TaskPriority::DataDistributionLaunch ); - state Future resetRequest = Never(); - state Future> interfaceChanged( server->onInterfaceChanged ); - state Future serverRemoved( server->onRemoved ); - - loop { - choose { - when( ErrorOr rep = wait( metricsRequest ) ) { - if( rep.present() ) { - server->serverMetrics = rep; - if(server->updated.canBeSet()) { - server->updated.send(Void()); - } - return Void(); - } - metricsRequest = Never(); - resetRequest = delay( SERVER_KNOBS->METRIC_DELAY, TaskPriority::DataDistributionLaunch ); - } - when( std::pair _ssi = wait( interfaceChanged ) ) { - ssi = _ssi.first; - interfaceChanged = server->onInterfaceChanged; - resetRequest = Void(); - } - when( wait( serverRemoved ) ) { - return Void(); - } - when( wait( resetRequest ) ) { //To prevent a tight spin loop - if(IFailureMonitor::failureMonitor().getState(ssi.getStorageMetrics.getEndpoint()).isFailed()) { - resetRequest = IFailureMonitor::failureMonitor().onStateEqual(ssi.getStorageMetrics.getEndpoint(), FailureStatus(false)); - } - else { - resetRequest = Never(); - metricsRequest = ssi.getStorageMetrics.tryGetReply( GetStorageMetricsRequest(), TaskPriority::DataDistributionLaunch ); - } - } - } - } -} - -ACTOR Future updateServerMetrics( Reference server ) { - wait( updateServerMetrics( server.getPtr() ) ); - return Void(); -} +ACTOR Future updateServerMetrics( Reference server); // TeamCollection's machine team information class TCMachineTeamInfo : public ReferenceCounted { @@ -596,6 +557,8 @@ struct DDTeamCollection : ReferenceCounted { int64_t unhealthyServers; std::map priority_teams; std::map> server_info; + std::map lagging_zones; // zone to number of storage servers lagging + AsyncVar disableFailingLaggingServers; // machine_info has all machines info; key must be unique across processes on the same machine std::map, Reference> machine_info; @@ -721,6 +684,23 @@ struct DDTeamCollection : ReferenceCounted { teamBuilder.cancel(); } + void addLaggingStorageServer(Key zoneId) { + lagging_zones[zoneId]++; + if (lagging_zones.size() > std::max(1, configuration.storageTeamSize - 1) && !disableFailingLaggingServers.get()) + disableFailingLaggingServers.set(true); + } + + void removeLaggingStorageServer(Key zoneId) { + auto iter = lagging_zones.find(zoneId); + ASSERT(iter != lagging_zones.end()); + iter->second--; + ASSERT(iter->second >= 0); + if (iter->second == 0) + lagging_zones.erase(iter); + if (lagging_zones.size() <= std::max(1, configuration.storageTeamSize - 1) && disableFailingLaggingServers.get()) + disableFailingLaggingServers.set(false); + } + ACTOR static Future logOnCompletion( Future signal, DDTeamCollection* self ) { wait(signal); wait(delay(SERVER_KNOBS->LOG_ON_COMPLETION_DELAY, TaskPriority::DataDistribution)); @@ -1040,7 +1020,7 @@ struct DDTeamCollection : ReferenceCounted { TraceEvent(SevWarnAlways, "MissingLocality") .detail("Server", i->first.uniqueID) .detail("Locality", i->first.locality.toString()); - auto addr = i->first.address(); + auto addr = i->first.stableAddress(); self->invalidLocalityAddr.insert(AddressExclusion(addr.ip, addr.port)); if (self->checkInvalidLocalities.isReady()) { self->checkInvalidLocalities = checkAndRemoveInvalidLocalityAddr(self); @@ -2255,6 +2235,8 @@ struct DDTeamCollection : ReferenceCounted { .detail("DoBuildTeams", self->doBuildTeams) .trackLatest("TeamCollectionInfo"); } + } else { + self->lastBuildTeamsFailed = true; } self->evaluateTeamQuality(); @@ -2297,7 +2279,7 @@ struct DDTeamCollection : ReferenceCounted { allServers.push_back( newServer.id() ); TraceEvent("AddedStorageServer", distributorId).detail("ServerID", newServer.id()).detail("ProcessClass", processClass.toString()).detail("WaitFailureToken", newServer.waitFailure.getEndpoint().token).detail("Address", newServer.waitFailure.getEndpoint().getPrimaryAddress()); - auto &r = server_info[newServer.id()] = Reference( new TCServerInfo( newServer, processClass, includedDCs.empty() || std::find(includedDCs.begin(), includedDCs.end(), newServer.locality.dcId()) != includedDCs.end(), storageServerSet ) ); + auto &r = server_info[newServer.id()] = Reference( new TCServerInfo( newServer, this, processClass, includedDCs.empty() || std::find(includedDCs.begin(), includedDCs.end(), newServer.locality.dcId()) != includedDCs.end(), storageServerSet ) ); // Establish the relation between server and machine checkAndCreateMachine(r); @@ -2586,6 +2568,80 @@ struct DDTeamCollection : ReferenceCounted { } }; + +TCServerInfo::~TCServerInfo() { + if (ssVersionTooFarBehind.get()) { + collection->removeLaggingStorageServer(lastKnownInterface.locality.zoneId().get()); + } +} + +ACTOR Future updateServerMetrics( TCServerInfo *server ) { + state StorageServerInterface ssi = server->lastKnownInterface; + state Future> metricsRequest = ssi.getStorageMetrics.tryGetReply( GetStorageMetricsRequest(), TaskPriority::DataDistributionLaunch ); + state Future resetRequest = Never(); + state Future> interfaceChanged( server->onInterfaceChanged ); + state Future serverRemoved( server->onRemoved ); + + loop { + choose { + when( ErrorOr rep = wait( metricsRequest ) ) { + if( rep.present() ) { + server->serverMetrics = rep; + if(server->updated.canBeSet()) { + server->updated.send(Void()); + } + break; + } + metricsRequest = Never(); + resetRequest = delay( SERVER_KNOBS->METRIC_DELAY, TaskPriority::DataDistributionLaunch ); + } + when( std::pair _ssi = wait( interfaceChanged ) ) { + ssi = _ssi.first; + interfaceChanged = server->onInterfaceChanged; + resetRequest = Void(); + } + when( wait( serverRemoved ) ) { + return Void(); + } + when( wait( resetRequest ) ) { //To prevent a tight spin loop + if(IFailureMonitor::failureMonitor().getState(ssi.getStorageMetrics.getEndpoint()).isFailed()) { + resetRequest = IFailureMonitor::failureMonitor().onStateEqual(ssi.getStorageMetrics.getEndpoint(), FailureStatus(false)); + } + else { + resetRequest = Never(); + metricsRequest = ssi.getStorageMetrics.tryGetReply( GetStorageMetricsRequest(), TaskPriority::DataDistributionLaunch ); + } + } + } + } + + if ( server->serverMetrics.get().lastUpdate < now() - SERVER_KNOBS->DD_SS_STUCK_TIME_LIMIT ) { + if (server->ssVersionTooFarBehind.get() == false) { + TraceEvent("StorageServerStuck", server->collection->distributorId).detail("ServerId", server->id.toString()).detail("LastUpdate", server->serverMetrics.get().lastUpdate); + server->ssVersionTooFarBehind.set(true); + server->collection->addLaggingStorageServer(server->lastKnownInterface.locality.zoneId().get()); + } + } else if ( server->serverMetrics.get().versionLag > SERVER_KNOBS->DD_SS_FAILURE_VERSIONLAG ) { + if (server->ssVersionTooFarBehind.get() == false) { + TraceEvent("SSVersionDiffLarge", server->collection->distributorId).detail("ServerId", server->id.toString()).detail("VersionLag", server->serverMetrics.get().versionLag); + server->ssVersionTooFarBehind.set(true); + server->collection->addLaggingStorageServer(server->lastKnownInterface.locality.zoneId().get()); + } + } else if ( server->serverMetrics.get().versionLag < SERVER_KNOBS->DD_SS_ALLOWED_VERSIONLAG ) { + if (server->ssVersionTooFarBehind.get() == true) { + TraceEvent("SSVersionDiffNormal", server->collection->distributorId).detail("ServerId", server->id.toString()).detail("VersionLag", server->serverMetrics.get().versionLag); + server->ssVersionTooFarBehind.set(false); + server->collection->removeLaggingStorageServer(server->lastKnownInterface.locality.zoneId().get()); + } + } + return Void(); +} + +ACTOR Future updateServerMetrics( Reference server) { + wait( updateServerMetrics( server.getPtr() ) ); + return Void(); +} + ACTOR Future waitUntilHealthy(DDTeamCollection* self, double extraDelay = 0) { state int waitCount = 0; loop { @@ -2858,6 +2914,14 @@ bool teamContainsFailedServer(DDTeamCollection* self, Reference team self->excludedServers.get(ipaddr) == DDTeamCollection::Status::FAILED) { return true; } + if(ssi.secondaryAddress().present()) { + AddressExclusion saddr(ssi.secondaryAddress().get().ip, ssi.secondaryAddress().get().port); + AddressExclusion sipaddr(ssi.secondaryAddress().get().ip); + if (self->excludedServers.get(saddr) == DDTeamCollection::Status::FAILED || + self->excludedServers.get(sipaddr) == DDTeamCollection::Status::FAILED) { + return true; + } + } } return false; } @@ -3332,7 +3396,7 @@ ACTOR Future waitHealthyZoneChange( DDTeamCollection* self ) { } } -ACTOR Future serverMetricsPolling( TCServerInfo *server) { +ACTOR Future serverMetricsPolling( TCServerInfo *server ) { state double lastUpdate = now(); loop { wait( updateServerMetrics( server ) ); @@ -3479,6 +3543,7 @@ ACTOR Future storageServerTracker( state ServerStatus status( false, false, server->lastKnownInterface.locality ); state bool lastIsUnhealthy = false; state Future metricsTracker = serverMetricsPolling( server ); + state Future> interfaceChanged = server->onInterfaceChanged; state Future storeTypeTracker = keyValueStoreTypeTracker(self, server); @@ -3489,7 +3554,7 @@ ACTOR Future storageServerTracker( try { loop { - status.isUndesired = false; + status.isUndesired = !self->disableFailingLaggingServers.get() && server->ssVersionTooFarBehind.get(); status.isWrongConfiguration = false; hasWrongDC = !isCorrectDC(self, server); hasInvalidLocality = @@ -3569,29 +3634,41 @@ ACTOR Future storageServerTracker( // If the storage server is in the excluded servers list, it is undesired NetworkAddress a = server->lastKnownInterface.address(); - state AddressExclusion addr( a.ip, a.port ); - state AddressExclusion ipaddr( a.ip ); - state DDTeamCollection::Status addrStatus = self->excludedServers.get(addr); - state DDTeamCollection::Status ipaddrStatus = self->excludedServers.get(ipaddr); - if (addrStatus != DDTeamCollection::Status::NONE || ipaddrStatus != DDTeamCollection::Status::NONE) { + AddressExclusion worstAddr( a.ip, a.port ); + DDTeamCollection::Status worstStatus = self->excludedServers.get( worstAddr ); + otherChanges.push_back( self->excludedServers.onChange( worstAddr ) ); + + for(int i = 0; i < 3; i++) { + if(i > 0 && !server->lastKnownInterface.secondaryAddress().present()) { + break; + } + AddressExclusion testAddr; + if(i == 0) testAddr = AddressExclusion(a.ip); + else if(i == 1) testAddr = AddressExclusion(server->lastKnownInterface.secondaryAddress().get().ip, server->lastKnownInterface.secondaryAddress().get().port); + else if(i == 2) testAddr = AddressExclusion(server->lastKnownInterface.secondaryAddress().get().ip); + DDTeamCollection::Status testStatus = self->excludedServers.get(testAddr); + if(testStatus > worstStatus) { + worstStatus = testStatus; + worstAddr = testAddr; + } + otherChanges.push_back( self->excludedServers.onChange( testAddr ) ); + } + + if (worstStatus != DDTeamCollection::Status::NONE) { TraceEvent(SevWarn, "UndesiredStorageServer", self->distributorId) - .detail("Server", server->id) - .detail("Excluded", - ipaddrStatus == DDTeamCollection::Status::NONE ? addr.toString() : ipaddr.toString()); + .detail("Server", server->id) + .detail("Excluded", worstAddr.toString()); status.isUndesired = true; status.isWrongConfiguration = true; - if (addrStatus == DDTeamCollection::Status::FAILED || - ipaddrStatus == DDTeamCollection::Status::FAILED) { + if (worstStatus == DDTeamCollection::Status::FAILED) { TraceEvent(SevWarn, "FailedServerRemoveKeys", self->distributorId) - .detail("Address", addr.toString()) - .detail("ServerID", server->id); + .detail("Server", server->id) + .detail("Excluded", worstAddr.toString()); wait(removeKeysFromFailedServer(cx, server->id, self->lock)); if (BUGGIFY) wait(delay(5.0)); self->shardsAffectedByTeamFailure->eraseServer(server->id); } } - otherChanges.push_back( self->excludedServers.onChange( addr ) ); - otherChanges.push_back( self->excludedServers.onChange( ipaddr ) ); failureTracker = storageServerFailureTracker(self, server, cx, &status, addedVersion); //We need to recruit new storage servers if the key value store type has changed @@ -3599,6 +3676,7 @@ ACTOR Future storageServerTracker( self->restartRecruiting.trigger(); } + if (lastIsUnhealthy && !status.isUnhealthy() && ( server->teams.size() < targetTeamNumPerServer || self->lastBuildTeamsFailed)) { self->doBuildTeams = true; @@ -3753,6 +3831,8 @@ ACTOR Future storageServerTracker( server->wakeUpTracker = Promise(); } when(wait(storeTypeTracker)) {} + when(wait(server->ssVersionTooFarBehind.onChange())) { } + when(wait(self->disableFailingLaggingServers.onChange())) { } } if (recordTeamCollectionInfo) { @@ -3861,7 +3941,7 @@ ACTOR Future checkAndRemoveInvalidLocalityAddr(DDTeamCollection* self) { int numExistingSSOnAddr(DDTeamCollection* self, const AddressExclusion& addr) { int numExistingSS = 0; for (auto& server : self->server_info) { - const NetworkAddress& netAddr = server.second->lastKnownInterface.address(); + const NetworkAddress& netAddr = server.second->lastKnownInterface.stableAddress(); AddressExclusion usedAddr(netAddr.ip, netAddr.port); if (usedAddr == addr) { ++numExistingSS; @@ -3875,10 +3955,10 @@ ACTOR Future initializeStorage(DDTeamCollection* self, RecruitStorageReply // SOMEDAY: Cluster controller waits for availability, retry quickly if a server's Locality changes self->recruitingStream.set(self->recruitingStream.get() + 1); - const NetworkAddress& netAddr = candidateWorker.worker.address(); + const NetworkAddress& netAddr = candidateWorker.worker.stableAddress(); AddressExclusion workerAddr(netAddr.ip, netAddr.port); if (numExistingSSOnAddr(self, workerAddr) <= 2 && - self->recruitingLocalities.find(candidateWorker.worker.address()) == self->recruitingLocalities.end()) { + self->recruitingLocalities.find(candidateWorker.worker.stableAddress()) == self->recruitingLocalities.end()) { // Only allow at most 2 storage servers on an address, because // too many storage server on the same address (i.e., process) can cause OOM. // Ask the candidateWorker to initialize a SS only if the worker does not have a pending request @@ -3899,7 +3979,7 @@ ACTOR Future initializeStorage(DDTeamCollection* self, RecruitStorageReply .detail("RecruitingStream", self->recruitingStream.get()); self->recruitingIds.insert(interfaceId); - self->recruitingLocalities.insert(candidateWorker.worker.address()); + self->recruitingLocalities.insert(candidateWorker.worker.stableAddress()); state ErrorOr newServer = wait(candidateWorker.worker.storage.tryGetReply(isr, TaskPriority::DataDistribution)); if (newServer.isError()) { @@ -3910,7 +3990,7 @@ ACTOR Future initializeStorage(DDTeamCollection* self, RecruitStorageReply wait(delay(SERVER_KNOBS->STORAGE_RECRUITMENT_DELAY, TaskPriority::DataDistribution)); } self->recruitingIds.erase(interfaceId); - self->recruitingLocalities.erase(candidateWorker.worker.address()); + self->recruitingLocalities.erase(candidateWorker.worker.stableAddress()); TraceEvent("DDRecruiting") .detail("Primary", self->primary) @@ -3956,7 +4036,7 @@ ACTOR Future storageRecruiter( DDTeamCollection* self, Referenceprimary) .detail("Excluding", s->second->lastKnownInterface.address()); - auto addr = s->second->lastKnownInterface.address(); + auto addr = s->second->lastKnownInterface.stableAddress(); AddressExclusion addrExcl(addr.ip, addr.port); exclusions.insert(addrExcl); numSSPerAddr[addrExcl]++; // increase from 0 @@ -4007,8 +4087,8 @@ ACTOR Future storageRecruiter( DDTeamCollection* self, Reference= 2) { TraceEvent(SevWarnAlways, "StorageRecruiterTooManySSOnSameAddr", self->distributorId) @@ -4742,7 +4822,7 @@ ACTOR Future ddExclusionSafetyCheck(DistributorExclusionSafetyCheckRequest // Go through storage server interfaces and translate Address -> server ID (UID) for (const AddressExclusion& excl : req.exclusions) { for (const auto& ssi : ssis) { - if (excl.excludes(ssi.address())) { + if (excl.excludes(ssi.address()) || (ssi.secondaryAddress().present() && excl.excludes(ssi.secondaryAddress().get()))) { excludeServerIDs.push_back(ssi.id()); } } @@ -4844,7 +4924,7 @@ DDTeamCollection* testTeamCollection(int teamSize, Reference interface.locality.set(LiteralStringRef("machineid"), Standalone(std::to_string(id))); interface.locality.set(LiteralStringRef("zoneid"), Standalone(std::to_string(id % 5))); interface.locality.set(LiteralStringRef("data_hall"), Standalone(std::to_string(id % 3))); - collection->server_info[uid] = Reference(new TCServerInfo(interface, ProcessClass(), true, collection->storageServerSet)); + collection->server_info[uid] = Reference(new TCServerInfo(interface, collection, ProcessClass(), true, collection->storageServerSet)); collection->server_status.set(uid, ServerStatus(false, false, interface.locality)); collection->checkAndCreateMachine(collection->server_info[uid]); } @@ -4885,7 +4965,7 @@ DDTeamCollection* testMachineTeamCollection(int teamSize, Reference(std::to_string(data_hall_id))); interface.locality.set(LiteralStringRef("dcid"), Standalone(std::to_string(dc_id))); collection->server_info[uid] = - Reference(new TCServerInfo(interface, ProcessClass(), true, collection->storageServerSet)); + Reference(new TCServerInfo(interface, collection, ProcessClass(), true, collection->storageServerSet)); collection->server_status.set(uid, ServerStatus(false, false, interface.locality)); } diff --git a/fdbserver/DataDistribution.actor.h b/fdbserver/DataDistribution.actor.h index dc518f7d2f..f07a15dbfd 100644 --- a/fdbserver/DataDistribution.actor.h +++ b/fdbserver/DataDistribution.actor.h @@ -25,7 +25,6 @@ #define FDBSERVER_DATA_DISTRIBUTION_ACTOR_H #include "fdbclient/NativeAPI.actor.h" -#include "fdbserver/ClusterRecruitmentInterface.h" #include "fdbserver/MoveKeys.actor.h" #include "fdbserver/LogSystem.h" #include "flow/actorcompiler.h" // This must be the last #include. diff --git a/fdbserver/DeltaTree.h b/fdbserver/DeltaTree.h index c07f5c7e78..6e821cb2b4 100644 --- a/fdbserver/DeltaTree.h +++ b/fdbserver/DeltaTree.h @@ -27,14 +27,15 @@ #include typedef uint64_t Word; +// Get the number of prefix bytes that are the same between a and b, up to their common length of cl static inline int commonPrefixLength(uint8_t const* ap, uint8_t const* bp, int cl) { int i = 0; const int wordEnd = cl - sizeof(Word) + 1; - for(; i < wordEnd; i += sizeof(Word)) { - Word a = *(Word *)ap; - Word b = *(Word *)bp; - if(a != b) { + for (; i < wordEnd; i += sizeof(Word)) { + Word a = *(Word*)ap; + Word b = *(Word*)bp; + if (a != b) { return i + ctzll(a ^ b) / 8; } ap += sizeof(Word); @@ -58,31 +59,32 @@ static int commonPrefixLength(StringRef a, StringRef b) { // This appears to be the fastest version static int lessOrEqualPowerOfTwo(int n) { int p; - for (p = 1; p+p <= n; p+=p); + for (p = 1; p + p <= n; p += p) + ; return p; } /* static int _lessOrEqualPowerOfTwo(uint32_t n) { - if(n == 0) - return n; - int trailing = __builtin_ctz(n); - int leading = __builtin_clz(n); - if(trailing + leading == ((sizeof(n) * 8) - 1)) - return n; - return 1 << ( (sizeof(n) * 8) - leading - 1); + if(n == 0) + return n; + int trailing = __builtin_ctz(n); + int leading = __builtin_clz(n); + if(trailing + leading == ((sizeof(n) * 8) - 1)) + return n; + return 1 << ( (sizeof(n) * 8) - leading - 1); } static int __lessOrEqualPowerOfTwo(unsigned int n) { - int p = 1; - for(; p <= n; p <<= 1); - return p >> 1; + int p = 1; + for(; p <= n; p <<= 1); + return p >> 1; } */ static int perfectSubtreeSplitPoint(int subtree_size) { // return the inorder index of the root node in a subtree of the given size - // consistent with the resulting binary search tree being "perfect" (having minimal height + // consistent with the resulting binary search tree being "perfect" (having minimal height // and all missing nodes as far right as possible). // There has to be a simpler way to do this. int s = lessOrEqualPowerOfTwo((subtree_size - 1) / 2 + 1) - 1; @@ -90,16 +92,14 @@ static int perfectSubtreeSplitPoint(int subtree_size) { } static int perfectSubtreeSplitPointCached(int subtree_size) { - static uint16_t *points = nullptr; + static uint16_t* points = nullptr; static const int max = 500; - if(points == nullptr) { + if (points == nullptr) { points = new uint16_t[max]; - for(int i = 0; i < max; ++i) - points[i] = perfectSubtreeSplitPoint(i); + for (int i = 0; i < max; ++i) points[i] = perfectSubtreeSplitPoint(i); } - if(subtree_size < max) - return points[subtree_size]; + if (subtree_size < max) return points[subtree_size]; return perfectSubtreeSplitPoint(subtree_size); } @@ -128,7 +128,7 @@ static int perfectSubtreeSplitPointCached(int subtree_size) { // int getCommonPrefixLen(const T &base, int skip) const; // // // Returns the size of the delta object needed to make *this from base -// // TODO: Explain contract required for deltaSize to be used to predict final +// // TODO: Explain contract required for deltaSize to be used to predict final // // balanced tree size incrementally while adding sorted items to a build set // int deltaSize(const T &base) const; // @@ -146,169 +146,195 @@ static int perfectSubtreeSplitPointCached(int subtree_size) { // // Retrieves the previously stored boolean // bool getPrefixSource() const; // -#pragma pack(push,1) -template +#pragma pack(push, 1) +template struct DeltaTree { - - static int MaximumTreeSize() { - return std::numeric_limits::max(); - }; - struct Node { - OffsetT leftChildOffset; - OffsetT rightChildOffset; - - inline DeltaT & delta() { - return *(DeltaT *)(this + 1); + union { + struct { + uint32_t left; + uint32_t right; + } largeOffsets; + struct { + uint16_t left; + uint16_t right; + } smallOffsets; }; - inline const DeltaT & delta() const { - return *(const DeltaT *)(this + 1); + static int headerSize(bool large) { return large ? sizeof(largeOffsets) : sizeof(smallOffsets); } + + inline DeltaT& delta(bool large) { + return large ? *(DeltaT*)(&largeOffsets + 1) : *(DeltaT*)(&smallOffsets + 1); }; - Node * rightChild() const { - //printf("Node(%p): leftOffset=%d rightOffset=%d deltaSize=%d\n", this, (int)leftChildOffset, (int)rightChildOffset, (int)delta().size()); - return rightChildOffset == 0 ? nullptr : (Node *)((uint8_t *)this + rightChildOffset); + inline const DeltaT& delta(bool large) const { + return large ? *(const DeltaT*)(&largeOffsets + 1) : *(const DeltaT*)(&smallOffsets + 1); + }; + + Node* resolvePointer(int offset) const { return offset == 0 ? nullptr : (Node*)((uint8_t*)this + offset); } + + Node* rightChild(bool large) const { return resolvePointer(large ? largeOffsets.right : smallOffsets.right); } + + Node* leftChild(bool large) const { return resolvePointer(large ? largeOffsets.left : smallOffsets.left); } + + void setRightChildOffset(bool large, int offset) { + if (large) { + largeOffsets.right = offset; + } else { + smallOffsets.right = offset; + } } - Node * leftChild() const { - //printf("Node(%p): leftOffset=%d rightOffset=%d deltaSize=%d\n", this, (int)leftChildOffset, (int)rightChildOffset, (int)delta().size()); - return leftChildOffset == 0 ? nullptr : (Node *)((uint8_t *)this + leftChildOffset); + void setLeftChildOffset(bool large, int offset) { + if (large) { + largeOffsets.left = offset; + } else { + smallOffsets.left = offset; + } } - int size() const { - return sizeof(Node) + delta().size(); + int size(bool large) const { + return delta(large).size() + (large ? sizeof(smallOffsets) : sizeof(largeOffsets)); } }; + static constexpr int SmallSizeLimit = std::numeric_limits::max(); + static constexpr int LargeTreePerNodeExtraOverhead = sizeof(Node::largeOffsets) - sizeof(Node::smallOffsets); + struct { - OffsetT numItems; // Number of items in the tree. - OffsetT nodeBytesUsed; // Bytes in use by tree, exluding overhead - OffsetT nodeBytesFree; // Bytes left at end of tree to expand into - OffsetT nodeBytesDeleted; // Delta bytes deleted from tree. Note that some of these bytes could be borrowed by descendents. - uint8_t initialHeight; // Height of tree as originally built - uint8_t maxHeight; // Maximum height of tree after any insertion. Value of 0 means no insertions done. + uint16_t numItems; // Number of items in the tree. + uint32_t nodeBytesUsed; // Bytes used by nodes (everything after the tree header) + uint32_t nodeBytesFree; // Bytes left at end of tree to expand into + uint32_t nodeBytesDeleted; // Delta bytes deleted from tree. Note that some of these bytes could be borrowed by + // descendents. + uint8_t initialHeight; // Height of tree as originally built + uint8_t maxHeight; // Maximum height of tree after any insertion. Value of 0 means no insertions done. + bool largeNodes; // Node size, can be calculated as capacity > SmallSizeLimit but it will be used a lot }; #pragma pack(pop) - inline Node & root() { - return *(Node *)(this + 1); - } + inline Node& root() { return *(Node*)(this + 1); } - inline const Node & root() const { - return *(const Node *)(this + 1); - } + inline const Node& root() const { return *(const Node*)(this + 1); } - int size() const { - return sizeof(DeltaTree) + nodeBytesUsed; - } + int size() const { return sizeof(DeltaTree) + nodeBytesUsed; } - inline Node & newNode() { - return *(Node *)((uint8_t *)this + size()); - } + int capacity() const { return size() + nodeBytesFree; } + + inline Node& newNode() { return *(Node*)((uint8_t*)this + size()); } public: // Get count of total overhead bytes (everything but the user-formatted Delta) for a tree given size n - static inline int GetTreeOverhead(int n = 0) { - return sizeof(DeltaTree) + (n * sizeof(Node)); - } + static int emptyTreeSize() { return sizeof(DeltaTree); } struct DecodedNode { DecodedNode() {} // construct root node - DecodedNode(Node *raw, const T *prev, const T *next, Arena &arena) - : raw(raw), parent(nullptr), otherAncestor(nullptr), leftChild(nullptr), rightChild(nullptr), prev(prev), next(next), - item(raw->delta().apply(raw->delta().getPrefixSource() ? *prev : *next, arena)) - { - //printf("DecodedNode1 raw=%p delta=%s\n", raw, raw->delta().toString().c_str()); + DecodedNode(Node* raw, const T* prev, const T* next, Arena& arena, bool large) + : raw(raw), parent(nullptr), otherAncestor(nullptr), leftChild(nullptr), rightChild(nullptr), prev(prev), + next(next), item(raw->delta(large).apply(raw->delta(large).getPrefixSource() ? *prev : *next, arena)), + large(large) { + // printf("DecodedNode1 raw=%p delta=%s\n", raw, raw->delta(large).toString().c_str()); } - + // Construct non-root node - // wentLeft indicates that we've gone left to get to the raw node. - DecodedNode(Node *raw, DecodedNode *parent, bool wentLeft, Arena &arena) - : parent(parent), otherAncestor(wentLeft ? parent->getPrevAncestor() : parent->getNextAncestor()), - prev(wentLeft ? parent->prev : &parent->item), - next(wentLeft ? &parent->item : parent->next), - leftChild(nullptr), rightChild(nullptr), - raw(raw), item(raw->delta().apply(raw->delta().getPrefixSource() ? *prev : *next, arena)) - { - //printf("DecodedNode2 raw=%p delta=%s\n", raw, raw->delta().toString().c_str()); + // wentLeft indicates that we've gone left to get to the raw node. + DecodedNode(Node* raw, DecodedNode* parent, bool wentLeft, Arena& arena) + : parent(parent), large(parent->large), + otherAncestor(wentLeft ? parent->getPrevAncestor() : parent->getNextAncestor()), + prev(wentLeft ? parent->prev : &parent->item), next(wentLeft ? &parent->item : parent->next), + leftChild(nullptr), rightChild(nullptr), raw(raw), + item(raw->delta(large).apply(raw->delta(large).getPrefixSource() ? *prev : *next, arena)) { + // printf("DecodedNode2 raw=%p delta=%s\n", raw, raw->delta(large).toString().c_str()); } // Returns true if otherAncestor is the previous ("greatest lesser") ancestor - bool otherAncestorPrev() const { - return parent && parent->leftChild == this; - } + bool otherAncestorPrev() const { return parent && parent->leftChild == this; } // Returns true if otherAncestor is the next ("least greator") ancestor - bool otherAncestorNext() const { - return parent && parent->rightChild == this; - } + bool otherAncestorNext() const { return parent && parent->rightChild == this; } - DecodedNode * getPrevAncestor() const { - return otherAncestorPrev() ? otherAncestor : parent; - } + DecodedNode* getPrevAncestor() const { return otherAncestorPrev() ? otherAncestor : parent; } - DecodedNode * getNextAncestor() const { - return otherAncestorNext() ? otherAncestor : parent; - } + DecodedNode* getNextAncestor() const { return otherAncestorNext() ? otherAncestor : parent; } - DecodedNode * jumpNext(DecodedNode *root) const { - if(otherAncestorNext()) { - return (otherAncestor != nullptr) ? otherAncestor : rightChild; + DecodedNode* jumpUpNext(DecodedNode* root, bool& othersChild) const { + if (parent != nullptr) { + if (parent->rightChild == this) { + return otherAncestor; + } + if (otherAncestor != nullptr) { + othersChild = true; + return otherAncestor->rightChild; + } } - else { - if(this == root) { + return parent; + } + + DecodedNode* jumpUpPrev(DecodedNode* root, bool& othersChild) const { + if (parent != nullptr) { + if (parent->leftChild == this) { + return otherAncestor; + } + if (otherAncestor != nullptr) { + othersChild = true; + return otherAncestor->leftChild; + } + } + return parent; + } + + DecodedNode* jumpNext(DecodedNode* root) const { + if (otherAncestorNext()) { + return (otherAncestor != nullptr) ? otherAncestor : rightChild; + } else { + if (this == root) { return rightChild; } return (otherAncestor != nullptr) ? otherAncestor->rightChild : root; } } - DecodedNode * jumpPrev(DecodedNode *root) const { - if(otherAncestorPrev()) { + DecodedNode* jumpPrev(DecodedNode* root) const { + if (otherAncestorPrev()) { return (otherAncestor != nullptr) ? otherAncestor : leftChild; - } - else { - if(this == root) { + } else { + if (this == root) { return leftChild; } return (otherAncestor != nullptr) ? otherAncestor->leftChild : root; } } - void setDeleted(bool deleted) { - raw->delta().setDeleted(deleted); - } + void setDeleted(bool deleted) { raw->delta(large).setDeleted(deleted); } - bool isDeleted() const { - return raw->delta().getDeleted(); - } + bool isDeleted() const { return raw->delta(large).getDeleted(); } - Node *raw; - DecodedNode *parent; - DecodedNode *otherAncestor; - DecodedNode *leftChild; - DecodedNode *rightChild; - const T *prev; // greatest ancestor to the left, or tree lower bound - const T *next; // least ancestor to the right, or tree upper bound + bool large; // Node size + Node* raw; + DecodedNode* parent; + DecodedNode* otherAncestor; + DecodedNode* leftChild; + DecodedNode* rightChild; + const T* prev; // greatest ancestor to the left, or tree lower bound + const T* next; // least ancestor to the right, or tree upper bound T item; - DecodedNode *getRightChild(Arena &arena) { - if(rightChild == nullptr) { - Node *n = raw->rightChild(); - if(n != nullptr) { + DecodedNode* getRightChild(Arena& arena) { + if (rightChild == nullptr) { + Node* n = raw->rightChild(large); + if (n != nullptr) { rightChild = new (arena) DecodedNode(n, this, false, arena); } } return rightChild; } - DecodedNode *getLeftChild(Arena &arena) { - if(leftChild == nullptr) { - Node *n = raw->leftChild(); - if(n != nullptr) { + DecodedNode* getLeftChild(Arena& arena) { + if (leftChild == nullptr) { + Node* n = raw->leftChild(large); + if (n != nullptr) { leftChild = new (arena) DecodedNode(n, this, true, arena); } } @@ -325,75 +351,69 @@ public: struct Mirror : FastAllocated { friend class Cursor; - Mirror(const void *treePtr = nullptr, const T *lowerBound = nullptr, const T *upperBound = nullptr) - : tree((DeltaTree *)treePtr), lower(lowerBound), upper(upperBound) - { - // TODO: Remove these copies into arena and require users of Mirror to keep prev and next alive during its lifetime - lower = new(arena) T(arena, *lower); - upper = new(arena) T(arena, *upper); + Mirror(const void* treePtr = nullptr, const T* lowerBound = nullptr, const T* upperBound = nullptr) + : tree((DeltaTree*)treePtr), lower(lowerBound), upper(upperBound) { + // TODO: Remove these copies into arena and require users of Mirror to keep prev and next alive during its + // lifetime + lower = new (arena) T(arena, *lower); + upper = new (arena) T(arena, *upper); - root = (tree->nodeBytesUsed == 0) ? nullptr : new (arena) DecodedNode(&tree->root(), lower, upper, arena); + root = (tree->nodeBytesUsed == 0) ? nullptr + : new (arena) + DecodedNode(&tree->root(), lower, upper, arena, tree->largeNodes); } - const T *lowerBound() const { - return lower; - } + const T* lowerBound() const { return lower; } - const T *upperBound() const { - return upper; - } + const T* upperBound() const { return upper; } -private: + private: Arena arena; - DeltaTree *tree; - DecodedNode *root; - const T *lower; - const T *upper; -public: + DeltaTree* tree; + DecodedNode* root; + const T* lower; + const T* upper; - Cursor getCursor() { - return Cursor(this); - } + public: + Cursor getCursor() { return Cursor(this); } // Try to insert k into the DeltaTree, updating byte counts and initialHeight if they // have changed (they won't if k already exists in the tree but was deleted). // Returns true if successful, false if k does not fit in the space available // or if k is already in the tree (and was not already deleted). - bool insert(const T &k, int skipLen = 0, int maxHeightAllowed = std::numeric_limits::max()) { + bool insert(const T& k, int skipLen = 0, int maxHeightAllowed = std::numeric_limits::max()) { int height = 1; - DecodedNode *n = root; + DecodedNode* n = root; bool addLeftChild = false; - while(n != nullptr) { + while (n != nullptr) { int cmp = k.compare(n->item, skipLen); - if(cmp >= 0) { + if (cmp >= 0) { // If we found an item identical to k then if it is deleted, undeleted it, // otherwise fail - if(cmp == 0) { - auto &d = n->raw->delta(); - if(d.getDeleted()) { + if (cmp == 0) { + auto& d = n->raw->delta(tree->largeNodes); + if (d.getDeleted()) { d.setDeleted(false); ++tree->numItems; return true; - } - else { + } else { return false; } } - DecodedNode *right = n->getRightChild(arena); + DecodedNode* right = n->getRightChild(arena); - if(right == nullptr) { + if (right == nullptr) { break; } n = right; - } - else { - DecodedNode *left = n->getLeftChild(arena); + } else { + DecodedNode* left = n->getLeftChild(arena); - if(left == nullptr) { + if (left == nullptr) { addLeftChild = true; break; } @@ -403,14 +423,14 @@ public: ++height; } - if(height > maxHeightAllowed) { + if (height > maxHeightAllowed) { return false; } // Insert k as the left or right child of n, depending on the value of addLeftChild // First, see if it will fit. - const T *prev = addLeftChild ? n->prev : &n->item; - const T *next = addLeftChild ? &n->item : n->next; + const T* prev = addLeftChild ? n->prev : &n->item; + const T* next = addLeftChild ? &n->item : n->next; int common = prev->getCommonPrefixLen(*next, skipLen); int commonWithPrev = k.getCommonPrefixLen(*prev, common); @@ -418,31 +438,31 @@ public: bool basePrev = commonWithPrev >= commonWithNext; int commonPrefix = basePrev ? commonWithPrev : commonWithNext; - const T *base = basePrev ? prev : next; + const T* base = basePrev ? prev : next; - int deltaSize = k.deltaSize(*base, false, commonPrefix); - int nodeSpace = deltaSize + sizeof(Node); - if(nodeSpace > tree->nodeBytesFree) { + int deltaSize = k.deltaSize(*base, commonPrefix, false); + int nodeSpace = deltaSize + Node::headerSize(tree->largeNodes); + if (nodeSpace > tree->nodeBytesFree) { return false; } - DecodedNode *newNode = new (arena) DecodedNode(); - Node *raw = &tree->newNode(); - raw->leftChildOffset = 0; - raw->rightChildOffset = 0; - int newOffset = (uint8_t *)raw - (uint8_t *)n->raw; - //printf("Inserting %s at offset %d\n", k.toString().c_str(), newOffset); + DecodedNode* newNode = new (arena) DecodedNode(); + Node* raw = &tree->newNode(); + raw->setLeftChildOffset(tree->largeNodes, 0); + raw->setRightChildOffset(tree->largeNodes, 0); + int newOffset = (uint8_t*)raw - (uint8_t*)n->raw; + // printf("Inserting %s at offset %d\n", k.toString().c_str(), newOffset); - if(addLeftChild) { + if (addLeftChild) { n->leftChild = newNode; - n->raw->leftChildOffset = newOffset; - } - else { + n->raw->setLeftChildOffset(tree->largeNodes, newOffset); + } else { n->rightChild = newNode; - n->raw->rightChildOffset = newOffset; + n->raw->setRightChildOffset(tree->largeNodes, newOffset); } newNode->parent = n; + newNode->large = tree->largeNodes; newNode->leftChild = nullptr; newNode->rightChild = nullptr; newNode->raw = raw; @@ -450,18 +470,19 @@ public: newNode->prev = prev; newNode->next = next; - ASSERT(deltaSize == k.writeDelta(raw->delta(), *base, commonPrefix)); - raw->delta().setPrefixSource(basePrev); + ASSERT(deltaSize == k.writeDelta(raw->delta(tree->largeNodes), *base, commonPrefix)); + raw->delta(tree->largeNodes).setPrefixSource(basePrev); - // Initialize node's item from the delta (instead of copying into arena) to avoid unnecessary arena space usage - newNode->item = raw->delta().apply(*base, arena); + // Initialize node's item from the delta (instead of copying into arena) to avoid unnecessary arena space + // usage + newNode->item = raw->delta(tree->largeNodes).apply(*base, arena); tree->nodeBytesUsed += nodeSpace; tree->nodeBytesFree -= nodeSpace; ++tree->numItems; // Update max height of the tree if necessary - if(height > tree->maxHeight) { + if (height > tree->maxHeight) { tree->maxHeight = height; } @@ -469,47 +490,37 @@ public: } // Erase k by setting its deleted flag to true. Returns true only if k existed - bool erase(const T &k, int skipLen = 0) { + bool erase(const T& k, int skipLen = 0) { Cursor c = getCursor(); - bool r = c.seek(k); - if(r) { + int cmp = c.seek(k); + // If exactly k is found + if (cmp == 0 && !c.node->isDeleted()) { c.erase(); + return true; } - return r; + return false; } }; // Cursor provides a way to seek into a DeltaTree and iterate over its contents // All Cursors from a Mirror share the same decoded node 'cache' (tree of DecodedNodes) struct Cursor { - Cursor() : mirror(nullptr), node(nullptr) { - } + Cursor() : mirror(nullptr), node(nullptr) {} - Cursor(Mirror *r) : mirror(r), node(mirror->root) { - } + Cursor(Mirror* r) : mirror(r), node(mirror->root) {} - Mirror *mirror; - DecodedNode *node; + Mirror* mirror; + DecodedNode* node; - bool valid() const { - return node != nullptr; - } + bool valid() const { return node != nullptr; } - const T & get() const { - return node->item; - } + const T& get() const { return node->item; } - const T & getOrUpperBound() const { - return valid() ? node->item : *mirror->upperBound(); - } + const T& getOrUpperBound() const { return valid() ? node->item : *mirror->upperBound(); } - bool operator==(const Cursor &rhs) const { - return node == rhs.node; - } + bool operator==(const Cursor& rhs) const { return node == rhs.node; } - bool operator!=(const Cursor &rhs) const { - return node != rhs.node; - } + bool operator!=(const Cursor& rhs) const { return node != rhs.node; } void erase() { node->setDeleted(true); @@ -517,87 +528,69 @@ public: moveNext(); } - bool seekLessThanOrEqual(const T &s, int skipLen = 0) { - return seekLessThanOrEqual(s, skipLen, nullptr, 0); - } - - bool seekLessThanOrEqual(const T &s, int skipLen, const Cursor *pHint) { - if(pHint->valid()) { - return seekLessThanOrEqual(s, skipLen, pHint, s.compare(pHint->get(), skipLen)); - } - return seekLessThanOrEqual(s, skipLen, nullptr, 0); - } - - // Moves the cursor to the node with the greatest key less than or equal to s. If successful, - // returns true, otherwise returns false and the cursor position will be invalid. - // If pHint is given then initialCmp must be logically equivalent to s.compare(pHint->get()) - // If hintFwd is omitted, it will be calculated (see other definitions above) - bool seekLessThanOrEqual(const T &s, int skipLen, const Cursor *pHint, int initialCmp) { - DecodedNode *n; + // TODO: Make hint-based seek() use the hint logic in this, which is better and actually improves seek times, + // then remove this function. + bool seekLessThanOrEqualOld(const T& s, int skipLen, const Cursor* pHint, int initialCmp) { + DecodedNode* n; // If there's a hint position, use it // At the end of using the hint, if n is valid it should point to a node which has not yet been compared to. - if(pHint != nullptr && pHint->node != nullptr) { + if (pHint != nullptr && pHint->node != nullptr) { n = pHint->node; - if(initialCmp == 0) { + if (initialCmp == 0) { node = n; return _hideDeletedBackward(); } - if(initialCmp > 0) { + if (initialCmp > 0) { node = n; - while(n != nullptr) { + while (n != nullptr) { n = n->jumpNext(mirror->root); - if(n == nullptr) { + if (n == nullptr) { break; } - int cmp = s.compare(n->item, skipLen); - if(cmp > 0) { + if (cmp > 0) { node = n; continue; } - if(cmp == 0) { + if (cmp == 0) { node = n; n = nullptr; - } - else { + } else { n = n->leftChild; } break; } - } - else { - while(n != nullptr) { + } else { + while (n != nullptr) { n = n->jumpPrev(mirror->root); - if(n == nullptr) { + if (n == nullptr) { break; } int cmp = s.compare(n->item, skipLen); - if(cmp >= 0) { + if (cmp >= 0) { node = n; n = (cmp == 0) ? nullptr : n->rightChild; break; } } } - } - else { + } else { // Start at root, clear current position n = mirror->root; node = nullptr; } - while(n != nullptr) { + while (n != nullptr) { int cmp = s.compare(n->item, skipLen); - if(cmp < 0) { + if (cmp < 0) { n = n->getLeftChild(mirror->arena); - } - else { + } else { // n <= s so store it in node as a potential result node = n; - if(cmp == 0) { + if (cmp == 0) { break; } @@ -608,74 +601,152 @@ public: return _hideDeletedBackward(); } - // Moves the cursor to the node with the lowest key greater than or equal to s. If successful, - // returns true, otherwise returns false and the cursor position will be invalid. - bool seekGreaterThanOrEqual(const T &s, int skipLen = 0) { - DecodedNode *n = mirror->root; - node = nullptr; - - while(n != nullptr) { - int cmp = s.compare(n->item, skipLen); - - if(cmp > 0) { - n = n->getRightChild(mirror->arena); - } - else { - // n >= s so store it in node as a potential result - node = n; - - if(cmp == 0) { - break; - } - - n = n->getLeftChild(mirror->arena); - } + // The seek methods, of the form seek[Less|Greater][orEqual](...) are very similar. + // They attempt move the cursor to the [Greatest|Least] item, based on the name of the function. + // Then will not "see" erased records. + // If successful, they return true, and if not then false a while making the cursor invalid. + // These methods forward arguments to the seek() overloads, see those for argument descriptions. + template + bool seekLessThan(Args... args) { + int cmp = seek(args...); + if (cmp < 0 || (cmp == 0 && node != nullptr)) { + movePrev(); } + return _hideDeletedBackward(); + } + template + bool seekLessThanOrEqual(Args... args) { + int cmp = seek(args...); + if (cmp < 0) { + movePrev(); + } + return _hideDeletedBackward(); + } + + template + bool seekGreaterThan(Args... args) { + int cmp = seek(args...); + if (cmp > 0 || (cmp == 0 && node != nullptr)) { + moveNext(); + } return _hideDeletedForward(); } - // Moves the cursor to the node with exactly item s - // If successful, returns true, otherwise returns false and the cursor position will be invalid. - bool seek(const T &s, int skipLen = 0) { - DecodedNode *n = mirror->root; + template + bool seekGreaterThanOrEqual(Args... args) { + int cmp = seek(args...); + if (cmp > 0) { + moveNext(); + } + return _hideDeletedForward(); + } + + // seek() moves the cursor to a node containing s or the node that would be the parent of s if s were to be + // added to the tree. If the tree was empty, the cursor will be invalid and the return value will be 0. + // Otherwise, returns the result of s.compare(item at cursor position) + // Does not skip/avoid deleted nodes. + int seek(const T& s, int skipLen = 0) { + DecodedNode* n = mirror->root; node = nullptr; - - while(n != nullptr) { - int cmp = s.compare(n->item, skipLen); - - if(cmp == 0) { - if(n->isDeleted()) { - return false; - } - node = n; - return true; + int cmp = 0; + while (n != nullptr) { + node = n; + cmp = s.compare(n->item, skipLen); + if (cmp == 0) { + break; } n = (cmp > 0) ? n->getRightChild(mirror->arena) : n->getLeftChild(mirror->arena); } - return false; + return cmp; + } + + // Same usage as seek() but with a hint of a cursor, which can't be null, whose starting position + // should be close to s in the tree to improve seek time. + // initialCmp should be logically equivalent to s.compare(pHint->get()) or 0, in which + // case the comparison will be done in this method. + // TODO: This is broken, it's not faster than not using a hint. See Make thisUnfortunately in a microbenchmark + // attempting to approximate a common use case, this version of using a cursor hint is actually slower than not + // using a hint. + int seek(const T& s, int skipLen, const Cursor* pHint, int initialCmp = 0) { + DecodedNode* n = mirror->root; + node = nullptr; + int cmp; + + // If there's a hint position, use it + // At the end of using the hint, if n is valid it should point to a node which has not yet been compared to. + if (pHint->node != nullptr) { + n = pHint->node; + if (initialCmp == 0) { + initialCmp = s.compare(pHint->get()); + } + cmp = initialCmp; + + while (true) { + node = n; + if (cmp == 0) { + return cmp; + } + + // Attempt to jump up and past s + bool othersChild = false; + n = (initialCmp > 0) ? n->jumpUpNext(mirror->root, othersChild) + : n->jumpUpPrev(mirror->root, othersChild); + if (n == nullptr) { + n = (cmp > 0) ? node->rightChild : node->leftChild; + break; + } + + // Compare s to the node jumped to + cmp = s.compare(n->item, skipLen); + + // n is on the oposite side of s than node is, then n is too far. + if (cmp != 0 && ((initialCmp ^ cmp) < 0)) { + if (!othersChild) { + n = (cmp < 0) ? node->rightChild : node->leftChild; + } + break; + } + } + } else { + // Start at root, clear current position + n = mirror->root; + node = nullptr; + cmp = 0; + } + + // Search starting from n, which is either the root or the result of applying the hint + while (n != nullptr) { + node = n; + cmp = s.compare(n->item, skipLen); + if (cmp == 0) { + break; + } + + n = (cmp > 0) ? n->getRightChild(mirror->arena) : n->getLeftChild(mirror->arena); + } + + return cmp; } bool moveFirst() { - DecodedNode *n = mirror->root; + DecodedNode* n = mirror->root; node = n; - while(n != nullptr) { + while (n != nullptr) { n = n->getLeftChild(mirror->arena); - if(n != nullptr) - node = n; + if (n != nullptr) node = n; } return _hideDeletedForward(); } bool moveLast() { - DecodedNode *n = mirror->root; + DecodedNode* n = mirror->root; node = n; - while(n != nullptr) { + while (n != nullptr) { n = n->getRightChild(mirror->arena); - if(n != nullptr) - node = n; + if (n != nullptr) node = n; } return _hideDeletedBackward(); } @@ -683,15 +754,14 @@ public: // Try to move to next node, sees deleted nodes. void _moveNext() { // Try to go right - DecodedNode *n = node->getRightChild(mirror->arena); + DecodedNode* n = node->getRightChild(mirror->arena); // If we couldn't go right, then the answer is our next ancestor - if(n == nullptr) { + if (n == nullptr) { node = node->getNextAncestor(); - } - else { + } else { // Go left as far as possible - while(n != nullptr) { + while (n != nullptr) { node = n; n = n->getLeftChild(mirror->arena); } @@ -701,15 +771,14 @@ public: // Try to move to previous node, sees deleted nodes. void _movePrev() { // Try to go left - DecodedNode *n = node->getLeftChild(mirror->arena); + DecodedNode* n = node->getLeftChild(mirror->arena); // If we couldn't go left, then the answer is our prev ancestor - if(n == nullptr) { + if (n == nullptr) { node = node->getPrevAncestor(); - } - else { + } else { // Go right as far as possible - while(n != nullptr) { + while (n != nullptr) { node = n; n = n->getRightChild(mirror->arena); } @@ -728,14 +797,14 @@ public: private: bool _hideDeletedBackward() { - while(node != nullptr && node->isDeleted()) { + while (node != nullptr && node->isDeleted()) { _movePrev(); } return node != nullptr; } bool _hideDeletedForward() { - while(node != nullptr && node->isDeleted()) { + while (node != nullptr && node->isDeleted()) { _moveNext(); } return node != nullptr; @@ -743,8 +812,8 @@ public: }; // Returns number of bytes written - int build(int spaceAvailable, const T *begin, const T *end, const T *prev, const T *next) { - //printf("tree size: %d node size: %d\n", sizeof(DeltaTree), sizeof(Node)); + int build(int spaceAvailable, const T* begin, const T* end, const T* prev, const T* next) { + largeNodes = spaceAvailable > SmallSizeLimit; int count = end - begin; numItems = count; nodeBytesDeleted = 0; @@ -752,10 +821,9 @@ public: maxHeight = 0; // The boundary leading to the new page acts as the last time we branched right - if(begin != end) { - nodeBytesUsed = build(root(), begin, end, prev, next, prev->getCommonPrefixLen(*next, 0)); - } - else { + if (begin != end) { + nodeBytesUsed = buildSubtree(root(), begin, end, prev, next, prev->getCommonPrefixLen(*next, 0)); + } else { nodeBytesUsed = 0; } nodeBytesFree = spaceAvailable - size(); @@ -763,58 +831,56 @@ public: } private: - static OffsetT build(Node &root, const T *begin, const T *end, const T *prev, const T *next, int subtreeCommon) { - //printf("build: %s to %s\n", begin->toString().c_str(), (end - 1)->toString().c_str()); - //printf("build: root at %p sizeof(Node) %d delta at %p \n", &root, sizeof(Node), &root.delta()); + int buildSubtree(Node& node, const T* begin, const T* end, const T* prev, const T* next, int subtreeCommon) { + // printf("build: %s to %s\n", begin->toString().c_str(), (end - 1)->toString().c_str()); + // printf("build: root at %p Node::headerSize %d delta at %p \n", &root, Node::headerSize(largeNodes), + // &node.delta(largeNodes)); ASSERT(end != begin); int count = end - begin; // Find key to be stored in root int mid = perfectSubtreeSplitPointCached(count); - const T &item = begin[mid]; + const T& item = begin[mid]; int commonWithPrev = item.getCommonPrefixLen(*prev, subtreeCommon); int commonWithNext = item.getCommonPrefixLen(*next, subtreeCommon); bool prefixSourcePrev; int commonPrefix; - const T *base; - if(commonWithPrev >= commonWithNext) { + const T* base; + if (commonWithPrev >= commonWithNext) { prefixSourcePrev = true; commonPrefix = commonWithPrev; base = prev; - } - else { + } else { prefixSourcePrev = false; commonPrefix = commonWithNext; base = next; } - int deltaSize = item.writeDelta(root.delta(), *base, commonPrefix); - root.delta().setPrefixSource(prefixSourcePrev); - //printf("Serialized %s to %p\n", item.toString().c_str(), &root.delta()); + int deltaSize = item.writeDelta(node.delta(largeNodes), *base, commonPrefix); + node.delta(largeNodes).setPrefixSource(prefixSourcePrev); + // printf("Serialized %s to %p\n", item.toString().c_str(), &root.delta(largeNodes)); // Continue writing after the serialized Delta. - uint8_t *wptr = (uint8_t *)&root.delta() + deltaSize; + uint8_t* wptr = (uint8_t*)&node.delta(largeNodes) + deltaSize; // Serialize left child - if(count > 1) { - wptr += build(*(Node *)wptr, begin, begin + mid, prev, &item, commonWithPrev); - root.leftChildOffset = sizeof(Node) + deltaSize; - } - else { - root.leftChildOffset = 0; + if (count > 1) { + wptr += buildSubtree(*(Node*)wptr, begin, begin + mid, prev, &item, commonWithPrev); + node.setLeftChildOffset(largeNodes, Node::headerSize(largeNodes) + deltaSize); + } else { + node.setLeftChildOffset(largeNodes, 0); } // Serialize right child - if(count > 2) { - root.rightChildOffset = wptr - (uint8_t *)&root; - wptr += build(*(Node *)wptr, begin + mid + 1, end, &item, next, commonWithNext); - } - else { - root.rightChildOffset = 0; + if (count > 2) { + node.setRightChildOffset(largeNodes, wptr - (uint8_t*)&node); + wptr += buildSubtree(*(Node*)wptr, begin + mid + 1, end, &item, next, commonWithNext); + } else { + node.setRightChildOffset(largeNodes, 0); } - return wptr - (uint8_t *)&root; + return wptr - (uint8_t*)&node; } }; diff --git a/fdbserver/IPager.h b/fdbserver/IPager.h index 12d23ab089..b3991a025c 100644 --- a/fdbserver/IPager.h +++ b/fdbserver/IPager.h @@ -4,13 +4,13 @@ * This source file is part of the FoundationDB open source project * * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors - * + * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -29,24 +29,30 @@ #define REDWOOD_DEBUG 0 -#define debug_printf_stream stderr -#define debug_printf_always(...) { fprintf(debug_printf_stream, "%s %f %04d ", g_network->getLocalAddress().toString().c_str(), now(), __LINE__); fprintf(debug_printf_stream, __VA_ARGS__); fflush(debug_printf_stream); } +#define debug_printf_stream stdout +#define debug_printf_always(...) \ + { \ + fprintf(debug_printf_stream, "%s %f %04d ", g_network->getLocalAddress().toString().c_str(), now(), __LINE__); \ + fprintf(debug_printf_stream, __VA_ARGS__); \ + fflush(debug_printf_stream); \ + } #define debug_printf_noop(...) #if defined(NO_INTELLISENSE) - #if REDWOOD_DEBUG - #define debug_printf debug_printf_always - #else - #define debug_printf debug_printf_noop - #endif +#if REDWOOD_DEBUG +#define debug_printf debug_printf_always #else - // To get error-checking on debug_printf statements in IDE - #define debug_printf printf +#define debug_printf debug_printf_noop +#endif +#else +// To get error-checking on debug_printf statements in IDE +#define debug_printf printf #endif #define BEACON debug_printf_always("HERE\n") -#define TRACE debug_printf_always("%s: %s line %d %s\n", __FUNCTION__, __FILE__, __LINE__, platform::get_backtrace().c_str()); +#define TRACE \ + debug_printf_always("%s: %s line %d %s\n", __FUNCTION__, __FILE__, __LINE__, platform::get_backtrace().c_str()); #ifndef VALGRIND #define VALGRIND_MAKE_MEM_UNDEFINED(x, y) @@ -67,12 +73,10 @@ public: // Must return the same size for all pages created by the same pager instance virtual int size() const = 0; - StringRef asStringRef() const { - return StringRef(begin(), size()); - } + StringRef asStringRef() const { return StringRef(begin(), size()); } virtual ~IPage() { - if(userData != nullptr && userDataDestructor != nullptr) { + if (userData != nullptr && userDataDestructor != nullptr) { userDataDestructor(userData); } } @@ -82,8 +86,8 @@ public: virtual void addref() const = 0; virtual void delref() const = 0; - mutable void *userData; - mutable void (*userDataDestructor)(void *); + mutable void* userData; + mutable void (*userDataDestructor)(void*); }; class IPagerSnapshot { diff --git a/fdbserver/IVersionedStore.h b/fdbserver/IVersionedStore.h index 9baf5c4469..b1feb8063c 100644 --- a/fdbserver/IVersionedStore.h +++ b/fdbserver/IVersionedStore.h @@ -4,13 +4,13 @@ * This source file is part of the FoundationDB open source project * * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors - * + * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -46,28 +46,33 @@ public: class IVersionedStore : public IClosable { public: virtual KeyValueStoreType getType() = 0; - virtual bool supportsMutation(int op) = 0; // If this returns true, then mutate(op, ...) may be called + virtual bool supportsMutation(int op) = 0; // If this returns true, then mutate(op, ...) may be called virtual StorageBytes getStorageBytes() = 0; // Writes are provided in an ordered stream. - // A write is considered part of (a change leading to) the version determined by the previous call to setWriteVersion() - // A write shall not become durable until the following call to commit() begins, and shall be durable once the following call to commit() returns + // A write is considered part of (a change leading to) the version determined by the previous call to + // setWriteVersion() A write shall not become durable until the following call to commit() begins, and shall be + // durable once the following call to commit() returns virtual void set(KeyValueRef keyValue) = 0; virtual void clear(KeyRangeRef range) = 0; virtual void mutate(int op, StringRef param1, StringRef param2) = 0; - virtual void setWriteVersion(Version) = 0; // The write version must be nondecreasing - virtual void setOldestVersion(Version v) = 0; // Set oldest readable version to be used in next commit - virtual Version getOldestVersion() = 0; // Get oldest readable version + virtual void setWriteVersion(Version) = 0; // The write version must be nondecreasing + virtual void setOldestVersion(Version v) = 0; // Set oldest readable version to be used in next commit + virtual Version getOldestVersion() = 0; // Get oldest readable version virtual Future commit() = 0; virtual Future init() = 0; virtual Version getLatestVersion() = 0; - // readAtVersion() may only be called on a version which has previously been passed to setWriteVersion() and never previously passed - // to forgetVersion. The returned results when violating this precondition are unspecified; the store is not required to be able to detect violations. - // The returned read cursor provides a consistent snapshot of the versioned store, corresponding to all the writes done with write versions less + // readAtVersion() may only be called on a version which has previously been passed to setWriteVersion() and never + // previously passed + // to forgetVersion. The returned results when violating this precondition are unspecified; the store is not + // required to be able to detect violations. + // The returned read cursor provides a consistent snapshot of the versioned store, corresponding to all the writes + // done with write versions less // than or equal to the given version. - // If readAtVersion() is called on the *current* write version, the given read cursor MAY reflect subsequent writes at the same + // If readAtVersion() is called on the *current* write version, the given read cursor MAY reflect subsequent writes + // at the same // write version, OR it may represent a snapshot as of the call to readAtVersion(). virtual Reference readAtVersion(Version) = 0; }; diff --git a/fdbserver/Knobs.cpp b/fdbserver/Knobs.cpp index 4a1168481a..e1d06665b9 100644 --- a/fdbserver/Knobs.cpp +++ b/fdbserver/Knobs.cpp @@ -88,6 +88,8 @@ void ServerKnobs::initialize(bool randomize, ClientKnobs* clientKnobs, bool isSi init( TLOG_IGNORE_POP_AUTO_ENABLE_DELAY, 300.0 ); init( TXS_POPPED_MAX_DELAY, 1.0 ); if ( randomize && BUGGIFY ) TXS_POPPED_MAX_DELAY = deterministicRandom()->random01(); init( TLOG_MAX_CREATE_DURATION, 10.0 ); + init( PEEK_LOGGING_AMOUNT, 5 ); + init( PEEK_LOGGING_DELAY, 5.0 ); // disk snapshot max timeout, to be put in TLog, storage and coordinator nodes init( SNAP_CREATE_MAX_TIMEOUT, 300.0 ); @@ -201,7 +203,7 @@ void ServerKnobs::initialize(bool randomize, ClientKnobs* clientKnobs, bool isSi init( STORAGE_METRICS_POLLING_DELAY, 2.0 ); if( randomize && BUGGIFY ) STORAGE_METRICS_POLLING_DELAY = 15.0; init( STORAGE_METRICS_RANDOM_DELAY, 0.2 ); init( AVAILABLE_SPACE_RATIO_CUTOFF, 0.05 ); - init( DESIRED_TEAMS_PER_SERVER, 5 ); DESIRED_TEAMS_PER_SERVER = deterministicRandom()->randomInt(1, 10); + init( DESIRED_TEAMS_PER_SERVER, 5 ); if( randomize && BUGGIFY ) DESIRED_TEAMS_PER_SERVER = deterministicRandom()->randomInt(1, 10); init( MAX_TEAMS_PER_SERVER, 5*DESIRED_TEAMS_PER_SERVER ); init( DD_SHARD_SIZE_GRANULARITY, 5000000 ); init( DD_SHARD_SIZE_GRANULARITY_SIM, 500000 ); if( randomize && BUGGIFY ) DD_SHARD_SIZE_GRANULARITY_SIM = 0; @@ -219,6 +221,9 @@ void ServerKnobs::initialize(bool randomize, ClientKnobs* clientKnobs, bool isSi init( DD_VALIDATE_LOCALITY, true ); if( randomize && BUGGIFY ) DD_VALIDATE_LOCALITY = false; init( DD_CHECK_INVALID_LOCALITY_DELAY, 60 ); if( randomize && BUGGIFY ) DD_CHECK_INVALID_LOCALITY_DELAY = 1 + deterministicRandom()->random01() * 600; init( DD_ENABLE_VERBOSE_TRACING, false ); if( randomize && BUGGIFY ) DD_ENABLE_VERBOSE_TRACING = true; + init( DD_SS_FAILURE_VERSIONLAG, 250000000 ); + init( DD_SS_ALLOWED_VERSIONLAG, 200000000 ); if( randomize && BUGGIFY ) { DD_SS_FAILURE_VERSIONLAG = deterministicRandom()->randomInt(15000000, 500000000); DD_SS_ALLOWED_VERSIONLAG = 0.75 * DD_SS_FAILURE_VERSIONLAG; } + init( DD_SS_STUCK_TIME_LIMIT, 300.0 ); if( randomize && BUGGIFY ) { DD_SS_STUCK_TIME_LIMIT = 200.0 + deterministicRandom()->random01() * 100.0; } // TeamRemover init( TR_FLAG_DISABLE_MACHINE_TEAM_REMOVER, false ); if( randomize && BUGGIFY ) TR_FLAG_DISABLE_MACHINE_TEAM_REMOVER = deterministicRandom()->random01() < 0.1 ? true : false; // false by default. disable the consistency check when it's true @@ -344,6 +349,7 @@ void ServerKnobs::initialize(bool randomize, ClientKnobs* clientKnobs, bool isSi init( MAX_PROXY_COMPUTE, 2.0 ); init( PROXY_COMPUTE_BUCKETS, 20000 ); init( PROXY_COMPUTE_GROWTH_RATE, 0.01 ); + init( TXN_STATE_SEND_AMOUNT, 2 ); // Master Server // masterCommitter() in the master server will allow lower priority tasks (e.g. DataDistibution) @@ -411,6 +417,8 @@ void ServerKnobs::initialize(bool randomize, ClientKnobs* clientKnobs, bool isSi init( POLICY_RATING_TESTS, 200 ); if( randomize && BUGGIFY ) POLICY_RATING_TESTS = 20; init( POLICY_GENERATIONS, 100 ); if( randomize && BUGGIFY ) POLICY_GENERATIONS = 10; + init( DBINFO_SEND_AMOUNT, 2 ); + init( DBINFO_BATCH_DELAY, 0.1 ); //Move Keys init( SHARD_READY_DELAY, 0.25 ); @@ -522,13 +530,13 @@ void ServerKnobs::initialize(bool randomize, ClientKnobs* clientKnobs, bool isSi //Worker init( WORKER_LOGGING_INTERVAL, 5.0 ); - init( INCOMPATIBLE_PEER_DELAY_BEFORE_LOGGING, 5.0 ); init( HEAP_PROFILER_INTERVAL, 30.0 ); init( DEGRADED_RESET_INTERVAL, 24*60*60 ); if ( randomize && BUGGIFY ) DEGRADED_RESET_INTERVAL = 10; init( DEGRADED_WARNING_LIMIT, 1 ); init( DEGRADED_WARNING_RESET_DELAY, 7*24*60*60 ); init( TRACE_LOG_FLUSH_FAILURE_CHECK_INTERVAL_SECONDS, 10 ); init( TRACE_LOG_PING_TIMEOUT_SECONDS, 5.0 ); + init( DBINFO_FAILED_DELAY, 1.0 ); // Test harness init( WORKER_POLL_DELAY, 1.0 ); @@ -561,19 +569,19 @@ void ServerKnobs::initialize(bool randomize, ClientKnobs* clientKnobs, bool isSi // Fast Restore init( FASTRESTORE_FAILURE_TIMEOUT, 3600 ); init( FASTRESTORE_HEARTBEAT_INTERVAL, 60 ); - init( FASTRESTORE_SAMPLING_PERCENT, 1 ); if( randomize ) { FASTRESTORE_SAMPLING_PERCENT = deterministicRandom()->random01() * 100; } - init( FASTRESTORE_NUM_LOADERS, 3 ); if( randomize ) { FASTRESTORE_NUM_LOADERS = deterministicRandom()->random01() * 10 + 1; } - init( FASTRESTORE_NUM_APPLIERS, 3 ); if( randomize ) { FASTRESTORE_NUM_APPLIERS = deterministicRandom()->random01() * 10 + 1; } - init( FASTRESTORE_TXN_BATCH_MAX_BYTES, 512.0 ); if( randomize ) { FASTRESTORE_TXN_BATCH_MAX_BYTES = deterministicRandom()->random01() * 1024.0 * 1024.0 + 1.0; } - init( FASTRESTORE_VERSIONBATCH_MAX_BYTES, 10.0 * 1024.0 * 1024.0 ); if( randomize ) { FASTRESTORE_VERSIONBATCH_MAX_BYTES = deterministicRandom()->random01() * 10.0 * 1024.0 * 1024.0 * 1024.0; } - init( FASTRESTORE_VB_PARALLELISM, 3 ); if( randomize ) { FASTRESTORE_VB_PARALLELISM = deterministicRandom()->random01() * 20 + 1; } - init( FASTRESTORE_VB_MONITOR_DELAY, 5 ); if( randomize ) { FASTRESTORE_VB_MONITOR_DELAY = deterministicRandom()->random01() * 20 + 1; } - init( FASTRESTORE_VB_LAUNCH_DELAY, 5 ); if( randomize ) { FASTRESTORE_VB_LAUNCH_DELAY = deterministicRandom()->random01() * 60 + 1; } - init( FASTRESTORE_ROLE_LOGGING_DELAY, 5 ); if( randomize ) { FASTRESTORE_ROLE_LOGGING_DELAY = deterministicRandom()->random01() * 60 + 1; } - init( FASTRESTORE_UPDATE_PROCESS_STATS_INTERVAL, 5 ); if( randomize ) { FASTRESTORE_UPDATE_PROCESS_STATS_INTERVAL = deterministicRandom()->random01() * 60 + 1; } - init( FASTRESTORE_ATOMICOP_WEIGHT, 100 ); if( randomize ) { FASTRESTORE_ATOMICOP_WEIGHT = deterministicRandom()->random01() * 200 + 1; } - init( FASTRESTORE_APPLYING_PARALLELISM, 100 ); if( randomize ) { FASTRESTORE_APPLYING_PARALLELISM = deterministicRandom()->random01() * 10 + 1; } - init( FASTRESTORE_MONITOR_LEADER_DELAY, 5 ); if( randomize ) { FASTRESTORE_MONITOR_LEADER_DELAY = deterministicRandom()->random01() * 100; } + init( FASTRESTORE_SAMPLING_PERCENT, 1 ); if( randomize && BUGGIFY ) { FASTRESTORE_SAMPLING_PERCENT = deterministicRandom()->random01() * 100; } + init( FASTRESTORE_NUM_LOADERS, 3 ); if( randomize && BUGGIFY ) { FASTRESTORE_NUM_LOADERS = deterministicRandom()->random01() * 10 + 1; } + init( FASTRESTORE_NUM_APPLIERS, 3 ); if( randomize && BUGGIFY ) { FASTRESTORE_NUM_APPLIERS = deterministicRandom()->random01() * 10 + 1; } + init( FASTRESTORE_TXN_BATCH_MAX_BYTES, 512.0 ); if( randomize && BUGGIFY ) { FASTRESTORE_TXN_BATCH_MAX_BYTES = deterministicRandom()->random01() * 1024.0 * 1024.0 + 1.0; } + init( FASTRESTORE_VERSIONBATCH_MAX_BYTES, 10.0 * 1024.0 * 1024.0 ); if( randomize && BUGGIFY ) { FASTRESTORE_VERSIONBATCH_MAX_BYTES = deterministicRandom()->random01() * 10.0 * 1024.0 * 1024.0 * 1024.0; } + init( FASTRESTORE_VB_PARALLELISM, 3 ); if( randomize && BUGGIFY ) { FASTRESTORE_VB_PARALLELISM = deterministicRandom()->random01() * 20 + 1; } + init( FASTRESTORE_VB_MONITOR_DELAY, 5 ); if( randomize && BUGGIFY ) { FASTRESTORE_VB_MONITOR_DELAY = deterministicRandom()->random01() * 20 + 1; } + init( FASTRESTORE_VB_LAUNCH_DELAY, 5 ); if( randomize && BUGGIFY ) { FASTRESTORE_VB_LAUNCH_DELAY = deterministicRandom()->random01() * 60 + 1; } + init( FASTRESTORE_ROLE_LOGGING_DELAY, 5 ); if( randomize && BUGGIFY ) { FASTRESTORE_ROLE_LOGGING_DELAY = deterministicRandom()->random01() * 60 + 1; } + init( FASTRESTORE_UPDATE_PROCESS_STATS_INTERVAL, 5 ); if( randomize && BUGGIFY ) { FASTRESTORE_UPDATE_PROCESS_STATS_INTERVAL = deterministicRandom()->random01() * 60 + 1; } + init( FASTRESTORE_ATOMICOP_WEIGHT, 100 ); if( randomize && BUGGIFY ) { FASTRESTORE_ATOMICOP_WEIGHT = deterministicRandom()->random01() * 200 + 1; } + init( FASTRESTORE_APPLYING_PARALLELISM, 100 ); if( randomize && BUGGIFY ) { FASTRESTORE_APPLYING_PARALLELISM = deterministicRandom()->random01() * 10 + 1; } + init( FASTRESTORE_MONITOR_LEADER_DELAY, 5 ); if( randomize && BUGGIFY ) { FASTRESTORE_MONITOR_LEADER_DELAY = deterministicRandom()->random01() * 100; } init( FASTRESTORE_STRAGGLER_THRESHOLD_SECONDS, 60 ); if( randomize && BUGGIFY ) { FASTRESTORE_STRAGGLER_THRESHOLD_SECONDS = deterministicRandom()->random01() * 240 + 10; } init( FASTRESTORE_TRACK_REQUEST_LATENCY, true ); if( randomize && BUGGIFY ) { FASTRESTORE_TRACK_REQUEST_LATENCY = false; } init( FASTRESTORE_TRACK_LOADER_SEND_REQUESTS, false ); if( randomize && BUGGIFY ) { FASTRESTORE_TRACK_LOADER_SEND_REQUESTS = true; } diff --git a/fdbserver/Knobs.h b/fdbserver/Knobs.h index 694efadd7e..1af0934d1b 100644 --- a/fdbserver/Knobs.h +++ b/fdbserver/Knobs.h @@ -86,6 +86,8 @@ public: int64_t MAX_CACHE_VERSIONS; double TXS_POPPED_MAX_DELAY; double TLOG_MAX_CREATE_DURATION; + int PEEK_LOGGING_AMOUNT; + double PEEK_LOGGING_DELAY; // Data distribution queue double HEALTH_POLL_TIME; @@ -173,7 +175,10 @@ public: bool DD_VALIDATE_LOCALITY; int DD_CHECK_INVALID_LOCALITY_DELAY; bool DD_ENABLE_VERBOSE_TRACING; - + int64_t DD_SS_FAILURE_VERSIONLAG; // Allowed SS version lag from the current read version before marking it as failed. + int64_t DD_SS_ALLOWED_VERSIONLAG; // SS will be marked as healthy if it's version lag goes below this value. + double DD_SS_STUCK_TIME_LIMIT; // If a storage server is not getting new versions for this amount of time, then it becomes undesired. + // TeamRemover to remove redundant teams bool TR_FLAG_DISABLE_MACHINE_TEAM_REMOVER; // disable the machineTeamRemover actor double TR_REMOVE_MACHINE_TEAM_DELAY; // wait for the specified time before try to remove next machine team @@ -281,6 +286,7 @@ public: double MAX_PROXY_COMPUTE; int PROXY_COMPUTE_BUCKETS; double PROXY_COMPUTE_GROWTH_RATE; + int TXN_STATE_SEND_AMOUNT; // Master Server double COMMIT_SLEEP_TIME; @@ -345,6 +351,8 @@ public: int EXPECTED_PROXY_FITNESS; int EXPECTED_RESOLVER_FITNESS; double RECRUITMENT_TIMEOUT; + int DBINFO_SEND_AMOUNT; + double DBINFO_BATCH_DELAY; //Move Keys double SHARD_READY_DELAY; @@ -457,13 +465,13 @@ public: //Worker double WORKER_LOGGING_INTERVAL; - double INCOMPATIBLE_PEER_DELAY_BEFORE_LOGGING; double HEAP_PROFILER_INTERVAL; double DEGRADED_RESET_INTERVAL; double DEGRADED_WARNING_LIMIT; double DEGRADED_WARNING_RESET_DELAY; int64_t TRACE_LOG_FLUSH_FAILURE_CHECK_INTERVAL_SECONDS; double TRACE_LOG_PING_TIMEOUT_SECONDS; + double DBINFO_FAILED_DELAY; // Test harness double WORKER_POLL_DELAY; diff --git a/fdbserver/LeaderElection.actor.cpp b/fdbserver/LeaderElection.actor.cpp index be23f7da8e..a910a3c486 100644 --- a/fdbserver/LeaderElection.actor.cpp +++ b/fdbserver/LeaderElection.actor.cpp @@ -20,7 +20,6 @@ #include "fdbrpc/FailureMonitor.h" #include "fdbrpc/Locality.h" -#include "fdbserver/ClusterRecruitmentInterface.h" #include "fdbserver/CoordinationInterface.h" #include "fdbclient/MonitorLeader.h" #include "flow/actorcompiler.h" // This must be the last #include. diff --git a/fdbserver/MasterInterface.h b/fdbserver/MasterInterface.h index 3129b3f8eb..61ade89cee 100644 --- a/fdbserver/MasterInterface.h +++ b/fdbserver/MasterInterface.h @@ -40,6 +40,7 @@ struct MasterInterface { RequestStream notifyBackupWorkerDone; NetworkAddress address() const { return changeCoordinators.getEndpoint().getPrimaryAddress(); } + NetworkAddressList addresses() const { return changeCoordinators.getEndpoint().addresses; } UID id() const { return changeCoordinators.getEndpoint().token; } template diff --git a/fdbserver/MasterProxyServer.actor.cpp b/fdbserver/MasterProxyServer.actor.cpp index af9823a1fe..d21b9237c5 100644 --- a/fdbserver/MasterProxyServer.actor.cpp +++ b/fdbserver/MasterProxyServer.actor.cpp @@ -48,6 +48,29 @@ #include "flow/TDMetric.actor.h" #include "flow/actorcompiler.h" // This must be the last #include. +ACTOR Future broadcastTxnRequest(TxnStateRequest req, int sendAmount, bool sendReply) { + state ReplyPromise reply = req.reply; + resetReply( req ); + std::vector> replies; + int currentStream = 0; + std::vector broadcastEndpoints = req.broadcastInfo; + for(int i = 0; i < sendAmount && currentStream < broadcastEndpoints.size(); i++) { + std::vector endpoints; + RequestStream cur(broadcastEndpoints[currentStream++]); + while(currentStream < broadcastEndpoints.size()*(i+1)/sendAmount) { + endpoints.push_back(broadcastEndpoints[currentStream++]); + } + req.broadcastInfo = endpoints; + replies.push_back(brokenPromiseToNever( cur.getReply( req ) )); + resetReply( req ); + } + wait( waitForAll(replies) ); + if(sendReply) { + reply.send(Void()); + } + return Void(); +} + struct ProxyStats { CounterCollection cc; Counter txnRequestIn, txnRequestOut, txnRequestErrors; @@ -157,7 +180,7 @@ struct TransactionRateInfo { } void setRate(double rate) { - ASSERT(rate >= 0 && rate != std::numeric_limits::infinity() && !isnan(rate)); + ASSERT(rate >= 0 && rate != std::numeric_limits::infinity() && !std::isnan(rate)); this->rate = rate; if(disabled) { @@ -1954,7 +1977,7 @@ ACTOR Future masterProxyServerCore( when(ExclusionSafetyCheckRequest exclCheckReq = waitNext(proxy.exclusionSafetyCheckReq.getFuture())) { addActor.send(proxyCheckSafeExclusion(db, exclCheckReq)); } - when(TxnStateRequest req = waitNext(proxy.txnState.getFuture())) { + when(state TxnStateRequest req = waitNext(proxy.txnState.getFuture())) { state ReplyPromise reply = req.reply; if(req.last) maxSequence = req.sequence + 1; if (!txnSequences.count(req.sequence)) { @@ -2022,7 +2045,7 @@ ACTOR Future masterProxyServerCore( commitData.txnStateStore->enableSnapshot(); } } - reply.send(Void()); + addActor.send(broadcastTxnRequest(req, SERVER_KNOBS->TXN_STATE_SEND_AMOUNT, true)); wait(yield()); } } diff --git a/fdbserver/MoveKeys.actor.cpp b/fdbserver/MoveKeys.actor.cpp index aad9a11e6a..eea20328cd 100644 --- a/fdbserver/MoveKeys.actor.cpp +++ b/fdbserver/MoveKeys.actor.cpp @@ -774,6 +774,7 @@ ACTOR Future> addStorageServer( Database cx, StorageServ try { state Future> fTagLocalities = tr.getRange( tagLocalityListKeys, CLIENT_KNOBS->TOO_MANY ); state Future> fv = tr.get( serverListKeyFor(server.id()) ); + state Future> fExclProc = tr.get( StringRef(encodeExcludedServersKey( AddressExclusion( server.address().ip, server.address().port ))) ); state Future> fExclIP = tr.get( @@ -782,14 +783,28 @@ ACTOR Future> addStorageServer( Database cx, StorageServ StringRef(encodeFailedServersKey( AddressExclusion( server.address().ip, server.address().port ))) ); state Future> fFailIP = tr.get( StringRef(encodeFailedServersKey( AddressExclusion( server.address().ip ))) ); + + state Future> fExclProc2 = server.secondaryAddress().present() ? tr.get( + StringRef(encodeExcludedServersKey( AddressExclusion( server.secondaryAddress().get().ip, server.secondaryAddress().get().port ))) ) : Future>( Optional() ); + state Future> fExclIP2 = server.secondaryAddress().present() ? tr.get( + StringRef(encodeExcludedServersKey( AddressExclusion( server.secondaryAddress().get().ip ))) ) : Future>( Optional() ); + state Future> fFailProc2 = server.secondaryAddress().present() ? tr.get( + StringRef(encodeFailedServersKey( AddressExclusion( server.secondaryAddress().get().ip, server.secondaryAddress().get().port ))) ) : Future>( Optional() ); + state Future> fFailIP2 = server.secondaryAddress().present() ? tr.get( + StringRef(encodeFailedServersKey( AddressExclusion( server.secondaryAddress().get().ip ))) ) : Future>( Optional() ); + state Future> fTags = tr.getRange( serverTagKeys, CLIENT_KNOBS->TOO_MANY, true); state Future> fHistoryTags = tr.getRange( serverTagHistoryKeys, CLIENT_KNOBS->TOO_MANY, true); - wait( success(fTagLocalities) && success(fv) && success(fExclProc) && success(fExclIP) && success(fFailProc) && success(fFailIP) && success(fTags) && success(fHistoryTags) ); + wait( success(fTagLocalities) && success(fv) && success(fTags) && success(fHistoryTags) && + success(fExclProc) && success(fExclIP) && success(fFailProc) && success(fFailIP) && + success(fExclProc2) && success(fExclIP2) && success(fFailProc2) && success(fFailIP2) ); // If we have been added to the excluded/failed state servers list, we have to fail - if (fExclProc.get().present() || fExclIP.get().present() || fFailProc.get().present() || fFailIP.get().present() ) + if (fExclProc.get().present() || fExclIP.get().present() || fFailProc.get().present() || fFailIP.get().present() || + fExclProc2.get().present() || fExclIP2.get().present() || fFailProc2.get().present() || fFailIP2.get().present() ) { throw recruitment_failed(); + } if(fTagLocalities.get().more || fTags.get().more || fHistoryTags.get().more) ASSERT(false); diff --git a/fdbserver/OldTLogServer_6_0.actor.cpp b/fdbserver/OldTLogServer_6_0.actor.cpp index d8137fcad2..4754bf6dad 100644 --- a/fdbserver/OldTLogServer_6_0.actor.cpp +++ b/fdbserver/OldTLogServer_6_0.actor.cpp @@ -385,6 +385,43 @@ struct LogData : NonCopyable, public ReferenceCounted { struct PeekTrackerData { std::map>> sequence_version; double lastUpdate; + + Tag tag; + + double lastLogged; + int64_t totalPeeks; + int64_t replyBytes; + int64_t duplicatePeeks; + double queueTime; + double queueMax; + double blockTime; + double blockMax; + double workTime; + double workMax; + + int64_t unblockedPeeks; + double idleTime; + double idleMax; + + PeekTrackerData() : lastUpdate(0) { + resetMetrics(); + } + + void resetMetrics() { + lastLogged = now(); + totalPeeks = 0; + replyBytes = 0; + duplicatePeeks = 0; + queueTime = 0; + queueMax = 0; + blockTime = 0; + blockMax = 0; + workTime = 0; + workMax = 0; + unblockedPeeks = 0; + idleTime = 0; + idleMax = 0; + } }; std::map peekTracker; @@ -1049,6 +1086,7 @@ ACTOR Future tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere state BinaryWriter messages2(Unversioned()); state int sequence = -1; state UID peekId; + state double queueStart = now(); if(req.sequence.present()) { try { @@ -1059,6 +1097,7 @@ ACTOR Future tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere } auto& trackerData = logData->peekTracker[peekId]; if (sequence == 0 && trackerData.sequence_version.find(0) == trackerData.sequence_version.end()) { + trackerData.tag = req.tag; trackerData.sequence_version[0].send(std::make_pair(req.begin, req.onlySpilled)); } auto seqBegin = trackerData.sequence_version.begin(); @@ -1074,8 +1113,16 @@ ACTOR Future tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere throw operation_obsolete(); } + Future> fPrevPeekData = trackerData.sequence_version[sequence].getFuture(); + if(fPrevPeekData.isReady()) { + trackerData.unblockedPeeks++; + double t = now() - trackerData.lastUpdate; + if(t > trackerData.idleMax) trackerData.idleMax = t; + trackerData.idleTime += t; + } trackerData.lastUpdate = now(); - std::pair prevPeekData = wait(trackerData.sequence_version[sequence].getFuture()); + std::pair prevPeekData = wait(fPrevPeekData); + req.begin = std::max(prevPeekData.first, req.begin); req.onlySpilled = prevPeekData.second; wait(yield()); @@ -1089,6 +1136,8 @@ ACTOR Future tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere } } + state double blockStart = now(); + if( req.returnIfBlocked && logData->version.get() < req.begin ) { req.reply.sendError(end_of_stream()); if(req.sequence.present()) { @@ -1123,6 +1172,8 @@ ACTOR Future tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere wait(delay(0, TaskPriority::TLogSpilledPeekReply)); } + state double workStart = now(); + Version poppedVer = poppedVersion(logData, req.tag); if(poppedVer > req.begin) { TLogPeekReply rep; @@ -1211,6 +1262,22 @@ ACTOR Future tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere if(req.sequence.present()) { auto& trackerData = logData->peekTracker[peekId]; trackerData.lastUpdate = now(); + + double queueT = blockStart-queueStart; + double blockT = workStart-blockStart; + double workT = now()-workStart; + + trackerData.totalPeeks++; + trackerData.replyBytes += reply.messages.size(); + + if(queueT > trackerData.queueMax) trackerData.queueMax = queueT; + if(blockT > trackerData.blockMax) trackerData.blockMax = blockT; + if(workT > trackerData.workMax) trackerData.workMax = workT; + + trackerData.queueTime += queueT; + trackerData.blockTime += blockT; + trackerData.workTime += workT; + auto& sequenceData = trackerData.sequence_version[sequence+1]; if(trackerData.sequence_version.size() && sequence+1 < trackerData.sequence_version.begin()->first) { req.reply.sendError(operation_obsolete()); @@ -1219,6 +1286,7 @@ ACTOR Future tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere return Void(); } if(sequenceData.isSet()) { + trackerData.duplicatePeeks++; if(sequenceData.getFuture().get().first != reply.end) { TEST(true); //tlog peek second attempt ended at a different version req.reply.sendError(operation_obsolete()); @@ -1542,6 +1610,47 @@ ACTOR Future cleanupPeekTrackers( LogData* logData ) { } } +ACTOR Future logPeekTrackers( LogData* logData ) { + loop { + int64_t logThreshold = 1; + if(logData->peekTracker.size() > SERVER_KNOBS->PEEK_LOGGING_AMOUNT) { + std::vector peekCounts; + peekCounts.reserve(logData->peekTracker.size()); + for( auto& it : logData->peekTracker ) { + peekCounts.push_back(it.second.totalPeeks); + } + size_t pivot = peekCounts.size()-SERVER_KNOBS->PEEK_LOGGING_AMOUNT; + std::nth_element(peekCounts.begin(), peekCounts.begin()+pivot, peekCounts.end()); + logThreshold = std::max(1,peekCounts[pivot]); + } + int logCount = 0; + for( auto& it : logData->peekTracker ) { + if(it.second.totalPeeks >= logThreshold) { + logCount++; + TraceEvent("PeekMetrics", logData->logId) + .detail("Tag", it.second.tag.toString()) + .detail("Elapsed", now() - it.second.lastLogged) + .detail("MeanReplyBytes", it.second.replyBytes/it.second.totalPeeks) + .detail("TotalPeeks", it.second.totalPeeks) + .detail("UnblockedPeeks", it.second.unblockedPeeks) + .detail("DuplicatePeeks", it.second.duplicatePeeks) + .detail("Sequence", it.second.sequence_version.size() ? it.second.sequence_version.begin()->first : -1) + .detail("IdleSeconds", it.second.idleTime) + .detail("IdleMax", it.second.idleMax) + .detail("QueueSeconds", it.second.queueTime) + .detail("QueueMax", it.second.queueMax) + .detail("BlockSeconds", it.second.blockTime) + .detail("BlockMax", it.second.blockMax) + .detail("WorkSeconds", it.second.workTime) + .detail("WorkMax", it.second.workMax); + it.second.resetMetrics(); + } + } + + wait( delay(SERVER_KNOBS->PEEK_LOGGING_DELAY * std::max(1,logCount)) ); + } +} + void getQueuingMetrics( TLogData* self, Reference logData, TLogQueuingMetricsRequest const& req ) { TLogQueuingMetricsReply reply; reply.localTime = now(); @@ -1880,6 +1989,7 @@ ACTOR Future tLogCore( TLogData* self, Reference logData, TLogInt logData->addActor.send( traceCounters("TLogMetrics", logData->logId, SERVER_KNOBS->STORAGE_LOGGING_DELAY, &logData->cc, logData->logId.toString() + "/TLogMetrics")); logData->addActor.send( serveTLogInterface(self, tli, logData, warningCollectorInput) ); logData->addActor.send( cleanupPeekTrackers(logData.getPtr()) ); + logData->addActor.send( logPeekTrackers(logData.getPtr()) ); if(!logData->isPrimary) { std::vector tags; diff --git a/fdbserver/OldTLogServer_6_2.actor.cpp b/fdbserver/OldTLogServer_6_2.actor.cpp index 1dbb486e1d..4a9f9c9158 100644 --- a/fdbserver/OldTLogServer_6_2.actor.cpp +++ b/fdbserver/OldTLogServer_6_2.actor.cpp @@ -495,6 +495,44 @@ struct LogData : NonCopyable, public ReferenceCounted { struct PeekTrackerData { std::map>> sequence_version; double lastUpdate; + + Tag tag; + + double lastLogged; + int64_t totalPeeks; + int64_t replyBytes; + int64_t duplicatePeeks; + double queueTime; + double queueMax; + double blockTime; + double blockMax; + double workTime; + double workMax; + + int64_t unblockedPeeks; + double idleTime; + double idleMax; + + PeekTrackerData() : lastUpdate(0) { + resetMetrics(); + } + + void resetMetrics() { + lastLogged = now(); + totalPeeks = 0; + replyBytes = 0; + duplicatePeeks = 0; + queueTime = 0; + queueMax = 0; + blockTime = 0; + blockMax = 0; + workTime = 0; + workMax = 0; + unblockedPeeks = 0; + idleTime = 0; + idleMax = 0; + } + }; std::map peekTracker; @@ -1352,6 +1390,7 @@ ACTOR Future tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere state BinaryWriter messages2(Unversioned()); state int sequence = -1; state UID peekId; + state double queueStart = now(); if(req.sequence.present()) { try { @@ -1362,6 +1401,7 @@ ACTOR Future tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere } auto& trackerData = logData->peekTracker[peekId]; if (sequence == 0 && trackerData.sequence_version.find(0) == trackerData.sequence_version.end()) { + trackerData.tag = req.tag; trackerData.sequence_version[0].send(std::make_pair(req.begin, req.onlySpilled)); } auto seqBegin = trackerData.sequence_version.begin(); @@ -1378,8 +1418,15 @@ ACTOR Future tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere throw operation_obsolete(); } + Future> fPrevPeekData = trackerData.sequence_version[sequence].getFuture(); + if(fPrevPeekData.isReady()) { + trackerData.unblockedPeeks++; + double t = now() - trackerData.lastUpdate; + if(t > trackerData.idleMax) trackerData.idleMax = t; + trackerData.idleTime += t; + } trackerData.lastUpdate = now(); - std::pair prevPeekData = wait(trackerData.sequence_version[sequence].getFuture()); + std::pair prevPeekData = wait(fPrevPeekData); req.begin = std::max(prevPeekData.first, req.begin); req.onlySpilled = prevPeekData.second; wait(yield()); @@ -1393,6 +1440,8 @@ ACTOR Future tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere } } + state double blockStart = now(); + if( req.returnIfBlocked && logData->version.get() < req.begin ) { req.reply.sendError(end_of_stream()); if(req.sequence.present()) { @@ -1427,6 +1476,8 @@ ACTOR Future tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere wait(delay(0, TaskPriority::TLogSpilledPeekReply)); } + state double workStart = now(); + Version poppedVer = poppedVersion(logData, req.tag); if(poppedVer > req.begin) { TLogPeekReply rep; @@ -1603,6 +1654,22 @@ ACTOR Future tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere if(req.sequence.present()) { auto& trackerData = logData->peekTracker[peekId]; trackerData.lastUpdate = now(); + + double queueT = blockStart-queueStart; + double blockT = workStart-blockStart; + double workT = now()-workStart; + + trackerData.totalPeeks++; + trackerData.replyBytes += reply.messages.size(); + + if(queueT > trackerData.queueMax) trackerData.queueMax = queueT; + if(blockT > trackerData.blockMax) trackerData.blockMax = blockT; + if(workT > trackerData.workMax) trackerData.workMax = workT; + + trackerData.queueTime += queueT; + trackerData.blockTime += blockT; + trackerData.workTime += workT; + auto& sequenceData = trackerData.sequence_version[sequence+1]; if(trackerData.sequence_version.size() && sequence+1 < trackerData.sequence_version.begin()->first) { req.reply.sendError(operation_obsolete()); @@ -1611,6 +1678,7 @@ ACTOR Future tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere return Void(); } if(sequenceData.isSet()) { + trackerData.duplicatePeeks++; if(sequenceData.getFuture().get().first != reply.end) { TEST(true); //tlog peek second attempt ended at a different version req.reply.sendError(operation_obsolete()); @@ -1934,6 +2002,47 @@ ACTOR Future cleanupPeekTrackers( LogData* logData ) { } } +ACTOR Future logPeekTrackers( LogData* logData ) { + loop { + int64_t logThreshold = 1; + if(logData->peekTracker.size() > SERVER_KNOBS->PEEK_LOGGING_AMOUNT) { + std::vector peekCounts; + peekCounts.reserve(logData->peekTracker.size()); + for( auto& it : logData->peekTracker ) { + peekCounts.push_back(it.second.totalPeeks); + } + size_t pivot = peekCounts.size()-SERVER_KNOBS->PEEK_LOGGING_AMOUNT; + std::nth_element(peekCounts.begin(), peekCounts.begin()+pivot, peekCounts.end()); + logThreshold = std::max(1,peekCounts[pivot]); + } + int logCount = 0; + for( auto& it : logData->peekTracker ) { + if(it.second.totalPeeks >= logThreshold) { + logCount++; + TraceEvent("PeekMetrics", logData->logId) + .detail("Tag", it.second.tag.toString()) + .detail("Elapsed", now() - it.second.lastLogged) + .detail("MeanReplyBytes", it.second.replyBytes/it.second.totalPeeks) + .detail("TotalPeeks", it.second.totalPeeks) + .detail("UnblockedPeeks", it.second.unblockedPeeks) + .detail("DuplicatePeeks", it.second.duplicatePeeks) + .detail("Sequence", it.second.sequence_version.size() ? it.second.sequence_version.begin()->first : -1) + .detail("IdleSeconds", it.second.idleTime) + .detail("IdleMax", it.second.idleMax) + .detail("QueueSeconds", it.second.queueTime) + .detail("QueueMax", it.second.queueMax) + .detail("BlockSeconds", it.second.blockTime) + .detail("BlockMax", it.second.blockMax) + .detail("WorkSeconds", it.second.workTime) + .detail("WorkMax", it.second.workMax); + it.second.resetMetrics(); + } + } + + wait( delay(SERVER_KNOBS->PEEK_LOGGING_DELAY * std::max(1,logCount)) ); + } +} + void getQueuingMetrics( TLogData* self, Reference logData, TLogQueuingMetricsRequest const& req ) { TLogQueuingMetricsReply reply; reply.localTime = now(); @@ -2283,6 +2392,7 @@ ACTOR Future tLogCore( TLogData* self, Reference logData, TLogInt logData->addActor.send( traceCounters("TLogMetrics", logData->logId, SERVER_KNOBS->STORAGE_LOGGING_DELAY, &logData->cc, logData->logId.toString() + "/TLogMetrics")); logData->addActor.send( serveTLogInterface(self, tli, logData, warningCollectorInput) ); logData->addActor.send( cleanupPeekTrackers(logData.getPtr()) ); + logData->addActor.send( logPeekTrackers(logData.getPtr()) ); if(!logData->isPrimary) { std::vector tags; diff --git a/fdbserver/RestoreLoader.actor.cpp b/fdbserver/RestoreLoader.actor.cpp index 0fead10ef3..72f5de6148 100644 --- a/fdbserver/RestoreLoader.actor.cpp +++ b/fdbserver/RestoreLoader.actor.cpp @@ -829,7 +829,7 @@ ACTOR static Future _parseRangeFileToMutationsOnLoader( // The set of key value version is rangeFile.version. the key-value set in the same range file has the same version Reference inFile = wait(bc->readFile(asset.filename)); - state VectorRef blockData; + state Standalone> blockData; try { Standalone> kvs = wait(fileBackup::decodeRangeFileBlock(inFile, asset.offset, asset.len)); diff --git a/fdbserver/RestoreMaster.actor.cpp b/fdbserver/RestoreMaster.actor.cpp index a13bf9e54b..9e0d749a5c 100644 --- a/fdbserver/RestoreMaster.actor.cpp +++ b/fdbserver/RestoreMaster.actor.cpp @@ -717,7 +717,7 @@ ACTOR static Future collectBackupFiles(Reference bc, ACTOR static Future insertRangeVersion(KeyRangeMap* pRangeVersions, RestoreFileFR* file, Reference bc) { TraceEvent("FastRestoreMasterDecodeRangeVersion").detail("File", file->toString()); - RangeFile rangeFile(file->version, file->blockSize, file->fileName, file->fileSize); + RangeFile rangeFile = { file->version, (uint32_t)file->blockSize, file->fileName, file->fileSize }; // First and last key are the range for this file: endKey is exclusive KeyRange fileRange = wait(bc->getSnapshotFileKeyRange(rangeFile)); diff --git a/fdbserver/RestoreUtil.h b/fdbserver/RestoreUtil.h index 4605077874..2cef2ca539 100644 --- a/fdbserver/RestoreUtil.h +++ b/fdbserver/RestoreUtil.h @@ -38,22 +38,6 @@ #define SevFRMutationInfo SevVerbose //#define SevFRMutationInfo SevInfo -struct VersionedMutation { - MutationRef mutation; - LogMessageVersion version; - - VersionedMutation() = default; - explicit VersionedMutation(MutationRef mutation, LogMessageVersion version) - : mutation(mutation), version(version) {} - explicit VersionedMutation(Arena& to, const VersionedMutation& from) - : mutation(to, from.mutation), version(from.version) {} - - template - void serialize(Ar& ar) { - serializer(ar, mutation, version); - } -}; - using MutationsVec = Standalone>; using LogMessageVersionVec = Standalone>; using VersionedMutationsVec = Standalone>; diff --git a/fdbserver/RestoreWorker.actor.cpp b/fdbserver/RestoreWorker.actor.cpp index 77494c1fcf..6d78dad9ef 100644 --- a/fdbserver/RestoreWorker.actor.cpp +++ b/fdbserver/RestoreWorker.actor.cpp @@ -147,7 +147,10 @@ ACTOR Future collectRestoreWorkerInterface(Reference se } break; } - TraceEvent("FastRestore").suppressFor(10.0).detail("NotEnoughWorkers", agentValues.size()); + TraceEvent("FastRestore") + .suppressFor(10.0) + .detail("NotEnoughWorkers", agentValues.size()) + .detail("MinWorkers", min_num_workers); wait(delay(5.0)); } catch (Error& e) { wait(tr.onError(e)); diff --git a/fdbserver/ServerDBInfo.h b/fdbserver/ServerDBInfo.h index b067211aef..a28c6323ae 100644 --- a/fdbserver/ServerDBInfo.h +++ b/fdbserver/ServerDBInfo.h @@ -22,13 +22,13 @@ #define FDBSERVER_SERVERDBINFO_H #pragma once -#include "fdbserver/ClusterRecruitmentInterface.h" #include "fdbserver/DataDistributorInterface.h" #include "fdbserver/MasterInterface.h" #include "fdbserver/LogSystemConfig.h" #include "fdbserver/RatekeeperInterface.h" #include "fdbserver/RecoveryState.h" #include "fdbserver/LatencyBandConfig.h" +#include "fdbserver/WorkerInterface.actor.h" struct ServerDBInfo { constexpr static FileIdentifier file_identifier = 13838807; @@ -51,29 +51,45 @@ struct ServerDBInfo { std::vector priorCommittedLogServers; // If !fullyRecovered and logSystemConfig refers to a new log system which may not have been committed to the coordinated state yet, then priorCommittedLogServers are the previous, fully committed generation which need to stay alive in case this recovery fails Optional latencyBandConfig; std::vector> storageCaches; + int64_t infoGeneration; - explicit ServerDBInfo() : recoveryCount(0), recoveryState(RecoveryState::UNINITIALIZED), logSystemConfig(0) {} + ServerDBInfo() : recoveryCount(0), recoveryState(RecoveryState::UNINITIALIZED), logSystemConfig(0), infoGeneration(0) {} bool operator == (ServerDBInfo const& r) const { return id == r.id; } bool operator != (ServerDBInfo const& r) const { return id != r.id; } template void serialize( Ar& ar ) { - serializer(ar, id, clusterInterface, client, distributor, master, ratekeeper, resolvers, recoveryCount, recoveryState, masterLifetime, logSystemConfig, priorCommittedLogServers, latencyBandConfig, storageCaches); + serializer(ar, id, clusterInterface, client, distributor, master, ratekeeper, resolvers, recoveryCount, recoveryState, masterLifetime, logSystemConfig, priorCommittedLogServers, latencyBandConfig, storageCaches, infoGeneration); + } +}; + +struct UpdateServerDBInfoRequest { + constexpr static FileIdentifier file_identifier = 9467438; + Standalone serializedDbInfo; + std::vector broadcastInfo; + ReplyPromise> reply; + + template + void serialize(Ar& ar) { + serializer(ar, serializedDbInfo, broadcastInfo, reply); } }; struct GetServerDBInfoRequest { - constexpr static FileIdentifier file_identifier = 9467438; + constexpr static FileIdentifier file_identifier = 9467439; UID knownServerInfoID; - Standalone> issues; - std::vector incompatiblePeers; - ReplyPromise< CachedSerialization > reply; + ReplyPromise reply; template void serialize(Ar& ar) { - serializer(ar, knownServerInfoID, issues, incompatiblePeers, reply); + serializer(ar, knownServerInfoID, reply); } }; + +Future broadcastTxnRequest(TxnStateRequest const& req, int const& sendAmount, bool const& sendReply); + +Future> broadcastDBInfoRequest(UpdateServerDBInfoRequest const& req, int const& sendAmount, Optional const& sender, bool const& sendReply); + #endif diff --git a/fdbserver/SimulatedCluster.actor.cpp b/fdbserver/SimulatedCluster.actor.cpp index f89a97c59e..c4be587c1f 100644 --- a/fdbserver/SimulatedCluster.actor.cpp +++ b/fdbserver/SimulatedCluster.actor.cpp @@ -25,7 +25,6 @@ #include "fdbserver/WorkerInterface.actor.h" #include "fdbclient/ClusterInterface.h" #include "fdbserver/Knobs.h" -#include "fdbserver/ClusterRecruitmentInterface.h" #include "fdbserver/CoordinationInterface.h" #include "fdbmonitor/SimpleIni.h" #include "fdbrpc/AsyncFileNonDurable.actor.h" @@ -737,7 +736,7 @@ void SimulationConfig::generateNormalConfig(int minimumReplication, int minimumR if (deterministicRandom()->random01() < 0.25) db.desiredTLogCount = deterministicRandom()->randomInt(1,7); if (deterministicRandom()->random01() < 0.25) db.masterProxyCount = deterministicRandom()->randomInt(1,7); if (deterministicRandom()->random01() < 0.25) db.resolverCount = deterministicRandom()->randomInt(1,7); - int storage_engine_type = deterministicRandom()->randomInt(0, 3); + int storage_engine_type = deterministicRandom()->randomInt(0, 4); switch (storage_engine_type) { case 0: { TEST(true); // Simulated cluster using ssd storage engine diff --git a/fdbserver/Status.actor.cpp b/fdbserver/Status.actor.cpp index 73c78b3f4f..c524ed946f 100644 --- a/fdbserver/Status.actor.cpp +++ b/fdbserver/Status.actor.cpp @@ -25,7 +25,6 @@ #include "fdbclient/SystemData.h" #include "fdbclient/ReadYourWrites.h" #include "fdbserver/WorkerInterface.actor.h" -#include "fdbserver/ClusterRecruitmentInterface.h" #include #include "fdbserver/CoordinationInterface.h" #include "fdbserver/DataDistribution.actor.h" @@ -35,28 +34,6 @@ #include "fdbclient/JsonBuilder.h" #include "flow/actorcompiler.h" // This must be the last #include. -void setIssues(ProcessIssuesMap& issueMap, NetworkAddress const& addr, VectorRef const& issues, - Optional& issueID) { - if (issues.size()) { - auto& e = issueMap[addr]; - e.first = issues; - e.second = deterministicRandom()->randomUniqueID(); - issueID = e.second; - } else { - issueMap.erase(addr); - issueID = Optional(); - } -} - -void removeIssues(ProcessIssuesMap& issueMap, NetworkAddress const& addr, Optional& issueID) { - if (!issueID.present()) { - return; - } - if (issueMap.count(addr) && issueMap[addr].second == issueID.get()) { - issueMap.erase( addr ); - } -} - const char* RecoveryStatus::names[] = { "reading_coordinated_state", "locking_coordinated_state", "locking_old_transaction_servers", "reading_transaction_system_state", "configuration_missing", "configuration_never_created", "configuration_invalid", @@ -364,7 +341,10 @@ static JsonBuilderObject machineStatusFetcher(WorkerEvents mMetrics, vectorfirst)) + //FIXME: this will not catch if the secondary address of the process was excluded + NetworkAddressList tempList; + tempList.address = it->first; + if (configuration.present() && !configuration.get().isExcludedServer(tempList)) notExcludedMap[machineId] = false; workerContribMap[machineId] ++; } @@ -569,7 +549,7 @@ struct RolesInfo { }; ACTOR static Future processStatusFetcher( - Reference>> db, std::vector workers, WorkerEvents pMetrics, + Reference> db, std::vector workers, WorkerEvents pMetrics, WorkerEvents mMetrics, WorkerEvents nMetrics, WorkerEvents errors, WorkerEvents traceFileOpenErrors, WorkerEvents programStarts, std::map> processIssues, vector> storageServers, @@ -627,18 +607,18 @@ ACTOR static Future processStatusFetcher( state RolesInfo roles; - roles.addRole("master", db->get().read().master); - roles.addRole("cluster_controller", db->get().read().clusterInterface.clientInterface); + roles.addRole("master", db->get().master); + roles.addRole("cluster_controller", db->get().clusterInterface.clientInterface); - if (db->get().read().distributor.present()) { - roles.addRole("data_distributor", db->get().read().distributor.get()); + if (db->get().distributor.present()) { + roles.addRole("data_distributor", db->get().distributor.get()); } - if (db->get().read().ratekeeper.present()) { - roles.addRole("ratekeeper", db->get().read().ratekeeper.get()); + if (db->get().ratekeeper.present()) { + roles.addRole("ratekeeper", db->get().ratekeeper.get()); } - for(auto& tLogSet : db->get().read().logSystemConfig.tLogs) { + for(auto& tLogSet : db->get().logSystemConfig.tLogs) { for(auto& it : tLogSet.logRouters) { if(it.present()) { roles.addRole("router", it.interf()); @@ -646,7 +626,7 @@ ACTOR static Future processStatusFetcher( } } - for(auto& old : db->get().read().logSystemConfig.oldTLogs) { + for(auto& old : db->get().logSystemConfig.oldTLogs) { for(auto& tLogSet : old.tLogs) { for(auto& it : tLogSet.logRouters) { if(it.present()) { @@ -689,7 +669,7 @@ ACTOR static Future processStatusFetcher( } state std::vector::const_iterator res; - state std::vector resolvers = db->get().read().resolvers; + state std::vector resolvers = db->get().resolvers; for(res = resolvers.begin(); res != resolvers.end(); ++res) { roles.addRole( "resolver", *res ); wait(yield()); @@ -850,7 +830,7 @@ ACTOR static Future processStatusFetcher( statusObj["roles"] = roles.getStatusForAddress(address); if (configuration.present()){ - statusObj["excluded"] = configuration.get().isExcludedServer(address); + statusObj["excluded"] = configuration.get().isExcludedServer(workerItr->interf.addresses()); } statusObj["class_type"] = workerItr->processClass.toString(); @@ -1551,17 +1531,17 @@ ACTOR static Future>> getStor return results; } -ACTOR static Future>> getTLogsAndMetrics(Reference>> db, std::unordered_map address_workers) { - vector servers = db->get().read().logSystemConfig.allPresentLogs(); +ACTOR static Future>> getTLogsAndMetrics(Reference> db, std::unordered_map address_workers) { + vector servers = db->get().logSystemConfig.allPresentLogs(); vector> results = wait(getServerMetrics(servers, address_workers, std::vector{ "TLogMetrics" })); return results; } -ACTOR static Future>> getProxiesAndMetrics(Reference>> db, std::unordered_map address_workers) { +ACTOR static Future>> getProxiesAndMetrics(Reference> db, std::unordered_map address_workers) { vector> results = wait(getServerMetrics( - db->get().read().client.proxies, address_workers, std::vector{ "GRVLatencyMetrics", "CommitLatencyMetrics" })); + db->get().client.proxies, address_workers, std::vector{ "GRVLatencyMetrics", "CommitLatencyMetrics" })); return results; } @@ -1571,7 +1551,7 @@ static int getExtraTLogEligibleZones(const vector& workers, const std::map> dcId_zone; for(auto const& worker : workers) { if(worker.processClass.machineClassFitness(ProcessClass::TLog) < ProcessClass::NeverAssign - && !configuration.isExcludedServer(worker.interf.address())) + && !configuration.isExcludedServer(worker.interf.addresses())) { allZones.insert(worker.interf.locality.zoneId().get()); if(worker.interf.locality.dcId().present()) { @@ -1629,7 +1609,7 @@ JsonBuilderObject getPerfLimit(TraceEventFields const& ratekeeper, double transP return perfLimit; } -ACTOR static Future workloadStatusFetcher(Reference>> db, vector workers, WorkerDetails mWorker, WorkerDetails rkWorker, +ACTOR static Future workloadStatusFetcher(Reference> db, vector workers, WorkerDetails mWorker, WorkerDetails rkWorker, JsonBuilderObject *qos, JsonBuilderObject *data_overlay, std::set *incomplete_reasons, Future>>> storageServerFuture) { state JsonBuilderObject statusObj; @@ -1644,7 +1624,7 @@ ACTOR static Future workloadStatusFetcher(Referenceget().read().client.proxies) { + for (auto &p : db->get().client.proxies) { auto worker = getWorker(workersMap, p.address()); if (worker.present()) proxyStatFutures.push_back(timeoutError(worker.get().interf.eventLogRequest.getReply(EventLogRequest(LiteralStringRef("ProxyMetrics"))), 1.0)); @@ -1859,11 +1839,11 @@ ACTOR static Future clusterSummaryStatisticsFetcher(WorkerEve return statusObj; } -static JsonBuilderArray oldTlogFetcher(int* oldLogFaultTolerance, Reference>> db, std::unordered_map const& address_workers) { +static JsonBuilderArray oldTlogFetcher(int* oldLogFaultTolerance, Reference> db, std::unordered_map const& address_workers) { JsonBuilderArray oldTlogsArray; - if(db->get().read().recoveryState >= RecoveryState::ACCEPTING_COMMITS) { - for(auto it : db->get().read().logSystemConfig.oldTLogs) { + if(db->get().recoveryState >= RecoveryState::ACCEPTING_COMMITS) { + for(auto it : db->get().logSystemConfig.oldTLogs) { JsonBuilderObject statusObj; JsonBuilderArray logsObj; Optional sat_log_replication_factor, sat_log_write_anti_quorum, sat_log_fault_tolerance, log_replication_factor, log_write_anti_quorum, log_fault_tolerance, remote_log_replication_factor, remote_log_fault_tolerance; @@ -1986,15 +1966,14 @@ static std::string getIssueDescription(std::string name) { } static std::map> getProcessIssuesAsMessages( - ProcessIssuesMap const& _issues) { + std::vector const& issues) { std::map> issuesMap; try { - ProcessIssuesMap issues = _issues; for (auto processIssues : issues) { - for (auto issue : processIssues.second.first) { + for (auto issue : processIssues.issues) { std::string issueStr = issue.toString(); - issuesMap[processIssues.first.toString()].push_back( + issuesMap[processIssues.address.toString()].push_back( JsonString::makeMessage(issueStr.c_str(), getIssueDescription(issueStr).c_str())); } } @@ -2109,7 +2088,7 @@ ACTOR Future layerStatusFetcher(Database cx, JsonBuilderArray return statusObj; } -ACTOR Future lockedStatusFetcher(Reference>> db, JsonBuilderArray *messages, std::set *incomplete_reasons) { +ACTOR Future lockedStatusFetcher(Reference> db, JsonBuilderArray *messages, std::set *incomplete_reasons) { state JsonBuilderObject statusObj; state Database cx = openDBOnServer(db, TaskPriority::DefaultEndpoint, true, false); // Open a new database connection that isn't lock-aware @@ -2181,10 +2160,10 @@ ACTOR Future> getActivePrimaryDC(Database cx, JsonBuilderArray* // constructs the cluster section of the json status output ACTOR Future clusterGetStatus( - Reference>> db, + Reference> db, Database cx, vector workers, - ProcessIssuesMap workerIssues, + std::vector workerIssues, std::map>* clientStatus, ServerCoordinators coordinators, std::vector incompatibleConnections, @@ -2201,7 +2180,7 @@ ACTOR Future clusterGetStatus( try { // Get the master Worker interface - Optional _mWorker = getWorker( workers, db->get().read().master.address() ); + Optional _mWorker = getWorker( workers, db->get().master.address() ); if (_mWorker.present()) { mWorker = _mWorker.get(); } else { @@ -2209,11 +2188,11 @@ ACTOR Future clusterGetStatus( } // Get the DataDistributor worker interface Optional _ddWorker; - if (db->get().read().distributor.present()) { - _ddWorker = getWorker( workers, db->get().read().distributor.get().address() ); + if (db->get().distributor.present()) { + _ddWorker = getWorker( workers, db->get().distributor.get().address() ); } - if (!db->get().read().distributor.present() || !_ddWorker.present()) { + if (!db->get().distributor.present() || !_ddWorker.present()) { messages.push_back(JsonString::makeMessage("unreachable_dataDistributor_worker", "Unable to locate the data distributor worker.")); } else { ddWorker = _ddWorker.get(); @@ -2221,11 +2200,11 @@ ACTOR Future clusterGetStatus( // Get the Ratekeeper worker interface Optional _rkWorker; - if (db->get().read().ratekeeper.present()) { - _rkWorker = getWorker( workers, db->get().read().ratekeeper.get().address() ); + if (db->get().ratekeeper.present()) { + _rkWorker = getWorker( workers, db->get().ratekeeper.get().address() ); } - if (!db->get().read().ratekeeper.present() || !_rkWorker.present()) { + if (!db->get().ratekeeper.present() || !_rkWorker.present()) { messages.push_back(JsonString::makeMessage("unreachable_ratekeeper_worker", "Unable to locate the ratekeeper worker.")); } else { rkWorker = _rkWorker.get(); @@ -2283,8 +2262,8 @@ ACTOR Future clusterGetStatus( state WorkerEvents programStarts = workerEventsVec[5].present() ? workerEventsVec[5].get().first : WorkerEvents(); state JsonBuilderObject statusObj; - if(db->get().read().recoveryCount > 0) { - statusObj["generation"] = db->get().read().recoveryCount; + if(db->get().recoveryCount > 0) { + statusObj["generation"] = db->get().recoveryCount; } state std::map> processIssues = @@ -2367,7 +2346,7 @@ ACTOR Future clusterGetStatus( state std::vector workerStatuses = wait(getAll(futures2)); int oldLogFaultTolerance = 100; - if(db->get().read().recoveryState >= RecoveryState::ACCEPTING_COMMITS && db->get().read().logSystemConfig.oldTLogs.size() > 0) { + if(db->get().recoveryState >= RecoveryState::ACCEPTING_COMMITS && db->get().logSystemConfig.oldTLogs.size() > 0) { statusObj["old_logs"] = oldTlogFetcher(&oldLogFaultTolerance, db, address_workers); } diff --git a/fdbserver/Status.h b/fdbserver/Status.h index 7a6537e94e..ac863b9c39 100644 --- a/fdbserver/Status.h +++ b/fdbserver/Status.h @@ -27,14 +27,14 @@ #include "fdbserver/MasterInterface.h" #include "fdbclient/ClusterInterface.h" -typedef Standalone> ProcessIssues; -typedef std::map> ProcessIssuesMap; +struct ProcessIssues { + NetworkAddress address; + Standalone> issues; -void setIssues(ProcessIssuesMap& issueMap, NetworkAddress const& addr, VectorRef const& issues, Optional& issueID); + ProcessIssues(NetworkAddress address, Standalone> issues) : address(address), issues(issues) {} +}; -void removeIssues(ProcessIssuesMap& issueMap, NetworkAddress const& addr, Optional& issueID); - -Future clusterGetStatus( Reference>> const& db, Database const& cx, vector const& workers, - ProcessIssuesMap const& workerIssues, std::map>* const& clientStatus, ServerCoordinators const& coordinators, std::vector const& incompatibleConnections, Version const& datacenterVersionDifference ); +Future clusterGetStatus( Reference> const& db, Database const& cx, vector const& workers, std::vector const& workerIssues, + std::map>* const& clientStatus, ServerCoordinators const& coordinators, std::vector const& incompatibleConnections, Version const& datacenterVersionDifference ); #endif diff --git a/fdbserver/StorageMetrics.actor.h b/fdbserver/StorageMetrics.actor.h index 5a0777b5ff..13e8d5fa87 100644 --- a/fdbserver/StorageMetrics.actor.h +++ b/fdbserver/StorageMetrics.actor.h @@ -378,7 +378,7 @@ struct StorageServerMetrics { } } - void getStorageMetrics( GetStorageMetricsRequest req, StorageBytes sb, double bytesInputRate ){ + void getStorageMetrics( GetStorageMetricsRequest req, StorageBytes sb, double bytesInputRate, int64_t versionLag, double lastUpdate ){ GetStorageMetricsReply rep; // SOMEDAY: make bytes dynamic with hard disk space @@ -405,6 +405,9 @@ struct StorageServerMetrics { rep.bytesInputRate = bytesInputRate; + rep.versionLag = versionLag; + rep.lastUpdate = lastUpdate; + req.reply.send(rep); } diff --git a/fdbserver/TLogServer.actor.cpp b/fdbserver/TLogServer.actor.cpp index 7ce0bb5d79..ae53590cba 100644 --- a/fdbserver/TLogServer.actor.cpp +++ b/fdbserver/TLogServer.actor.cpp @@ -491,6 +491,43 @@ struct LogData : NonCopyable, public ReferenceCounted { struct PeekTrackerData { std::map>> sequence_version; double lastUpdate; + + Tag tag; + + double lastLogged; + int64_t totalPeeks; + int64_t replyBytes; + int64_t duplicatePeeks; + double queueTime; + double queueMax; + double blockTime; + double blockMax; + double workTime; + double workMax; + + int64_t unblockedPeeks; + double idleTime; + double idleMax; + + PeekTrackerData() : lastUpdate(0) { + resetMetrics(); + } + + void resetMetrics() { + lastLogged = now(); + totalPeeks = 0; + replyBytes = 0; + duplicatePeeks = 0; + queueTime = 0; + queueMax = 0; + blockTime = 0; + blockMax = 0; + workTime = 0; + workMax = 0; + unblockedPeeks = 0; + idleTime = 0; + idleMax = 0; + } }; std::map peekTracker; @@ -1366,6 +1403,7 @@ ACTOR Future tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere state BinaryWriter messages2(Unversioned()); state int sequence = -1; state UID peekId; + state double queueStart = now(); if(req.sequence.present()) { try { @@ -1376,6 +1414,7 @@ ACTOR Future tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere } auto& trackerData = logData->peekTracker[peekId]; if (sequence == 0 && trackerData.sequence_version.find(0) == trackerData.sequence_version.end()) { + trackerData.tag = req.tag; trackerData.sequence_version[0].send(std::make_pair(req.begin, req.onlySpilled)); } auto seqBegin = trackerData.sequence_version.begin(); @@ -1392,8 +1431,15 @@ ACTOR Future tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere throw operation_obsolete(); } + Future> fPrevPeekData = trackerData.sequence_version[sequence].getFuture(); + if(fPrevPeekData.isReady()) { + trackerData.unblockedPeeks++; + double t = now() - trackerData.lastUpdate; + if(t > trackerData.idleMax) trackerData.idleMax = t; + trackerData.idleTime += t; + } trackerData.lastUpdate = now(); - std::pair prevPeekData = wait(trackerData.sequence_version[sequence].getFuture()); + std::pair prevPeekData = wait(fPrevPeekData); req.begin = std::max(prevPeekData.first, req.begin); req.onlySpilled = prevPeekData.second; wait(yield()); @@ -1407,6 +1453,8 @@ ACTOR Future tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere } } + state double blockStart = now(); + if( req.returnIfBlocked && logData->version.get() < req.begin ) { req.reply.sendError(end_of_stream()); if(req.sequence.present()) { @@ -1442,6 +1490,8 @@ ACTOR Future tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere wait(delay(0, TaskPriority::TLogSpilledPeekReply)); } + state double workStart = now(); + Version poppedVer = poppedVersion(logData, req.tag); if(poppedVer > req.begin) { TLogPeekReply rep; @@ -1617,8 +1667,24 @@ ACTOR Future tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere if(req.sequence.present()) { auto& trackerData = logData->peekTracker[peekId]; - auto& sequenceData = trackerData.sequence_version[sequence+1]; trackerData.lastUpdate = now(); + + double queueT = blockStart-queueStart; + double blockT = workStart-blockStart; + double workT = now()-workStart; + + trackerData.totalPeeks++; + trackerData.replyBytes += reply.messages.size(); + + if(queueT > trackerData.queueMax) trackerData.queueMax = queueT; + if(blockT > trackerData.blockMax) trackerData.blockMax = blockT; + if(workT > trackerData.workMax) trackerData.workMax = workT; + + trackerData.queueTime += queueT; + trackerData.blockTime += blockT; + trackerData.workTime += workT; + + auto& sequenceData = trackerData.sequence_version[sequence+1]; if(trackerData.sequence_version.size() && sequence+1 < trackerData.sequence_version.begin()->first) { req.reply.sendError(operation_obsolete()); if(!sequenceData.isSet()) { @@ -1631,6 +1697,7 @@ ACTOR Future tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere return Void(); } if(sequenceData.isSet()) { + trackerData.duplicatePeeks++; if(sequenceData.getFuture().get().first != reply.end) { TEST(true); //tlog peek second attempt ended at a different version req.reply.sendError(operation_obsolete()); @@ -1956,6 +2023,47 @@ ACTOR Future cleanupPeekTrackers( LogData* logData ) { } } +ACTOR Future logPeekTrackers( LogData* logData ) { + loop { + int64_t logThreshold = 1; + if(logData->peekTracker.size() > SERVER_KNOBS->PEEK_LOGGING_AMOUNT) { + std::vector peekCounts; + peekCounts.reserve(logData->peekTracker.size()); + for( auto& it : logData->peekTracker ) { + peekCounts.push_back(it.second.totalPeeks); + } + size_t pivot = peekCounts.size()-SERVER_KNOBS->PEEK_LOGGING_AMOUNT; + std::nth_element(peekCounts.begin(), peekCounts.begin()+pivot, peekCounts.end()); + logThreshold = std::max(1,peekCounts[pivot]); + } + int logCount = 0; + for( auto& it : logData->peekTracker ) { + if(it.second.totalPeeks >= logThreshold) { + logCount++; + TraceEvent("PeekMetrics", logData->logId) + .detail("Tag", it.second.tag.toString()) + .detail("Elapsed", now() - it.second.lastLogged) + .detail("MeanReplyBytes", it.second.replyBytes/it.second.totalPeeks) + .detail("TotalPeeks", it.second.totalPeeks) + .detail("UnblockedPeeks", it.second.unblockedPeeks) + .detail("DuplicatePeeks", it.second.duplicatePeeks) + .detail("Sequence", it.second.sequence_version.size() ? it.second.sequence_version.begin()->first : -1) + .detail("IdleSeconds", it.second.idleTime) + .detail("IdleMax", it.second.idleMax) + .detail("QueueSeconds", it.second.queueTime) + .detail("QueueMax", it.second.queueMax) + .detail("BlockSeconds", it.second.blockTime) + .detail("BlockMax", it.second.blockMax) + .detail("WorkSeconds", it.second.workTime) + .detail("WorkMax", it.second.workMax); + it.second.resetMetrics(); + } + } + + wait( delay(SERVER_KNOBS->PEEK_LOGGING_DELAY * std::max(1,logCount)) ); + } +} + void getQueuingMetrics( TLogData* self, Reference logData, TLogQueuingMetricsRequest const& req ) { TLogQueuingMetricsReply reply; reply.localTime = now(); @@ -2302,6 +2410,7 @@ ACTOR Future tLogCore( TLogData* self, Reference logData, TLogInt logData->addActor.send( traceCounters("TLogMetrics", logData->logId, SERVER_KNOBS->STORAGE_LOGGING_DELAY, &logData->cc, logData->logId.toString() + "/TLogMetrics")); logData->addActor.send( serveTLogInterface(self, tli, logData, warningCollectorInput) ); logData->addActor.send( cleanupPeekTrackers(logData.getPtr()) ); + logData->addActor.send( logPeekTrackers(logData.getPtr()) ); if(!logData->isPrimary) { std::vector tags; diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index bee56642d9..084fead508 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -4,13 +4,13 @@ * This source file is part of the FoundationDB open source project * * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors - * + * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -42,8 +42,8 @@ // Some convenience functions for debugging to stringify various structures // Classes can add compatibility by either specializing toString or implementing // std::string toString() const; -template -std::string toString(const T &o) { +template +std::string toString(const T& o) { return o.toString(); } @@ -52,27 +52,26 @@ std::string toString(StringRef s) { } std::string toString(LogicalPageID id) { - if(id == invalidLogicalPageID) { + if (id == invalidLogicalPageID) { return "LogicalPageID{invalid}"; } return format("LogicalPageID{%" PRId64 "}", id); } -template -std::string toString(const Standalone &s) { +template +std::string toString(const Standalone& s) { return toString((T)s); } -template -std::string toString(const T *begin, const T *end) { +template +std::string toString(const T* begin, const T* end) { std::string r = "{"; bool comma = false; - while(begin != end) { - if(comma) { + while (begin != end) { + if (comma) { r += ", "; - } - else { + } else { comma = true; } r += toString(*begin++); @@ -82,25 +81,25 @@ std::string toString(const T *begin, const T *end) { return r; } -template -std::string toString(const std::vector &v) { +template +std::string toString(const std::vector& v) { return toString(&v.front(), &v.back() + 1); } -template -std::string toString(const VectorRef &v) { +template +std::string toString(const VectorRef& v) { return toString(v.begin(), v.end()); } -template -std::string toString(const Optional &o) { - if(o.present()) { +template +std::string toString(const Optional& o) { + if (o.present()) { return toString(o.get()); } return ""; } -// A FIFO queue of T stored as a linked list of pages. +// A FIFO queue of T stored as a linked list of pages. // Main operations are pop(), pushBack(), pushFront(), and flush(). // // flush() will ensure all queue pages are written to the pager and move the unflushed @@ -133,64 +132,54 @@ std::string toString(const Optional &o) { // // Serialize *this to dst, return number of bytes written to dst // int writeToBytes(uint8_t *dst) const; // - must be supported by toString(object) (see above) -template +template struct FIFOQueueCodec { - static T readFromBytes(const uint8_t *src, int &bytesRead) { + static T readFromBytes(const uint8_t* src, int& bytesRead) { T x; bytesRead = x.readFromBytes(src); return x; } - static int bytesNeeded(const T &x) { - return x.bytesNeeded(); - } - static int writeToBytes(uint8_t *dst, const T &x) { - return x.writeToBytes(dst); - } + static int bytesNeeded(const T& x) { return x.bytesNeeded(); } + static int writeToBytes(uint8_t* dst, const T& x) { return x.writeToBytes(dst); } }; -template +template struct FIFOQueueCodec::value>::type> { static_assert(std::is_trivially_copyable::value); - static T readFromBytes(const uint8_t *src, int &bytesRead) { + static T readFromBytes(const uint8_t* src, int& bytesRead) { bytesRead = sizeof(T); - return *(T *)src; + return *(T*)src; } - static int bytesNeeded(const T &x) { - return sizeof(T); - } - static int writeToBytes(uint8_t *dst, const T &x) { - *(T *)dst = x; + static int bytesNeeded(const T& x) { return sizeof(T); } + static int writeToBytes(uint8_t* dst, const T& x) { + *(T*)dst = x; return sizeof(T); } }; -template> +template > class FIFOQueue { public: #pragma pack(push, 1) struct QueueState { - bool operator==(const QueueState &rhs) const { - return memcmp(this, &rhs, sizeof(QueueState)) == 0; - } + bool operator==(const QueueState& rhs) const { return memcmp(this, &rhs, sizeof(QueueState)) == 0; } LogicalPageID headPageID = invalidLogicalPageID; LogicalPageID tailPageID = invalidLogicalPageID; uint16_t headOffset; - // Note that there is no tail index because the tail page is always never-before-written and its index will start at 0 + // Note that there is no tail index because the tail page is always never-before-written and its index will + // start at 0 int64_t numPages; int64_t numEntries; std::string toString() const { - return format("{head: %s:%d tail: %s numPages: %" PRId64 " numEntries: %" PRId64 "}", ::toString(headPageID).c_str(), (int)headOffset, ::toString(tailPageID).c_str(), numPages, numEntries); + return format("{head: %s:%d tail: %s numPages: %" PRId64 " numEntries: %" PRId64 "}", + ::toString(headPageID).c_str(), (int)headOffset, ::toString(tailPageID).c_str(), numPages, + numEntries); } }; #pragma pack(pop) struct Cursor { - enum Mode { - NONE, - POP, - READONLY, - WRITE - }; + enum Mode { NONE, POP, READONLY, WRITE }; // The current page being read or written to LogicalPageID pageID; @@ -198,23 +187,23 @@ public: // The first page ID to be written to the pager, if this cursor has written anything LogicalPageID firstPageIDWritten; - // Offset after RawPage header to next read from or write to + // Offset after RawPage header to next read from or write to int offset; // A read cursor will not read this page (or beyond) LogicalPageID endPageID; Reference page; - FIFOQueue *queue; + FIFOQueue* queue; Future operation; Mode mode; - Cursor() : mode(NONE) { - } + Cursor() : mode(NONE) {} - // Initialize a cursor. - void init(FIFOQueue *q = nullptr, Mode m = NONE, LogicalPageID initialPageID = invalidLogicalPageID, int readOffset = 0, LogicalPageID endPage = invalidLogicalPageID) { - if(operation.isValid()) { + // Initialize a cursor. + void init(FIFOQueue* q = nullptr, Mode m = NONE, LogicalPageID initialPageID = invalidLogicalPageID, + int readOffset = 0, LogicalPageID endPage = invalidLogicalPageID) { + if (operation.isValid()) { operation.cancel(); } queue = q; @@ -224,44 +213,45 @@ public: endPageID = endPage; page.clear(); - if(mode == POP || mode == READONLY) { + if (mode == POP || mode == READONLY) { // If cursor is not pointed at the end page then start loading it. // The end page will not have been written to disk yet. pageID = initialPageID; operation = (pageID == endPageID) ? Void() : loadPage(); - } - else { + } else { pageID = invalidLogicalPageID; - ASSERT(mode == WRITE || (initialPageID == invalidLogicalPageID && readOffset == 0 && endPage == invalidLogicalPageID)); + ASSERT(mode == WRITE || + (initialPageID == invalidLogicalPageID && readOffset == 0 && endPage == invalidLogicalPageID)); operation = Void(); } debug_printf("FIFOQueue::Cursor(%s) initialized\n", toString().c_str()); - if(mode == WRITE && initialPageID != invalidLogicalPageID) { + if (mode == WRITE && initialPageID != invalidLogicalPageID) { addNewPage(initialPageID, 0, true); } } // Since cursors can have async operations pending which modify their state they can't be copied cleanly - Cursor(const Cursor &other) = delete; + Cursor(const Cursor& other) = delete; // A read cursor can be initialized from a pop cursor - void initReadOnly(const Cursor &c) { + void initReadOnly(const Cursor& c) { ASSERT(c.mode == READONLY || c.mode == POP); init(c.queue, READONLY, c.pageID, c.offset, c.endPageID); } - ~Cursor() { - operation.cancel(); - } + ~Cursor() { operation.cancel(); } std::string toString() const { - if(mode == WRITE) { - return format("{WriteCursor %s:%p pos=%s:%d endOffset=%d}", queue->name.c_str(), this, ::toString(pageID).c_str(), offset, page ? raw()->endOffset : -1); + if (mode == WRITE) { + return format("{WriteCursor %s:%p pos=%s:%d endOffset=%d}", queue->name.c_str(), this, + ::toString(pageID).c_str(), offset, page ? raw()->endOffset : -1); } - if(mode == POP || mode == READONLY) { - return format("{ReadCursor %s:%p pos=%s:%d endOffset=%d endPage=%s}", queue->name.c_str(), this, ::toString(pageID).c_str(), offset, page ? raw()->endOffset : -1, ::toString(endPageID).c_str()); + if (mode == POP || mode == READONLY) { + return format("{ReadCursor %s:%p pos=%s:%d endOffset=%d endPage=%s}", queue->name.c_str(), this, + ::toString(pageID).c_str(), offset, page ? raw()->endOffset : -1, + ::toString(endPageID).c_str()); } ASSERT(mode == NONE); return format("{NullCursor=%p}", this); @@ -272,28 +262,20 @@ public: LogicalPageID nextPageID; uint16_t nextOffset; uint16_t endOffset; - uint8_t * begin() { - return (uint8_t *)(this + 1); - } + uint8_t* begin() { return (uint8_t*)(this + 1); } }; #pragma pack(pop) - Future notBusy() { - return operation; - } + Future notBusy() { return operation; } // Returns true if any items have been written to the last page - bool pendingWrites() const { - return mode == WRITE && offset != 0; - } + bool pendingWrites() const { return mode == WRITE && offset != 0; } - RawPage * raw() const { - return ((RawPage *)(page->begin())); - } + RawPage* raw() const { return ((RawPage*)(page->begin())); } void setNext(LogicalPageID pageID, int offset) { ASSERT(mode == WRITE); - RawPage *p = raw(); + RawPage* p = raw(); p->nextPageID = pageID; p->nextOffset = offset; } @@ -314,21 +296,22 @@ public: VALGRIND_MAKE_MEM_DEFINED(raw()->begin(), offset); VALGRIND_MAKE_MEM_DEFINED(raw()->begin() + offset, queue->dataBytesPerPage - raw()->endOffset); queue->pager->updatePage(pageID, page); - if(firstPageIDWritten == invalidLogicalPageID) { + if (firstPageIDWritten == invalidLogicalPageID) { firstPageIDWritten = pageID; } } // Link the current page to newPageID:newOffset and then write it to the pager. - // If initializeNewPage is true a page buffer will be allocated for the new page and it will be initialized + // If initializeNewPage is true a page buffer will be allocated for the new page and it will be initialized // as a new tail page. void addNewPage(LogicalPageID newPageID, int newOffset, bool initializeNewPage) { ASSERT(mode == WRITE); ASSERT(newPageID != invalidLogicalPageID); - debug_printf("FIFOQueue::Cursor(%s) Adding page %s init=%d\n", toString().c_str(), ::toString(newPageID).c_str(), initializeNewPage); + debug_printf("FIFOQueue::Cursor(%s) Adding page %s init=%d\n", toString().c_str(), + ::toString(newPageID).c_str(), initializeNewPage); // Update existing page and write, if it exists - if(page) { + if (page) { setNext(newPageID, newOffset); debug_printf("FIFOQueue::Cursor(%s) Linked new page\n", toString().c_str()); writePage(); @@ -337,21 +320,20 @@ public: pageID = newPageID; offset = newOffset; - if(initializeNewPage) { + if (initializeNewPage) { debug_printf("FIFOQueue::Cursor(%s) Initializing new page\n", toString().c_str()); page = queue->pager->newPageBuffer(); setNext(0, 0); auto p = raw(); ASSERT(newOffset == 0); p->endOffset = 0; - } - else { + } else { page.clear(); } } // Write item to the next position in the current page or, if it won't fit, add a new page and write it there. - ACTOR static Future write_impl(Cursor *self, T item, Future start) { + ACTOR static Future write_impl(Cursor* self, T item, Future start) { ASSERT(self->mode == WRITE); // Wait for the previous operation to finish @@ -360,14 +342,16 @@ public: wait(previous); state int bytesNeeded = Codec::bytesNeeded(item); - if(self->pageID == invalidLogicalPageID || self->offset + bytesNeeded > self->queue->dataBytesPerPage) { - debug_printf("FIFOQueue::Cursor(%s) write(%s) page is full, adding new page\n", self->toString().c_str(), ::toString(item).c_str()); + if (self->pageID == invalidLogicalPageID || self->offset + bytesNeeded > self->queue->dataBytesPerPage) { + debug_printf("FIFOQueue::Cursor(%s) write(%s) page is full, adding new page\n", + self->toString().c_str(), ::toString(item).c_str()); LogicalPageID newPageID = wait(self->queue->pager->newPageID()); self->addNewPage(newPageID, 0, true); ++self->queue->numPages; wait(yield()); } - debug_printf("FIFOQueue::Cursor(%s) before write(%s)\n", self->toString().c_str(), ::toString(item).c_str()); + debug_printf("FIFOQueue::Cursor(%s) before write(%s)\n", self->toString().c_str(), + ::toString(item).c_str()); auto p = self->raw(); Codec::writeToBytes(p->begin() + self->offset, item); self->offset += bytesNeeded; @@ -376,14 +360,15 @@ public: return Void(); } - void write(const T &item) { + void write(const T& item) { Promise p; operation = write_impl(this, item, p.getFuture()); p.send(Void()); } - // Read the next item at the cursor (if <= upperBound), moving to a new page first if the current page is exhausted - ACTOR static Future> readNext_impl(Cursor *self, Optional upperBound, Future start) { + // Read the next item at the cursor (if <= upperBound), moving to a new page first if the current page is + // exhausted + ACTOR static Future> readNext_impl(Cursor* self, Optional upperBound, Future start) { ASSERT(self->mode == POP || self->mode == READONLY); // Wait for the previous operation to finish @@ -392,13 +377,13 @@ public: wait(previous); debug_printf("FIFOQueue::Cursor(%s) readNext begin\n", self->toString().c_str()); - if(self->pageID == invalidLogicalPageID || self->pageID == self->endPageID) { + if (self->pageID == invalidLogicalPageID || self->pageID == self->endPageID) { debug_printf("FIFOQueue::Cursor(%s) readNext returning nothing\n", self->toString().c_str()); return Optional(); } // We now know we are pointing to PageID and it should be read and used, but it may not be loaded yet. - if(!self->page) { + if (!self->page) { wait(self->loadPage()); wait(yield()); } @@ -409,46 +394,50 @@ public: int bytesRead; T result = Codec::readFromBytes(p->begin() + self->offset, bytesRead); - if(upperBound.present() && upperBound.get() < result) { - debug_printf("FIFOQueue::Cursor(%s) not popping %s, exceeds upper bound %s\n", - self->toString().c_str(), ::toString(result).c_str(), ::toString(upperBound.get()).c_str()); + if (upperBound.present() && upperBound.get() < result) { + debug_printf("FIFOQueue::Cursor(%s) not popping %s, exceeds upper bound %s\n", self->toString().c_str(), + ::toString(result).c_str(), ::toString(upperBound.get()).c_str()); return Optional(); } self->offset += bytesRead; - if(self->mode == POP) { + if (self->mode == POP) { --self->queue->numEntries; } - debug_printf("FIFOQueue::Cursor(%s) after read of %s\n", self->toString().c_str(), ::toString(result).c_str()); + debug_printf("FIFOQueue::Cursor(%s) after read of %s\n", self->toString().c_str(), + ::toString(result).c_str()); ASSERT(self->offset <= p->endOffset); - if(self->offset == p->endOffset) { + if (self->offset == p->endOffset) { debug_printf("FIFOQueue::Cursor(%s) Page exhausted\n", self->toString().c_str()); LogicalPageID oldPageID = self->pageID; self->pageID = p->nextPageID; self->offset = p->nextOffset; - if(self->mode == POP) { + if (self->mode == POP) { --self->queue->numPages; } self->page.clear(); - debug_printf("FIFOQueue::Cursor(%s) readNext page exhausted, moved to new page\n", self->toString().c_str()); + debug_printf("FIFOQueue::Cursor(%s) readNext page exhausted, moved to new page\n", + self->toString().c_str()); - if(self->mode == POP) { - // Freeing the old page must happen after advancing the cursor and clearing the page reference because - // freePage() could cause a push onto a queue that causes a newPageID() call which could pop() from this - // very same queue. - // Queue pages are freed at page 0 because they can be reused after the next commit. + if (self->mode == POP) { + // Freeing the old page must happen after advancing the cursor and clearing the page reference + // because freePage() could cause a push onto a queue that causes a newPageID() call which could + // pop() from this very same queue. Queue pages are freed at page 0 because they can be reused after + // the next commit. self->queue->pager->freePage(oldPageID, 0); } } - debug_printf("FIFOQueue(%s) %s(upperBound=%s) -> %s\n", self->queue->name.c_str(), (self->mode == POP ? "pop" : "peek"), ::toString(upperBound).c_str(), ::toString(result).c_str()); + debug_printf("FIFOQueue(%s) %s(upperBound=%s) -> %s\n", self->queue->name.c_str(), + (self->mode == POP ? "pop" : "peek"), ::toString(upperBound).c_str(), + ::toString(result).c_str()); return result; } // Read and move past the next item if is <= upperBound or if upperBound is not present - Future> readNext(const Optional &upperBound = {}) { - if(mode == NONE) { + Future> readNext(const Optional& upperBound = {}) { + if (mode == NONE) { return Optional(); } Promise p; @@ -460,18 +449,15 @@ public: }; public: - FIFOQueue() : pager(nullptr) { - } + FIFOQueue() : pager(nullptr) {} - ~FIFOQueue() { - newTailPage.cancel(); - } + ~FIFOQueue() { newTailPage.cancel(); } - FIFOQueue(const FIFOQueue &other) = delete; - void operator=(const FIFOQueue &rhs) = delete; + FIFOQueue(const FIFOQueue& other) = delete; + void operator=(const FIFOQueue& rhs) = delete; // Create a new queue at newPageID - void create(IPager2 *p, LogicalPageID newPageID, std::string queueName) { + void create(IPager2* p, LogicalPageID newPageID, std::string queueName) { debug_printf("FIFOQueue(%s) create from page %s\n", queueName.c_str(), toString(newPageID).c_str()); pager = p; name = queueName; @@ -486,7 +472,7 @@ public: } // Load an existing queue from its queue state - void recover(IPager2 *p, const QueueState &qs, std::string queueName) { + void recover(IPager2* p, const QueueState& qs, std::string queueName) { debug_printf("FIFOQueue(%s) recover from queue state %s\n", queueName.c_str(), qs.toString().c_str()); pager = p; name = queueName; @@ -500,7 +486,7 @@ public: debug_printf("FIFOQueue(%s) recovered\n", queueName.c_str()); } - ACTOR static Future>> peekAll_impl(FIFOQueue *self) { + ACTOR static Future>> peekAll_impl(FIFOQueue* self) { state Standalone> results; state Cursor c; c.initReadOnly(self->headReader); @@ -508,7 +494,7 @@ public: loop { Optional x = wait(c.readNext()); - if(!x.present()) { + if (!x.present()) { break; } results.push_back(results.arena(), x.get()); @@ -517,14 +503,10 @@ public: return results; } - Future>> peekAll() { - return peekAll_impl(this); - } + Future>> peekAll() { return peekAll_impl(this); } // Pop the next item on front of queue if it is <= upperBound or if upperBound is not present - Future> pop(Optional upperBound = {}) { - return headReader.readNext(upperBound); - } + Future> pop(Optional upperBound = {}) { return headReader.readNext(upperBound); } QueueState getState() const { QueueState s; @@ -538,12 +520,12 @@ public: return s; } - void pushBack(const T &item) { + void pushBack(const T& item) { debug_printf("FIFOQueue(%s) pushBack(%s)\n", name.c_str(), toString(item).c_str()); tailWriter.write(item); } - void pushFront(const T &item) { + void pushFront(const T& item) { debug_printf("FIFOQueue(%s) pushFront(%s)\n", name.c_str(), toString(item).c_str()); headWriter.write(item); } @@ -555,7 +537,8 @@ public: // Returns true if any most recently started operations on any cursors are not ready bool busy() { - return !headWriter.notBusy().isReady() || !headReader.notBusy().isReady() || !tailWriter.notBusy().isReady() || !newTailPage.isReady(); + return !headWriter.notBusy().isReady() || !headReader.notBusy().isReady() || !tailWriter.notBusy().isReady() || + !newTailPage.isReady(); } // preFlush() prepares this queue to be flushed to disk, but doesn't actually do it so the queue can still @@ -571,7 +554,7 @@ public: // - queue push() can call pager->newPageID() which can call pop() on the same or another queue // This creates a circular dependency with 1 or more queues when those queues are used by the pager // to manage free page IDs. - ACTOR static Future preFlush_impl(FIFOQueue *self) { + ACTOR static Future preFlush_impl(FIFOQueue* self) { debug_printf("FIFOQueue(%s) preFlush begin\n", self->name.c_str()); wait(self->notBusy()); @@ -579,14 +562,15 @@ public: // so see if any work is pending now. bool workPending = self->busy(); - if(!workPending) { + if (!workPending) { // A newly created or flushed queue starts out in a state where its tail page to be written to is empty. - // After pushBack() is called, this is no longer the case and never will be again until the queue is flushed. - // Before the non-empty tail page is written it must be linked to a new empty page for use after the next - // flush. (This is explained more at the top of FIFOQueue but it is because queue pages can only be written - // once because once they contain durable data a second write to link to a new page could corrupt the existing - // data if the subsequent commit never succeeds.) - if(self->newTailPage.isReady() && self->newTailPage.get() == invalidLogicalPageID && self->tailWriter.pendingWrites()) { + // After pushBack() is called, this is no longer the case and never will be again until the queue is + // flushed. Before the non-empty tail page is written it must be linked to a new empty page for use after + // the next flush. (This is explained more at the top of FIFOQueue but it is because queue pages can only + // be written once because once they contain durable data a second write to link to a new page could corrupt + // the existing data if the subsequent commit never succeeds.) + if (self->newTailPage.isReady() && self->newTailPage.get() == invalidLogicalPageID && + self->tailWriter.pendingWrites()) { self->newTailPage = self->pager->newPageID(); workPending = true; } @@ -596,16 +580,14 @@ public: return workPending; } - Future preFlush() { - return preFlush_impl(this); - } + Future preFlush() { return preFlush_impl(this); } void finishFlush() { debug_printf("FIFOQueue(%s) finishFlush start\n", name.c_str()); ASSERT(!busy()); // If a new tail page was allocated, link the last page of the tail writer to it. - if(newTailPage.get() != invalidLogicalPageID) { + if (newTailPage.get() != invalidLogicalPageID) { tailWriter.addNewPage(newTailPage.get(), 0, false); // The flush sequence allocated a page and added it to the queue so increment numPages ++numPages; @@ -618,7 +600,7 @@ public: // If the headWriter wrote anything, link its tail page to the headReader position and point the headReader // to the start of the headWriter - if(headWriter.pendingWrites()) { + if (headWriter.pendingWrites()) { headWriter.addNewPage(headReader.pageID, headReader.offset, false); headReader.pageID = headWriter.firstPageIDWritten; headReader.offset = 0; @@ -635,10 +617,10 @@ public: debug_printf("FIFOQueue(%s) finishFlush end\n", name.c_str()); } - ACTOR static Future flush_impl(FIFOQueue *self) { + ACTOR static Future flush_impl(FIFOQueue* self) { loop { bool notDone = wait(self->preFlush()); - if(!notDone) { + if (!notDone) { break; } } @@ -646,15 +628,13 @@ public: return Void(); } - Future flush() { - return flush_impl(this); - } + Future flush() { return flush_impl(this); } - IPager2 *pager; + IPager2* pager; int64_t numPages; int64_t numEntries; int dataBytesPerPage; - + Cursor headReader; Cursor tailWriter; Cursor headWriter; @@ -673,63 +653,44 @@ class FastAllocatedPage : public IPage, public FastAllocated, public: // Create a fast-allocated page with size total bytes INCLUDING checksum FastAllocatedPage(int size, int bufferSize) : logicalSize(size), bufferSize(bufferSize) { - buffer = (uint8_t *)allocateFast(bufferSize); + buffer = (uint8_t*)allocateFast(bufferSize); // Mark any unused page portion defined VALGRIND_MAKE_MEM_DEFINED(buffer + logicalSize, bufferSize - logicalSize); }; - virtual ~FastAllocatedPage() { - freeFast(bufferSize, buffer); - } + virtual ~FastAllocatedPage() { freeFast(bufferSize, buffer); } virtual Reference clone() const { - FastAllocatedPage *p = new FastAllocatedPage(logicalSize, bufferSize); + FastAllocatedPage* p = new FastAllocatedPage(logicalSize, bufferSize); memcpy(p->buffer, buffer, logicalSize); return Reference(p); } // Usable size, without checksum - int size() const { - return logicalSize - sizeof(Checksum); - } + int size() const { return logicalSize - sizeof(Checksum); } - uint8_t const* begin() const { - return buffer; - } + uint8_t const* begin() const { return buffer; } - uint8_t* mutate() { - return buffer; - } + uint8_t* mutate() { return buffer; } - void addref() const { - ReferenceCounted::addref(); - } + void addref() const { ReferenceCounted::addref(); } + + void delref() const { ReferenceCounted::delref(); } - void delref() const { - ReferenceCounted::delref(); - } - typedef uint32_t Checksum; - Checksum & getChecksum() { - return *(Checksum *)(buffer + size()); - } + Checksum& getChecksum() { return *(Checksum*)(buffer + size()); } - Checksum calculateChecksum(LogicalPageID pageID) { - return crc32c_append(pageID, buffer, size()); - } + Checksum calculateChecksum(LogicalPageID pageID) { return crc32c_append(pageID, buffer, size()); } - void updateChecksum(LogicalPageID pageID) { - getChecksum() = calculateChecksum(pageID); - } + void updateChecksum(LogicalPageID pageID) { getChecksum() = calculateChecksum(pageID); } + + bool verifyChecksum(LogicalPageID pageID) { return getChecksum() == calculateChecksum(pageID); } - bool verifyChecksum(LogicalPageID pageID) { - return getChecksum() == calculateChecksum(pageID); - } private: int logicalSize; int bufferSize; - uint8_t *buffer; + uint8_t* buffer; }; // Holds an index of recently used objects. @@ -737,12 +698,11 @@ private: // bool evictable() const; // return true if the entry can be evicted // Future onEvictable() const; // ready when entry can be evicted // indicating if it is safe to evict. -template +template class ObjectCache : NonCopyable { struct Entry : public boost::intrusive::list_base_hook<> { - Entry() : hits(0) { - } + Entry() : hits(0) {} IndexType index; ObjectType item; int hits; @@ -752,8 +712,8 @@ class ObjectCache : NonCopyable { typedef boost::intrusive::list EvictionOrderT; public: - ObjectCache(int sizeLimit = 1) : sizeLimit(sizeLimit), cacheHits(0), cacheMisses(0), noHitEvictions(0), failedEvictions(0) { - } + ObjectCache(int sizeLimit = 1) + : sizeLimit(sizeLimit), cacheHits(0), cacheMisses(0), noHitEvictions(0), failedEvictions(0) {} void setSizeLimit(int n) { ASSERT(n > 0); @@ -762,9 +722,9 @@ public: // Get the object for i if it exists, else return nullptr. // If the object exists, its eviction order will NOT change as this is not a cache hit. - ObjectType * getIfExists(const IndexType &index) { + ObjectType* getIfExists(const IndexType& index) { auto i = cache.find(index); - if(i != cache.end()) { + if (i != cache.end()) { ++i->second.hits; return &i->second.item; } @@ -773,20 +733,19 @@ public: // Get the object for i or create a new one. // After a get(), the object for i is the last in evictionOrder. - ObjectType & get(const IndexType &index, bool noHit = false) { - Entry &entry = cache[index]; + ObjectType& get(const IndexType& index, bool noHit = false) { + Entry& entry = cache[index]; // If entry is linked into evictionOrder then move it to the back of the order - if(entry.is_linked()) { - if(!noHit) { + if (entry.is_linked()) { + if (!noHit) { ++entry.hits; ++cacheHits; } // Move the entry to the back of the eviction order evictionOrder.erase(evictionOrder.iterator_to(entry)); evictionOrder.push_back(entry); - } - else { + } else { ++cacheMisses; // Finish initializing entry entry.index = index; @@ -795,25 +754,27 @@ public: evictionOrder.push_back(entry); // While the cache is too big, evict the oldest entry until the oldest entry can't be evicted. - while(cache.size() > sizeLimit) { - Entry &toEvict = evictionOrder.front(); - debug_printf("Trying to evict %s to make room for %s\n", toString(toEvict.index).c_str(), toString(index).c_str()); + while (cache.size() > sizeLimit) { + Entry& toEvict = evictionOrder.front(); + debug_printf("Trying to evict %s to make room for %s\n", toString(toEvict.index).c_str(), + toString(index).c_str()); - // It's critical that we do not evict the item we just added (or the reference we return would be invalid) but - // since sizeLimit must be > 0, entry was just added to the end of the evictionOrder, and this loop will end - // if we move anything to the end of the eviction order, we can be guaraunted that entry != toEvict, so we - // do not need to check. - // If the item is not evictable then move it to the back of the eviction order and stop. - if(!toEvict.item.evictable()) { + // It's critical that we do not evict the item we just added (or the reference we return would be + // invalid) but since sizeLimit must be > 0, entry was just added to the end of the evictionOrder, and + // this loop will end if we move anything to the end of the eviction order, we can be guaraunted that + // entry != toEvict, so we do not need to check. If the item is not evictable then move it to the back + // of the eviction order and stop. + if (!toEvict.item.evictable()) { evictionOrder.erase(evictionOrder.iterator_to(toEvict)); evictionOrder.push_back(toEvict); ++failedEvictions; break; } else { - if(toEvict.hits == 0) { + if (toEvict.hits == 0) { ++noHitEvictions; } - debug_printf("Evicting %s to make room for %s\n", toString(toEvict.index).c_str(), toString(index).c_str()); + debug_printf("Evicting %s to make room for %s\n", toString(toEvict.index).c_str(), + toString(index).c_str()); evictionOrder.pop_front(); cache.erase(toEvict.index); } @@ -825,12 +786,12 @@ public: // Clears the cache, saving the entries, and then waits for eachWaits for each item to be evictable and evicts it. // The cache should not be Evicts all evictable entries - ACTOR static Future clear_impl(ObjectCache *self) { + ACTOR static Future clear_impl(ObjectCache* self) { state ObjectCache::CacheT cache; state EvictionOrderT evictionOrder; // Swap cache contents to local state vars - // After this, no more entries will be added to or read from these + // After this, no more entries will be added to or read from these // structures so we know for sure that no page will become unevictable // after it is either evictable or onEvictable() is ready. cache.swap(self->cache); @@ -839,8 +800,8 @@ public: state typename EvictionOrderT::iterator i = evictionOrder.begin(); state typename EvictionOrderT::iterator iEnd = evictionOrder.begin(); - while(i != iEnd) { - if(!i->item.evictable()) { + while (i != iEnd) { + if (!i->item.evictable()) { wait(i->item.onEvictable()); } ++i; @@ -852,9 +813,7 @@ public: return Void(); } - Future clear() { - return clear_impl(this); - } + Future clear() { return clear_impl(this); } int count() const { ASSERT(evictionOrder.size() == cache.size()); @@ -872,13 +831,13 @@ private: EvictionOrderT evictionOrder; }; -ACTOR template Future forwardError(Future f, Promise target) { +ACTOR template +Future forwardError(Future f, Promise target) { try { T x = wait(f); return x; - } - catch(Error &e) { - if(e.code() != error_code_actor_cancelled && target.canBeSet()) { + } catch (Error& e) { + if (e.code() != error_code_actor_cancelled && target.canBeSet()) { target.sendError(e); } @@ -892,7 +851,7 @@ class DWALPagerSnapshot; // It does this internally mapping the original page ID to alternate page IDs by write version. // The page id remaps are kept in memory and also logged to a "remap queue" which must be reloaded on cold start. // To prevent the set of remaps from growing unboundedly, once a remap is old enough to be at or before the -// oldest pager version being maintained the remap can be "undone" by popping it from the remap queue, +// oldest pager version being maintained the remap can be "undone" by popping it from the remap queue, // copying the alternate page ID's data over top of the original page ID's data, and deleting the remap from memory. // This process basically describes a "Delayed" Write-Ahead-Log (DWAL) because the remap queue and the newly allocated // alternate pages it references basically serve as a write ahead log for pages that will eventially be copied @@ -907,9 +866,7 @@ public: Version version; LogicalPageID pageID; - bool operator<(const DelayedFreePage &rhs) const { - return version < rhs.version; - } + bool operator<(const DelayedFreePage& rhs) const { return version < rhs.version; } std::string toString() const { return format("DelayedFreePage{%s @%" PRId64 "}", ::toString(pageID).c_str(), version); @@ -921,12 +878,11 @@ public: LogicalPageID originalPageID; LogicalPageID newPageID; - bool operator<(const RemappedPage &rhs) { - return version < rhs.version; - } + bool operator<(const RemappedPage& rhs) { return version < rhs.version; } std::string toString() const { - return format("RemappedPage(%s -> %s @%" PRId64 "}", ::toString(originalPageID).c_str(), ::toString(newPageID).c_str(), version); + return format("RemappedPage(%s -> %s @%" PRId64 "}", ::toString(originalPageID).c_str(), + ::toString(newPageID).c_str(), version); } }; @@ -938,10 +894,11 @@ public: // If the file already exists, pageSize might be different than desiredPageSize // Use pageCacheSizeBytes == 0 for default DWALPager(int desiredPageSize, std::string filename, int64_t pageCacheSizeBytes) - : desiredPageSize(desiredPageSize), filename(filename), pHeader(nullptr), pageCacheBytes(pageCacheSizeBytes) - { - if(pageCacheBytes == 0) { - pageCacheBytes = g_network->isSimulated() ? (BUGGIFY ? FLOW_KNOBS->BUGGIFY_SIM_PAGE_CACHE_4K : FLOW_KNOBS->SIM_PAGE_CACHE_4K) : FLOW_KNOBS->PAGE_CACHE_4K; + : desiredPageSize(desiredPageSize), filename(filename), pHeader(nullptr), pageCacheBytes(pageCacheSizeBytes) { + if (pageCacheBytes == 0) { + pageCacheBytes = g_network->isSimulated() + ? (BUGGIFY ? FLOW_KNOBS->BUGGIFY_SIM_PAGE_CACHE_4K : FLOW_KNOBS->SIM_PAGE_CACHE_4K) + : FLOW_KNOBS->PAGE_CACHE_4K; } commitFuture = Void(); recoverFuture = forwardError(recover(this), errorPromise); @@ -950,10 +907,10 @@ public: void setPageSize(int size) { logicalPageSize = size; physicalPageSize = smallestPhysicalBlock; - while(logicalPageSize > physicalPageSize) { + while (logicalPageSize > physicalPageSize) { physicalPageSize += smallestPhysicalBlock; } - if(pHeader != nullptr) { + if (pHeader != nullptr) { pHeader->pageSize = logicalPageSize; } pageCache.setSizeLimit(pageCacheBytes / physicalPageSize); @@ -963,14 +920,15 @@ public: memcpy(lastCommittedHeaderPage->mutate(), headerPage->begin(), smallestPhysicalBlock); } - ACTOR static Future recover(DWALPager *self) { + ACTOR static Future recover(DWALPager* self) { ASSERT(!self->recoverFuture.isValid()); self->remapUndoFuture = Void(); - int64_t flags = IAsyncFile::OPEN_UNCACHED | IAsyncFile::OPEN_UNBUFFERED | IAsyncFile::OPEN_READWRITE | IAsyncFile::OPEN_LOCK; + int64_t flags = IAsyncFile::OPEN_UNCACHED | IAsyncFile::OPEN_UNBUFFERED | IAsyncFile::OPEN_READWRITE | + IAsyncFile::OPEN_LOCK; state bool exists = fileExists(self->filename); - if(!exists) { + if (!exists) { flags |= IAsyncFile::OPEN_ATOMIC_WRITE_AND_CREATE | IAsyncFile::OPEN_CREATE; } @@ -979,19 +937,20 @@ public: // Header page is always treated as having a page size of smallestPhysicalBlock self->setPageSize(smallestPhysicalBlock); self->lastCommittedHeaderPage = self->newPageBuffer(); - self->pLastCommittedHeader = (Header *)self->lastCommittedHeaderPage->begin(); + self->pLastCommittedHeader = (Header*)self->lastCommittedHeaderPage->begin(); state int64_t fileSize = 0; - if(exists) { + if (exists) { wait(store(fileSize, self->pageFile->size())); } - debug_printf("DWALPager(%s) recover exists=%d fileSize=%" PRId64 "\n", self->filename.c_str(), exists, fileSize); + debug_printf("DWALPager(%s) recover exists=%d fileSize=%" PRId64 "\n", self->filename.c_str(), exists, + fileSize); // TODO: If the file exists but appears to never have been successfully committed is this an error or // should recovery proceed with a new pager instance? // If there are at least 2 pages then try to recover the existing file - if(exists && fileSize >= (self->smallestPhysicalBlock * 2)) { + if (exists && fileSize >= (self->smallestPhysicalBlock * 2)) { debug_printf("DWALPager(%s) recovering using existing file\n"); state bool recoveredHeader = false; @@ -1000,44 +959,42 @@ public: wait(store(self->headerPage, self->readHeaderPage(self, 0))); // If the checksum fails for the header page, try to recover committed header backup from page 1 - if(!self->headerPage.castTo()->verifyChecksum(0)) { + if (!self->headerPage.castTo()->verifyChecksum(0)) { TraceEvent(SevWarn, "DWALPagerRecoveringHeader").detail("Filename", self->filename); - + wait(store(self->headerPage, self->readHeaderPage(self, 1))); - if(!self->headerPage.castTo()->verifyChecksum(1)) { - if(g_network->isSimulated()) { + if (!self->headerPage.castTo()->verifyChecksum(1)) { + if (g_network->isSimulated()) { // TODO: Detect if process is being restarted and only throw injected if so? throw io_error().asInjectedFault(); } Error e = checksum_failed(); - TraceEvent(SevError, "DWALPagerRecoveryFailed") - .detail("Filename", self->filename) - .error(e); + TraceEvent(SevError, "DWALPagerRecoveryFailed").detail("Filename", self->filename).error(e); throw e; } recoveredHeader = true; } - self->pHeader = (Header *)self->headerPage->begin(); + self->pHeader = (Header*)self->headerPage->begin(); - if(self->pHeader->formatVersion != Header::FORMAT_VERSION) { - Error e = internal_error(); // TODO: Something better? + if (self->pHeader->formatVersion != Header::FORMAT_VERSION) { + Error e = internal_error(); // TODO: Something better? TraceEvent(SevError, "DWALPagerRecoveryFailedWrongVersion") - .detail("Filename", self->filename) - .detail("Version", self->pHeader->formatVersion) - .detail("ExpectedVersion", Header::FORMAT_VERSION) - .error(e); + .detail("Filename", self->filename) + .detail("Version", self->pHeader->formatVersion) + .detail("ExpectedVersion", Header::FORMAT_VERSION) + .error(e); throw e; } self->setPageSize(self->pHeader->pageSize); - if(self->logicalPageSize != self->desiredPageSize) { + if (self->logicalPageSize != self->desiredPageSize) { TraceEvent(SevWarn, "DWALPagerPageSizeNotDesired") - .detail("Filename", self->filename) - .detail("ExistingPageSize", self->logicalPageSize) - .detail("DesiredPageSize", self->desiredPageSize); + .detail("Filename", self->filename) + .detail("ExistingPageSize", self->logicalPageSize) + .detail("DesiredPageSize", self->desiredPageSize); } self->freeList.recover(self, self->pHeader->freeList, "FreeListRecovered"); @@ -1045,15 +1002,15 @@ public: self->remapQueue.recover(self, self->pHeader->remapQueue, "RemapQueueRecovered"); Standalone> remaps = wait(self->remapQueue.peekAll()); - for(auto &r : remaps) { - if(r.newPageID != invalidLogicalPageID) { + for (auto& r : remaps) { + if (r.newPageID != invalidLogicalPageID) { self->remappedPages[r.originalPageID][r.version] = r.newPageID; } } // If the header was recovered from the backup at Page 1 then write and sync it to Page 0 before continuing. // If this fails, the backup header is still in tact for the next recovery attempt. - if(recoveredHeader) { + if (recoveredHeader) { // Write the header to page 0 wait(self->writeHeaderPage(0, self->headerPage)); @@ -1065,19 +1022,19 @@ public: debug_printf("DWALPager(%s) Header recovery complete.\n", self->filename.c_str()); } - // Update the last committed header with the one that was recovered (which is the last known committed header) + // Update the last committed header with the one that was recovered (which is the last known committed + // header) self->updateCommittedHeader(); self->addLatestSnapshot(); - } - else { - // Note: If the file contains less than 2 pages but more than 0 bytes then the pager was never successfully committed. - // A new pager will be created in its place. + } else { + // Note: If the file contains less than 2 pages but more than 0 bytes then the pager was never successfully + // committed. A new pager will be created in its place. // TODO: Is the right behavior? debug_printf("DWALPager(%s) creating new pager\n"); self->headerPage = self->newPageBuffer(); - self->pHeader = (Header *)self->headerPage->begin(); + self->pHeader = (Header*)self->headerPage->begin(); // Now that the header page has been allocated, set page size to desired self->setPageSize(self->desiredPageSize); @@ -1107,7 +1064,8 @@ public: self->pHeader->remapQueue = self->remapQueue.getState(); // Set remaining header bytes to \xff - memset(self->headerPage->mutate() + self->pHeader->size(), 0xff, self->headerPage->size() - self->pHeader->size()); + memset(self->headerPage->mutate() + self->pHeader->size(), 0xff, + self->headerPage->size() - self->pHeader->size()); // Since there is no previously committed header use the initial header for the initial commit. self->updateCommittedHeader(); @@ -1115,7 +1073,9 @@ public: wait(self->commit()); } - debug_printf("DWALPager(%s) recovered. committedVersion=%" PRId64 " logicalPageSize=%d physicalPageSize=%d\n", self->filename.c_str(), self->pHeader->committedVersion, self->logicalPageSize, self->physicalPageSize); + debug_printf("DWALPager(%s) recovered. committedVersion=%" PRId64 " logicalPageSize=%d physicalPageSize=%d\n", + self->filename.c_str(), self->pHeader->committedVersion, self->logicalPageSize, + self->physicalPageSize); return Void(); } @@ -1125,31 +1085,34 @@ public: // Returns the usable size of pages returned by the pager (i.e. the size of the page that isn't pager overhead). // For a given pager instance, separate calls to this function must return the same value. - int getUsablePageSize() override { - return logicalPageSize - sizeof(FastAllocatedPage::Checksum); - } + int getUsablePageSize() override { return logicalPageSize - sizeof(FastAllocatedPage::Checksum); } // Get a new, previously available page ID. The page will be considered in-use after the next commit // regardless of whether or not it was written to, until it is returned to the pager via freePage() - ACTOR static Future newPageID_impl(DWALPager *self) { + ACTOR static Future newPageID_impl(DWALPager* self) { // First try the free list Optional freePageID = wait(self->freeList.pop()); - if(freePageID.present()) { - debug_printf("DWALPager(%s) newPageID() returning %s from free list\n", self->filename.c_str(), toString(freePageID.get()).c_str()); + if (freePageID.present()) { + debug_printf("DWALPager(%s) newPageID() returning %s from free list\n", self->filename.c_str(), + toString(freePageID.get()).c_str()); return freePageID.get(); } - // Try to reuse pages up to the earlier of the oldest version set by the user or the oldest snapshot still in the snapshots list + // Try to reuse pages up to the earlier of the oldest version set by the user or the oldest snapshot still in + // the snapshots list ASSERT(!self->snapshots.empty()); - Optional delayedFreePageID = wait(self->delayedFreeList.pop(DelayedFreePage{self->effectiveOldestVersion(), 0})); - if(delayedFreePageID.present()) { - debug_printf("DWALPager(%s) newPageID() returning %s from delayed free list\n", self->filename.c_str(), toString(delayedFreePageID.get()).c_str()); + Optional delayedFreePageID = + wait(self->delayedFreeList.pop(DelayedFreePage{ self->effectiveOldestVersion(), 0 })); + if (delayedFreePageID.present()) { + debug_printf("DWALPager(%s) newPageID() returning %s from delayed free list\n", self->filename.c_str(), + toString(delayedFreePageID.get()).c_str()); return delayedFreePageID.get().pageID; } // Lastly, add a new page to the pager LogicalPageID id = self->newLastPageID(); - debug_printf("DWALPager(%s) newPageID() returning %s at end of file\n", self->filename.c_str(), toString(id).c_str()); + debug_printf("DWALPager(%s) newPageID() returning %s at end of file\n", self->filename.c_str(), + toString(id).c_str()); return id; }; @@ -1160,22 +1123,24 @@ public: return id; } - Future newPageID() override { - return newPageID_impl(this); - } + Future newPageID() override { return newPageID_impl(this); } Future writePhysicalPage(PhysicalPageID pageID, Reference page, bool header = false) { - debug_printf("DWALPager(%s) op=%s %s ptr=%p\n", filename.c_str(), (header ? "writePhysicalHeader" : "writePhysical"), toString(pageID).c_str(), page->begin()); + debug_printf("DWALPager(%s) op=%s %s ptr=%p\n", filename.c_str(), + (header ? "writePhysicalHeader" : "writePhysical"), toString(pageID).c_str(), page->begin()); VALGRIND_MAKE_MEM_DEFINED(page->begin(), page->size()); - ((Page *)page.getPtr())->updateChecksum(pageID); + ((Page*)page.getPtr())->updateChecksum(pageID); // Note: Not using forwardError here so a write error won't be discovered until commit time. int blockSize = header ? smallestPhysicalBlock : physicalPageSize; - Future f = holdWhile(page, map(pageFile->write(page->begin(), blockSize, (int64_t)pageID * blockSize), [=](Void) { - debug_printf("DWALPager(%s) op=%s %s ptr=%p\n", filename.c_str(), (header ? "writePhysicalHeaderComplete" : "writePhysicalComplete"), toString(pageID).c_str(), page->begin()); - return Void(); - })); + Future f = + holdWhile(page, map(pageFile->write(page->begin(), blockSize, (int64_t)pageID * blockSize), [=](Void) { + debug_printf("DWALPager(%s) op=%s %s ptr=%p\n", filename.c_str(), + (header ? "writePhysicalHeaderComplete" : "writePhysicalComplete"), + toString(pageID).c_str(), page->begin()); + return Void(); + })); operations.add(f); return f; } @@ -1186,8 +1151,11 @@ public: void updatePage(LogicalPageID pageID, Reference data) override { // Get the cache entry for this page, without counting it as a cache hit as we're replacing its contents now - PageCacheEntry &cacheEntry = pageCache.get(pageID, true); - debug_printf("DWALPager(%s) op=write %s cached=%d reading=%d writing=%d\n", filename.c_str(), toString(pageID).c_str(), cacheEntry.initialized(), cacheEntry.initialized() && cacheEntry.reading(), cacheEntry.initialized() && cacheEntry.writing()); + PageCacheEntry& cacheEntry = pageCache.get(pageID, true); + debug_printf("DWALPager(%s) op=write %s cached=%d reading=%d writing=%d\n", filename.c_str(), + toString(pageID).c_str(), cacheEntry.initialized(), + cacheEntry.initialized() && cacheEntry.reading(), + cacheEntry.initialized() && cacheEntry.writing()); // If the page is still being read then it's not also being written because a write places // the new content into readFuture when the write is launched, not when it is completed. @@ -1195,25 +1163,23 @@ public: // is necessary for remap erasure to work correctly since the oldest version of a page, located // at the original page ID, could have a pending read when that version is expired and the write // of the next newest version over top of the original page begins. - if(!cacheEntry.initialized()) { + if (!cacheEntry.initialized()) { cacheEntry.writeFuture = writePhysicalPage(pageID, data); - } - else if(cacheEntry.reading()) { + } else if (cacheEntry.reading()) { // Wait for the read to finish, then start the write. cacheEntry.writeFuture = map(success(cacheEntry.readFuture), [=](Void) { writePhysicalPage(pageID, data); return Void(); }); - } + } // If the page is being written, wait for this write before issuing the new write to ensure the // writes happen in the correct order - else if(cacheEntry.writing()) { + else if (cacheEntry.writing()) { cacheEntry.writeFuture = map(cacheEntry.writeFuture, [=](Void) { writePhysicalPage(pageID, data); return Void(); }); - } - else { + } else { cacheEntry.writeFuture = writePhysicalPage(pageID, data); } @@ -1227,7 +1193,7 @@ public: Future f = map(newPageID(), [=](LogicalPageID newPageID) { updatePage(newPageID, data); // TODO: Possibly limit size of remap queue since it must be recovered on cold start - RemappedPage r{v, pageID, newPageID}; + RemappedPage r{ v, pageID, newPageID }; remapQueue.pushBack(r); remappedPages[pageID][v] = newPageID; debug_printf("DWALPager(%s) pushed %s\n", filename.c_str(), RemappedPage(r).toString().c_str()); @@ -1239,62 +1205,71 @@ public: } void freePage(LogicalPageID pageID, Version v) override { - // If pageID has been remapped, then it can't be freed until all existing remaps for that page have been undone, so queue it for later deletion - if(remappedPages.find(pageID) != remappedPages.end()) { - debug_printf("DWALPager(%s) op=freeRemapped %s @%" PRId64 " oldestVersion=%" PRId64 "\n", filename.c_str(), toString(pageID).c_str(), v, pLastCommittedHeader->oldestVersion); - remapQueue.pushBack(RemappedPage{v, pageID, invalidLogicalPageID}); + // If pageID has been remapped, then it can't be freed until all existing remaps for that page have been undone, + // so queue it for later deletion + if (remappedPages.find(pageID) != remappedPages.end()) { + debug_printf("DWALPager(%s) op=freeRemapped %s @%" PRId64 " oldestVersion=%" PRId64 "\n", filename.c_str(), + toString(pageID).c_str(), v, pLastCommittedHeader->oldestVersion); + remapQueue.pushBack(RemappedPage{ v, pageID, invalidLogicalPageID }); return; } // If v is older than the oldest version still readable then mark pageID as free as of the next commit - if(v < effectiveOldestVersion()) { - debug_printf("DWALPager(%s) op=freeNow %s @%" PRId64 " oldestVersion=%" PRId64 "\n", filename.c_str(), toString(pageID).c_str(), v, pLastCommittedHeader->oldestVersion); + if (v < effectiveOldestVersion()) { + debug_printf("DWALPager(%s) op=freeNow %s @%" PRId64 " oldestVersion=%" PRId64 "\n", filename.c_str(), + toString(pageID).c_str(), v, pLastCommittedHeader->oldestVersion); freeList.pushBack(pageID); - } - else { + } else { // Otherwise add it to the delayed free list - debug_printf("DWALPager(%s) op=freeLater %s @%" PRId64 " oldestVersion=%" PRId64 "\n", filename.c_str(), toString(pageID).c_str(), v, pLastCommittedHeader->oldestVersion); - delayedFreeList.pushBack({v, pageID}); + debug_printf("DWALPager(%s) op=freeLater %s @%" PRId64 " oldestVersion=%" PRId64 "\n", filename.c_str(), + toString(pageID).c_str(), v, pLastCommittedHeader->oldestVersion); + delayedFreeList.pushBack({ v, pageID }); } }; // Read a physical page from the page file. Note that header pages use a page size of smallestPhysicalBlock // If the user chosen physical page size is larger, then there will be a gap of unused space after the header pages // and before the user-chosen sized pages. - ACTOR static Future> readPhysicalPage(DWALPager *self, PhysicalPageID pageID, bool header = false) { - if(g_network->getCurrentTask() > TaskPriority::DiskRead) { + ACTOR static Future> readPhysicalPage(DWALPager* self, PhysicalPageID pageID, + bool header = false) { + if (g_network->getCurrentTask() > TaskPriority::DiskRead) { wait(delay(0, TaskPriority::DiskRead)); } - state Reference page = header ? Reference(new FastAllocatedPage(smallestPhysicalBlock, smallestPhysicalBlock)) : self->newPageBuffer(); - debug_printf("DWALPager(%s) op=readPhysicalStart %s ptr=%p\n", self->filename.c_str(), toString(pageID).c_str(), page->begin()); + state Reference page = + header ? Reference(new FastAllocatedPage(smallestPhysicalBlock, smallestPhysicalBlock)) + : self->newPageBuffer(); + debug_printf("DWALPager(%s) op=readPhysicalStart %s ptr=%p\n", self->filename.c_str(), toString(pageID).c_str(), + page->begin()); int blockSize = header ? smallestPhysicalBlock : self->physicalPageSize; // TODO: Could a dispatched read try to write to page after it has been destroyed if this actor is cancelled? int readBytes = wait(self->pageFile->read(page->mutate(), blockSize, (int64_t)pageID * blockSize)); - debug_printf("DWALPager(%s) op=readPhysicalComplete %s ptr=%p bytes=%d\n", self->filename.c_str(), toString(pageID).c_str(), page->begin(), readBytes); + debug_printf("DWALPager(%s) op=readPhysicalComplete %s ptr=%p bytes=%d\n", self->filename.c_str(), + toString(pageID).c_str(), page->begin(), readBytes); // Header reads are checked explicitly during recovery - if(!header) { - Page *p = (Page *)page.getPtr(); - if(!p->verifyChecksum(pageID)) { - debug_printf("DWALPager(%s) checksum failed for %s\n", self->filename.c_str(), toString(pageID).c_str()); + if (!header) { + Page* p = (Page*)page.getPtr(); + if (!p->verifyChecksum(pageID)) { + debug_printf("DWALPager(%s) checksum failed for %s\n", self->filename.c_str(), + toString(pageID).c_str()); Error e = checksum_failed(); TraceEvent(SevError, "DWALPagerChecksumFailed") - .detail("Filename", self->filename.c_str()) - .detail("PageID", pageID) - .detail("PageSize", self->physicalPageSize) - .detail("Offset", pageID * self->physicalPageSize) - .detail("CalculatedChecksum", p->calculateChecksum(pageID)) - .detail("ChecksumInPage", p->getChecksum()) - .error(e); + .detail("Filename", self->filename.c_str()) + .detail("PageID", pageID) + .detail("PageSize", self->physicalPageSize) + .detail("Offset", pageID * self->physicalPageSize) + .detail("CalculatedChecksum", p->calculateChecksum(pageID)) + .detail("ChecksumInPage", p->getChecksum()) + .error(e); throw e; } } return page; } - static Future> readHeaderPage(DWALPager *self, PhysicalPageID pageID) { + static Future> readHeaderPage(DWALPager* self, PhysicalPageID pageID) { return readPhysicalPage(self, pageID, true); } @@ -1302,10 +1277,10 @@ public: Future> readPage(LogicalPageID pageID, bool cacheable, bool noHit = false) override { // Use cached page if present, without triggering a cache hit. // Otherwise, read the page and return it but don't add it to the cache - if(!cacheable) { + if (!cacheable) { debug_printf("DWALPager(%s) op=readUncached %s\n", filename.c_str(), toString(pageID).c_str()); - PageCacheEntry *pCacheEntry = pageCache.getIfExists(pageID); - if(pCacheEntry != nullptr) { + PageCacheEntry* pCacheEntry = pageCache.getIfExists(pageID); + if (pCacheEntry != nullptr) { debug_printf("DWALPager(%s) op=readUncachedHit %s\n", filename.c_str(), toString(pageID).c_str()); return pCacheEntry->readFuture; } @@ -1314,10 +1289,13 @@ public: return forwardError(readPhysicalPage(this, (PhysicalPageID)pageID), errorPromise); } - PageCacheEntry &cacheEntry = pageCache.get(pageID, noHit); - debug_printf("DWALPager(%s) op=read %s cached=%d reading=%d writing=%d noHit=%d\n", filename.c_str(), toString(pageID).c_str(), cacheEntry.initialized(), cacheEntry.initialized() && cacheEntry.reading(), cacheEntry.initialized() && cacheEntry.writing(), noHit); + PageCacheEntry& cacheEntry = pageCache.get(pageID, noHit); + debug_printf("DWALPager(%s) op=read %s cached=%d reading=%d writing=%d noHit=%d\n", filename.c_str(), + toString(pageID).c_str(), cacheEntry.initialized(), + cacheEntry.initialized() && cacheEntry.reading(), cacheEntry.initialized() && cacheEntry.writing(), + noHit); - if(!cacheEntry.initialized()) { + if (!cacheEntry.initialized()) { debug_printf("DWALPager(%s) issuing actual read of %s\n", filename.c_str(), toString(pageID).c_str()); cacheEntry.readFuture = readPhysicalPage(this, (PhysicalPageID)pageID); cacheEntry.writeFuture = Void(); @@ -1330,16 +1308,17 @@ public: Future> readPageAtVersion(LogicalPageID pageID, Version v, bool cacheable, bool noHit) { auto i = remappedPages.find(pageID); - if(i != remappedPages.end()) { + if (i != remappedPages.end()) { auto j = i->second.upper_bound(v); - if(j != i->second.begin()) { + if (j != i->second.begin()) { --j; - debug_printf("DWALPager(%s) read %s @%" PRId64 " -> %s\n", filename.c_str(), toString(pageID).c_str(), v, toString(j->second).c_str()); + debug_printf("DWALPager(%s) read %s @%" PRId64 " -> %s\n", filename.c_str(), toString(pageID).c_str(), + v, toString(j->second).c_str()); pageID = j->second; } - } - else { - debug_printf("DWALPager(%s) read %s @%" PRId64 " (not remapped)\n", filename.c_str(), toString(pageID).c_str(), v); + } else { + debug_printf("DWALPager(%s) read %s @%" PRId64 " (not remapped)\n", filename.c_str(), + toString(pageID).c_str(), v); } return readPage(pageID, cacheable, noHit); @@ -1359,9 +1338,7 @@ public: // Get the oldest *readable* version, which is not the same as the oldest retained version as the version // returned could have been set as the oldest version in the pending commit - Version getOldestVersion() override { - return pHeader->oldestVersion; - }; + Version getOldestVersion() override { return pHeader->oldestVersion; }; // Calculate the *effective* oldest version, which can be older than the one set in the last commit since we // are allowing active snapshots to temporarily delay page reuse. @@ -1369,27 +1346,28 @@ public: return std::min(pLastCommittedHeader->oldestVersion, snapshots.front().version); } - ACTOR static Future undoRemaps(DWALPager *self) { + ACTOR static Future undoRemaps(DWALPager* self) { state RemappedPage cutoff; cutoff.version = self->effectiveOldestVersion(); // TODO: Use parallel reads - // TODO: One run of this actor might write to the same original page more than once, in which case just unmap the latest + // TODO: One run of this actor might write to the same original page more than once, in which case just unmap + // the latest loop { - if(self->remapUndoStop) { + if (self->remapUndoStop) { break; } state Optional p = wait(self->remapQueue.pop(cutoff)); - if(!p.present()) { + if (!p.present()) { break; } debug_printf("DWALPager(%s) undoRemaps popped %s\n", self->filename.c_str(), p.get().toString().c_str()); - if(p.get().newPageID == invalidLogicalPageID) { - debug_printf("DWALPager(%s) undoRemaps freeing %s\n", self->filename.c_str(), p.get().toString().c_str()); + if (p.get().newPageID == invalidLogicalPageID) { + debug_printf("DWALPager(%s) undoRemaps freeing %s\n", self->filename.c_str(), + p.get().toString().c_str()); self->freePage(p.get().originalPageID, p.get().version); - } - else { + } else { // Read the data from the page that the original was mapped to Reference data = wait(self->readPage(p.get().newPageID, false)); @@ -1398,24 +1376,25 @@ public: // Remove the remap from this page, deleting the entry for the pageID if its map becomes empty auto i = self->remappedPages.find(p.get().originalPageID); - if(i->second.size() == 1) { + if (i->second.size() == 1) { self->remappedPages.erase(i); - } - else { + } else { i->second.erase(p.get().version); } - // Now that the remap has been undone nothing will read this page so it can be freed as of the next commit. + // Now that the remap has been undone nothing will read this page so it can be freed as of the next + // commit. self->freePage(p.get().newPageID, 0); } } - debug_printf("DWALPager(%s) undoRemaps stopped, remapQueue size is %d\n", self->filename.c_str(), self->remapQueue.numEntries); + debug_printf("DWALPager(%s) undoRemaps stopped, remapQueue size is %d\n", self->filename.c_str(), + self->remapQueue.numEntries); return Void(); } // Flush all queues so they have no operations pending. - ACTOR static Future flushQueues(DWALPager *self) { + ACTOR static Future flushQueues(DWALPager* self) { ASSERT(self->remapUndoFuture.isReady()); // Flush remap queue separately, it's not involved in free page management @@ -1429,7 +1408,7 @@ public: // Once preFlush() returns false for both queues then there are no more operations pending // on either queue. If preFlush() returns true for either queue in one loop execution then // it could have generated new work for itself or the other queue. - if(!freeBusy && !delayedFreeBusy) { + if (!freeBusy && !delayedFreeBusy) { break; } } @@ -1439,7 +1418,7 @@ public: return Void(); } - ACTOR static Future commit_impl(DWALPager *self) { + ACTOR static Future commit_impl(DWALPager* self) { debug_printf("DWALPager(%s) commit begin\n", self->filename.c_str()); // Write old committed header to Page 1 @@ -1461,19 +1440,21 @@ public: debug_printf("DWALPager(%s) Syncing\n", self->filename.c_str()); // Sync everything except the header - if(g_network->getCurrentTask() > TaskPriority::DiskWrite) { + if (g_network->getCurrentTask() > TaskPriority::DiskWrite) { wait(delay(0, TaskPriority::DiskWrite)); } wait(self->pageFile->sync()); - debug_printf("DWALPager(%s) commit version %" PRId64 " sync 1\n", self->filename.c_str(), self->pHeader->committedVersion); + debug_printf("DWALPager(%s) commit version %" PRId64 " sync 1\n", self->filename.c_str(), + self->pHeader->committedVersion); // Update header on disk and sync again. wait(self->writeHeaderPage(0, self->headerPage)); - if(g_network->getCurrentTask() > TaskPriority::DiskWrite) { + if (g_network->getCurrentTask() > TaskPriority::DiskWrite) { wait(delay(0, TaskPriority::DiskWrite)); } wait(self->pageFile->sync()); - debug_printf("DWALPager(%s) commit version %" PRId64 " sync 2\n", self->filename.c_str(), self->pHeader->committedVersion); + debug_printf("DWALPager(%s) commit version %" PRId64 " sync 2\n", self->filename.c_str(), + self->pHeader->committedVersion); // Update the last committed header for use in the next commit. self->updateCommittedHeader(); @@ -1497,19 +1478,13 @@ public: return commitFuture; } - Key getMetaKey() const override { - return pHeader->getMetaKey(); - } + Key getMetaKey() const override { return pHeader->getMetaKey(); } - void setCommitVersion(Version v) override { - pHeader->committedVersion = v; - } + void setCommitVersion(Version v) override { pHeader->committedVersion = v; } - void setMetaKey(KeyRef metaKey) override { - pHeader->setMetaKey(metaKey); - } - - ACTOR void shutdown(DWALPager *self, bool dispose) { + void setMetaKey(KeyRef metaKey) override { pHeader->setMetaKey(metaKey); } + + ACTOR void shutdown(DWALPager* self, bool dispose) { debug_printf("DWALPager(%s) shutdown cancel recovery\n", self->filename.c_str()); self->recoverFuture.cancel(); debug_printf("DWALPager(%s) shutdown cancel commit\n", self->filename.c_str()); @@ -1517,9 +1492,9 @@ public: debug_printf("DWALPager(%s) shutdown cancel remap\n", self->filename.c_str()); self->remapUndoFuture.cancel(); - if(self->errorPromise.canBeSet()) { + if (self->errorPromise.canBeSet()) { debug_printf("DWALPager(%s) shutdown sending error\n", self->filename.c_str()); - self->errorPromise.sendError(actor_cancelled()); // Ideally this should be shutdown_in_progress + self->errorPromise.sendError(actor_cancelled()); // Ideally this should be shutdown_in_progress } // Must wait for pending operations to complete, canceling them can cause a crash because the underlying @@ -1532,7 +1507,7 @@ public: // Unreference the file and clear self->pageFile.clear(); - if(dispose) { + if (dispose) { debug_printf("DWALPager(%s) shutdown deleting file\n", self->filename.c_str()); wait(IAsyncFileSystem::filesystem()->incrementalDeleteFile(self->filename, true)); } @@ -1541,21 +1516,13 @@ public: delete self; } - void dispose() override { - shutdown(this, true); - } + void dispose() override { shutdown(this, true); } - void close() override { - shutdown(this, false); - } + void close() override { shutdown(this, false); } - Future getError() override { - return errorPromise.getFuture(); - } - - Future onClosed() override { - return closedPromise.getFuture(); - } + Future getError() override { return errorPromise.getFuture(); } + + Future onClosed() override { return closedPromise.getFuture(); } StorageBytes getStorageBytes() override { ASSERT(recoverFuture.isReady()); @@ -1564,41 +1531,42 @@ public: g_network->getDiskBytes(parentDirectory(filename), free, total); int64_t pagerSize = pHeader->pageCount * physicalPageSize; - // It is not exactly known how many pages on the delayed free list are usable as of right now. It could be known, - // if each commit delayed entries that were freeable were shuffled from the delayed free queue to the free queue, - // but this doesn't seem necessary. + // It is not exactly known how many pages on the delayed free list are usable as of right now. It could be + // known, if each commit delayed entries that were freeable were shuffled from the delayed free queue to the + // free queue, but this doesn't seem necessary. int64_t reusable = (freeList.numEntries + delayedFreeList.numEntries) * physicalPageSize; return StorageBytes(free, total, pagerSize - reusable, free + reusable); } - ACTOR static Future getUserPageCount_cleanup(DWALPager *self) { + ACTOR static Future getUserPageCount_cleanup(DWALPager* self) { // Wait for the remap eraser to finish all of its work (not triggering stop) wait(self->remapUndoFuture); // Flush queues so there are no pending freelist operations wait(flushQueues(self)); - + return Void(); } // Get the number of pages in use by the pager's user Future getUserPageCount() override { return map(getUserPageCount_cleanup(this), [=](Void) { - int64_t userPages = pHeader->pageCount - 2 - freeList.numPages - freeList.numEntries - delayedFreeList.numPages - delayedFreeList.numEntries - remapQueue.numPages; - debug_printf("DWALPager(%s) userPages=%" PRId64 " totalPageCount=%" PRId64 " freeQueuePages=%" PRId64 " freeQueueCount=%" PRId64 " delayedFreeQueuePages=%" PRId64 " delayedFreeQueueCount=%" PRId64 " remapQueuePages=%" PRId64 " remapQueueCount=%" PRId64 "\n", - filename.c_str(), userPages, pHeader->pageCount, freeList.numPages, freeList.numEntries, delayedFreeList.numPages, delayedFreeList.numEntries, remapQueue.numPages, remapQueue.numEntries); + int64_t userPages = pHeader->pageCount - 2 - freeList.numPages - freeList.numEntries - + delayedFreeList.numPages - delayedFreeList.numEntries - remapQueue.numPages; + debug_printf("DWALPager(%s) userPages=%" PRId64 " totalPageCount=%" PRId64 " freeQueuePages=%" PRId64 + " freeQueueCount=%" PRId64 " delayedFreeQueuePages=%" PRId64 " delayedFreeQueueCount=%" PRId64 + " remapQueuePages=%" PRId64 " remapQueueCount=%" PRId64 "\n", + filename.c_str(), userPages, pHeader->pageCount, freeList.numPages, freeList.numEntries, + delayedFreeList.numPages, delayedFreeList.numEntries, remapQueue.numPages, + remapQueue.numEntries); return userPages; }); } - Future init() override { - return recoverFuture; - } + Future init() override { return recoverFuture; } - Version getLatestVersion() override { - return pLastCommittedHeader->committedVersion; - } + Version getLatestVersion() override { return pLastCommittedHeader->committedVersion; } private: ~DWALPager() {} @@ -1617,12 +1585,10 @@ private: FIFOQueue::QueueState delayedFreeList; FIFOQueue::QueueState remapQueue; Version committedVersion; - Version oldestVersion; + Version oldestVersion; int32_t metaKeySize; - KeyRef getMetaKey() const { - return KeyRef((const uint8_t *)(this + 1), metaKeySize); - } + KeyRef getMetaKey() const { return KeyRef((const uint8_t*)(this + 1), metaKeySize); } void setMetaKey(StringRef key) { ASSERT(key.size() < (smallestPhysicalBlock - sizeof(Header))); @@ -1632,9 +1598,7 @@ private: } } - int size() const { - return sizeof(Header) + metaKeySize; - } + int size() const { return sizeof(Header) + metaKeySize; } private: Header(); @@ -1645,26 +1609,18 @@ private: Future> readFuture; Future writeFuture; - bool initialized() const { - return readFuture.isValid(); - } + bool initialized() const { return readFuture.isValid(); } - bool reading() const { - return !readFuture.isReady(); - } + bool reading() const { return !readFuture.isReady(); } - bool writing() const { - return !writeFuture.isReady(); - } + bool writing() const { return !writeFuture.isReady(); } bool evictable() const { // Don't evict if a page is still being read or written return !reading() && !writing(); } - Future onEvictable() const { - return ready(readFuture) && writeFuture; - } + Future onEvictable() const { return ready(readFuture) && writeFuture; } }; // Physical page sizes will always be a multiple of 4k because AsyncFileNonDurable requires @@ -1672,18 +1628,18 @@ private: // Allowing a smaller 'logical' page size is very useful for testing. static constexpr int smallestPhysicalBlock = 4096; int physicalPageSize; - int logicalPageSize; // In simulation testing it can be useful to use a small logical page size + int logicalPageSize; // In simulation testing it can be useful to use a small logical page size int64_t pageCacheBytes; // The header will be written to / read from disk as a smallestPhysicalBlock sized chunk. Reference headerPage; - Header *pHeader; + Header* pHeader; int desiredPageSize; Reference lastCommittedHeaderPage; - Header *pLastCommittedHeader; + Header* pLastCommittedHeader; std::string filename; @@ -1691,7 +1647,7 @@ private: PageCacheT pageCache; Promise closedPromise; - Promise errorPromise; + Promise errorPromise; Future commitFuture; SignalableActorCollection operations; Future recoverFuture; @@ -1715,13 +1671,9 @@ private: }; struct SnapshotEntryLessThanVersion { - bool operator() (Version v, const SnapshotEntry &snapshot) { - return v < snapshot.version; - } + bool operator()(Version v, const SnapshotEntry& snapshot) { return v < snapshot.version; } - bool operator() (const SnapshotEntry &snapshot, Version v) { - return snapshot.version < v; - } + bool operator()(const SnapshotEntry& snapshot, Version v) { return snapshot.version < v; } }; // TODO: Better data structure @@ -1733,46 +1685,38 @@ private: // Prevents pager from reusing freed pages from version until the snapshot is destroyed class DWALPagerSnapshot : public IPagerSnapshot, public ReferenceCounted { public: - DWALPagerSnapshot(DWALPager *pager, Key meta, Version version, Future expiredFuture) : pager(pager), metaKey(meta), version(version), expired(expiredFuture) { - } - virtual ~DWALPagerSnapshot() { - } + DWALPagerSnapshot(DWALPager* pager, Key meta, Version version, Future expiredFuture) + : pager(pager), metaKey(meta), version(version), expired(expiredFuture) {} + virtual ~DWALPagerSnapshot() {} Future> getPhysicalPage(LogicalPageID pageID, bool cacheable, bool noHit) override { - if(expired.isError()) { + if (expired.isError()) { throw expired.getError(); } - return map(pager->readPageAtVersion(pageID, version, cacheable, noHit), [=](Reference p) { - return Reference(p); - }); + return map(pager->readPageAtVersion(pageID, version, cacheable, noHit), + [=](Reference p) { return Reference(p); }); } - Key getMetaKey() const override { - return metaKey; - } + Key getMetaKey() const override { return metaKey; } - Version getVersion() const override { - return version; - } + Version getVersion() const override { return version; } - void addref() override { - ReferenceCounted::addref(); - } + void addref() override { ReferenceCounted::addref(); } - void delref() override { - ReferenceCounted::delref(); - } + void delref() override { ReferenceCounted::delref(); } - DWALPager *pager; + DWALPager* pager; Future expired; Version version; Key metaKey; }; void DWALPager::expireSnapshots(Version v) { - debug_printf("DWALPager(%s) expiring snapshots through %" PRId64 " snapshot count %d\n", filename.c_str(), v, (int)snapshots.size()); - while(snapshots.size() > 1 && snapshots.front().version < v && snapshots.front().snapshot->isSoleOwner()) { - debug_printf("DWALPager(%s) expiring snapshot for %" PRId64 " soleOwner=%d\n", filename.c_str(), snapshots.front().version, snapshots.front().snapshot->isSoleOwner()); + debug_printf("DWALPager(%s) expiring snapshots through %" PRId64 " snapshot count %d\n", filename.c_str(), v, + (int)snapshots.size()); + while (snapshots.size() > 1 && snapshots.front().version < v && snapshots.front().snapshot->isSoleOwner()) { + debug_printf("DWALPager(%s) expiring snapshot for %" PRId64 " soleOwner=%d\n", filename.c_str(), + snapshots.front().version, snapshots.front().snapshot->isSoleOwner()); // The snapshot contract could be made such that the expired promise isn't need anymore. In practice it // probably is already not needed but it will gracefully handle the case where a user begins a page read // with a snapshot reference, keeps the page read future, and drops the snapshot reference. @@ -1785,7 +1729,7 @@ Reference DWALPager::getReadSnapshot(Version v) { ASSERT(!snapshots.empty()); auto i = std::upper_bound(snapshots.begin(), snapshots.end(), v, SnapshotEntryLessThanVersion()); - if(i == snapshots.begin()) { + if (i == snapshots.begin()) { throw version_invalid(); } --i; @@ -1794,35 +1738,30 @@ Reference DWALPager::getReadSnapshot(Version v) { void DWALPager::addLatestSnapshot() { Promise expired; - snapshots.push_back({ - pLastCommittedHeader->committedVersion, - expired, - Reference(new DWALPagerSnapshot(this, pLastCommittedHeader->getMetaKey(), pLastCommittedHeader->committedVersion, expired.getFuture())) - }); + snapshots.push_back({ pLastCommittedHeader->committedVersion, expired, + Reference(new DWALPagerSnapshot(this, pLastCommittedHeader->getMetaKey(), + pLastCommittedHeader->committedVersion, + expired.getFuture())) }); } - // TODO: Move this to a flow header once it is mature. struct SplitStringRef { StringRef a; StringRef b; - SplitStringRef(StringRef a = StringRef(), StringRef b = StringRef()) : a(a), b(b) { - } + SplitStringRef(StringRef a = StringRef(), StringRef b = StringRef()) : a(a), b(b) {} - SplitStringRef(Arena &arena, const SplitStringRef &toCopy) - : a(toStringRef(arena)), b() { - } + SplitStringRef(Arena& arena, const SplitStringRef& toCopy) : a(toStringRef(arena)), b() {} SplitStringRef prefix(int len) const { - if(len <= a.size()) { + if (len <= a.size()) { return SplitStringRef(a.substr(0, len)); } len -= a.size(); return SplitStringRef(a, b.substr(0, len)); } - StringRef toStringRef(Arena &arena) const { + StringRef toStringRef(Arena& arena) const { StringRef c = makeString(size(), arena); memcpy(mutateString(c), a.begin(), a.size()); memcpy(mutateString(c) + a.size(), b.begin(), b.size()); @@ -1834,82 +1773,66 @@ struct SplitStringRef { return Standalone(toStringRef(a), a); } - int size() const { - return a.size() + b.size(); - } + int size() const { return a.size() + b.size(); } - int expectedSize() const { - return size(); - } + int expectedSize() const { return size(); } - std::string toString() const { - return format("%s%s", a.toString().c_str(), b.toString().c_str()); - } + std::string toString() const { return format("%s%s", a.toString().c_str(), b.toString().c_str()); } - std::string toHexString() const { - return format("%s%s", a.toHexString().c_str(), b.toHexString().c_str()); - } + std::string toHexString() const { return format("%s%s", a.toHexString().c_str(), b.toHexString().c_str()); } struct const_iterator { - const uint8_t *ptr; - const uint8_t *end; - const uint8_t *next; + const uint8_t* ptr; + const uint8_t* end; + const uint8_t* next; - inline bool operator==(const const_iterator &rhs) const { - return ptr == rhs.ptr; - } + inline bool operator==(const const_iterator& rhs) const { return ptr == rhs.ptr; } - inline const_iterator & operator++() { + inline const_iterator& operator++() { ++ptr; - if(ptr == end) { + if (ptr == end) { ptr = next; } return *this; } - inline const_iterator & operator+(int n) { + inline const_iterator& operator+(int n) { ptr += n; - if(ptr >= end) { + if (ptr >= end) { ptr = next + (ptr - end); } return *this; } - inline uint8_t operator *() const { - return *ptr; - } + inline uint8_t operator*() const { return *ptr; } }; - inline const_iterator begin() const { - return {a.begin(), a.end(), b.begin()}; - } + inline const_iterator begin() const { return { a.begin(), a.end(), b.begin() }; } - inline const_iterator end() const { - return {b.end()}; - } + inline const_iterator end() const { return { b.end() }; } - template - int compare(const StringT &rhs) const { + template + int compare(const StringT& rhs) const { auto j = begin(); auto k = rhs.begin(); auto jEnd = end(); auto kEnd = rhs.end(); - while(j != jEnd && k != kEnd) { + while (j != jEnd && k != kEnd) { int cmp = *j - *k; - if(cmp != 0) { + if (cmp != 0) { return cmp; } } - // If we've reached the end of *this, then values are equal if rhs is also exhausted, otherwise *this is less than rhs - if(j == jEnd) { + // If we've reached the end of *this, then values are equal if rhs is also exhausted, otherwise *this is less + // than rhs + if (j == jEnd) { return k == kEnd ? 0 : -1; } return 1; } - }; // A BTree "page id" is actually a list of LogicalPageID's whose contents should be concatenated together. @@ -1924,577 +1847,497 @@ std::string toString(BTreePageID id) { struct RedwoodRecordRef { typedef uint8_t byte; - RedwoodRecordRef(KeyRef key = KeyRef(), Version ver = 0, Optional value = {}, uint32_t chunkTotal = 0, uint32_t chunkStart = 0) - : key(key), version(ver), value(value), chunk({chunkTotal, chunkStart}) - { - } + RedwoodRecordRef(KeyRef key = KeyRef(), Version ver = 0, Optional value = {}) + : key(key), version(ver), value(value) {} - RedwoodRecordRef(Arena &arena, const RedwoodRecordRef &toCopy) - : key(arena, toCopy.key), version(toCopy.version), chunk(toCopy.chunk) - { - if(toCopy.value.present()) { + RedwoodRecordRef(Arena& arena, const RedwoodRecordRef& toCopy) : key(arena, toCopy.key), version(toCopy.version) { + if (toCopy.value.present()) { value = ValueRef(arena, toCopy.value.get()); } } - RedwoodRecordRef(KeyRef key, Optional value, const byte intFields[14]) - : key(key), value(value) - { - deserializeIntFields(intFields); - } + KeyValueRef toKeyValueRef() const { return KeyValueRef(key, value.get()); } // RedwoodRecordRefs are used for both internal and leaf pages of the BTree. // Boundary records in internal pages are made from leaf records. // These functions make creating and working with internal page records more convenient. inline BTreePageID getChildPage() const { ASSERT(value.present()); - return BTreePageID((LogicalPageID *)value.get().begin(), value.get().size() / sizeof(LogicalPageID)); + return BTreePageID((LogicalPageID*)value.get().begin(), value.get().size() / sizeof(LogicalPageID)); } inline void setChildPage(BTreePageID id) { - value = ValueRef((const uint8_t *)id.begin(), id.size() * sizeof(LogicalPageID)); + value = ValueRef((const uint8_t*)id.begin(), id.size() * sizeof(LogicalPageID)); } - inline void setChildPage(Arena &arena, BTreePageID id) { - value = ValueRef(arena, (const uint8_t *)id.begin(), id.size() * sizeof(LogicalPageID)); + inline void setChildPage(Arena& arena, BTreePageID id) { + value = ValueRef(arena, (const uint8_t*)id.begin(), id.size() * sizeof(LogicalPageID)); } inline RedwoodRecordRef withPageID(BTreePageID id) const { - return RedwoodRecordRef(key, version, ValueRef((const uint8_t *)id.begin(), id.size() * sizeof(LogicalPageID)), chunk.total, chunk.start); + return RedwoodRecordRef(key, version, ValueRef((const uint8_t*)id.begin(), id.size() * sizeof(LogicalPageID))); } - inline RedwoodRecordRef withoutValue() const { - return RedwoodRecordRef(key, version, {}, chunk.total, chunk.start); - } + inline RedwoodRecordRef withoutValue() const { return RedwoodRecordRef(key, version); } - // Returns how many bytes are in common between the integer fields of *this and other, assuming that - // all values are BigEndian, version is 64 bits, chunk total is 24 bits, and chunk start is 24 bits - int getCommonIntFieldPrefix(const RedwoodRecordRef &other) const { - if(version != other.version) { - return clzll(version ^ other.version) >> 3; - } - - if(chunk.total != other.chunk.total) { - // the -1 is because we are only considering the lower 3 bytes - return 8 + (clz(chunk.total ^ other.chunk.total) >> 3) - 1; - } - - if(chunk.start != other.chunk.start) { - // the -1 is because we are only considering the lower 3 bytes - return 11 + (clz(chunk.start ^ other.chunk.start) >> 3) - 1; - } - - return 14; - } - - // Truncate (key, version, chunk.total, chunk.start) tuple to len bytes. + // Truncate (key, version, part) tuple to len bytes. void truncate(int len) { - if(len <= key.size()) { - key = key.substr(0, len); - version = 0; - chunk.total = 0; - chunk.start = 0; - } - else { - byte fields[intFieldArraySize]; - serializeIntFields(fields); - int end = len - key.size(); - for(int i = intFieldArraySize - 1; i >= end; --i) { - fields[i] = 0; - } - } + ASSERT(len <= key.size()); + key = key.substr(0, len); + version = 0; } - // Find the common prefix between two records, assuming that the first - // skip bytes are the same. - inline int getCommonPrefixLen(const RedwoodRecordRef &other, int skip = 0) const { - int skipStart = std::min(skip, key.size()); - int common = skipStart + commonPrefixLength(key.begin() + skipStart, other.key.begin() + skipStart, std::min(other.key.size(), key.size()) - skipStart); - - if(common == key.size() && key.size() == other.key.size()) { - common += getCommonIntFieldPrefix(other); - } - - return common; + // Find the common key prefix between two records, assuming that the first skipLen bytes are the same + inline int getCommonPrefixLen(const RedwoodRecordRef& other, int skipLen = 0) const { + int skipStart = std::min(skipLen, key.size()); + return skipStart + commonPrefixLength(key.begin() + skipStart, other.key.begin() + skipStart, + std::min(other.key.size(), key.size()) - skipStart); } - // Compares and orders by key, version, chunk.start, chunk.total, value - int compare(const RedwoodRecordRef &rhs, int skip = 0) const { + // Compares and orders by key, version, chunk.total, chunk.start, value + // This is the same order that delta compression uses for prefix borrowing + int compare(const RedwoodRecordRef& rhs, int skip = 0) const { int keySkip = std::min(skip, key.size()); int cmp = key.substr(keySkip).compare(rhs.key.substr(keySkip)); - if(cmp == 0) { + if (cmp == 0) { cmp = version - rhs.version; - if(cmp == 0) { - // It is assumed that in any data set there will never be more than one - // unique chunk total size for the same key and version, so sort by start, total - // Chunked (represented by chunk.total > 0) sorts higher than whole - cmp = chunk.start - rhs.chunk.start; - if(cmp == 0) { - cmp = chunk.total - rhs.chunk.total; - if(cmp == 0) { - cmp = value.compare(rhs.value); - } - } + if (cmp == 0) { + cmp = value.compare(rhs.value); } } return cmp; } - bool sameUserKey(const StringRef &k, int skipLen) const { - // Keys are the same if the sizes are the same and either the skipLen is longer or the non-skipped suffixes are the same. - return key.size() == k.size() && ( skipLen > key.size() || key.substr(skipLen) == k.substr(skipLen) ); + bool sameUserKey(const StringRef& k, int skipLen) const { + // Keys are the same if the sizes are the same and either the skipLen is longer or the non-skipped suffixes are + // the same. + return (key.size() == k.size()) && (key.substr(skipLen) == k.substr(skipLen)); } - bool sameExceptValue(const RedwoodRecordRef &rhs, int skipLen = 0) const { - return sameUserKey(rhs.key, skipLen) && version == rhs.version && chunk.total == rhs.chunk.total && chunk.start == rhs.chunk.start; - } - - static const int intFieldArraySize = 14; - - // Write big endian values of version (64 bits), total (24 bits), and start (24 bits) fields - // to an array of 14 bytes - void serializeIntFields(byte *dst) const { - *(uint32_t *)(dst + 10) = bigEndian32(chunk.start); - *(uint32_t *)(dst + 7) = bigEndian32(chunk.total); - *(uint64_t *)dst = bigEndian64(version); - } - - // Initialize int fields from the array format that serializeIntFields produces - void deserializeIntFields(const byte *src) { - version = bigEndian64(*(uint64_t *)src); - chunk.total = bigEndian32(*(uint32_t *)(src + 7)) & 0xffffff; - chunk.start = bigEndian32(*(uint32_t *)(src + 10)) & 0xffffff; + bool sameExceptValue(const RedwoodRecordRef& rhs, int skipLen = 0) const { + return sameUserKey(rhs.key, skipLen) && version == rhs.version; } // TODO: Use SplitStringRef (unless it ends up being slower) KeyRef key; Optional value; Version version; - struct { - uint32_t total; - // TODO: Change start to chunk number? - uint32_t start; - } chunk; - int expectedSize() const { - return key.expectedSize() + value.expectedSize(); - } - - bool isMultiPart() const { - return chunk.total != 0; - } - - // Generate a kv shard from a complete kv - RedwoodRecordRef split(int start, int len) { - ASSERT(!isMultiPart()); - return RedwoodRecordRef(key, version, value.get().substr(start, len), value.get().size(), start); - } - - class Writer { - public: - Writer(byte *ptr) : wptr(ptr) {} - - byte *wptr; - - template void write(const T &in) { - *(T *)wptr = in; - wptr += sizeof(T); - } - - // Write a big endian 1 or 2 byte integer using the high bit of the first byte as an "extension" bit. - // Values > 15 bits in length are not valid input but this is not checked for. - void writeVarInt(int x) { - if(x >= 128) { - *wptr++ = (uint8_t)( (x >> 8) | 0x80 ); - } - *wptr++ = (uint8_t)x; - } - - void writeString(StringRef s) { - memcpy(wptr, s.begin(), s.size()); - wptr += s.size(); - } - - }; + int expectedSize() const { return key.expectedSize() + value.expectedSize(); } class Reader { public: - Reader(const void *ptr) : rptr((const byte *)ptr) {} + Reader(const void* ptr) : rptr((const byte*)ptr) {} - const byte *rptr; - - template T read() { - T r = *(const T *)rptr; - rptr += sizeof(T); - return r; - } - - // Read a big endian 1 or 2 byte integer using the high bit of the first byte as an "extension" bit. - int readVarInt() { - int x = *rptr++; - // If the high bit is set - if(x & 0x80) { - // Clear the high bit - x &= 0x7f; - // Shift low byte left - x <<= 8; - // Read the new low byte and OR it in - x |= *rptr++; - } - - return x; - } + const byte* rptr; StringRef readString(int len) { StringRef s(rptr, len); rptr += len; return s; } - - const byte * readBytes(int len) { - const byte *b = rptr; - rptr += len; - return b; - } }; -#pragma pack(push,1) +#pragma pack(push, 1) struct Delta { + uint8_t flags; + + // Four field sizing schemes ranging from 3 to 8 bytes, with 3 being the most common. + union { + struct { + uint8_t prefixLength; + uint8_t suffixLength; + uint8_t valueLength; + } LengthFormat0; + + struct { + uint8_t prefixLength; + uint8_t suffixLength; + uint16_t valueLength; + } LengthFormat1; + + struct { + uint8_t prefixLength; + uint8_t suffixLength; + uint32_t valueLength; + } LengthFormat2; + + struct { + uint16_t prefixLength; + uint16_t suffixLength; + uint32_t valueLength; + } LengthFormat3; + }; + + struct int48_t { + static constexpr int64_t MASK = 0xFFFFFFFFFFFFLL; + int32_t high; + int16_t low; + }; + + static constexpr int LengthFormatSizes[] = { sizeof(LengthFormat0), sizeof(LengthFormat1), + sizeof(LengthFormat2), sizeof(LengthFormat3) }; + static constexpr int VersionDeltaSizes[] = { 0, sizeof(int32_t), sizeof(int48_t), sizeof(int64_t) }; + // Serialized Format // - // TODO: Optimize this format better. Non-versioned non-multipart records should have the lowest overhead. - // - // Byte 1 + // Flags - 1 byte // 1 bit - borrow source is prev ancestor (otherwise next ancestor) - // 1 bit - is deleted - // 1 bit - has value (this is different from having a zero-length value) - // 1 bit - has version - // 4 bits - length of suffix bytes for optional integer fields version, total bytes, start offset + // 1 bit - item is deleted + // 1 bit - has value (different from zero-length value, if 0 value len will be 0) + // 1 bits - has nonzero version + // 2 bits - version delta integer size code, maps to 0, 4, 6, 8 + // 2 bits - length fields format // - // Remaining field sizes are variable length. 1-2 byte ints use high byte to indicate use of second byte. - // 1-2 bytes - prefix length to borrow - // 1-2 bytes - key suffix length - // 1 byte - value length, if has_value + // Length fields using 3 to 8 bytes total depending on length fields format // - // Data bytes, variable length based on values above + // Byte strings // Key suffix bytes - // Optional int field suffix bytes // Value bytes + // Version delta bytes // - // For a series of RedwoodRecordRef's containing shards of the same KV pair where the key size is < 114 bytes (single byte prefixLen) - // the overhead per middle chunk is 9 bytes: - // 4 bytes of child pointers in the DeltaTree Node - // 1 flag byte - // 1 prefix borrow length byte - // 1 suffix length byte (which will be zero) - // 1 value length byte - // ~1 optional int field suffix byte describing chunk start position (higher bytes will be borrowed as part of prefix len)) - + enum EFlags { PREFIX_SOURCE_PREV = 0x80, IS_DELETED = 0x40, HAS_VALUE = 0x20, HAS_VERSION = 0x10, - INT_FIELD_SUFFIX_BITS = 0x0f + VERSION_DELTA_SIZE = 0xC, + LENGTHS_FORMAT = 0x03 }; - uint8_t flags; - - inline byte * data() { - return (byte *)(this + 1); + static inline int determineLengthFormat(int prefixLength, int suffixLength, int valueLength) { + // Large prefix or suffix length, which should be rare, is format 3 + if (prefixLength > 0xFF || suffixLength > 0xFF) { + return 3; + } else if (valueLength < 0x100) { + return 0; + } else if (valueLength < 0x10000) { + return 1; + } else { + return 2; + } } - inline const byte * data() const { - return (const byte *)(this + 1); + // Large prefix or suffix length, which should be rare, is format 3 + byte* data() const { + switch (flags & LENGTHS_FORMAT) { + case 0: + return (byte*)(&LengthFormat0 + 1); + case 1: + return (byte*)(&LengthFormat1 + 1); + case 2: + return (byte*)(&LengthFormat2 + 1); + case 3: + default: + return (byte*)(&LengthFormat3 + 1); + } } + int getKeyPrefixLength() const { + switch (flags & LENGTHS_FORMAT) { + case 0: + return LengthFormat0.prefixLength; + case 1: + return LengthFormat1.prefixLength; + case 2: + return LengthFormat2.prefixLength; + case 3: + default: + return LengthFormat3.prefixLength; + } + } + + int getKeySuffixLength() const { + switch (flags & LENGTHS_FORMAT) { + case 0: + return LengthFormat0.suffixLength; + case 1: + return LengthFormat1.suffixLength; + case 2: + return LengthFormat2.suffixLength; + case 3: + default: + return LengthFormat3.suffixLength; + } + } + + int getValueLength() const { + switch (flags & LENGTHS_FORMAT) { + case 0: + return LengthFormat0.valueLength; + case 1: + return LengthFormat1.valueLength; + case 2: + return LengthFormat2.valueLength; + case 3: + default: + return LengthFormat3.valueLength; + } + } + + StringRef getKeySuffix() const { return StringRef(data(), getKeySuffixLength()); } + + StringRef getValue() const { return StringRef(data() + getKeySuffixLength(), getValueLength()); } + + bool hasVersion() const { return flags & HAS_VERSION; } + + int getVersionDeltaSizeBytes() const { + int code = (flags & VERSION_DELTA_SIZE) >> 2; + return VersionDeltaSizes[code]; + } + + static int getVersionDeltaSizeBytes(Version d) { + if (d == 0) { + return 0; + } else if (d == (int32_t)d) { + return sizeof(int32_t); + } else if (d == (d & int48_t::MASK)) { + return sizeof(int48_t); + } + return sizeof(int64_t); + } + + int getVersionDelta(const uint8_t* r) const { + int code = (flags & VERSION_DELTA_SIZE) >> 2; + switch (code) { + case 0: + return 0; + case 1: + return *(int32_t*)r; + case 2: + return (((int64_t)((int48_t*)r)->high) << 16) | (((int48_t*)r)->low & 0xFFFF); + case 3: + default: + return *(int64_t*)r; + } + } + + // Version delta size should be 0 before calling + int setVersionDelta(Version d, uint8_t* w) { + flags |= HAS_VERSION; + if (d == 0) { + return 0; + } else if (d == (int32_t)d) { + flags |= 1 << 2; + *(uint32_t*)w = d; + return sizeof(uint32_t); + } else if (d == (d & int48_t::MASK)) { + flags |= 2 << 2; + ((int48_t*)w)->high = d >> 16; + ((int48_t*)w)->low = d; + return sizeof(int48_t); + } else { + flags |= 3 << 2; + *(int64_t*)w = d; + return sizeof(int64_t); + } + } + + bool hasValue() const { return flags & HAS_VALUE; } + void setPrefixSource(bool val) { - if(val) { + if (val) { flags |= PREFIX_SOURCE_PREV; - } - else { + } else { flags &= ~PREFIX_SOURCE_PREV; } } - bool getPrefixSource() const { - return flags & PREFIX_SOURCE_PREV; - } + bool getPrefixSource() const { return flags & PREFIX_SOURCE_PREV; } void setDeleted(bool val) { - if(val) { + if (val) { flags |= IS_DELETED; - } - else { + } else { flags &= ~IS_DELETED; } } - bool getDeleted() const { - return flags & IS_DELETED; - } + bool getDeleted() const { return flags & IS_DELETED; } - RedwoodRecordRef apply(const RedwoodRecordRef &base, Arena &arena) const { - Reader r(data()); - - int intFieldSuffixLen = flags & INT_FIELD_SUFFIX_BITS; - int prefixLen = r.readVarInt(); - int keySuffixLen = r.readVarInt(); - int valueLen = (flags & HAS_VALUE) ? r.read() : 0; + RedwoodRecordRef apply(const RedwoodRecordRef& base, Arena& arena) const { + int keyPrefixLen = getKeyPrefixLength(); + int keySuffixLen = getKeySuffixLength(); + int valueLen = hasValue() ? getValueLength() : 0; StringRef k; - // Separate the borrowed key string byte count from the borrowed int field byte count - int keyPrefixLen = std::min(prefixLen, base.key.size()); - int intFieldPrefixLen = prefixLen - keyPrefixLen; - + Reader r(data()); // If there is a key suffix, reconstitute the complete key into a contiguous string - if(keySuffixLen > 0) { + if (keySuffixLen > 0) { + StringRef keySuffix = r.readString(keySuffixLen); k = makeString(keyPrefixLen + keySuffixLen, arena); memcpy(mutateString(k), base.key.begin(), keyPrefixLen); - memcpy(mutateString(k) + keyPrefixLen, r.readString(keySuffixLen).begin(), keySuffixLen); - } - else { + memcpy(mutateString(k) + keyPrefixLen, keySuffix.begin(), keySuffixLen); + } else { + // Otherwise just reference the base key's memory k = base.key.substr(0, keyPrefixLen); } - // Now decode the optional integer fields - const byte *intFieldSuffix = r.readBytes(intFieldSuffixLen); - - // Create big endian array in which to reassemble the integer fields from prefix and suffix bytes - byte intFields[intFieldArraySize]; - - // If borrowing any bytes, get the source's integer field array - if(intFieldPrefixLen > 0) { - base.serializeIntFields(intFields); - } - else { - memset(intFields, 0, intFieldArraySize); + Optional value; + if (hasValue()) { + value = r.readString(valueLen); } - // Version offset is used to skip the version bytes in the int field array when version is missing (aka 0) - int versionOffset = ( (intFieldPrefixLen == 0) && (~flags & HAS_VERSION) ) ? 8 : 0; - - // If there are suffix bytes, copy those into place after the prefix - if(intFieldSuffixLen > 0) { - memcpy(intFields + versionOffset + intFieldPrefixLen, intFieldSuffix, intFieldSuffixLen); + Version v = 0; + if (hasVersion()) { + v = base.version + getVersionDelta(r.rptr); } - // Zero out any remaining bytes if the array was initialized from base - if(intFieldPrefixLen > 0) { - for(int i = versionOffset + intFieldPrefixLen + intFieldSuffixLen; i < intFieldArraySize; ++i) { - intFields[i] = 0; - } - } - - return RedwoodRecordRef(k, flags & HAS_VALUE ? r.readString(valueLen) : Optional(), intFields); + return RedwoodRecordRef(k, v, value); } int size() const { - Reader r(data()); - - int intFieldSuffixLen = flags & INT_FIELD_SUFFIX_BITS; - r.readVarInt(); // skip prefix length - int keySuffixLen = r.readVarInt(); - int valueLen = (flags & HAS_VALUE) ? r.read() : 0; - - return sizeof(Delta) + r.rptr - data() + intFieldSuffixLen + valueLen + keySuffixLen; + int size = 1 + getVersionDeltaSizeBytes(); + switch (flags & LENGTHS_FORMAT) { + case 0: + return size + sizeof(LengthFormat0) + LengthFormat0.suffixLength + LengthFormat0.valueLength; + case 1: + return size + sizeof(LengthFormat1) + LengthFormat1.suffixLength + LengthFormat1.valueLength; + case 2: + return size + sizeof(LengthFormat2) + LengthFormat2.suffixLength + LengthFormat2.valueLength; + case 3: + default: + return size + sizeof(LengthFormat3) + LengthFormat3.suffixLength + LengthFormat3.valueLength; + } } - // Delta can't be determined without the RedwoodRecordRef upon which the Delta is based. std::string toString() const { - Reader r(data()); - std::string flagString = " "; - if(flags & PREFIX_SOURCE_PREV) flagString += "PrefixSource "; - if(flags & IS_DELETED) flagString += "IsDeleted "; - if(flags & HAS_VERSION) flagString += "Version "; - if(flags & HAS_VALUE) flagString += "HasValue "; + if (flags & PREFIX_SOURCE_PREV) { + flagString += "PrefixSource|"; + } + if (flags & IS_DELETED) { + flagString += "IsDeleted|"; + } + if (hasValue()) { + flagString += "HasValue|"; + } + if (hasVersion()) { + flagString += "HasVersion|"; + } + int lengthFormat = flags & LENGTHS_FORMAT; - int intFieldSuffixLen = flags & INT_FIELD_SUFFIX_BITS; - int prefixLen = r.readVarInt(); - int keySuffixLen = r.readVarInt(); - int valueLen = (flags & HAS_VALUE) ? r.read() : 0; + Reader r(data()); + int prefixLen = getKeyPrefixLength(); + int keySuffixLen = getKeySuffixLength(); + int valueLen = getValueLength(); - return format("len: %d flags: %s prefixLen: %d keySuffixLen: %d intFieldSuffix: %d valueLen %d raw: %s", - size(), flagString.c_str(), prefixLen, keySuffixLen, intFieldSuffixLen, valueLen, StringRef((const uint8_t *)this, size()).toHexString().c_str()); + return format("lengthFormat: %d totalDeltaSize: %d flags: %s prefixLen: %d keySuffixLen: %d " + "versionDeltaSizeBytes: %d valueLen %d raw: %s", + lengthFormat, size(), flagString.c_str(), prefixLen, keySuffixLen, getVersionDeltaSizeBytes(), + valueLen, StringRef((const uint8_t*)this, size()).toHexString().c_str()); } }; // Using this class as an alternative for Delta enables reading a DeltaTree while only decoding // its values, so the Reader does not require the original prev/next ancestors. struct DeltaValueOnly : Delta { - RedwoodRecordRef apply(const RedwoodRecordRef &base, Arena &arena) const { - Reader r(data()); + RedwoodRecordRef apply(const RedwoodRecordRef& base, Arena& arena) const { + Optional value; - // Skip prefix length - r.readVarInt(); - - int keySuffixLen = r.readVarInt(); + if (hasValue()) { + value = getValue(); + } - // Get value length - int valueLen = (flags & HAS_VALUE) ? r.read() : 0; - - // Skip key suffix bytes and int field suffix bytes - r.readString(keySuffixLen); - r.readBytes(flags & INT_FIELD_SUFFIX_BITS); - - // Return record with only the optional value populated - return RedwoodRecordRef(StringRef(), 0, (flags & HAS_VALUE ? r.readString(valueLen) : Optional()) ); + return RedwoodRecordRef(StringRef(), 0, value); } }; #pragma pack(pop) - bool operator==(const RedwoodRecordRef &rhs) const { - return compare(rhs) == 0; - } + bool operator==(const RedwoodRecordRef& rhs) const { return compare(rhs) == 0; } - bool operator!=(const RedwoodRecordRef &rhs) const { - return compare(rhs) != 0; - } + bool operator!=(const RedwoodRecordRef& rhs) const { return compare(rhs) != 0; } - bool operator<(const RedwoodRecordRef &rhs) const { - return compare(rhs) < 0; - } + bool operator<(const RedwoodRecordRef& rhs) const { return compare(rhs) < 0; } - bool operator>(const RedwoodRecordRef &rhs) const { - return compare(rhs) > 0; - } + bool operator>(const RedwoodRecordRef& rhs) const { return compare(rhs) > 0; } - bool operator<=(const RedwoodRecordRef &rhs) const { - return compare(rhs) <= 0; - } + bool operator<=(const RedwoodRecordRef& rhs) const { return compare(rhs) <= 0; } - bool operator>=(const RedwoodRecordRef &rhs) const { - return compare(rhs) >= 0; - } + bool operator>=(const RedwoodRecordRef& rhs) const { return compare(rhs) >= 0; } - int deltaSize(const RedwoodRecordRef &base, bool worstCase = true, int skipLen = 0) const { - int size = sizeof(Delta); - - if(value.present()) { - // value size byte - ++size; - // value bytes - size += value.get().size(); - } - - // Size of prefix length + // Worst case overhead means to assu + int deltaSize(const RedwoodRecordRef& base, int skipLen, bool worstCaseOverhead) const { int prefixLen = getCommonPrefixLen(base, skipLen); - size += (worstCase || prefixLen >= 128) ? 2 : 1; + int keySuffixLen = key.size() - prefixLen; + int valueLen = value.present() ? value.get().size() : 0; - int intFieldPrefixLen; - - // First byte of suffix len - ++size; - - // Currently using a worst-guess guess where int fields in suffix are stored in their entirety if nonzero. - if(prefixLen < key.size()) { - int keySuffixLen = key.size() - prefixLen; - if(worstCase || keySuffixLen >= 128) { - // Second byte of suffix len - ++size; - } - size += keySuffixLen; - intFieldPrefixLen = 0; - } - else { - intFieldPrefixLen = prefixLen - key.size(); - if(worstCase) { - // Second byte of suffix len - ++size; - } + int formatType; + int versionBytes; + if (worstCaseOverhead) { + formatType = Delta::determineLengthFormat(key.size(), key.size(), valueLen); + versionBytes = version == 0 ? 0 : Delta::getVersionDeltaSizeBytes(version << 1); + } else { + formatType = Delta::determineLengthFormat(prefixLen, keySuffixLen, valueLen); + versionBytes = version == 0 ? 0 : Delta::getVersionDeltaSizeBytes(version - base.version); } - if(version == 0 && chunk.total == 0 && chunk.start == 0) { - // No int field suffix needed - } - else { - byte fields[intFieldArraySize]; - serializeIntFields(fields); - - const byte *end = fields + intFieldArraySize - 1; - int trailingNulls = 0; - while(*end-- == 0) { - ++trailingNulls; - } - - size += std::max(0, intFieldArraySize - intFieldPrefixLen - trailingNulls); - if(intFieldPrefixLen == 0 && version == 0) { - size -= 8; - } - } - - return size; + return 1 + Delta::LengthFormatSizes[formatType] + keySuffixLen + valueLen + versionBytes; } // commonPrefix between *this and base can be passed if known - int writeDelta(Delta &d, const RedwoodRecordRef &base, int commonPrefix = -1) const { - d.flags = version == 0 ? 0 : Delta::HAS_VERSION; + int writeDelta(Delta& d, const RedwoodRecordRef& base, int keyPrefixLen = -1) const { + d.flags = value.present() ? Delta::HAS_VALUE : 0; - if(commonPrefix < 0) { - commonPrefix = getCommonPrefixLen(base, 0); + if (keyPrefixLen < 0) { + keyPrefixLen = getCommonPrefixLen(base, 0); } - Writer w(d.data()); + StringRef keySuffix = key.substr(keyPrefixLen); + int valueLen = value.present() ? value.get().size() : 0; - // prefix len - w.writeVarInt(commonPrefix); + int formatType = Delta::determineLengthFormat(keyPrefixLen, keySuffix.size(), valueLen); + d.flags |= formatType; - // key suffix len - StringRef keySuffix( (key.size() > commonPrefix) ? key.substr(commonPrefix) : StringRef()); - w.writeVarInt(keySuffix.size()); - - // value len - if(value.present()) { - d.flags |= Delta::HAS_VALUE; - w.write(value.get().size()); + switch (formatType) { + case 0: + d.LengthFormat0.prefixLength = keyPrefixLen; + d.LengthFormat0.suffixLength = keySuffix.size(); + d.LengthFormat0.valueLength = valueLen; + break; + case 1: + d.LengthFormat1.prefixLength = keyPrefixLen; + d.LengthFormat1.suffixLength = keySuffix.size(); + d.LengthFormat1.valueLength = valueLen; + break; + case 2: + d.LengthFormat2.prefixLength = keyPrefixLen; + d.LengthFormat2.suffixLength = keySuffix.size(); + d.LengthFormat2.valueLength = valueLen; + break; + case 3: + default: + d.LengthFormat3.prefixLength = keyPrefixLen; + d.LengthFormat3.suffixLength = keySuffix.size(); + d.LengthFormat3.valueLength = valueLen; + break; } - // key suffix bytes - w.writeString(keySuffix); + uint8_t* wptr = d.data(); + // Write key suffix string + wptr = keySuffix.copyTo(wptr); - // extra int fields suffix - // This is a common case, where no int suffix is needed - if(version == 0 && chunk.total == 0 && chunk.start == 0) { - // The suffixLen bits in flags are already zero, so nothing to do here. - } - else { - byte fields[intFieldArraySize]; - serializeIntFields(fields); - - // Find the position of the first null byte from the right - // This for loop has no endPos > 0 check because it is known that the array contains non-null bytes - int endPos; - for(endPos = intFieldArraySize; fields[endPos - 1] == 0; --endPos); - - // Start copying after any prefix bytes that matched the int fields of the base - int intFieldPrefixLen = std::max(0, commonPrefix - key.size()); - int startPos = intFieldPrefixLen + (intFieldPrefixLen == 0 && version == 0 ? 8 : 0); - int suffixLen = std::max(0, endPos - startPos); - - if(suffixLen > 0) { - w.writeString(StringRef(fields + startPos, suffixLen)); - d.flags |= suffixLen; - } + // Write value bytes + if (value.present()) { + wptr = value.get().copyTo(wptr); } - // value - if(value.present()) { - w.writeString(value.get()); + if (version != 0) { + wptr += d.setVersionDelta(version - base.version, wptr); } - return w.wptr - d.data() + sizeof(Delta); + return wptr - (uint8_t*)&d; } - template - static std::string kvformat(StringRefT s, int hexLimit = -1) { + static std::string kvformat(StringRef s, int hexLimit = -1) { bool hex = false; - for(auto c : s) { - if(!isprint(c)) { + for (auto c : s) { + if (!isprint(c)) { hex = true; break; } @@ -2503,21 +2346,17 @@ struct RedwoodRecordRef { return hex ? s.toHexString(hexLimit) : s.toString(); } - std::string toString(int hexLimit = 15) const { + std::string toString(bool leaf = true) const { std::string r; - r += format("'%s'@%" PRId64, kvformat(key, hexLimit).c_str(), version); - r += format("[%u/%u]->", chunk.start, chunk.total); - if(value.present()) { - // Assume that values the size of a page ID are page IDs. It's not perfect but it's just for debugging. - if(value.get().size() == sizeof(LogicalPageID)) { + r += format("'%s'@%" PRId64 " => ", kvformat(key).c_str(), version); + if (value.present()) { + if (leaf) { + r += format("'%s'", kvformat(value.get()).c_str()); + } else { r += format("[%s]", ::toString(getChildPage()).c_str()); } - else { - r += format("'%s'", kvformat(value.get(), hexLimit).c_str()); - } - } - else { - r += "null"; + } else { + r += "(absent)"; } return r; } @@ -2527,7 +2366,7 @@ struct BTreePage { typedef DeltaTree BinaryTree; typedef DeltaTree ValueTree; -#pragma pack(push,1) +#pragma pack(push, 1) struct { uint8_t height; uint32_t kvBytes; @@ -2535,33 +2374,27 @@ struct BTreePage { #pragma pack(pop) int size() const { - const BinaryTree *t = &tree(); - return (uint8_t *)t - (uint8_t *)this + t->size(); + const BinaryTree* t = &tree(); + return (uint8_t*)t - (uint8_t*)this + t->size(); } - bool isLeaf() const { - return height == 1; - } + bool isLeaf() const { return height == 1; } - BinaryTree & tree() { - return *(BinaryTree *)(this + 1); - } + BinaryTree& tree() { return *(BinaryTree*)(this + 1); } - const BinaryTree & tree() const { - return *(const BinaryTree *)(this + 1); - } + const BinaryTree& tree() const { return *(const BinaryTree*)(this + 1); } - const ValueTree & valueTree() const { - return *(const ValueTree *)(this + 1); - } + const ValueTree& valueTree() const { return *(const ValueTree*)(this + 1); } - std::string toString(bool write, BTreePageID id, Version ver, const RedwoodRecordRef *lowerBound, const RedwoodRecordRef *upperBound) const { + std::string toString(bool write, BTreePageID id, Version ver, const RedwoodRecordRef* lowerBound, + const RedwoodRecordRef* upperBound) const { std::string r; - r += format("BTreePage op=%s %s @%" PRId64 " ptr=%p height=%d count=%d kvBytes=%d\n lowerBound: %s\n upperBound: %s\n", - write ? "write" : "read", ::toString(id).c_str(), ver, this, height, (int)tree().numItems, (int)kvBytes, - lowerBound->toString().c_str(), upperBound->toString().c_str()); + r += format("BTreePage op=%s %s @%" PRId64 + " ptr=%p height=%d count=%d kvBytes=%d\n lowerBound: %s\n upperBound: %s\n", + write ? "write" : "read", ::toString(id).c_str(), ver, this, height, (int)tree().numItems, + (int)kvBytes, lowerBound->toString(false).c_str(), upperBound->toString(false).c_str()); try { - if(tree().numItems > 0) { + if (tree().numItems > 0) { // This doesn't use the cached reader for the page but it is only for debugging purposes BinaryTree::Mirror reader(&tree(), lowerBound, upperBound); BinaryTree::Cursor c = reader.getCursor(); @@ -2572,22 +2405,22 @@ struct BTreePage { bool anyOutOfRange = false; do { r += " "; - r += c.get().toString(); + r += c.get().toString(height == 1); - bool tooLow = c.get().key < lowerBound->key; - bool tooHigh = c.get().key > upperBound->key; - if(tooLow || tooHigh) { + bool tooLow = c.get().withoutValue() < lowerBound->withoutValue(); + bool tooHigh = c.get().withoutValue() >= upperBound->withoutValue(); + if (tooLow || tooHigh) { anyOutOfRange = true; - if(tooLow) { + if (tooLow) { r += " (too low)"; } - if(tooHigh) { + if (tooHigh) { r += " (too high)"; } } r += "\n"; - } while(c.moveNext()); + } while (c.moveNext()); ASSERT(!anyOutOfRange); } } catch (Error& e) { @@ -2601,14 +2434,14 @@ struct BTreePage { }; static void makeEmptyRoot(Reference page) { - BTreePage *btpage = (BTreePage *)page->begin(); + BTreePage* btpage = (BTreePage*)page->begin(); btpage->height = 1; btpage->kvBytes = 0; btpage->tree().build(page->size(), nullptr, nullptr, nullptr, nullptr); } -BTreePage::BinaryTree::Cursor getCursor(const Reference &page) { - return ((BTreePage::BinaryTree::Mirror *)page->userData)->getCursor(); +BTreePage::BinaryTree::Cursor getCursor(const Reference& page) { + return ((BTreePage::BinaryTree::Mirror*)page->userData)->getCursor(); } struct BoundaryRefAndPage { @@ -2621,32 +2454,23 @@ struct BoundaryRefAndPage { } }; -#define NOT_IMPLEMENTED { UNSTOPPABLE_ASSERT(false); } +#define NOT_IMPLEMENTED \ + { UNSTOPPABLE_ASSERT(false); } #pragma pack(push, 1) -template +template struct InPlaceArray { SizeT count; - const T * begin() const { - return (T *)(this + 1); - } - - T * begin() { - return (T *)(this + 1); - } + const T* begin() const { return (T*)(this + 1); } - const T * end() const { - return begin() + count; - } - - T * end() { - return begin() + count; - } + T* begin() { return (T*)(this + 1); } - VectorRef get() { - return VectorRef(begin(), count); - } + const T* end() const { return begin() + count; } + + T* end() { return begin() + count; } + + VectorRef get() { return VectorRef(begin(), count); } void set(VectorRef v, int availableSpace) { ASSERT(sizeof(T) * v.size() <= availableSpace); @@ -2654,9 +2478,7 @@ struct InPlaceArray { memcpy(begin(), v.begin(), sizeof(T) * v.size()); } - int extraSize() const { - return count * sizeof(T); - } + int extraSize() const { return count * sizeof(T); } }; #pragma pack(pop) @@ -2671,49 +2493,41 @@ public: Version version; Standalone pageID; - bool operator< (const LazyDeleteQueueEntry &rhs) const { - return version < rhs.version; - } + bool operator<(const LazyDeleteQueueEntry& rhs) const { return version < rhs.version; } - int readFromBytes(const uint8_t *src) { - version = *(Version *)src; + int readFromBytes(const uint8_t* src) { + version = *(Version*)src; src += sizeof(Version); int count = *src++; - pageID = BTreePageID((LogicalPageID *)src, count); + pageID = BTreePageID((LogicalPageID*)src, count); return bytesNeeded(); } - int bytesNeeded() const { - return sizeof(Version) + 1 + (pageID.size() * sizeof(LogicalPageID)); - } + int bytesNeeded() const { return sizeof(Version) + 1 + (pageID.size() * sizeof(LogicalPageID)); } - int writeToBytes(uint8_t *dst) const { - *(Version *)dst = version; + int writeToBytes(uint8_t* dst) const { + *(Version*)dst = version; dst += sizeof(Version); *dst++ = pageID.size(); memcpy(dst, pageID.begin(), pageID.size() * sizeof(LogicalPageID)); return bytesNeeded(); } - std::string toString() const { - return format("{%s @%" PRId64 "}", ::toString(pageID).c_str(), version); - } + std::string toString() const { return format("{%s @%" PRId64 "}", ::toString(pageID).c_str(), version); } }; typedef FIFOQueue LazyDeleteQueueT; #pragma pack(push, 1) struct MetaKey { - static constexpr int FORMAT_VERSION = 4; + static constexpr int FORMAT_VERSION = 7; // This serves as the format version for the entire tree, individual pages will not be versioned uint16_t formatVersion; uint8_t height; LazyDeleteQueueT::QueueState lazyDeleteQueue; InPlaceArray root; - KeyRef asKeyRef() const { - return KeyRef((uint8_t *)this, sizeof(MetaKey) + root.extraSize()); - } + KeyRef asKeyRef() const { return KeyRef((uint8_t*)this, sizeof(MetaKey) + root.extraSize()); } void fromKeyRef(KeyRef k) { memcpy(this, k.begin(), k.size()); @@ -2721,9 +2535,9 @@ public: } std::string toString() { - return format("{height=%d formatVersion=%d root=%s lazyDeleteQueue=%s}", (int)height, (int)formatVersion, ::toString(root.get()).c_str(), lazyDeleteQueue.toString().c_str()); + return format("{height=%d formatVersion=%d root=%s lazyDeleteQueue=%s}", (int)height, (int)formatVersion, + ::toString(root.get()).c_str(), lazyDeleteQueue.toString().c_str()); } - }; #pragma pack(pop) @@ -2733,9 +2547,7 @@ public: startTime = g_network ? now() : 0; } - void clear() { - *this = Counts(); - } + void clear() { *this = Counts(); } int64_t pageReads; int64_t extPageReads; @@ -2756,16 +2568,23 @@ public: double startTime; std::string toString(bool clearAfter = false) { - const char *labels[] = {"set", "clear", "clearSingleKey", "get", "getRange", "commit", "pageReads", "extPageRead", "pagePreloads", "extPagePreloads", "pageWrite", "extPageWrite", "commitPage", "commitPageStart", "pageUpdates"}; - const int64_t values[] = {sets, clears, clearSingleKey, gets, getRanges, commits, pageReads, extPageReads, pagePreloads, extPagePreloads, pageWrites, extPageWrites, commitToPage, commitToPageStart, pageUpdates}; + const char* labels[] = { "set", "clear", "clearSingleKey", "get", + "getRange", "commit", "pageReads", "extPageRead", + "pagePreloads", "extPagePreloads", "pageWrite", "extPageWrite", + "commitPage", "commitPageStart", "pageUpdates" }; + const int64_t values[] = { + sets, clears, clearSingleKey, gets, getRanges, commits, pageReads, + extPageReads, pagePreloads, extPagePreloads, pageWrites, extPageWrites, commitToPage, commitToPageStart, + pageUpdates + }; double elapsed = now() - startTime; std::string s; - for(int i = 0; i < sizeof(values) / sizeof(int64_t); ++i) { + for (int i = 0; i < sizeof(values) / sizeof(int64_t); ++i) { s += format("%s=%" PRId64 " (%d/s) ", labels[i], values[i], int(values[i] / elapsed)); } - if(clearAfter) { + if (clearAfter) { clear(); } @@ -2778,40 +2597,32 @@ public: // All async opts on the btree are based on pager reads, writes, and commits, so // we can mostly forward these next few functions to the pager - Future getError() { - return m_pager->getError(); - } + Future getError() { return m_pager->getError(); } - Future onClosed() { - return m_pager->onClosed(); - } + Future onClosed() { return m_pager->onClosed(); } void close_impl(bool dispose) { - auto *pager = m_pager; + auto* pager = m_pager; delete this; - if(dispose) + if (dispose) pager->dispose(); else pager->close(); } - void dispose() { - return close_impl(true); - } + void dispose() { return close_impl(true); } - void close() { - return close_impl(false); - } + void close() { return close_impl(false); } - KeyValueStoreType getType() NOT_IMPLEMENTED - bool supportsMutation(int op) NOT_IMPLEMENTED - StorageBytes getStorageBytes() { + KeyValueStoreType getType() NOT_IMPLEMENTED bool supportsMutation(int op) NOT_IMPLEMENTED StorageBytes + getStorageBytes() { return m_pager->getStorageBytes(); } // Writes are provided in an ordered stream. - // A write is considered part of (a change leading to) the version determined by the previous call to setWriteVersion() - // A write shall not become durable until the following call to commit() begins, and shall be durable once the following call to commit() returns + // A write is considered part of (a change leading to) the version determined by the previous call to + // setWriteVersion() A write shall not become durable until the following call to commit() begins, and shall be + // durable once the following call to commit() returns void set(KeyValueRef keyValue) { ++counts.sets; m_pBuffer->insert(keyValue.key).mutation().setBoundaryValue(m_pBuffer->copyToArena(keyValue.value)); @@ -2819,10 +2630,8 @@ public: void clear(KeyRangeRef clearedRange) { // Optimization for single key clears to create just one mutation boundary instead of two - if(clearedRange.begin.size() == clearedRange.end.size() - 1 - && clearedRange.end[clearedRange.end.size() - 1] == 0 - && clearedRange.end.startsWith(clearedRange.begin) - ) { + if (clearedRange.begin.size() == clearedRange.end.size() - 1 && + clearedRange.end[clearedRange.end.size() - 1] == 0 && clearedRange.end.startsWith(clearedRange.begin)) { ++counts.clears; ++counts.clearSingleKey; m_pBuffer->insert(clearedRange.begin).mutation().clearBoundary(); @@ -2840,40 +2649,31 @@ public: void mutate(int op, StringRef param1, StringRef param2) NOT_IMPLEMENTED - void setOldestVersion(Version v) { + void setOldestVersion(Version v) { m_newOldestVersion = v; } - Version getOldestVersion() { - return m_pager->getOldestVersion(); - } + Version getOldestVersion() { return m_pager->getOldestVersion(); } Version getLatestVersion() { - if(m_writeVersion != invalidVersion) - return m_writeVersion; + if (m_writeVersion != invalidVersion) return m_writeVersion; return m_pager->getLatestVersion(); } - Version getWriteVersion() { - return m_writeVersion; - } + Version getWriteVersion() { return m_writeVersion; } - Version getLastCommittedVersion() { - return m_lastCommittedVersion; - } + Version getLastCommittedVersion() { return m_lastCommittedVersion; } - VersionedBTree(IPager2 *pager, std::string name) - : m_pager(pager), - m_writeVersion(invalidVersion), - m_lastCommittedVersion(invalidVersion), - m_pBuffer(nullptr), - m_name(name) - { + VersionedBTree(IPager2* pager, std::string name) + : m_pager(pager), m_writeVersion(invalidVersion), m_lastCommittedVersion(invalidVersion), m_pBuffer(nullptr), + m_name(name) { m_init = init_impl(this); m_latestCommit = m_init; } - ACTOR static Future incrementalSubtreeClear(VersionedBTree *self, bool *pStop = nullptr, int batchSize = 10, unsigned int minPages = 0, int maxPages = std::numeric_limits::max()) { + ACTOR static Future incrementalSubtreeClear(VersionedBTree* self, bool* pStop = nullptr, int batchSize = 10, + unsigned int minPages = 0, + int maxPages = std::numeric_limits::max()) { // TODO: Is it contractually okay to always to read at the latest version? state Reference snapshot = self->m_pager->getReadSnapshot(self->m_pager->getLatestVersion()); state int freedPages = 0; @@ -2882,52 +2682,52 @@ public: state std::vector>>> entries; // Take up to batchSize pages from front of queue - while(entries.size() < batchSize) { + while (entries.size() < batchSize) { Optional q = wait(self->m_lazyDeleteQueue.pop()); debug_printf("LazyDelete: popped %s\n", toString(q).c_str()); - if(!q.present()) { + if (!q.present()) { break; } // Start reading the page, without caching - entries.push_back(std::make_pair(q.get(), self->readPage(snapshot, q.get().pageID, nullptr, nullptr, true))); + entries.push_back( + std::make_pair(q.get(), self->readPage(snapshot, q.get().pageID, nullptr, nullptr, true))); } - if(entries.empty()) { + if (entries.empty()) { break; } state int i; - for(i = 0; i < entries.size(); ++i) { + for (i = 0; i < entries.size(); ++i) { Reference p = wait(entries[i].second); - const LazyDeleteQueueEntry &entry = entries[i].first; - const BTreePage &btPage = *(BTreePage *)p->begin(); + const LazyDeleteQueueEntry& entry = entries[i].first; + const BTreePage& btPage = *(BTreePage*)p->begin(); debug_printf("LazyDelete: processing %s\n", toString(entry).c_str()); // Level 1 (leaf) nodes should never be in the lazy delete queue ASSERT(btPage.height > 1); - + // Iterate over page entries, skipping key decoding using BTreePage::ValueTree which uses // RedwoodRecordRef::DeltaValueOnly as the delta type type to skip key decoding BTreePage::ValueTree::Mirror reader(&btPage.valueTree(), &dbBegin, &dbEnd); auto c = reader.getCursor(); ASSERT(c.moveFirst()); Version v = entry.version; - while(1) { - if(c.get().value.present()) { + while (1) { + if (c.get().value.present()) { BTreePageID btChildPageID = c.get().getChildPage(); // If this page is height 2, then the children are leaves so free - if(btPage.height == 2) { + if (btPage.height == 2) { debug_printf("LazyDelete: freeing child %s\n", toString(btChildPageID).c_str()); self->freeBtreePage(btChildPageID, v); freedPages += btChildPageID.size(); - } - else { + } else { // Otherwise, queue them for lazy delete. debug_printf("LazyDelete: queuing child %s\n", toString(btChildPageID).c_str()); - self->m_lazyDeleteQueue.pushFront(LazyDeleteQueueEntry{v, btChildPageID}); + self->m_lazyDeleteQueue.pushFront(LazyDeleteQueueEntry{ v, btChildPageID }); } } - if(!c.moveNext()) { + if (!c.moveNext()) { break; } } @@ -2939,28 +2739,30 @@ public: } // If stop is set and we've freed the minimum number of pages required, or the maximum is exceeded, return. - if((freedPages >= minPages && pStop != nullptr && *pStop) || freedPages >= maxPages) { + if ((freedPages >= minPages && pStop != nullptr && *pStop) || freedPages >= maxPages) { break; } } - debug_printf("LazyDelete: freed %d pages, %s has %" PRId64 " entries\n", freedPages, self->m_lazyDeleteQueue.name.c_str(), self->m_lazyDeleteQueue.numEntries); + debug_printf("LazyDelete: freed %d pages, %s has %" PRId64 " entries\n", freedPages, + self->m_lazyDeleteQueue.name.c_str(), self->m_lazyDeleteQueue.numEntries); return freedPages; } - ACTOR static Future init_impl(VersionedBTree *self) { + ACTOR static Future init_impl(VersionedBTree* self) { wait(self->m_pager->init()); state Version latest = self->m_pager->getLatestVersion(); self->m_newOldestVersion = self->m_pager->getOldestVersion(); - debug_printf("Recovered pager to version %" PRId64 ", oldest version is %" PRId64 "\n", self->m_newOldestVersion); + debug_printf("Recovered pager to version %" PRId64 ", oldest version is %" PRId64 "\n", + self->m_newOldestVersion); state Key meta = self->m_pager->getMetaKey(); - if(meta.size() == 0) { + if (meta.size() == 0) { self->m_header.formatVersion = MetaKey::FORMAT_VERSION; LogicalPageID id = wait(self->m_pager->newPageID()); - BTreePageID newRoot((LogicalPageID *)&id, 1); + BTreePageID newRoot((LogicalPageID*)&id, 1); debug_printf("new root %s\n", toString(newRoot).c_str()); self->m_header.root.set(newRoot, sizeof(headerSpace) - sizeof(m_header)); self->m_header.height = 1; @@ -2976,26 +2778,22 @@ public: self->m_pager->setMetaKey(self->m_header.asKeyRef()); wait(self->m_pager->commit()); debug_printf("Committed initial commit.\n"); - } - else { + } else { self->m_header.fromKeyRef(meta); self->m_lazyDeleteQueue.recover(self->m_pager, self->m_header.lazyDeleteQueue, "LazyDeleteQueueRecovered"); } debug_printf("Recovered btree at version %" PRId64 ": %s\n", latest, self->m_header.toString().c_str()); - self->m_maxPartSize = std::min(255, self->m_pager->getUsablePageSize() / 5); self->m_lastCommittedVersion = latest; return Void(); } - Future init() override { - return m_init; - } + Future init() override { return m_init; } virtual ~VersionedBTree() { // This probably shouldn't be called directly (meaning deleting an instance directly) but it should be safe, - // it will cancel init and commit and leave the pager alive but with potentially an incomplete set of + // it will cancel init and commit and leave the pager alive but with potentially an incomplete set of // uncommitted writes so it should not be committed. m_init.cancel(); m_latestCommit.cancel(); @@ -3010,19 +2808,18 @@ public: KeyRef m = snapshot->getMetaKey(); // Currently all internal records generated in the write path are at version 0 - return Reference(new Cursor(snapshot, ((MetaKey *)m.begin())->root.get(), (Version)0)); + return Reference(new Cursor(snapshot, ((MetaKey*)m.begin())->root.get(), (Version)0)); } // Must be nondecreasing void setWriteVersion(Version v) { ASSERT(v > m_lastCommittedVersion); // If there was no current mutation buffer, create one in the buffer map and update m_pBuffer - if(m_pBuffer == nullptr) { + if (m_pBuffer == nullptr) { // When starting a new mutation buffer its start version must be greater than the last write version ASSERT(v > m_writeVersion); m_pBuffer = &m_mutationBuffers[v]; - } - else { + } else { // It's OK to set the write version to the same version repeatedly so long as m_pBuffer is not null ASSERT(v >= m_writeVersion); } @@ -3030,12 +2827,11 @@ public: } Future commit() { - if(m_pBuffer == nullptr) - return m_latestCommit; + if (m_pBuffer == nullptr) return m_latestCommit; return commit_impl(this); } - ACTOR static Future destroyAndCheckSanity_impl(VersionedBTree *self) { + ACTOR static Future destroyAndCheckSanity_impl(VersionedBTree* self) { ASSERT(g_network->isSimulated()); debug_printf("Clearing tree.\n"); @@ -3046,7 +2842,7 @@ public: state int freedPages = wait(self->incrementalSubtreeClear(self)); wait(self->commit()); // Keep looping until the last commit doesn't do anything at all - if(self->m_lazyDeleteQueue.numEntries == 0 && freedPages == 0) { + if (self->m_lazyDeleteQueue.numEntries == 0 && freedPages == 0) { break; } self->setWriteVersion(self->getLatestVersion() + 1); @@ -3076,29 +2872,22 @@ public: return Void(); } - Future destroyAndCheckSanity() { - return destroyAndCheckSanity_impl(this); - } + Future destroyAndCheckSanity() { return destroyAndCheckSanity_impl(this); } private: struct ChildLinksRef { ChildLinksRef() = default; ChildLinksRef(VectorRef children, RedwoodRecordRef upperBound) - : children(children), upperBound(upperBound) { - } + : children(children), upperBound(upperBound) {} - ChildLinksRef(const RedwoodRecordRef *child, const RedwoodRecordRef *upperBound) - : children((RedwoodRecordRef *)child, 1), upperBound(*upperBound) { - } + ChildLinksRef(const RedwoodRecordRef* child, const RedwoodRecordRef* upperBound) + : children((RedwoodRecordRef*)child, 1), upperBound(*upperBound) {} - ChildLinksRef(Arena &arena, const ChildLinksRef &toCopy) - : children(arena, toCopy.children), upperBound(arena, toCopy.upperBound) { - } + ChildLinksRef(Arena& arena, const ChildLinksRef& toCopy) + : children(arena, toCopy.children), upperBound(arena, toCopy.upperBound) {} - int expectedSize() const { - return children.expectedSize() + upperBound.expectedSize(); - } + int expectedSize() const { return children.expectedSize() + upperBound.expectedSize(); } std::string toString() const { return format("{children=%s upperbound=%s}", ::toString(children).c_str(), upperBound.toString().c_str()); @@ -3115,38 +2904,36 @@ private: // boundaries of consecutive entries. struct InternalPageBuilder { // Cursor must be at first entry in page - InternalPageBuilder(const BTreePage::BinaryTree::Cursor &c) - : cursor(c), modified(false), childPageCount(0) - { - } + InternalPageBuilder(const BTreePage::BinaryTree::Cursor& c) : cursor(c), modified(false), childPageCount(0) {} private: // This must be called internally, on records whose arena has already been added to the entries arena - inline void addEntry(const RedwoodRecordRef &rec) { - if(rec.value.present()) { + inline void addEntry(const RedwoodRecordRef& rec) { + if (rec.value.present()) { ++childPageCount; } // If no modification detected yet then check that this record is identical to the next // record from the original page which is at the current cursor position. - if(!modified) { - if(cursor.valid()) { - if(rec != cursor.get()) { - debug_printf("InternalPageBuilder: Found internal page difference. new: %s old: %s\n", rec.toString().c_str(), cursor.get().toString().c_str()); + if (!modified) { + if (cursor.valid()) { + if (rec != cursor.get()) { + debug_printf("InternalPageBuilder: Found internal page difference. new: %s old: %s\n", + rec.toString().c_str(), cursor.get().toString().c_str()); modified = true; - } - else { + } else { cursor.moveNext(); } - } - else { - debug_printf("InternalPageBuilder: Found internal page difference. new: %s old: \n", rec.toString().c_str()); + } else { + debug_printf("InternalPageBuilder: Found internal page difference. new: %s old: \n", + rec.toString().c_str()); modified = true; } } entries.push_back(entries.arena(), rec); } + public: // Add the child entries from newSet into entries void addEntries(ChildLinksRef newSet) { @@ -3154,14 +2941,14 @@ private: // as the first lowerBound in newSet (or newSet is empty, as the next newSet is necessarily greater) // then add the upper bound of the previous set as a value-less record so that on future reads // the previous child page can be decoded correctly. - if(!entries.empty() && entries.back().value.present() - && (newSet.children.empty() || !newSet.children.front().sameExceptValue(lastUpperBound))) - { - debug_printf("InternalPageBuilder: Added placeholder %s\n", lastUpperBound.withoutValue().toString().c_str()); + if (!entries.empty() && entries.back().value.present() && + (newSet.children.empty() || !newSet.children.front().sameExceptValue(lastUpperBound))) { + debug_printf("InternalPageBuilder: Added placeholder %s\n", + lastUpperBound.withoutValue().toString().c_str()); addEntry(lastUpperBound.withoutValue()); } - for(auto &child : newSet.children) { + for (auto& child : newSet.children) { debug_printf("InternalPageBuilder: Adding child entry %s\n", child.toString().c_str()); addEntry(child); } @@ -3178,32 +2965,40 @@ private: // This is only done if modified is set to avoid rewriting this page for this purpose only. // // After this call, lastUpperBound is internal page's upper bound. - void finalize(const RedwoodRecordRef &upperBound, const RedwoodRecordRef &decodeUpperBound) { - debug_printf("InternalPageBuilder::end modified=%d upperBound=%s decodeUpperBound=%s lastUpperBound=%s\n", modified, upperBound.toString().c_str(), decodeUpperBound.toString().c_str(), lastUpperBound.toString().c_str()); + void finalize(const RedwoodRecordRef& upperBound, const RedwoodRecordRef& decodeUpperBound) { + debug_printf( + "InternalPageBuilder::end modified=%d upperBound=%s decodeUpperBound=%s lastUpperBound=%s\n", + modified, upperBound.toString().c_str(), decodeUpperBound.toString().c_str(), + lastUpperBound.toString().c_str()); modified = modified || cursor.valid(); debug_printf("InternalPageBuilder::end modified=%d after cursor check\n", modified); - // If there are boundary key entries and the last one has a child page then the + // If there are boundary key entries and the last one has a child page then the // upper bound for this internal page must match the required upper bound for // the last child entry. - if(!entries.empty() && entries.back().value.present()) { + if (!entries.empty() && entries.back().value.present()) { debug_printf("InternalPageBuilder::end last entry is not null\n"); // If the page contents were not modified so far and the upper bound required // for the last child page (lastUpperBound) does not match what the page // was encoded with then the page must be modified. - if(!modified && !lastUpperBound.sameExceptValue(decodeUpperBound)) { - debug_printf("InternalPageBuilder::end modified set true because lastUpperBound does not match decodeUpperBound\n"); + if (!modified && !lastUpperBound.sameExceptValue(decodeUpperBound)) { + debug_printf("InternalPageBuilder::end modified set true because lastUpperBound does not match " + "decodeUpperBound\n"); modified = true; } - if(modified && !lastUpperBound.sameExceptValue(upperBound)) { - debug_printf("InternalPageBuilder::end Modified is true but lastUpperBound does not match upperBound so adding placeholder\n"); + if (modified && !lastUpperBound.sameExceptValue(upperBound)) { + debug_printf("InternalPageBuilder::end Modified is true but lastUpperBound does not match " + "upperBound so adding placeholder\n"); addEntry(lastUpperBound.withoutValue()); lastUpperBound = upperBound; } } - debug_printf("InternalPageBuilder::end exit. modified=%d upperBound=%s decodeUpperBound=%s lastUpperBound=%s\n", modified, upperBound.toString().c_str(), decodeUpperBound.toString().c_str(), lastUpperBound.toString().c_str()); + debug_printf( + "InternalPageBuilder::end exit. modified=%d upperBound=%s decodeUpperBound=%s lastUpperBound=%s\n", + modified, upperBound.toString().c_str(), decodeUpperBound.toString().c_str(), + lastUpperBound.toString().c_str()); } BTreePage::BinaryTree::Cursor cursor; @@ -3235,33 +3030,25 @@ private: // No point in serializing an atomic op, it needs to be coalesced to a real value. ASSERT(!isAtomicOp()); - if(isClear()) - return RedwoodRecordRef(userKey, version); + if (isClear()) return RedwoodRecordRef(userKey, version); return RedwoodRecordRef(userKey, version, value); } - std::string toString() const { - return format("op=%d val='%s'", op, printable(value).c_str()); - } + std::string toString() const { return format("op=%d val='%s'", op, printable(value).c_str()); } }; struct RangeMutation { - RangeMutation() : boundaryChanged(false), clearAfterBoundary(false) { - } + RangeMutation() : boundaryChanged(false), clearAfterBoundary(false) {} bool boundaryChanged; - Optional boundaryValue; // Not present means cleared + Optional boundaryValue; // Not present means cleared bool clearAfterBoundary; - bool boundaryCleared() const { - return boundaryChanged && !boundaryValue.present(); - } + bool boundaryCleared() const { return boundaryChanged && !boundaryValue.present(); } // Returns true if this RangeMutation doesn't actually mutate anything - bool noChanges() const { - return !boundaryChanged && !clearAfterBoundary; - } + bool noChanges() const { return !boundaryChanged && !clearAfterBoundary; } void clearBoundary() { boundaryChanged = true; @@ -3272,24 +3059,21 @@ private: clearBoundary(); clearAfterBoundary = true; } - + void setBoundaryValue(ValueRef v) { boundaryChanged = true; boundaryValue = v; } - bool boundarySet() const { - return boundaryChanged && boundaryValue.present(); - } + bool boundarySet() const { return boundaryChanged && boundaryValue.present(); } std::string toString() const { - return format("boundaryChanged=%d clearAfterBoundary=%d boundaryValue=%s", boundaryChanged, clearAfterBoundary, ::toString(boundaryValue).c_str()); + return format("boundaryChanged=%d clearAfterBoundary=%d boundaryValue=%s", boundaryChanged, + clearAfterBoundary, ::toString(boundaryValue).c_str()); } }; public: - - #include "ArtMutationBuffer.h" struct MutationBufferStdMap { MutationBufferStdMap() { @@ -3310,52 +3094,36 @@ public: struct iterator : public MutationsT::iterator { typedef MutationsT::iterator Base; iterator() = default; - iterator(const MutationsT::iterator &i) : Base(i) { - } + iterator(const MutationsT::iterator& i) : Base(i) {} - const KeyRef & key() { - return (*this)->first; - } + const KeyRef& key() { return (*this)->first; } - RangeMutation & mutation() { - return (*this)->second; - } + RangeMutation& mutation() { return (*this)->second; } }; struct const_iterator : public MutationsT::const_iterator { typedef MutationsT::const_iterator Base; const_iterator() = default; - const_iterator(const MutationsT::const_iterator &i) : Base(i) { - } - const_iterator(const MutationsT::iterator &i) : Base(i) { - } + const_iterator(const MutationsT::const_iterator& i) : Base(i) {} + const_iterator(const MutationsT::iterator& i) : Base(i) {} - const KeyRef & key() { - return (*this)->first; - } + const KeyRef& key() { return (*this)->first; } - const RangeMutation & mutation() { - return (*this)->second; - } + const RangeMutation& mutation() { return (*this)->second; } }; // Return a T constructed in arena - template T copyToArena(const T &object) { + template + T copyToArena(const T& object) { return T(arena, object); } - const_iterator upper_bound(const KeyRef &k) const { - return mutations.upper_bound(k); - } + const_iterator upper_bound(const KeyRef& k) const { return mutations.upper_bound(k); } - const_iterator lower_bound(const KeyRef &k) const { - return mutations.lower_bound(k); - } + const_iterator lower_bound(const KeyRef& k) const { return mutations.lower_bound(k); } // erase [begin, end) from the mutation map - void erase(const const_iterator &begin, const const_iterator &end) { - mutations.erase(begin, end); - } + void erase(const const_iterator& begin, const const_iterator& end) { mutations.erase(begin, end); } // Find or create a mutation buffer boundary for bound and return an iterator to it iterator insert(KeyRef boundary) { @@ -3366,34 +3134,34 @@ public: iterator ib = mutations.lower_bound(boundary); // If we found the boundary we are looking for, return its iterator - if(ib.key() == boundary) { + if (ib.key() == boundary) { return ib; } // ib is our insert hint. Copy boundary into arena and insert boundary into buffer boundary = KeyRef(arena, boundary); - ib = mutations.insert(ib, {boundary, RangeMutation()}); + ib = mutations.insert(ib, { boundary, RangeMutation() }); // ib is certainly > begin() because it is guaranteed that the empty string // boundary exists and the only way to have found that is to look explicitly // for it in which case we would have returned above. iterator iPrevious = ib; --iPrevious; - // If the range we just divided was being cleared, then the dividing boundary key and range after it must also be cleared - if(iPrevious.mutation().clearAfterBoundary) { + // If the range we just divided was being cleared, then the dividing boundary key and range after it must + // also be cleared + if (iPrevious.mutation().clearAfterBoundary) { ib.mutation().clearAll(); } return ib; } - }; #define USE_ART_MUTATION_BUFFER 1 #ifdef USE_ART_MUTATION_BUFFER - typedef struct MutationBufferART MutationBuffer; + typedef struct MutationBufferART MutationBuffer; #else - typedef struct MutationBufferStdMap MutationBuffer; + typedef struct MutationBufferStdMap MutationBuffer; #endif private: @@ -3402,10 +3170,10 @@ private: * This structure's organization is meant to put pending updates for the btree in an order * that makes it efficient to query all pending mutations across all pending versions which are * relevant to a particular subtree of the btree. - * + * * At the top level, it is a map of the start of a range being modified to a RangeMutation. * The end of the range is map key (which is the next range start in the map). - * + * * - The buffer starts out with keys '' and endKVV.key already populated. * * - When a new key is inserted into the buffer map, it is by definition @@ -3446,8 +3214,8 @@ private: * to be sorted later just before being merged into the existing leaf page. */ - IPager2 *m_pager; - MutationBuffer *m_pBuffer; + IPager2* m_pager; + MutationBuffer* m_pBuffer; std::map m_mutationBuffers; Version m_writeVersion; @@ -3459,282 +3227,310 @@ private: // MetaKey changes size so allocate space for it to expand into union { - uint8_t headerSpace[sizeof(MetaKey) + sizeof(LogicalPageID) * 20]; + uint8_t headerSpace[sizeof(MetaKey) + sizeof(LogicalPageID) * 30]; MetaKey m_header; }; LazyDeleteQueueT m_lazyDeleteQueue; - int m_maxPartSize; // Writes entries to 1 or more pages and return a vector of boundary keys with their IPage(s) - ACTOR static Future>> writePages(VersionedBTree *self, bool minimalBoundaries, const RedwoodRecordRef *lowerBound, const RedwoodRecordRef *upperBound, VectorRef entries, int height, Version v, BTreePageID previousID) { + ACTOR static Future>> writePages( + VersionedBTree* self, const RedwoodRecordRef* lowerBound, const RedwoodRecordRef* upperBound, + VectorRef entries, int height, Version v, BTreePageID previousID) { ASSERT(entries.size() > 0); state Standalone> records; // This is how much space for the binary tree exists in the page, after the header state int blockSize = self->m_pager->getUsablePageSize(); state int pageSize = blockSize - sizeof(BTreePage); + state float fillFactor = 0.66; // TODO: Make this a knob + state int pageFillTarget = pageSize * fillFactor; state int blockCount = 1; state int kvBytes = 0; - state int compressedBytes = BTreePage::BinaryTree::GetTreeOverhead(); + state int compressedBytes = BTreePage::BinaryTree::emptyTreeSize(); + state bool largeTree = false; state int start = 0; state int i = 0; - state bool end; + // The common prefix length between the first and last records are common to all records + state int skipLen = entries.front().getCommonPrefixLen(entries.back()); + + // Leaves can have just one record if it's large, but internal pages should have at least 4 + state int minimumEntries = (height == 1 ? 1 : 4); - // For leaf level where minimal boundaries are used require at least 1 entry, otherwise require 4 to enforce a minimum branching factor - state int minimumEntries = minimalBoundaries ? 1 : 4; - // Lower bound of the page being added to state RedwoodRecordRef pageLowerBound = lowerBound->withoutValue(); state RedwoodRecordRef pageUpperBound; - while(i <= entries.size()) { - end = i == entries.size(); - bool flush = end; + while (1) { + // While there are still entries to add and the page isn't full enough, add an entry + while (i < entries.size() && (i - start < minimumEntries || compressedBytes < pageFillTarget)) { + const RedwoodRecordRef& entry = entries[i]; + + // Get delta from previous record or page lower boundary if this is the first item in a page + const RedwoodRecordRef& base = (i == start) ? pageLowerBound : entries[i - 1]; + + // All record pairs in entries have skipLen bytes in common with each other, but for i == 0 the base is + // lowerBound + int skip = i == 0 ? 0 : skipLen; + + // In a delta tree, all common prefix bytes that can be borrowed, will be, but not necessarily + // by the same records during the linear estimate of the built page size. Since the key suffix bytes + // and therefore the key prefix lengths can be distributed differently in the balanced tree, worst case + // overhead for the delta size must be assumed. + int deltaSize = entry.deltaSize(base, skip, true); - // If not the end, add i to the page if necessary - if(end) { - pageUpperBound = upperBound->withoutValue(); - } - else { - // Get delta from previous record - const RedwoodRecordRef &entry = entries[i]; - int deltaSize = entry.deltaSize((i == start) ? pageLowerBound : entries[i - 1]); int keySize = entry.key.size(); int valueSize = entry.value.present() ? entry.value.get().size() : 0; - int spaceNeeded = sizeof(BTreePage::BinaryTree::Node) + deltaSize; - - debug_printf("Trying to add record %3d of %3lu (i=%3d) klen %4d vlen %3d deltaSize %4d spaceNeeded %4d compressed %4d / page %4d bytes %s\n", - i + 1, entries.size(), i, keySize, valueSize, deltaSize, - spaceNeeded, compressedBytes, pageSize, entry.toString().c_str()); + int nodeSize = BTreePage::BinaryTree::Node::headerSize(largeTree) + deltaSize; + debug_printf("Adding %3d of %3lu (i=%3d) klen %4d vlen %5d nodeSize %5d deltaSize %5d page usage: " + "%d/%d (%.2f%%) record=%s\n", + i + 1, entries.size(), i, keySize, valueSize, nodeSize, deltaSize, compressedBytes, + pageSize, (float)compressedBytes / pageSize * 100, entry.toString(height == 1).c_str()); + // While the node doesn't fit, expand the page. + // This is a loop because if the page size moves into "large" range for DeltaTree + // then the overhead will increase, which could require another page expansion. int spaceAvailable = pageSize - compressedBytes; + if (nodeSize > spaceAvailable) { + // Figure out how many additional whole or partial blocks are needed + // newBlocks = ceil ( additional space needed / block size) + int newBlocks = 1 + (nodeSize - spaceAvailable - 1) / blockSize; + int newPageSize = pageSize + (newBlocks * blockSize); - // Does it fit? - bool fits = spaceAvailable >= spaceNeeded; + // If we've moved into "large" page range for the delta tree then add additional overhead required + if (!largeTree && newPageSize > BTreePage::BinaryTree::SmallSizeLimit) { + largeTree = true; + // Add increased overhead for the current node to nodeSize + nodeSize += BTreePage::BinaryTree::LargeTreePerNodeExtraOverhead; + // Add increased overhead for all previously added nodes + compressedBytes += (i - start) * BTreePage::BinaryTree::LargeTreePerNodeExtraOverhead; - // If it doesn't fit, either end the current page or increase the page size - if(!fits) { - int count = i - start; - - // If not enough entries or page less than half full, increase page size to make the entry fit - if(count < minimumEntries || spaceAvailable > pageSize / 2) { - // Figure out how many additional whole or partial blocks are needed - // newBlocks = ceil ( additional space needed / block size) - int newBlocks = 1 + (spaceNeeded - spaceAvailable - 1) / blockSize; - int newPageSize = pageSize + (newBlocks * blockSize); - if(newPageSize <= BTreePage::BinaryTree::MaximumTreeSize()) { - blockCount += newBlocks; - pageSize = newPageSize; - fits = true; - } - } - if(!fits) { - pageUpperBound = entry.withoutValue(); + // Update calculations above made with previous overhead sizes + spaceAvailable = pageSize - compressedBytes; + newBlocks = 1 + (nodeSize - spaceAvailable - 1) / blockSize; + newPageSize = pageSize + (newBlocks * blockSize); } + + blockCount += newBlocks; + pageSize = newPageSize; + pageFillTarget = pageSize * fillFactor; } - // If the record fits then add it to the page set - if(fits) { - kvBytes += keySize + valueSize; - compressedBytes += spaceNeeded; - ++i; - } - - flush = !fits; + kvBytes += keySize + valueSize; + compressedBytes += nodeSize; + ++i; } - // If flush then write a page using records from start to i. It's guaranteed that pageUpperBound has been set above. - if(flush) { - int remaining = entries.size() - i; - end = remaining == 0; // i could have been moved above - int count = i - start; + // Flush the accumulated records to a page + state int nextStart = i; + // If we are building internal pages and there is a record after this page (index nextStart) but it has an + // empty childPage value then skip it. It only exists to serve as an upper boundary for a child page that + // has not been rewritten in the current commit, and that purpose will now be served by the upper bound of + // the page we are now building. + if (height != 1 && nextStart < entries.size() && !entries[nextStart].value.present()) { + ++nextStart; + } - // If - // - this is not the last page - // - the number of entries remaining after this page is less than the count of the current page - // - the page that would be written ends on a user key boundary - // Then adjust the current page item count to half the amount remaining after the start position. - if(!end && remaining < count && !entries[i - 1].sameUserKey(entries[i].key, 0)) { - i = (start + entries.size()) / 2; - pageUpperBound = entries[i].withoutValue(); - } + // Use the next entry as the upper bound, or upperBound if there are no more entries beyond this page + pageUpperBound = (i == entries.size()) ? upperBound->withoutValue() : entries[i].withoutValue(); - // If this isn't the final page, shorten the upper boundary - if(!end && minimalBoundaries) { - int commonPrefix = pageUpperBound.getCommonPrefixLen(entries[i - 1], 0); - pageUpperBound.truncate(commonPrefix + 1); - } + // If this is a leaf page, and not the last one to be written, shorten the upper boundary + state bool isLastPage = (nextStart == entries.size()); + if (!isLastPage && height == 1) { + int commonPrefix = pageUpperBound.getCommonPrefixLen(entries[i - 1], 0); + pageUpperBound.truncate(commonPrefix + 1); + } - state std::vector> pages; - BTreePage *btPage; + state std::vector> pages; + BTreePage* btPage; - if(blockCount == 1) { + if (blockCount == 1) { + Reference page = self->m_pager->newPageBuffer(); + btPage = (BTreePage*)page->mutate(); + pages.push_back(std::move(page)); + } else { + ASSERT(blockCount > 1); + int size = blockSize * blockCount; + btPage = (BTreePage*)new uint8_t[size]; + } + + btPage->height = height; + btPage->kvBytes = kvBytes; + + debug_printf( + "Building tree. start=%d i=%d count=%d page usage: %d/%d (%.2f%%) bytes\nlower: %s\nupper: %s\n", + start, i, i - start, compressedBytes, pageSize, (float)compressedBytes / pageSize * 100, + pageLowerBound.toString(false).c_str(), pageUpperBound.toString(false).c_str()); + + int written = + btPage->tree().build(pageSize, &entries[start], &entries[i], &pageLowerBound, &pageUpperBound); + if (written > pageSize) { + debug_printf("ERROR: Wrote %d bytes to %d byte page (%d blocks). recs %d kvBytes %d compressed %d\n", + written, pageSize, blockCount, i - start, kvBytes, compressedBytes); + fprintf(stderr, + "ERROR: Wrote %d bytes to %d byte page (%d blocks). recs %d kvBytes %d compressed %d\n", + written, pageSize, blockCount, i - start, kvBytes, compressedBytes); + ASSERT(false); + } + + // Create chunked pages + // TODO: Avoid copying page bytes, but this is not trivial due to how pager checksums are currently handled. + if (blockCount != 1) { + // Mark the slack in the page buffer as defined + VALGRIND_MAKE_MEM_DEFINED(((uint8_t*)btPage) + written, (blockCount * blockSize) - written); + const uint8_t* rptr = (const uint8_t*)btPage; + for (int b = 0; b < blockCount; ++b) { Reference page = self->m_pager->newPageBuffer(); - btPage = (BTreePage *)page->mutate(); + memcpy(page->mutate(), rptr, blockSize); + rptr += blockSize; pages.push_back(std::move(page)); } - else { - ASSERT(blockCount > 1); - int size = blockSize * blockCount; - btPage = (BTreePage *)new uint8_t[size]; - } - - btPage->height = height; - btPage->kvBytes = kvBytes; - - int written = btPage->tree().build(pageSize, &entries[start], &entries[i], &pageLowerBound, &pageUpperBound); - if(written > pageSize) { - fprintf(stderr, "ERROR: Wrote %d bytes to %d byte page (%d blocks). recs %d kvBytes %d compressed %d\n", written, pageSize, blockCount, i - start, kvBytes, compressedBytes); - ASSERT(false); - } - - // Create chunked pages - // TODO: Avoid copying page bytes, but this is not trivial due to how pager checksums are currently handled. - if(blockCount != 1) { - // Mark the slack in the page buffer as defined - VALGRIND_MAKE_MEM_DEFINED(((uint8_t *)btPage) + written, (blockCount * blockSize) - written); - const uint8_t *rptr = (const uint8_t *)btPage; - for(int b = 0; b < blockCount; ++b) { - Reference page = self->m_pager->newPageBuffer(); - memcpy(page->mutate(), rptr, blockSize); - rptr += blockSize; - pages.push_back(std::move(page)); - } - delete [] (uint8_t *)btPage; - } - - // Write this btree page, which is made of 1 or more pager pages. - state int p; - state BTreePageID childPageID; - - // If we are only writing 1 page and it has the same BTreePageID size as the original they try to reuse the - // LogicalPageIDs in previousID and try to update them atomically. - if(end && records.empty() && previousID.size() == pages.size()) { - for(p = 0; p < pages.size(); ++p) { - LogicalPageID id = wait(self->m_pager->atomicUpdatePage(previousID[p], pages[p], v)); - childPageID.push_back(records.arena(), id); - } - } - else { - // Either the original page is being split, or it's not but it has changed BTreePageID size. - // Either way, there is no point in reusing any of the original page IDs because the parent - // must be rewritten anyway to count for the change in child count or child links. - // Free the old IDs, but only once (before the first output record is added). - if(records.empty()) { - self->freeBtreePage(previousID, v); - } - for(p = 0; p < pages.size(); ++p) { - LogicalPageID id = wait(self->m_pager->newPageID()); - self->m_pager->updatePage(id, pages[p]); - childPageID.push_back(records.arena(), id); - } - } - wait(yield()); - - // Update activity counts - ++counts.pageWrites; - if(pages.size() > 1) { - counts.extPageWrites += pages.size() - 1; - } - - debug_printf("Flushing %s original=%s start=%d i=%d count=%d\nlower: %s\nupper: %s\n", toString(childPageID).c_str(), toString(previousID).c_str(), start, i, i - start, pageLowerBound.toString().c_str(), pageUpperBound.toString().c_str()); - if(REDWOOD_DEBUG) { - for(int j = start; j < i; ++j) { - debug_printf(" %3d: %s\n", j, entries[j].toString().c_str()); - } - ASSERT(pageLowerBound.key <= pageUpperBound.key); - } - - // Push a new record onto the results set, without the child page, copying it into the records arena - records.push_back_deep(records.arena(), pageLowerBound.withoutValue()); - // Set the child page value of the inserted record to childPageID, which has already been allocated in records.arena() above - records.back().setChildPage(childPageID); - - if(end) { - break; - } - - start = i; - kvBytes = 0; - compressedBytes = BTreePage::BinaryTree::GetTreeOverhead(); - pageLowerBound = pageUpperBound.withoutValue(); + delete[](uint8_t*) btPage; } + + // Write this btree page, which is made of 1 or more pager pages. + state int p; + state BTreePageID childPageID; + + // If we are only writing 1 page and it has the same BTreePageID size as the original then try to reuse the + // LogicalPageIDs in previousID and try to update them atomically. + bool isOnlyPage = isLastPage && (start == 0); + if (isOnlyPage && previousID.size() == pages.size()) { + for (p = 0; p < pages.size(); ++p) { + LogicalPageID id = wait(self->m_pager->atomicUpdatePage(previousID[p], pages[p], v)); + childPageID.push_back(records.arena(), id); + } + } else { + // Either the original page is being split, or it's not but it has changed BTreePageID size. + // Either way, there is no point in reusing any of the original page IDs because the parent + // must be rewritten anyway to count for the change in child count or child links. + // Free the old IDs, but only once (before the first output record is added). + if (records.empty()) { + self->freeBtreePage(previousID, v); + } + for (p = 0; p < pages.size(); ++p) { + LogicalPageID id = wait(self->m_pager->newPageID()); + self->m_pager->updatePage(id, pages[p]); + childPageID.push_back(records.arena(), id); + } + } + + wait(yield()); + + // Update activity counts + ++counts.pageWrites; + if (pages.size() > 1) { + counts.extPageWrites += pages.size() - 1; + } + + debug_printf("Flushing %s lastPage=%d original=%s start=%d i=%d count=%d page usage: %d/%d (%.2f%%) " + "bytes\nlower: %s\nupper: %s\n", + toString(childPageID).c_str(), isLastPage, toString(previousID).c_str(), start, i, i - start, + compressedBytes, pageSize, (float)compressedBytes / pageSize * 100, + pageLowerBound.toString(false).c_str(), pageUpperBound.toString(false).c_str()); + + if (REDWOOD_DEBUG) { + for (int j = start; j < i; ++j) { + debug_printf(" %3d: %s\n", j, entries[j].toString(height == 1).c_str()); + } + ASSERT(pageLowerBound.key <= pageUpperBound.key); + } + + // Push a new record onto the results set, without the child page, copying it into the records arena + records.push_back_deep(records.arena(), pageLowerBound.withoutValue()); + // Set the child page value of the inserted record to childPageID, which has already been allocated in + // records.arena() above + records.back().setChildPage(childPageID); + + if (isLastPage) { + break; + } + + start = nextStart; + kvBytes = 0; + compressedBytes = BTreePage::BinaryTree::emptyTreeSize(); + pageLowerBound = pageUpperBound; + } + + // If we're writing internal pages, if the last entry was the start of a new page and had an empty child link + // then it would not be written to a page. This means that the upper boundary for the the page set being built + // is not the upper bound of the final page in that set, so it must be added to the output set to preserve the + // decodability of the subtree to its left. Fortunately, this is easy to detect because the loop above would + // exit before i has reached the item count. + if (height != 1 && i != entries.size()) { + debug_printf("Adding dummy record to avoid writing useless page: %s\n", + pageUpperBound.toString(false).c_str()); + records.push_back_deep(records.arena(), pageUpperBound); } return records; } - ACTOR static Future>> buildNewRoot(VersionedBTree *self, Version version, Standalone> records, int height) { + ACTOR static Future>> buildNewRoot( + VersionedBTree* self, Version version, Standalone> records, int height) { debug_printf("buildNewRoot start version %" PRId64 ", %lu records\n", version, records.size()); // While there are multiple child pages for this version we must write new tree levels. - while(records.size() > 1) { + while (records.size() > 1) { self->m_header.height = ++height; - Standalone> newRecords = wait(writePages(self, false, &dbBegin, &dbEnd, records, height, version, BTreePageID())); - debug_printf("Wrote a new root level at version %" PRId64 " height %d size %lu pages\n", version, height, newRecords.size()); + Standalone> newRecords = + wait(writePages(self, &dbBegin, &dbEnd, records, height, version, BTreePageID())); + debug_printf("Wrote a new root level at version %" PRId64 " height %d size %lu pages\n", version, height, + newRecords.size()); records = newRecords; } return records; } - class SuperPage : public IPage, ReferenceCounted, public FastAllocated{ + class SuperPage : public IPage, ReferenceCounted, public FastAllocated { public: SuperPage(std::vector> pages) { int blockSize = pages.front()->size(); m_size = blockSize * pages.size(); m_data = new uint8_t[m_size]; - uint8_t *wptr = m_data; - for(auto &p : pages) { + uint8_t* wptr = m_data; + for (auto& p : pages) { ASSERT(p->size() == blockSize); memcpy(wptr, p->begin(), blockSize); wptr += blockSize; } } - virtual ~SuperPage() { - delete [] m_data; - } + virtual ~SuperPage() { delete[] m_data; } virtual Reference clone() const { - return Reference(new SuperPage({Reference::addRef(this)})); + return Reference(new SuperPage({ Reference::addRef(this) })); } - void addref() const { - ReferenceCounted::addref(); - } + void addref() const { ReferenceCounted::addref(); } - void delref() const { - ReferenceCounted::delref(); - } + void delref() const { ReferenceCounted::delref(); } - int size() const { - return m_size; - } + int size() const { return m_size; } - uint8_t const* begin() const { - return m_data; - } + uint8_t const* begin() const { return m_data; } - uint8_t* mutate() { - return m_data; - } + uint8_t* mutate() { return m_data; } private: - uint8_t *m_data; + uint8_t* m_data; int m_size; }; - ACTOR static Future> readPage(Reference snapshot, BTreePageID id, const RedwoodRecordRef *lowerBound, const RedwoodRecordRef *upperBound, bool forLazyDelete = false) { - if(!forLazyDelete) { - debug_printf("readPage() op=read %s @%" PRId64 " lower=%s upper=%s\n", toString(id).c_str(), snapshot->getVersion(), lowerBound->toString().c_str(), upperBound->toString().c_str()); - } - else { - debug_printf("readPage() op=readForDeferredClear %s @%" PRId64 " \n", toString(id).c_str(), snapshot->getVersion()); + ACTOR static Future> readPage(Reference snapshot, BTreePageID id, + const RedwoodRecordRef* lowerBound, + const RedwoodRecordRef* upperBound, + bool forLazyDelete = false) { + if (!forLazyDelete) { + debug_printf("readPage() op=read %s @%" PRId64 " lower=%s upper=%s\n", toString(id).c_str(), + snapshot->getVersion(), lowerBound->toString().c_str(), upperBound->toString().c_str()); + } else { + debug_printf("readPage() op=readForDeferredClear %s @%" PRId64 " \n", toString(id).c_str(), + snapshot->getVersion()); } wait(yield()); @@ -3742,15 +3538,14 @@ private: state Reference page; ++counts.pageReads; - if(id.size() == 1) { + if (id.size() == 1) { Reference p = wait(snapshot->getPhysicalPage(id.front(), !forLazyDelete, false)); page = p; - } - else { + } else { ASSERT(!id.empty()); counts.extPageReads += (id.size() - 1); std::vector>> reads; - for(auto &pageID : id) { + for (auto& pageID : id) { reads.push_back(snapshot->getPhysicalPage(pageID, !forLazyDelete, false)); } std::vector> pages = wait(getAll(reads)); @@ -3759,52 +3554,54 @@ private: } debug_printf("readPage() op=readComplete %s @%" PRId64 " \n", toString(id).c_str(), snapshot->getVersion()); - const BTreePage *pTreePage = (const BTreePage *)page->begin(); + const BTreePage* pTreePage = (const BTreePage*)page->begin(); - if(!forLazyDelete && page->userData == nullptr) { - debug_printf("readPage() Creating Reader for %s @%" PRId64 " lower=%s upper=%s\n", toString(id).c_str(), snapshot->getVersion(), lowerBound->toString().c_str(), upperBound->toString().c_str()); + if (!forLazyDelete && page->userData == nullptr) { + debug_printf("readPage() Creating Reader for %s @%" PRId64 " lower=%s upper=%s\n", toString(id).c_str(), + snapshot->getVersion(), lowerBound->toString().c_str(), upperBound->toString().c_str()); page->userData = new BTreePage::BinaryTree::Mirror(&pTreePage->tree(), lowerBound, upperBound); - page->userDataDestructor = [](void *ptr) { delete (BTreePage::BinaryTree::Mirror *)ptr; }; + page->userDataDestructor = [](void* ptr) { delete (BTreePage::BinaryTree::Mirror*)ptr; }; } - if(!forLazyDelete) { - debug_printf("readPage() %s\n", pTreePage->toString(false, id, snapshot->getVersion(), lowerBound, upperBound).c_str()); + if (!forLazyDelete) { + debug_printf("readPage() %s\n", + pTreePage->toString(false, id, snapshot->getVersion(), lowerBound, upperBound).c_str()); } return page; } - static void preLoadPage(IPagerSnapshot *snapshot, BTreePageID id) { + static void preLoadPage(IPagerSnapshot* snapshot, BTreePageID id) { ++counts.pagePreloads; counts.extPagePreloads += (id.size() - 1); - - for(auto pageID : id) { + + for (auto pageID : id) { snapshot->getPhysicalPage(pageID, true, true); } } void freeBtreePage(BTreePageID btPageID, Version v) { // Free individual pages at v - for(LogicalPageID id : btPageID) { + for (LogicalPageID id : btPageID) { m_pager->freePage(id, v); } } // Write new version of pageID at version v using page as its data. // Attempts to reuse original id(s) in btPageID, returns BTreePageID. - ACTOR static Future updateBtreePage(VersionedBTree *self, BTreePageID oldID, Arena *arena, Reference page, Version writeVersion) { + ACTOR static Future updateBtreePage(VersionedBTree* self, BTreePageID oldID, Arena* arena, + Reference page, Version writeVersion) { state BTreePageID newID; newID.resize(*arena, oldID.size()); - if(oldID.size() == 1) { + if (oldID.size() == 1) { LogicalPageID id = wait(self->m_pager->atomicUpdatePage(oldID.front(), page, writeVersion)); newID.front() = id; - } - else { + } else { state std::vector> pages; - const uint8_t *rptr = page->begin(); + const uint8_t* rptr = page->begin(); int bytesLeft = page->size(); - while(bytesLeft > 0) { + while (bytesLeft > 0) { Reference p = self->m_pager->newPageBuffer(); int blockSize = p->size(); memcpy(p->mutate(), rptr, blockSize); @@ -3816,7 +3613,7 @@ private: // Write pages, trying to reuse original page IDs state int i = 0; - for(; i < pages.size(); ++i) { + for (; i < pages.size(); ++i) { LogicalPageID id = wait(self->m_pager->atomicUpdatePage(oldID[i], pages[i], writeVersion)); newID[i] = id; } @@ -3824,7 +3621,7 @@ private: // Update activity counts ++counts.pageWrites; - if(newID.size() > 1) { + if (newID.size() > 1) { counts.extPageWrites += newID.size() - 1; } @@ -3835,11 +3632,12 @@ private: Reference cloneForUpdate(Reference page) { Reference newPage = page->clone(); - auto oldMirror = (const BTreePage::BinaryTree::Mirror *)page->userData; - auto newBTPage = (BTreePage *)newPage->mutate(); + auto oldMirror = (const BTreePage::BinaryTree::Mirror*)page->userData; + auto newBTPage = (BTreePage*)newPage->mutate(); - newPage->userData = new BTreePage::BinaryTree::Mirror(&newBTPage->tree(), oldMirror->lowerBound(), oldMirror->upperBound()); - newPage->userDataDestructor = [](void *ptr) { delete (BTreePage::BinaryTree::Mirror *)ptr; }; + newPage->userData = + new BTreePage::BinaryTree::Mirror(&newBTPage->tree(), oldMirror->lowerBound(), oldMirror->upperBound()); + newPage->userDataDestructor = [](void* ptr) { delete (BTreePage::BinaryTree::Mirror*)ptr; }; return newPage; } @@ -3847,30 +3645,26 @@ private: // iMutationBoundary is greatest boundary <= lowerBound->key // iMutationBoundaryEnd is least boundary >= upperBound->key ACTOR static Future> commitSubtree( - VersionedBTree *self, - MutationBuffer *mutationBuffer, - //MutationBuffer::const_iterator iMutationBoundary, // = mutationBuffer->upper_bound(lowerBound->key); --iMutationBoundary; - //MutationBuffer::const_iterator iMutationBoundaryEnd, // = mutationBuffer->lower_bound(upperBound->key); - Reference snapshot, - BTreePageID rootID, - bool isLeaf, - const RedwoodRecordRef *lowerBound, - const RedwoodRecordRef *upperBound, - const RedwoodRecordRef *decodeLowerBound, - const RedwoodRecordRef *decodeUpperBound, - int skipLen = 0 - ) { - //skipLen = lowerBound->getCommonPrefixLen(*upperBound, skipLen); + VersionedBTree* self, MutationBuffer* mutationBuffer, + // MutationBuffer::const_iterator iMutationBoundary, // = mutationBuffer->upper_bound(lowerBound->key); + // --iMutationBoundary; MutationBuffer::const_iterator iMutationBoundaryEnd, // = + // mutationBuffer->lower_bound(upperBound->key); + Reference snapshot, BTreePageID rootID, bool isLeaf, const RedwoodRecordRef* lowerBound, + const RedwoodRecordRef* upperBound, const RedwoodRecordRef* decodeLowerBound, + const RedwoodRecordRef* decodeUpperBound, int skipLen = 0) { + // skipLen = lowerBound->getCommonPrefixLen(*upperBound, skipLen); state std::string context; - if(REDWOOD_DEBUG) { + if (REDWOOD_DEBUG) { context = format("CommitSubtree(root=%s): ", toString(rootID).c_str()); } state Version writeVersion = self->getLastCommittedVersion() + 1; state Standalone result; - debug_printf("%s lower=%s upper=%s\n", context.c_str(), lowerBound->toString().c_str(), upperBound->toString().c_str()); - debug_printf("%s decodeLower=%s decodeUpper=%s\n", context.c_str(), decodeLowerBound->toString().c_str(), decodeUpperBound->toString().c_str()); + debug_printf("%s lower=%s upper=%s\n", context.c_str(), lowerBound->toString().c_str(), + upperBound->toString().c_str()); + debug_printf("%s decodeLower=%s decodeUpper=%s\n", context.c_str(), decodeLowerBound->toString().c_str(), + decodeUpperBound->toString().c_str()); self->counts.commitToPageStart++; // Find the slice of the mutation buffer that is relevant to this subtree @@ -3878,12 +3672,13 @@ private: --iMutationBoundary; state MutationBuffer::const_iterator iMutationBoundaryEnd = mutationBuffer->lower_bound(upperBound->key); - if(REDWOOD_DEBUG) { + if (REDWOOD_DEBUG) { debug_printf("%s ---------MUTATION BUFFER SLICE ---------------------\n", context.c_str()); auto begin = iMutationBoundary; - while(1) { - debug_printf("%s Mutation: '%s': %s\n", context.c_str(), printable(begin.key()).c_str(), begin.mutation().toString().c_str()); - if(begin == iMutationBoundaryEnd) { + while (1) { + debug_printf("%s Mutation: '%s': %s\n", context.c_str(), printable(begin.key()).c_str(), + begin.mutation().toString().c_str()); + if (begin == iMutationBoundaryEnd) { break; } ++begin; @@ -3894,33 +3689,11 @@ private: // iMutationBoundary is greatest boundary <= lowerBound->key // iMutationBoundaryEnd is least boundary >= upperBound->key - // If the boundary range iterators are the same then this subtree only has one unique key, which is the same key as the boundary - // record the iterators are pointing to. There only two outcomes possible: Clearing the subtree or leaving it alone. - // If there are any changes to the one key then the entire subtree should be deleted as the changes for the key - // do not go into this subtree. - if(iMutationBoundary == iMutationBoundaryEnd) { - if(iMutationBoundary.mutation().boundaryChanged) { - debug_printf("%s lower and upper bound key/version match and key is modified so deleting page, returning %s\n", context.c_str(), toString(result).c_str()); - if(isLeaf) { - self->freeBtreePage(rootID, writeVersion); - } - else { - self->m_lazyDeleteQueue.pushBack(LazyDeleteQueueEntry{writeVersion, rootID}); - } - return result; - } - - // Otherwise, no changes to this subtree - result.contents() = ChildLinksRef(decodeLowerBound, decodeUpperBound); - debug_printf("%s page contains a single key '%s' which is not changing, returning %s\n", context.c_str(), lowerBound->key.toString().c_str(), toString(result).c_str()); - return result; - } - // If one mutation range covers the entire subtree, then check if the entire subtree is modified, // unmodified, or possibly/partially modified. MutationBuffer::const_iterator iMutationBoundaryNext = iMutationBoundary; ++iMutationBoundaryNext; - if(iMutationBoundaryNext == iMutationBoundaryEnd) { + if (iMutationBoundaryNext == iMutationBoundaryEnd) { // Cleared means the entire range covering the subtree was cleared. It is assumed true // if the range starting after the lower mutation boundary was cleared, and then proven false // below if possible. @@ -3932,29 +3705,30 @@ private: // If the lower mutation boundary key is the same as the subtree lower bound then whether or not // that key is being changed or cleared affects this subtree. - if(iMutationBoundary.key() == lowerBound->key) { - // If subtree will be cleared (so far) but the lower boundary key is not cleared then the subtree is not cleared - if(cleared && !iMutationBoundary.mutation().boundaryCleared()) { + if (iMutationBoundary.key() == lowerBound->key) { + // If subtree will be cleared (so far) but the lower boundary key is not cleared then the subtree is not + // cleared + if (cleared && !iMutationBoundary.mutation().boundaryCleared()) { cleared = false; debug_printf("%s cleared=%d unchanged=%d\n", context.c_str(), cleared, unchanged); } - // If the subtree looked unchanged (so far) but the lower boundary is is changed then the subtree is changed - if(unchanged && iMutationBoundary.mutation().boundaryChanged) { + // If the subtree looked unchanged (so far) but the lower boundary is is changed then the subtree is + // changed + if (unchanged && iMutationBoundary.mutation().boundaryChanged) { unchanged = false; debug_printf("%s cleared=%d unchanged=%d\n", context.c_str(), cleared, unchanged); } } - // If the higher mutation boundary key is the same as the subtree upper bound key then whether + // If the higher mutation boundary key is the same as the subtree upper bound key then whether // or not it is being changed or cleared affects this subtree. - if((cleared || unchanged) && iMutationBoundaryEnd.key() == upperBound->key) { + if ((cleared || unchanged) && iMutationBoundaryEnd.key() == upperBound->key) { // If the key is being changed then the records in this subtree with the same key must be removed // so the subtree is definitely not unchanged, though it may be cleared to achieve the same effect. - if(iMutationBoundaryEnd.mutation().boundaryChanged) { + if (iMutationBoundaryEnd.mutation().boundaryChanged) { unchanged = false; debug_printf("%s cleared=%d unchanged=%d\n", context.c_str(), cleared, unchanged); - } - else { + } else { // If the key is not being changed then the records in this subtree can't be removed so the // subtree is not being cleared. cleared = false; @@ -3966,20 +3740,21 @@ private: ASSERT(!(cleared && unchanged)); // If no changes in subtree - if(unchanged) { + if (unchanged) { result.contents() = ChildLinksRef(decodeLowerBound, decodeUpperBound); - debug_printf("%s no changes on this subtree, returning %s\n", context.c_str(), toString(result).c_str()); + debug_printf("%s no changes on this subtree, returning %s\n", context.c_str(), + toString(result).c_str()); return result; } // If subtree is cleared - if(cleared) { - debug_printf("%s %s cleared, deleting it, returning %s\n", context.c_str(), isLeaf ? "Page" : "Subtree", toString(result).c_str()); - if(isLeaf) { + if (cleared) { + debug_printf("%s %s cleared, deleting it, returning %s\n", context.c_str(), isLeaf ? "Page" : "Subtree", + toString(result).c_str()); + if (isLeaf) { self->freeBtreePage(rootID, writeVersion); - } - else { - self->m_lazyDeleteQueue.pushBack(LazyDeleteQueueEntry{writeVersion, rootID}); + } else { + self->m_lazyDeleteQueue.pushBack(LazyDeleteQueueEntry{ writeVersion, rootID }); } return result; } @@ -3987,18 +3762,21 @@ private: self->counts.commitToPage++; state Reference page = wait(readPage(snapshot, rootID, decodeLowerBound, decodeUpperBound)); - state BTreePage *btPage = (BTreePage *)page->begin(); + state BTreePage* btPage = (BTreePage*)page->begin(); ASSERT(isLeaf == btPage->isLeaf()); - debug_printf("%s commitSubtree(): %s\n", context.c_str(), btPage->toString(false, rootID, snapshot->getVersion(), decodeLowerBound, decodeUpperBound).c_str()); + debug_printf( + "%s commitSubtree(): %s\n", context.c_str(), + btPage->toString(false, rootID, snapshot->getVersion(), decodeLowerBound, decodeUpperBound).c_str()); state BTreePage::BinaryTree::Cursor cursor; - if(REDWOOD_DEBUG) { + if (REDWOOD_DEBUG) { debug_printf("%s ---------MUTATION BUFFER SLICE ---------------------\n", context.c_str()); auto begin = iMutationBoundary; - while(1) { - debug_printf("%s Mutation: '%s': %s\n", context.c_str(), printable(begin.key()).c_str(), begin.mutation().toString().c_str()); - if(begin == iMutationBoundaryEnd) { + while (1) { + debug_printf("%s Mutation: '%s': %s\n", context.c_str(), printable(begin.key()).c_str(), + begin.mutation().toString().c_str()); + if (begin == iMutationBoundaryEnd) { break; } ++begin; @@ -4007,33 +3785,33 @@ private: } // Leaf Page - if(isLeaf) { + if (isLeaf) { // Try to update page unless it's an oversized page or empty or the boundaries have changed // TODO: Caller already knows if boundaries are the same. - bool updating = btPage->tree().numItems > 0 && !(*decodeLowerBound != *lowerBound || *decodeUpperBound != *upperBound); + bool updating = + btPage->tree().numItems > 0 && !(*decodeLowerBound != *lowerBound || *decodeUpperBound != *upperBound); - state Reference newPage; - // If replacement pages are written they will be at the minimum version seen in the mutations for this leaf + state Reference newPage; + // If replacement pages are written they will be at the minimum version seen in the mutations for this leaf bool changesMade = false; // If attempting an in-place page update, clone the page and read/modify the copy - if(updating) { + if (updating) { newPage = self->cloneForUpdate(page); - cursor = getCursor(newPage); - } - else { + cursor = getCursor(newPage); + } else { // Otherwise read the old page cursor = getCursor(page); } - // Couldn't make changes in place, so now do a linear merge and build new pages. - state Standalone> merged; + // Couldn't make changes in place, so now do a linear merge and build new pages. + state Standalone> merged; auto switchToLinearMerge = [&]() { updating = false; auto c = cursor; c.moveFirst(); - while(c != cursor) { + while (c != cursor) { debug_printf("%s catch-up adding %s\n", context.c_str(), c.get().toString().c_str()); merged.push_back(merged.arena(), c.get()); c.moveNext(); @@ -4042,40 +3820,46 @@ private: // The first mutation buffer boundary has a key <= the first key in the page. - cursor.moveFirst(); - debug_printf("%s Leaf page, applying changes.\n", context.c_str()); + cursor.moveFirst(); + debug_printf("%s Leaf page, applying changes.\n", context.c_str()); // Now, process each mutation range and merge changes with existing data. bool firstMutationBoundary = true; - while(iMutationBoundary != iMutationBoundaryEnd) { - debug_printf("%s New mutation boundary: '%s': %s\n", context.c_str(), printable(iMutationBoundary.key()).c_str(), iMutationBoundary.mutation().toString().c_str()); + while (iMutationBoundary != iMutationBoundaryEnd) { + debug_printf("%s New mutation boundary: '%s': %s\n", context.c_str(), + printable(iMutationBoundary.key()).c_str(), + iMutationBoundary.mutation().toString().c_str()); // Apply the change to the mutation buffer start boundary key only if // - there actually is a change (whether a set or a clear, old records are to be removed) // - either this is not the first boundary or it is but its key matches our lower bound key - bool applyBoundaryChange = iMutationBoundary.mutation().boundaryChanged && (!firstMutationBoundary || iMutationBoundary.key() >= lowerBound->key); + bool applyBoundaryChange = iMutationBoundary.mutation().boundaryChanged && + (!firstMutationBoundary || iMutationBoundary.key() >= lowerBound->key); firstMutationBoundary = false; - - // Iterate over records for the mutation boundary key, keep them unless the boundary key was changed or we are not applying it - while(cursor.valid() && cursor.get().key == iMutationBoundary.key()) { + + // Iterate over records for the mutation boundary key, keep them unless the boundary key was changed or + // we are not applying it + while (cursor.valid() && cursor.get().key == iMutationBoundary.key()) { // If there were no changes to the key or we're not applying it - if(!applyBoundaryChange) { - // If not updating, add to the output set, otherwise skip ahead past the records for the mutation boundary - if(!updating) { + if (!applyBoundaryChange) { + // If not updating, add to the output set, otherwise skip ahead past the records for the + // mutation boundary + if (!updating) { merged.push_back(merged.arena(), cursor.get()); - debug_printf("%s Added %s [existing, boundary start]\n", context.c_str(), cursor.get().toString().c_str()); + debug_printf("%s Added %s [existing, boundary start]\n", context.c_str(), + cursor.get().toString().c_str()); } cursor.moveNext(); - } - else { + } else { changesMade = true; // If updating, erase from the page, otherwise do not add to the output set - if(updating) { - debug_printf("%s Erasing %s [existing, boundary start]\n", context.c_str(), cursor.get().toString().c_str()); + if (updating) { + debug_printf("%s Erasing %s [existing, boundary start]\n", context.c_str(), + cursor.get().toString().c_str()); cursor.erase(); - } - else { - debug_printf("%s Skipped %s [existing, boundary start]\n", context.c_str(), cursor.get().toString().c_str()); + } else { + debug_printf("%s Skipped %s [existing, boundary start]\n", context.c_str(), + cursor.get().toString().c_str()); cursor.moveNext(); } } @@ -4084,54 +3868,28 @@ private: constexpr int maxHeightAllowed = 8; // Write the new record(s) for the mutation boundary start key if its value has been set - // Clears of this key will have been processed above by not being erased from the updated page or excluded from the merge output - if(applyBoundaryChange && iMutationBoundary.mutation().boundarySet()) { + // Clears of this key will have been processed above by not being erased from the updated page or + // excluded from the merge output + if (applyBoundaryChange && iMutationBoundary.mutation().boundarySet()) { RedwoodRecordRef rec(iMutationBoundary.key(), 0, iMutationBoundary.mutation().boundaryValue.get()); changesMade = true; - if(rec.value.get().size() <= self->m_maxPartSize) { - // If updating, add to the page, else add to the output set - if(updating) { - if(cursor.mirror->insert(rec, skipLen, maxHeightAllowed)) { - debug_printf("%s Inserted non-split %s [mutation, boundary start]\n", context.c_str(), rec.toString().c_str()); - } - else { - debug_printf("%s Inserted failed for non-split %s [mutation, boundary start]\n", context.c_str(), rec.toString().c_str()); - switchToLinearMerge(); - } + // If updating, add to the page, else add to the output set + if (updating) { + if (cursor.mirror->insert(rec, skipLen, maxHeightAllowed)) { + debug_printf("%s Inserted %s [mutation, boundary start]\n", context.c_str(), + rec.toString().c_str()); + } else { + debug_printf("%s Inserted failed for %s [mutation, boundary start]\n", context.c_str(), + rec.toString().c_str()); + switchToLinearMerge(); } - if(!updating) { - merged.push_back(merged.arena(), rec); - debug_printf("%s Added non-split %s [mutation, boundary start]\n", context.c_str(), rec.toString().c_str()); - } - } - else { - int bytesLeft = rec.value.get().size(); - int start = 0; - while(bytesLeft > 0) { - int partSize = std::min(bytesLeft, self->m_maxPartSize); - // Don't copy the value chunk because mutation buffer will stay in memory until after the new page is written - RedwoodRecordRef part = rec.split(start, partSize); - bytesLeft -= partSize; - if(updating) { - if(cursor.mirror->insert(part, skipLen, maxHeightAllowed)) { - debug_printf("%s Inserted split %s [mutation, boundary start] bytesLeft %d\n", context.c_str(), rec.toString().c_str(), bytesLeft); - } - else { - debug_printf("%s Inserted failed for split %s [mutation, boundary start] bytesLeft %d\n", context.c_str(), rec.toString().c_str(), bytesLeft); - switchToLinearMerge(); - } - } - - if(!updating) { - merged.push_back(merged.arena(), part); - debug_printf("%s Added split %s [mutation, boundary start] bytesLeft %d\n", context.c_str(), rec.toString().c_str(), bytesLeft); - } - - start += partSize; - } + if (!updating) { + merged.push_back(merged.arena(), rec); + debug_printf("%s Added %s [mutation, boundary start]\n", context.c_str(), + rec.toString().c_str()); } } @@ -4139,39 +3897,41 @@ private: bool remove = iMutationBoundary.mutation().clearAfterBoundary; // Advance to the next boundary because we need to know the end key for the current range. ++iMutationBoundary; - if(iMutationBoundary == iMutationBoundaryEnd) { + if (iMutationBoundary == iMutationBoundaryEnd) { skipLen = 0; } - debug_printf("%s Mutation range end: '%s'\n", context.c_str(), printable(iMutationBoundary.key()).c_str()); + debug_printf("%s Mutation range end: '%s'\n", context.c_str(), + printable(iMutationBoundary.key()).c_str()); // Now handle the records up through but not including the next mutation boundary key RedwoodRecordRef end(iMutationBoundary.key()); // If the records are being removed and we're not doing an in-place update // OR if we ARE doing an update but the records are NOT being removed, then just skip them. - if(remove != updating) { - // If not updating, then the records, if any exist, are being removed. We don't know if there actually are any - // but we must assume there are. - if(!updating) { + if (remove != updating) { + // If not updating, then the records, if any exist, are being removed. We don't know if there + // actually are any but we must assume there are. + if (!updating) { changesMade = true; } - debug_printf("%s Seeking forward to next boundary (remove=%d updating=%d) %s\n", context.c_str(), remove, updating, iMutationBoundary.key().toString().c_str()); + debug_printf("%s Seeking forward to next boundary (remove=%d updating=%d) %s\n", context.c_str(), + remove, updating, iMutationBoundary.key().toString().c_str()); cursor.seekGreaterThanOrEqual(end, skipLen); - } - else { - // Otherwise we must visit the records. If updating, the visit is to erase them, and if doing a + } else { + // Otherwise we must visit the records. If updating, the visit is to erase them, and if doing a // linear merge than the visit is to add them to the output set. - while(cursor.valid() && cursor.get().compare(end, skipLen) < 0) { - if(updating) { - debug_printf("%s Erasing %s [existing, boundary start]\n", context.c_str(), cursor.get().toString().c_str()); + while (cursor.valid() && cursor.get().compare(end, skipLen) < 0) { + if (updating) { + debug_printf("%s Erasing %s [existing, boundary start]\n", context.c_str(), + cursor.get().toString().c_str()); cursor.erase(); changesMade = true; - } - else { - merged.push_back(merged.arena(), cursor.get()); - debug_printf("%s Added %s [existing, middle]\n", context.c_str(), merged.back().toString().c_str()); + } else { + merged.push_back(merged.arena(), cursor.get()); + debug_printf("%s Added %s [existing, middle]\n", context.c_str(), + merged.back().toString().c_str()); cursor.moveNext(); } } @@ -4179,87 +3939,92 @@ private: } // If there are still more records, they have the same key as the end boundary - if(cursor.valid()) { + if (cursor.valid()) { // If the end boundary is changing, we must remove the remaining records in this page bool remove = iMutationBoundaryEnd.mutation().boundaryChanged; - if(remove) { + if (remove) { changesMade = true; } // If we don't have to remove the records and we are updating, do nothing. // If we do have to remove the records and we are not updating, do nothing. - if(remove != updating) { - debug_printf("%s Ignoring remaining records, remove=%d updating=%d\n", context.c_str(), remove, updating); - } - else { + if (remove != updating) { + debug_printf("%s Ignoring remaining records, remove=%d updating=%d\n", context.c_str(), remove, + updating); + } else { // If updating and the key is changing, we must visit the records to erase them. - // If not updating and the key is not changing, we must visit the records to add them to the output set. - while(cursor.valid()) { - if(updating) { - debug_printf("%s Erasing %s and beyond [existing, matches changed upper mutation boundary]\n", context.c_str(), cursor.get().toString().c_str()); + // If not updating and the key is not changing, we must visit the records to add them to the output + // set. + while (cursor.valid()) { + if (updating) { + debug_printf( + "%s Erasing %s and beyond [existing, matches changed upper mutation boundary]\n", + context.c_str(), cursor.get().toString().c_str()); cursor.erase(); - } - else { + } else { merged.push_back(merged.arena(), cursor.get()); - debug_printf("%s Added %s [existing, tail]\n", context.c_str(), merged.back().toString().c_str()); + debug_printf("%s Added %s [existing, tail]\n", context.c_str(), + merged.back().toString().c_str()); cursor.moveNext(); } } } - } - else { + } else { debug_printf("%s No records matching mutation buffer end boundary key\n", context.c_str()); } - // No changes were actually made. This could happen if the only mutations are clear ranges which do not match any records. - if(!changesMade) { + // No changes were actually made. This could happen if the only mutations are clear ranges which do not + // match any records. + if (!changesMade) { result.contents() = ChildLinksRef(decodeLowerBound, decodeUpperBound); - debug_printf("%s No changes were made during mutation merge, returning %s\n", context.c_str(), toString(result).c_str()); + debug_printf("%s No changes were made during mutation merge, returning %s\n", context.c_str(), + toString(result).c_str()); return result; - } - else { + } else { debug_printf("%s Changes were made, writing.\n", context.c_str()); } writeVersion = self->getLastCommittedVersion() + 1; - if(updating) { - const BTreePage::BinaryTree &deltaTree = ((const BTreePage *)newPage->begin())->tree(); - if(deltaTree.numItems == 0) { - debug_printf("%s Page updates cleared all entries, returning %s\n", context.c_str(), toString(result).c_str()); + if (updating) { + const BTreePage::BinaryTree& deltaTree = ((const BTreePage*)newPage->begin())->tree(); + if (deltaTree.numItems == 0) { + debug_printf("%s Page updates cleared all entries, returning %s\n", context.c_str(), + toString(result).c_str()); self->freeBtreePage(rootID, writeVersion); return result; - } - else { - // Otherwise update it. - BTreePageID newID = wait(self->updateBtreePage(self, rootID, &result.arena(), newPage, writeVersion)); + } else { + // Otherwise update it. + BTreePageID newID = + wait(self->updateBtreePage(self, rootID, &result.arena(), newPage, writeVersion)); - // Set the child page ID, which has already been allocated in result.arena() - RedwoodRecordRef *rec = new (result.arena()) RedwoodRecordRef(decodeLowerBound->withoutValue()); - rec->setChildPage(newID); + // Set the child page ID, which has already been allocated in result.arena() + RedwoodRecordRef* rec = new (result.arena()) RedwoodRecordRef(decodeLowerBound->withoutValue()); + rec->setChildPage(newID); - result.contents() = ChildLinksRef(rec, decodeUpperBound); - debug_printf("%s Page updated in-place, returning %s\n", context.c_str(), toString(result).c_str()); + result.contents() = ChildLinksRef(rec, decodeUpperBound); + debug_printf("%s Page updated in-place, returning %s\n", context.c_str(), toString(result).c_str()); ++counts.pageUpdates; - return result; - } + return result; + } } // If everything in the page was deleted then this page should be deleted as of the new version // Note that if a single range clear covered the entire page then we should not get this far - if(merged.empty()) { - debug_printf("%s All leaf page contents were cleared, returning %s\n", context.c_str(), toString(result).c_str()); + if (merged.empty()) { + debug_printf("%s All leaf page contents were cleared, returning %s\n", context.c_str(), + toString(result).c_str()); self->freeBtreePage(rootID, writeVersion); return result; } - state Standalone> entries = wait(writePages(self, true, lowerBound, upperBound, merged, btPage->height, writeVersion, rootID)); + state Standalone> entries = + wait(writePages(self, lowerBound, upperBound, merged, btPage->height, writeVersion, rootID)); result.arena().dependsOn(entries.arena()); result.contents() = ChildLinksRef(entries, *upperBound); debug_printf("%s Merge complete, returning %s\n", context.c_str(), toString(result).c_str()); return result; - } - else { + } else { // Internal Page ASSERT(!isLeaf); state std::vector>> futureChildren; @@ -4268,67 +4033,70 @@ private: cursor.moveFirst(); bool first = true; - while(cursor.valid()) { + while (cursor.valid()) { // The lower bound for the first child is the lowerBound arg - const RedwoodRecordRef &childLowerBound = first ? *lowerBound : cursor.get(); + const RedwoodRecordRef& childLowerBound = first ? *lowerBound : cursor.get(); first = false; - // Skip over any children that do not link to a page. They exist to preserve the ancestors from - // which adjacent children can borrow prefix bytes. - // If there are any, then the first valid child page will incur a boundary change to move - // its lower bound to the left so we can delete the non-linking entry from this page to free up space. - while(!cursor.get().value.present()) { - // There should never be an internal page written that has no valid child pages. This loop will find - // the first valid child link, and if there are no more then execution will not return to this loop. - ASSERT(cursor.moveNext()); - } + // At this point we should never be at a null child page entry because the first entry of a page + // can't be null and this loop will skip over null entries that come after non-null entries. + ASSERT(cursor.get().value.present()); - ASSERT(cursor.valid()); - - const RedwoodRecordRef &decodeChildLowerBound = cursor.get(); + // The decode lower bound is always the key of the child link record + const RedwoodRecordRef& decodeChildLowerBound = cursor.get(); BTreePageID pageID = cursor.get().getChildPage(); ASSERT(!pageID.empty()); - const RedwoodRecordRef &decodeChildUpperBound = cursor.moveNext() ? cursor.get() : *decodeUpperBound; + // The decode upper bound is always the next key after the child link, or the decode upper bound for + // this page + const RedwoodRecordRef& decodeChildUpperBound = cursor.moveNext() ? cursor.get() : *decodeUpperBound; - // Skip over any next-children which do not actually link to child pages - while(cursor.valid() && !cursor.get().value.present()) { - cursor.moveNext(); + // But the decode upper bound might be a placeholder record with a null child link because + // the subtree was previously deleted but the key needed to exist to enable decoding of the + // previous child page which has not since been rewritten. + if (cursor.valid() && !cursor.get().value.present()) { + // There should only be one null child link entry, followed by a present link or the end of the page + ASSERT(!cursor.moveNext() || cursor.get().value.present()); } - const RedwoodRecordRef &childUpperBound = cursor.valid() ? cursor.get() : *upperBound; + const RedwoodRecordRef& childUpperBound = cursor.valid() ? cursor.get() : *upperBound; - debug_printf("%s recursing to %s lower=%s upper=%s decodeLower=%s decodeUpper=%s\n", - context.c_str(), toString(pageID).c_str(), childLowerBound.toString().c_str(), childUpperBound.toString().c_str(), decodeChildLowerBound.toString().c_str(), decodeChildUpperBound.toString().c_str()); + debug_printf("%s recursing to %s lower=%s upper=%s decodeLower=%s decodeUpper=%s\n", context.c_str(), + toString(pageID).c_str(), childLowerBound.toString().c_str(), + childUpperBound.toString().c_str(), decodeChildLowerBound.toString().c_str(), + decodeChildUpperBound.toString().c_str()); // If this page has height of 2 then its children are leaf nodes - futureChildren.push_back(self->commitSubtree(self, mutationBuffer, snapshot, pageID, btPage->height == 2, &childLowerBound, &childUpperBound, &decodeChildLowerBound, &decodeChildUpperBound)); + futureChildren.push_back(self->commitSubtree(self, mutationBuffer, snapshot, pageID, + btPage->height == 2, &childLowerBound, &childUpperBound, + &decodeChildLowerBound, &decodeChildUpperBound)); } // Waiting one at a time makes debugging easier // TODO: Is it better to use waitForAll()? state int k; - for(k = 0; k < futureChildren.size(); ++k) { + for (k = 0; k < futureChildren.size(); ++k) { wait(success(futureChildren[k])); } - if(REDWOOD_DEBUG) { - debug_printf("%s Subtree update results\n", context.c_str()); - for(int i = 0; i < futureChildren.size(); ++i) { + if (REDWOOD_DEBUG) { + debug_printf("%s Subtree update results\n", context.c_str()); + for (int i = 0; i < futureChildren.size(); ++i) { debug_printf("%s subtree result %s\n", context.c_str(), toString(futureChildren[i].get()).c_str()); } } - // All of the things added to pageBuilder will exist in the arenas inside futureChildren or will be upperBound + // All of the things added to pageBuilder will exist in the arenas inside futureChildren or will be + // upperBound BTreePage::BinaryTree::Cursor c = getCursor(page); c.moveFirst(); InternalPageBuilder pageBuilder(c); - for(int i = 0; i < futureChildren.size(); ++i) { + for (int i = 0; i < futureChildren.size(); ++i) { ChildLinksRef c = futureChildren[i].get(); - if(!c.children.empty()) { + if (!c.children.empty()) { pageBuilder.addEntries(c); } } @@ -4336,28 +4104,33 @@ private: pageBuilder.finalize(*upperBound, *decodeUpperBound); // If page contents have changed - if(pageBuilder.modified) { + if (pageBuilder.modified) { // If the page now has no children - if(pageBuilder.childPageCount == 0) { - debug_printf("%s All internal page children were deleted so deleting this page too, returning %s\n", context.c_str(), toString(result).c_str()); + if (pageBuilder.childPageCount == 0) { + debug_printf("%s All internal page children were deleted so deleting this page too, returning %s\n", + context.c_str(), toString(result).c_str()); self->freeBtreePage(rootID, writeVersion); return result; - } - else { + } else { debug_printf("%s Internal page modified, creating replacements.\n", context.c_str()); - debug_printf("%s newChildren=%s lastUpperBound=%s upperBound=%s\n", context.c_str(), toString(pageBuilder.entries).c_str(), pageBuilder.lastUpperBound.toString().c_str(), upperBound->toString().c_str()); + debug_printf("%s newChildren=%s lastUpperBound=%s upperBound=%s\n", context.c_str(), + toString(pageBuilder.entries).c_str(), pageBuilder.lastUpperBound.toString().c_str(), + upperBound->toString().c_str()); + debug_printf("pagebuilder entries: %s\n", ::toString(pageBuilder.entries).c_str()); - ASSERT(pageBuilder.lastUpperBound.sameExceptValue(*upperBound)); + ASSERT(!pageBuilder.entries.back().value.present() || + pageBuilder.lastUpperBound.sameExceptValue(*upperBound)); - Standalone> childEntries = wait(holdWhile(pageBuilder.entries, writePages(self, false, lowerBound, upperBound, pageBuilder.entries, btPage->height, writeVersion, rootID))); + Standalone> childEntries = wait( + holdWhile(pageBuilder.entries, writePages(self, lowerBound, upperBound, pageBuilder.entries, + btPage->height, writeVersion, rootID))); result.arena().dependsOn(childEntries.arena()); result.contents() = ChildLinksRef(childEntries, *upperBound); debug_printf("%s Internal modified, returning %s\n", context.c_str(), toString(result).c_str()); return result; } - } - else { + } else { result.contents() = ChildLinksRef(decodeLowerBound, decodeUpperBound); debug_printf("%s Page has no changes, returning %s\n", context.c_str(), toString(result).c_str()); return result; @@ -4365,8 +4138,8 @@ private: } } - ACTOR static Future commit_impl(VersionedBTree *self) { - state MutationBuffer *mutations = self->m_pBuffer; + ACTOR static Future commit_impl(VersionedBTree* self) { + state MutationBuffer* mutations = self->m_pBuffer; // No more mutations are allowed to be written to this mutation buffer we will commit // at m_writeVersion, which we must save locally because it could change during commit. @@ -4385,7 +4158,8 @@ private: wait(previousCommit); self->m_pager->setOldestVersion(self->m_newOldestVersion); - debug_printf("%s: Beginning commit of version %" PRId64 ", new oldest version set to %" PRId64 "\n", self->m_name.c_str(), writeVersion, self->m_newOldestVersion); + debug_printf("%s: Beginning commit of version %" PRId64 ", new oldest version set to %" PRId64 "\n", + self->m_name.c_str(), writeVersion, self->m_newOldestVersion); state bool lazyDeleteStop = false; state Future lazyDelete = incrementalSubtreeClear(self, &lazyDeleteStop); @@ -4396,27 +4170,29 @@ private: state Standalone rootPageID = self->m_header.root.get(); state RedwoodRecordRef lowerBound = dbBegin.withPageID(rootPageID); - Standalone newRootChildren = wait(commitSubtree(self, mutations, self->m_pager->getReadSnapshot(latestVersion), rootPageID, self->m_header.height == 1, &lowerBound, &dbEnd, &lowerBound, &dbEnd)); - debug_printf("CommitSubtree(root %s) returned %s\n", toString(rootPageID).c_str(), toString(newRootChildren).c_str()); + Standalone newRootChildren = + wait(commitSubtree(self, mutations, self->m_pager->getReadSnapshot(latestVersion), rootPageID, + self->m_header.height == 1, &lowerBound, &dbEnd, &lowerBound, &dbEnd)); + debug_printf("CommitSubtree(root %s) returned %s\n", toString(rootPageID).c_str(), + toString(newRootChildren).c_str()); // If the old root was deleted, write a new empty tree root node and free the old roots - if(newRootChildren.children.empty()) { + if (newRootChildren.children.empty()) { debug_printf("Writing new empty root.\n"); LogicalPageID newRootID = wait(self->m_pager->newPageID()); Reference page = self->m_pager->newPageBuffer(); makeEmptyRoot(page); self->m_header.height = 1; self->m_pager->updatePage(newRootID, page); - rootPageID = BTreePageID((LogicalPageID *)&newRootID, 1); - } - else { - Standalone> newRootLevel(newRootChildren.children, newRootChildren.arena()); - if(newRootLevel.size() == 1) { + rootPageID = BTreePageID((LogicalPageID*)&newRootID, 1); + } else { + Standalone> newRootLevel(newRootChildren.children, newRootChildren.arena()); + if (newRootLevel.size() == 1) { rootPageID = newRootLevel.front().getChildPage(); - } - else { + } else { // If the new root level's size is not 1 then build new root level(s) - Standalone> newRootPage = wait(buildNewRoot(self, latestVersion, newRootLevel, self->m_header.height)); + Standalone> newRootPage = + wait(buildNewRoot(self, latestVersion, newRootLevel, self->m_header.height)); rootPageID = newRootPage.front().getChildPage(); } } @@ -4451,8 +4227,10 @@ private: return Void(); } - // InternalCursor is for seeking to and iterating over the 'internal' records (not user-visible) in the Btree. - // These records are versioned and they can represent deletedness or partial values. +public: + // InternalCursor is for seeking to and iterating over the leaf-level RedwoodRecordRef records in the tree. + // The records could represent multiple values for the same key at different versions, including a non-present value + // representing a clear. Currently, however, all records are at version 0 and no clears are present in the tree. struct InternalCursor { private: // Each InternalCursor's position is represented by a reference counted PageCursor, which links @@ -4460,51 +4238,46 @@ private: // PageCursors can be shared by many InternalCursors, making InternalCursor copying low overhead struct PageCursor : ReferenceCounted, FastAllocated { Reference parent; - BTreePageID pageID; // Only needed for debugging purposes + BTreePageID pageID; // Only needed for debugging purposes Reference page; BTreePage::BinaryTree::Cursor cursor; // id will normally reference memory owned by the parent, which is okay because a reference to the parent // will be held in the cursor PageCursor(BTreePageID id, Reference page, Reference parent = {}) - : pageID(id), page(page), parent(parent), cursor(getCursor(page)) - { - } + : pageID(id), page(page), parent(parent), cursor(getCursor(page)) {} - PageCursor(const PageCursor &toCopy) : parent(toCopy.parent), pageID(toCopy.pageID), page(toCopy.page), cursor(toCopy.cursor) { - } + PageCursor(const PageCursor& toCopy) + : parent(toCopy.parent), pageID(toCopy.pageID), page(toCopy.page), cursor(toCopy.cursor) {} // Convenience method for copying a PageCursor - Reference copy() const { - return Reference(new PageCursor(*this)); - } + Reference copy() const { return Reference(new PageCursor(*this)); } - const BTreePage * btPage() const { - return (const BTreePage *)page->begin(); - } + const BTreePage* btPage() const { return (const BTreePage*)page->begin(); } - bool isLeaf() const { - return btPage()->isLeaf(); - } + bool isLeaf() const { return btPage()->isLeaf(); } Future> getChild(Reference pager, int readAheadBytes = 0) { ASSERT(!isLeaf()); BTreePage::BinaryTree::Cursor next = cursor; next.moveNext(); - const RedwoodRecordRef &rec = cursor.get(); + const RedwoodRecordRef& rec = cursor.get(); BTreePageID id = rec.getChildPage(); Future> child = readPage(pager, id, &rec, &next.getOrUpperBound()); // Read ahead siblings at level 2 - if(readAheadBytes > 0 && btPage()->height == 2 && next.valid()) { + // TODO: Application of readAheadBytes is not taking into account the size of the current page or any + // of the adjacent pages it is preloading. + if (readAheadBytes > 0 && btPage()->height == 2 && next.valid()) { do { - debug_printf("preloading %s %d bytes left\n", ::toString(next.get().getChildPage()).c_str(), readAheadBytes); + debug_printf("preloading %s %d bytes left\n", ::toString(next.get().getChildPage()).c_str(), + readAheadBytes); // If any part of the page was already loaded then stop - if(next.get().value.present()) { + if (next.get().value.present()) { preLoadPage(pager.getPtr(), next.get().getChildPage()); readAheadBytes -= page->size(); } - } while(readAheadBytes > 0 && next.moveNext()); + } while (readAheadBytes > 0 && next.moveNext()); } return map(child, [=](Reference page) { @@ -4513,7 +4286,8 @@ private: } std::string toString() const { - return format("%s, %s", ::toString(pageID).c_str(), cursor.valid() ? cursor.get().toString().c_str() : ""); + return format("%s, %s", ::toString(pageID).c_str(), + cursor.valid() ? cursor.get().toString().c_str() : ""); } }; @@ -4522,26 +4296,23 @@ private: Reference pageCursor; public: - InternalCursor() { - } + InternalCursor() {} - InternalCursor(Reference pager, BTreePageID root) - : pager(pager), rootPageID(root) { - } + InternalCursor(Reference pager, BTreePageID root) : pager(pager), rootPageID(root) {} std::string toString() const { std::string r; Reference c = pageCursor; int maxDepth = 0; - while(c) { + while (c) { c = c->parent; ++maxDepth; } c = pageCursor; int depth = maxDepth; - while(c) { + while (c) { r = format("[%d/%d: %s] ", depth--, maxDepth, c->toString().c_str()) + r; c = c->parent; } @@ -4549,48 +4320,35 @@ private: } // Returns true if cursor position is a valid leaf page record - bool valid() const { - return pageCursor && pageCursor->isLeaf() && pageCursor->cursor.valid(); - } + bool valid() const { return pageCursor && pageCursor->isLeaf() && pageCursor->cursor.valid(); } // Returns true if cursor position is valid() and has a present record value - bool present() { - return valid() && pageCursor->cursor.get().value.present(); - } + bool present() const { return valid() && pageCursor->cursor.get().value.present(); } // Returns true if cursor position is present() and has an effective version <= v - bool presentAtVersion(Version v) { - return present() && pageCursor->cursor.get().version <= v; - } + bool presentAtVersion(Version v) { return present() && pageCursor->cursor.get().version <= v; } // This is to enable an optimization for the case where all internal records are at the // same version and there are no implicit clears // *this MUST be valid() - bool presentAtExactVersionUnsharded(Version v) const { - auto const &rec = pageCursor->cursor.get(); - return rec.value.present() && rec.version == v && rec.chunk.total == 0; - } + bool presentAtExactVersion(Version v) const { return present() && pageCursor->cursor.get().version == v; } // Returns true if cursor position is present() and has an effective version <= v - bool validAtVersion(Version v) { - return valid() && pageCursor->cursor.get().version <= v; - } + bool validAtVersion(Version v) { return valid() && pageCursor->cursor.get().version <= v; } - const RedwoodRecordRef & get() const { - return pageCursor->cursor.get(); - } + const RedwoodRecordRef& get() const { return pageCursor->cursor.get(); } // Ensure that pageCursor is not shared with other cursors so we can modify it void ensureUnshared() { - if(!pageCursor->isSoleOwner()) { + if (!pageCursor->isSoleOwner()) { pageCursor = pageCursor->copy(); } } Future moveToRoot() { // If pageCursor exists follow parent links to the root - if(pageCursor) { - while(pageCursor->parent) { + if (pageCursor) { + while (pageCursor->parent) { pageCursor = pageCursor->parent; } return Void(); @@ -4604,64 +4362,63 @@ private: }); } - ACTOR Future seekLessThanOrEqual_impl(InternalCursor *self, RedwoodRecordRef query, int prefetchBytes) { + ACTOR Future seekLessThan_impl(InternalCursor* self, RedwoodRecordRef query, int prefetchBytes) { Future f = self->moveToRoot(); - // f will almost always be ready - if(!f.isReady()) { + if (!f.isReady()) { wait(f); } self->ensureUnshared(); - loop { - bool success = self->pageCursor->cursor.seekLessThanOrEqual(query); + bool isLeaf = self->pageCursor->isLeaf(); + bool success = self->pageCursor->cursor.seekLessThan(query); // Skip backwards over internal page entries that do not link to child pages - if(!self->pageCursor->isLeaf()) { + if (!isLeaf) { // While record has no value, move again - while(success && !self->pageCursor->cursor.get().value.present()) { + while (success && !self->pageCursor->cursor.get().value.present()) { success = self->pageCursor->cursor.movePrev(); } } - if(success) { - // If we found a record <= query at a leaf page then return success - if(self->pageCursor->isLeaf()) { + if (success) { + // If we found a record < query at a leaf page then return success + if (isLeaf) { return true; } Reference child = wait(self->pageCursor->getChild(self->pager, prefetchBytes)); self->pageCursor = child; - } - else { - // No records <= query on this page, so move to immediate previous record at leaf level + } else { + // No records < query on this page, so move to immediate previous record at leaf level bool success = wait(self->move(false)); return success; } } } - Future seekLTE(RedwoodRecordRef query, int prefetchBytes) { - return seekLessThanOrEqual_impl(this, query, prefetchBytes); + Future seekLessThan(RedwoodRecordRef query, int prefetchBytes) { + return seekLessThan_impl(this, query, prefetchBytes); } - ACTOR Future move_impl(InternalCursor *self, bool forward) { + ACTOR Future move_impl(InternalCursor* self, bool forward) { // Try to move pageCursor, if it fails to go parent, repeat until it works or root cursor can't be moved - while(1) { + while (1) { self->ensureUnshared(); - bool success = self->pageCursor->cursor.valid() && (forward ? self->pageCursor->cursor.moveNext() : self->pageCursor->cursor.movePrev()); + bool success = self->pageCursor->cursor.valid() && + (forward ? self->pageCursor->cursor.moveNext() : self->pageCursor->cursor.movePrev()); // Skip over internal page entries that do not link to child pages - if(!self->pageCursor->isLeaf()) { + if (!self->pageCursor->isLeaf()) { // While record has no value, move again - while(success && !self->pageCursor->cursor.get().value.present()) { + while (success && !self->pageCursor->cursor.get().value.present()) { success = forward ? self->pageCursor->cursor.moveNext() : self->pageCursor->cursor.movePrev(); } } // Stop if successful or there's no parent to move to - if(success || !self->pageCursor->parent) { + if (success || !self->pageCursor->parent) { break; } @@ -4670,16 +4427,16 @@ private: } // If pageCursor not valid we've reached an end of the tree - if(!self->pageCursor->cursor.valid()) { + if (!self->pageCursor->cursor.valid()) { return false; } // While not on a leaf page, move down to get to one. - while(!self->pageCursor->isLeaf()) { + while (!self->pageCursor->isLeaf()) { // Skip over internal page entries that do not link to child pages - while(!self->pageCursor->cursor.get().value.present()) { + while (!self->pageCursor->cursor.get().value.present()) { bool success = forward ? self->pageCursor->cursor.moveNext() : self->pageCursor->cursor.movePrev(); - if(!success) { + if (!success) { return false; } } @@ -4692,16 +4449,14 @@ private: return true; } - Future move(bool forward) { - return move_impl(this, forward); - } + Future move(bool forward) { return move_impl(this, forward); } // Move to the first or last record of the database. - ACTOR Future move_end(InternalCursor *self, bool begin) { + ACTOR Future move_end(InternalCursor* self, bool begin) { Future f = self->moveToRoot(); // f will almost always be ready - if(!f.isReady()) { + if (!f.isReady()) { wait(f); } @@ -4712,47 +4467,37 @@ private: bool success = begin ? self->pageCursor->cursor.moveFirst() : self->pageCursor->cursor.moveLast(); // Skip over internal page entries that do not link to child pages - if(!self->pageCursor->isLeaf()) { + if (!self->pageCursor->isLeaf()) { // While record has no value, move past it - while(success && !self->pageCursor->cursor.get().value.present()) { + while (success && !self->pageCursor->cursor.get().value.present()) { success = begin ? self->pageCursor->cursor.moveNext() : self->pageCursor->cursor.movePrev(); } } // If it worked, return true if we've reached a leaf page otherwise go to the next child - if(success) { - if(self->pageCursor->isLeaf()) { + if (success) { + if (self->pageCursor->isLeaf()) { return true; } Reference child = wait(self->pageCursor->getChild(self->pager)); self->pageCursor = child; - } - else { + } else { return false; } } } - Future moveFirst() { - return move_end(this, true); - } - Future moveLast() { - return move_end(this, false); - } - + Future moveFirst() { return move_end(this, true); } + Future moveLast() { return move_end(this, false); } }; // Cursor is for reading and interating over user visible KV pairs at a specific version // KeyValueRefs returned become invalid once the cursor is moved - class Cursor : public IStoreCursor, public ReferenceCounted, public FastAllocated, NonCopyable { + class Cursor : public IStoreCursor, public ReferenceCounted, public FastAllocated, NonCopyable { public: Cursor(Reference pageSource, BTreePageID root, Version internalRecordVersion) - : m_version(internalRecordVersion), - m_cur1(pageSource, root), - m_cur2(m_cur1) - { - } + : m_version(internalRecordVersion), m_cur1(pageSource, root), m_cur2(m_cur1) {} void addref() { ReferenceCounted::addref(); } void delref() { ReferenceCounted::delref(); } @@ -4773,9 +4518,7 @@ private: Optional m_kv; public: - Future findEqual(KeyRef key) override { - return find_impl(this, key, 0); - } + Future findEqual(KeyRef key) override { return find_impl(this, key, 0); } Future findFirstEqualOrGreater(KeyRef key, int prefetchBytes) override { return find_impl(this, key, 1, prefetchBytes); } @@ -4783,43 +4526,32 @@ private: return find_impl(this, key, -1, prefetchBytes); } - Future next() override { - return move(this, true); - } - Future prev() override { - return move(this, false); - } + Future next() override { return move(this, true); } + Future prev() override { return move(this, false); } - bool isValid() override { - return m_kv.present(); - } + bool isValid() override { return m_kv.present(); } - KeyRef getKey() override { - return m_kv.get().key; - } + KeyRef getKey() override { return m_kv.get().key; } - ValueRef getValue() override { - return m_kv.get().value; - } + ValueRef getValue() override { return m_kv.get().value; } std::string toString(bool includePaths = false) const { std::string r; r += format("Cursor(%p) ver: %" PRId64 " ", this, m_version); - if(m_kv.present()) { - r += format(" KV: '%s' -> '%s'", m_kv.get().key.printable().c_str(), m_kv.get().value.printable().c_str()); - } - else { + if (m_kv.present()) { + r += format(" KV: '%s' -> '%s'", m_kv.get().key.printable().c_str(), + m_kv.get().value.printable().c_str()); + } else { r += " KV: "; } - if(includePaths) { + if (includePaths) { r += format("\n Cur1: %s", m_cur1.toString().c_str()); r += format("\n Cur2: %s", m_cur2.toString().c_str()); - } - else { - if(m_cur1.valid()) { + } else { + if (m_cur1.valid()) { r += format("\n Cur1: %s", m_cur1.get().toString().c_str()); } - if(m_cur2.valid()) { + if (m_cur2.valid()) { r += format("\n Cur2: %s", m_cur2.get().toString().c_str()); } } @@ -4832,48 +4564,48 @@ private: // for less than or equal use cmp < 0 // for greater than or equal use cmp > 0 // for equal use cmp == 0 - ACTOR static Future find_impl(Cursor *self, KeyRef key, int cmp, int prefetchBytes = 0) { - // Search for the last key at or before (key, version, \xff) - state RedwoodRecordRef query(key, self->m_version, {}, 0, std::numeric_limits::max()); + ACTOR static Future find_impl(Cursor* self, KeyRef key, int cmp, int prefetchBytes = 0) { + state RedwoodRecordRef query(key, self->m_version + 1); self->m_kv.reset(); - wait(success(self->m_cur1.seekLTE(query, prefetchBytes))); - debug_printf("find%sE(%s): %s\n", cmp > 0 ? "GT" : (cmp == 0 ? "" : "LT"), query.toString().c_str(), self->toString().c_str()); + wait(success(self->m_cur1.seekLessThan(query, prefetchBytes))); + debug_printf("find%sE(%s): %s\n", cmp > 0 ? "GT" : (cmp == 0 ? "" : "LT"), query.toString().c_str(), + self->toString().c_str()); // If we found the target key with a present value then return it as it is valid for any cmp type - if(self->m_cur1.present() && self->m_cur1.get().key == key) { - debug_printf("Target key found, reading full KV pair. Cursor: %s\n", self->toString().c_str()); - wait(self->readFullKVPair(self)); + if (self->m_cur1.present() && self->m_cur1.get().key == key) { + debug_printf("Target key found. Cursor: %s\n", self->toString().c_str()); + self->m_kv = self->m_cur1.get().toKeyValueRef(); return Void(); } - // Mode is ==, so if we're still here we didn't find it. - if(cmp == 0) { + // If cmp type is Equal and we reached here, we didn't find it + if (cmp == 0) { return Void(); } - // Mode is >=, so if we're here we have to go to the next present record at the target version - // because the seek done above was <= query - if(cmp > 0) { - // icur is at a record < query or invalid. - - // If cursor is invalid, try to go to start of tree - if(!self->m_cur1.valid()) { + // cmp mode is GreaterThanOrEqual, so if we've reached here an equal key was not found and cur1 either + // points to a lesser key or is invalid. + if (cmp > 0) { + // If cursor is invalid, query was less than the first key in database so go to the first record + if (!self->m_cur1.valid()) { bool valid = wait(self->m_cur1.moveFirst()); - if(!valid) { + if (!valid) { self->m_kv.reset(); return Void(); } - } - else { + } else { + // Otherwise, move forward until we find a key greater than the target key. + // If multiversion data is present, the next record could have the same key as the initial + // record found but be at a newer version. loop { bool valid = wait(self->m_cur1.move(true)); - if(!valid) { + if (!valid) { self->m_kv.reset(); return Void(); } - if(self->m_cur1.get().key > key) { + if (self->m_cur1.get().key > key) { break; } } @@ -4881,10 +4613,10 @@ private: // Get the next present key at the target version. Handles invalid cursor too. wait(self->next()); - } - else if(cmp < 0) { - // Mode is <=, which is the same as the seekLTE(query) - if(!self->m_cur1.valid()) { + } else if (cmp < 0) { + // cmp mode is LessThanOrEqual. An equal key to the target key was already checked above, and the + // search was for LessThan query, so cur1 is already in the right place. + if (!self->m_cur1.valid()) { self->m_kv.reset(); return Void(); } @@ -4896,114 +4628,67 @@ private: return Void(); } - ACTOR static Future move(Cursor *self, bool fwd) { + ACTOR static Future move(Cursor* self, bool fwd) { debug_printf("Cursor::move(%d): Start %s\n", fwd, self->toString().c_str()); ASSERT(self->m_cur1.valid()); // If kv is present then the key/version at cur1 was already returned so move to a new key // Move cur1 until failure or a new key is found, keeping prior record visited in cur2 - if(self->m_kv.present()) { + if (self->m_kv.present()) { ASSERT(self->m_cur1.valid()); loop { self->m_cur2 = self->m_cur1; debug_printf("Cursor::move(%d): Advancing cur1 %s\n", fwd, self->toString().c_str()); bool valid = wait(self->m_cur1.move(fwd)); - if(!valid || self->m_cur1.get().key != self->m_cur2.get().key) { + if (!valid || self->m_cur1.get().key != self->m_cur2.get().key) { break; } } } // Given two consecutive cursors c1 and c2, c1 represents a returnable record if - // c1.presentAtVersion(v) || (!c2.validAtVersion() || c2.get().key != c1.get().key()) + // c1 is present at exactly version v + // OR + // c1 is.presentAtVersion(v) && (!c2.validAtVersion() || c2.get().key != c1.get().key()) // Note the distinction between 'present' and 'valid'. Present means the value for the key // exists at the version (but could be the empty string) while valid just means the internal // record is in effect at that version but it could indicate that the key was cleared and // no longer exists from the user's perspective at that version - // - // cur2 must be the record immediately after cur1 - // TODO: This may already be the case, store state to track this condition and avoid the reset here - if(self->m_cur1.valid()) { + if (self->m_cur1.valid()) { self->m_cur2 = self->m_cur1; debug_printf("Cursor::move(%d): Advancing cur2 %s\n", fwd, self->toString().c_str()); wait(success(self->m_cur2.move(true))); } - self->m_kv.reset(); - while(self->m_cur1.valid()) { + while (self->m_cur1.valid()) { - if(self->m_cur1.presentAtExactVersionUnsharded(self->m_version) || - (self->m_cur1.presentAtVersion(self->m_version) && - (!self->m_cur2.validAtVersion(self->m_version) || - self->m_cur2.get().key != self->m_cur1.get().key)) - ) { - wait(readFullKVPair(self)); + if (self->m_cur1.get().version == self->m_version || + (self->m_cur1.presentAtVersion(self->m_version) && + (!self->m_cur2.validAtVersion(self->m_version) || + self->m_cur2.get().key != self->m_cur1.get().key))) { + self->m_kv = self->m_cur1.get().toKeyValueRef(); return Void(); } - if(fwd) { + if (fwd) { // Moving forward, move cur2 forward and keep cur1 pointing to the prior (predecessor) record debug_printf("Cursor::move(%d): Moving forward %s\n", fwd, self->toString().c_str()); self->m_cur1 = self->m_cur2; wait(success(self->m_cur2.move(true))); - } - else { + } else { // Moving backward, move cur1 backward and keep cur2 pointing to the prior (successor) record debug_printf("Cursor::move(%d): Moving backward %s\n", fwd, self->toString().c_str()); self->m_cur2 = self->m_cur1; wait(success(self->m_cur1.move(false))); } - } debug_printf("Cursor::move(%d): Exit, end of db reached. Cursor = %s\n", fwd, self->toString().c_str()); + self->m_kv.reset(); + return Void(); } - - // Read all of the current key-value record starting at cur1 into kv - ACTOR static Future readFullKVPair(Cursor *self) { - self->m_arena = Arena(); - const RedwoodRecordRef &rec = self->m_cur1.get(); - - self->m_kv.reset(); - debug_printf("readFullKVPair: Starting at %s\n", self->toString().c_str()); - - // Unsplit value, cur1 will hold the key and value memory - if(!rec.isMultiPart()) { - self->m_kv = KeyValueRef(rec.key, rec.value.get()); - debug_printf("readFullKVPair: Unsplit, exit. %s\n", self->toString().c_str()); - - return Void(); - } - - debug_printf("readFullKVPair: Split, first record %s\n", rec.toString().c_str()); - - // Split value, need to coalesce split value parts into a buffer in arena, - // after which cur1 will point to the first part and kv.key will reference its key - ASSERT(rec.chunk.start + rec.value.get().size() == rec.chunk.total); - - // Allocate space for the entire value in the same arena as the key - state int bytesLeft = rec.chunk.total; - state StringRef dst = makeString(bytesLeft, self->m_arena); - - loop { - const RedwoodRecordRef &rec = self->m_cur1.get(); - int partSize = rec.value.get().size(); - memcpy(mutateString(dst) + rec.chunk.start, rec.value.get().begin(), partSize); - bytesLeft -= partSize; - debug_printf("readFullKVPair: Added chunk %s (%d bytes, %d bytes left afterward)\n", rec.toString().c_str(), partSize, bytesLeft); - if(bytesLeft == 0) { - self->m_kv = KeyValueRef(rec.key, dst); - return Void(); - } - ASSERT(bytesLeft > 0); - // Move backward - bool success = wait(self->m_cur1.move(false)); - ASSERT(success); - } - } }; - }; #include "art_impl.h" @@ -5016,16 +4701,14 @@ class KeyValueStoreRedwoodUnversioned : public IKeyValueStore { public: KeyValueStoreRedwoodUnversioned(std::string filePrefix, UID logID) : m_filePrefix(filePrefix) { // TODO: This constructor should really just take an IVersionedStore - IPager2 *pager = new DWALPager(4096, filePrefix, 0); + IPager2* pager = new DWALPager(4096, filePrefix, 0); m_tree = new VersionedBTree(pager, filePrefix); m_init = catchError(init_impl(this)); } - Future init() { - return m_init; - } + Future init() { return m_init; } - ACTOR Future init_impl(KeyValueStoreRedwoodUnversioned *self) { + ACTOR Future init_impl(KeyValueStoreRedwoodUnversioned* self) { TraceEvent(SevInfo, "RedwoodInit").detail("FilePrefix", self->m_filePrefix); wait(self->m_tree->init()); Version v = self->m_tree->getLatestVersion(); @@ -5034,34 +4717,30 @@ public: return Void(); } - ACTOR void shutdown(KeyValueStoreRedwoodUnversioned *self, bool dispose) { + ACTOR void shutdown(KeyValueStoreRedwoodUnversioned* self, bool dispose) { TraceEvent(SevInfo, "RedwoodShutdown").detail("FilePrefix", self->m_filePrefix).detail("Dispose", dispose); - if(self->m_error.canBeSet()) { - self->m_error.sendError(actor_cancelled()); // Ideally this should be shutdown_in_progress + if (self->m_error.canBeSet()) { + self->m_error.sendError(actor_cancelled()); // Ideally this should be shutdown_in_progress } self->m_init.cancel(); Future closedFuture = self->m_tree->onClosed(); - if(dispose) + if (dispose) self->m_tree->dispose(); else self->m_tree->close(); wait(closedFuture); self->m_closed.send(Void()); - TraceEvent(SevInfo, "RedwoodShutdownComplete").detail("FilePrefix", self->m_filePrefix).detail("Dispose", dispose); + TraceEvent(SevInfo, "RedwoodShutdownComplete") + .detail("FilePrefix", self->m_filePrefix) + .detail("Dispose", dispose); delete self; } - void close() { - shutdown(this, false); - } + void close() { shutdown(this, false); } - void dispose() { - shutdown(this, true); - } + void dispose() { shutdown(this, true); } - Future< Void > onClosed() { - return m_closed.getFuture(); - } + Future onClosed() { return m_closed.getFuture(); } Future commit(bool sequential = false) { Future c = m_tree->commit(); @@ -5070,40 +4749,35 @@ public: return catchError(c); } - KeyValueStoreType getType() { - return KeyValueStoreType::SSD_REDWOOD_V1; - } + KeyValueStoreType getType() { return KeyValueStoreType::SSD_REDWOOD_V1; } - StorageBytes getStorageBytes() { - return m_tree->getStorageBytes(); - } + StorageBytes getStorageBytes() { return m_tree->getStorageBytes(); } - Future< Void > getError() { - return delayed(m_error.getFuture()); - }; + Future getError() { return delayed(m_error.getFuture()); }; void clear(KeyRangeRef range, const Arena* arena = 0) { debug_printf("CLEAR %s\n", printable(range).c_str()); m_tree->clear(range); } - void set( KeyValueRef keyValue, const Arena* arena = NULL ) { + void set(KeyValueRef keyValue, const Arena* arena = NULL) { debug_printf("SET %s\n", printable(keyValue).c_str()); m_tree->set(keyValue); } - Future< Standalone< RangeResultRef > > readRange(KeyRangeRef keys, int rowLimit = 1<<30, int byteLimit = 1<<30) { + Future> readRange(KeyRangeRef keys, int rowLimit = 1 << 30, int byteLimit = 1 << 30) { debug_printf("READRANGE %s\n", printable(keys).c_str()); return catchError(readRange_impl(this, keys, rowLimit, byteLimit)); } - ACTOR static Future< Standalone< RangeResultRef > > readRange_impl(KeyValueStoreRedwoodUnversioned *self, KeyRange keys, int rowLimit, int byteLimit) { + ACTOR static Future> readRange_impl(KeyValueStoreRedwoodUnversioned* self, KeyRange keys, + int rowLimit, int byteLimit) { self->m_tree->counts.getRanges++; state Standalone result; state int accumulatedBytes = 0; - ASSERT( byteLimit > 0 ); + ASSERT(byteLimit > 0); - if(rowLimit == 0) { + if (rowLimit == 0) { return result; } @@ -5111,27 +4785,26 @@ public: // Prefetch is currently only done in the forward direction state int prefetchBytes = rowLimit > 1 ? byteLimit : 0; - if(rowLimit > 0) { + if (rowLimit > 0) { wait(cur->findFirstEqualOrGreater(keys.begin, prefetchBytes)); - while(cur->isValid() && cur->getKey() < keys.end) { + while (cur->isValid() && cur->getKey() < keys.end) { KeyValueRef kv(KeyRef(result.arena(), cur->getKey()), ValueRef(result.arena(), cur->getValue())); accumulatedBytes += kv.expectedSize(); result.push_back(result.arena(), kv); - if(--rowLimit == 0 || accumulatedBytes >= byteLimit) { + if (--rowLimit == 0 || accumulatedBytes >= byteLimit) { break; } wait(cur->next()); } } else { wait(cur->findLastLessOrEqual(keys.end)); - if(cur->isValid() && cur->getKey() == keys.end) - wait(cur->prev()); + if (cur->isValid() && cur->getKey() == keys.end) wait(cur->prev()); - while(cur->isValid() && cur->getKey() >= keys.begin) { + while (cur->isValid() && cur->getKey() >= keys.begin) { KeyValueRef kv(KeyRef(result.arena(), cur->getKey()), ValueRef(result.arena(), cur->getValue())); accumulatedBytes += kv.expectedSize(); result.push_back(result.arena(), kv); - if(++rowLimit == 0 || accumulatedBytes >= byteLimit) { + if (++rowLimit == 0 || accumulatedBytes >= byteLimit) { break; } wait(cur->prev()); @@ -5139,34 +4812,36 @@ public: } result.more = rowLimit == 0 || accumulatedBytes >= byteLimit; - if(result.more) { + if (result.more) { ASSERT(result.size() > 0); - result.readThrough = result[result.size()-1].key; + result.readThrough = result[result.size() - 1].key; } return result; } - ACTOR static Future< Optional > readValue_impl(KeyValueStoreRedwoodUnversioned *self, Key key, Optional< UID > debugID) { + ACTOR static Future> readValue_impl(KeyValueStoreRedwoodUnversioned* self, Key key, + Optional debugID) { self->m_tree->counts.gets++; state Reference cur = self->m_tree->readAtVersion(self->m_tree->getLastCommittedVersion()); wait(cur->findEqual(key)); - if(cur->isValid()) { + if (cur->isValid()) { return cur->getValue(); } return Optional(); } - Future< Optional< Value > > readValue(KeyRef key, Optional< UID > debugID = Optional()) { + Future> readValue(KeyRef key, Optional debugID = Optional()) { return catchError(readValue_impl(this, key, debugID)); } - ACTOR static Future< Optional > readValuePrefix_impl(KeyValueStoreRedwoodUnversioned *self, Key key, int maxLength, Optional< UID > debugID) { + ACTOR static Future> readValuePrefix_impl(KeyValueStoreRedwoodUnversioned* self, Key key, + int maxLength, Optional debugID) { self->m_tree->counts.gets++; state Reference cur = self->m_tree->readAtVersion(self->m_tree->getLastCommittedVersion()); wait(cur->findEqual(key)); - if(cur->isValid()) { + if (cur->isValid()) { Value v = cur->getValue(); int len = std::min(v.size(), maxLength); return Value(cur->getValue().substr(0, len)); @@ -5174,26 +4849,26 @@ public: return Optional(); } - Future< Optional< Value > > readValuePrefix(KeyRef key, int maxLength, Optional< UID > debugID = Optional()) { + Future> readValuePrefix(KeyRef key, int maxLength, Optional debugID = Optional()) { return catchError(readValuePrefix_impl(this, key, maxLength, debugID)); } - virtual ~KeyValueStoreRedwoodUnversioned() { - }; + virtual ~KeyValueStoreRedwoodUnversioned(){}; private: std::string m_filePrefix; - VersionedBTree *m_tree; + VersionedBTree* m_tree; Future m_init; Promise m_closed; Promise m_error; - template inline Future catchError(Future f) { + template + inline Future catchError(Future f) { return forwardError(f, m_error); } }; -IKeyValueStore* keyValueStoreRedwoodV1( std::string const& filename, UID logID) { +IKeyValueStore* keyValueStoreRedwoodV1(std::string const& filename, UID logID) { return new KeyValueStoreRedwoodUnversioned(filename, logID); } @@ -5202,18 +4877,18 @@ int randomSize(int max) { return n; } -StringRef randomString(Arena &arena, int len, char firstChar = 'a', char lastChar = 'z') { +StringRef randomString(Arena& arena, int len, char firstChar = 'a', char lastChar = 'z') { ++lastChar; StringRef s = makeString(len, arena); - for(int i = 0; i < len; ++i) { - *(uint8_t *)(s.begin() + i) = (uint8_t)deterministicRandom()->randomInt(firstChar, lastChar); + for (int i = 0; i < len; ++i) { + *(uint8_t*)(s.begin() + i) = (uint8_t)deterministicRandom()->randomInt(firstChar, lastChar); } return s; } Standalone randomString(int len, char firstChar = 'a', char lastChar = 'z') { Standalone s; - (StringRef &)s = randomString(s.arena(), len, firstChar, lastChar); + (StringRef&)s = randomString(s.arena(), len, firstChar, lastChar); return s; } @@ -5224,80 +4899,84 @@ KeyValue randomKV(int maxKeySize = 10, int maxValueSize = 5) { KeyValue kv; kv.key = randomString(kv.arena(), kLen, 'a', 'm'); - for(int i = 0; i < kLen; ++i) - mutateString(kv.key)[i] = (uint8_t)deterministicRandom()->randomInt('a', 'm'); + for (int i = 0; i < kLen; ++i) mutateString(kv.key)[i] = (uint8_t)deterministicRandom()->randomInt('a', 'm'); - if(vLen > 0) { + if (vLen > 0) { kv.value = randomString(kv.arena(), vLen, 'n', 'z'); - for(int i = 0; i < vLen; ++i) - mutateString(kv.value)[i] = (uint8_t)deterministicRandom()->randomInt('o', 'z'); + for (int i = 0; i < vLen; ++i) mutateString(kv.value)[i] = (uint8_t)deterministicRandom()->randomInt('o', 'z'); } return kv; } -ACTOR Future verifyRange(VersionedBTree *btree, Key start, Key end, Version v, std::map, Optional> *written, int *pErrorCount) { +ACTOR Future verifyRange(VersionedBTree* btree, Key start, Key end, Version v, + std::map, Optional>* written, + int* pErrorCount) { state int errors = 0; - if(end <= start) - end = keyAfter(start); + if (end <= start) end = keyAfter(start); - state std::map, Optional>::const_iterator i = written->lower_bound(std::make_pair(start.toString(), 0)); - state std::map, Optional>::const_iterator iEnd = written->upper_bound(std::make_pair(end.toString(), 0)); + state std::map, Optional>::const_iterator i = + written->lower_bound(std::make_pair(start.toString(), 0)); + state std::map, Optional>::const_iterator iEnd = + written->upper_bound(std::make_pair(end.toString(), 0)); state std::map, Optional>::const_iterator iLast; state Reference cur = btree->readAtVersion(v); - debug_printf("VerifyRange(@%" PRId64 ", %s, %s): Start cur=%p\n", v, start.toString().c_str(), end.toString().c_str(), cur.getPtr()); + debug_printf("VerifyRange(@%" PRId64 ", %s, %s): Start cur=%p\n", v, start.toHexString().c_str(), + end.toHexString().c_str(), cur.getPtr()); // Randomly use the cursor for something else first. - if(deterministicRandom()->coinflip()) { + if (deterministicRandom()->coinflip()) { state Key randomKey = randomKV().key; - debug_printf("VerifyRange(@%" PRId64 ", %s, %s): Dummy seek to '%s'\n", v, start.toString().c_str(), end.toString().c_str(), randomKey.toString().c_str()); - wait(deterministicRandom()->coinflip() ? cur->findFirstEqualOrGreater(randomKey) : cur->findLastLessOrEqual(randomKey)); + debug_printf("VerifyRange(@%" PRId64 ", %s, %s): Dummy seek to '%s'\n", v, start.toHexString().c_str(), + end.toHexString().c_str(), randomKey.toString().c_str()); + wait(deterministicRandom()->coinflip() ? cur->findFirstEqualOrGreater(randomKey) + : cur->findLastLessOrEqual(randomKey)); } - debug_printf("VerifyRange(@%" PRId64 ", %s, %s): Actual seek\n", v, start.toString().c_str(), end.toString().c_str()); + debug_printf("VerifyRange(@%" PRId64 ", %s, %s): Actual seek\n", v, start.toHexString().c_str(), + end.toHexString().c_str()); wait(cur->findFirstEqualOrGreater(start)); state std::vector results; - while(cur->isValid() && cur->getKey() < end) { + while (cur->isValid() && cur->getKey() < end) { // Find the next written kv pair that would be present at this version - while(1) { + while (1) { iLast = i; - if(i == iEnd) - break; + if (i == iEnd) break; ++i; - if(iLast->first.second <= v - && iLast->second.present() - && ( - i == iEnd - || i->first.first != iLast->first.first - || i->first.second > v - ) - ) { - debug_printf("VerifyRange(@%" PRId64 ", %s, %s) Found key in written map: %s\n", v, start.toString().c_str(), end.toString().c_str(), iLast->first.first.c_str()); + if (iLast->first.second <= v && iLast->second.present() && + (i == iEnd || i->first.first != iLast->first.first || i->first.second > v)) { + debug_printf("VerifyRange(@%" PRId64 ", %s, %s) Found key in written map: %s\n", v, + start.toHexString().c_str(), end.toHexString().c_str(), iLast->first.first.c_str()); break; } } - if(iLast == iEnd) { + if (iLast == iEnd) { ++errors; ++*pErrorCount; - printf("VerifyRange(@%" PRId64 ", %s, %s) ERROR: Tree key '%s' vs nothing in written map.\n", v, start.toString().c_str(), end.toString().c_str(), cur->getKey().toString().c_str()); + printf("VerifyRange(@%" PRId64 ", %s, %s) ERROR: Tree key '%s' vs nothing in written map.\n", v, + start.toHexString().c_str(), end.toHexString().c_str(), cur->getKey().toString().c_str()); break; } - if(cur->getKey() != iLast->first.first) { + if (cur->getKey() != iLast->first.first) { ++errors; ++*pErrorCount; - printf("VerifyRange(@%" PRId64 ", %s, %s) ERROR: Tree key '%s' vs written '%s'\n", v, start.toString().c_str(), end.toString().c_str(), cur->getKey().toString().c_str(), iLast->first.first.c_str()); + printf("VerifyRange(@%" PRId64 ", %s, %s) ERROR: Tree key '%s' vs written '%s'\n", v, + start.toHexString().c_str(), end.toHexString().c_str(), cur->getKey().toString().c_str(), + iLast->first.first.c_str()); break; } - if(cur->getValue() != iLast->second.get()) { + if (cur->getValue() != iLast->second.get()) { ++errors; ++*pErrorCount; - printf("VerifyRange(@%" PRId64 ", %s, %s) ERROR: Tree key '%s' has tree value '%s' vs written '%s'\n", v, start.toString().c_str(), end.toString().c_str(), cur->getKey().toString().c_str(), cur->getValue().toString().c_str(), iLast->second.get().c_str()); + printf("VerifyRange(@%" PRId64 ", %s, %s) ERROR: Tree key '%s' has tree value '%s' vs written '%s'\n", v, + start.toHexString().c_str(), end.toHexString().c_str(), cur->getKey().toString().c_str(), + cur->getValue().toString().c_str(), iLast->second.get().c_str()); break; } @@ -5308,60 +4987,61 @@ ACTOR Future verifyRange(VersionedBTree *btree, Key start, Key end, Version } // Make sure there are no further written kv pairs that would be present at this version. - while(1) { + while (1) { iLast = i; - if(i == iEnd) - break; + if (i == iEnd) break; ++i; - if(iLast->first.second <= v - && iLast->second.present() - && ( - i == iEnd - || i->first.first != iLast->first.first - || i->first.second > v - ) - ) + if (iLast->first.second <= v && iLast->second.present() && + (i == iEnd || i->first.first != iLast->first.first || i->first.second > v)) break; } - if(iLast != iEnd) { + if (iLast != iEnd) { ++errors; ++*pErrorCount; - printf("VerifyRange(@%" PRId64 ", %s, %s) ERROR: Tree range ended but written has @%" PRId64 " '%s'\n", v, start.toString().c_str(), end.toString().c_str(), iLast->first.second, iLast->first.first.c_str()); + printf("VerifyRange(@%" PRId64 ", %s, %s) ERROR: Tree range ended but written has @%" PRId64 " '%s'\n", v, + start.toHexString().c_str(), end.toHexString().c_str(), iLast->first.second, iLast->first.first.c_str()); } - debug_printf("VerifyRangeReverse(@%" PRId64 ", %s, %s): start\n", v, start.toString().c_str(), end.toString().c_str()); + debug_printf("VerifyRangeReverse(@%" PRId64 ", %s, %s): start\n", v, start.toHexString().c_str(), + end.toHexString().c_str()); - // Randomly use a new cursor at the same version for the reverse range read, if the version is still available for opening new cursors - if(v >= btree->getOldestVersion() && deterministicRandom()->coinflip()) { + // Randomly use a new cursor at the same version for the reverse range read, if the version is still available for + // opening new cursors + if (v >= btree->getOldestVersion() && deterministicRandom()->coinflip()) { cur = btree->readAtVersion(v); } // Now read the range from the tree in reverse order and compare to the saved results wait(cur->findLastLessOrEqual(end)); - if(cur->isValid() && cur->getKey() == end) - wait(cur->prev()); + if (cur->isValid() && cur->getKey() == end) wait(cur->prev()); state std::vector::const_reverse_iterator r = results.rbegin(); - while(cur->isValid() && cur->getKey() >= start) { - if(r == results.rend()) { + while (cur->isValid() && cur->getKey() >= start) { + if (r == results.rend()) { ++errors; ++*pErrorCount; - printf("VerifyRangeReverse(@%" PRId64 ", %s, %s) ERROR: Tree key '%s' vs nothing in written map.\n", v, start.toString().c_str(), end.toString().c_str(), cur->getKey().toString().c_str()); + printf("VerifyRangeReverse(@%" PRId64 ", %s, %s) ERROR: Tree key '%s' vs nothing in written map.\n", v, + start.toHexString().c_str(), end.toHexString().c_str(), cur->getKey().toString().c_str()); break; } - if(cur->getKey() != r->key) { + if (cur->getKey() != r->key) { ++errors; ++*pErrorCount; - printf("VerifyRangeReverse(@%" PRId64 ", %s, %s) ERROR: Tree key '%s' vs written '%s'\n", v, start.toString().c_str(), end.toString().c_str(), cur->getKey().toString().c_str(), r->key.toString().c_str()); + printf("VerifyRangeReverse(@%" PRId64 ", %s, %s) ERROR: Tree key '%s' vs written '%s'\n", v, + start.toHexString().c_str(), end.toHexString().c_str(), cur->getKey().toString().c_str(), + r->key.toString().c_str()); break; } - if(cur->getValue() != r->value) { + if (cur->getValue() != r->value) { ++errors; ++*pErrorCount; - printf("VerifyRangeReverse(@%" PRId64 ", %s, %s) ERROR: Tree key '%s' has tree value '%s' vs written '%s'\n", v, start.toString().c_str(), end.toString().c_str(), cur->getKey().toString().c_str(), cur->getValue().toString().c_str(), r->value.toString().c_str()); + printf("VerifyRangeReverse(@%" PRId64 + ", %s, %s) ERROR: Tree key '%s' has tree value '%s' vs written '%s'\n", + v, start.toHexString().c_str(), end.toHexString().c_str(), cur->getKey().toString().c_str(), + cur->getValue().toString().c_str(), r->value.toString().c_str()); break; } @@ -5369,47 +5049,54 @@ ACTOR Future verifyRange(VersionedBTree *btree, Key start, Key end, Version wait(cur->prev()); } - if(r != results.rend()) { + if (r != results.rend()) { ++errors; ++*pErrorCount; - printf("VerifyRangeReverse(@%" PRId64 ", %s, %s) ERROR: Tree range ended but written has '%s'\n", v, start.toString().c_str(), end.toString().c_str(), r->key.toString().c_str()); + printf("VerifyRangeReverse(@%" PRId64 ", %s, %s) ERROR: Tree range ended but written has '%s'\n", v, + start.toHexString().c_str(), end.toHexString().c_str(), r->key.toString().c_str()); } return errors; } // Verify the result of point reads for every set or cleared key at the given version -ACTOR Future seekAll(VersionedBTree *btree, Version v, std::map, Optional> *written, int *pErrorCount) { +ACTOR Future seekAll(VersionedBTree* btree, Version v, + std::map, Optional>* written, int* pErrorCount) { state std::map, Optional>::const_iterator i = written->cbegin(); state std::map, Optional>::const_iterator iEnd = written->cend(); state int errors = 0; state Reference cur = btree->readAtVersion(v); - while(i != iEnd) { + while (i != iEnd) { state std::string key = i->first.first; state Version ver = i->first.second; - if(ver == v) { + if (ver == v) { state Optional val = i->second; debug_printf("Verifying @%" PRId64 " '%s'\n", ver, key.c_str()); state Arena arena; wait(cur->findEqual(KeyRef(arena, key))); - if(val.present()) { - if(!(cur->isValid() && cur->getKey() == key && cur->getValue() == val.get())) { + if (val.present()) { + if (!(cur->isValid() && cur->getKey() == key && cur->getValue() == val.get())) { ++errors; ++*pErrorCount; - if(!cur->isValid()) - printf("Verify ERROR: key_not_found: '%s' -> '%s' @%" PRId64 "\n", key.c_str(), val.get().c_str(), ver); - else if(cur->getKey() != key) - printf("Verify ERROR: key_incorrect: found '%s' expected '%s' @%" PRId64 "\n", cur->getKey().toString().c_str(), key.c_str(), ver); - else if(cur->getValue() != val.get()) - printf("Verify ERROR: value_incorrect: for '%s' found '%s' expected '%s' @%" PRId64 "\n", cur->getKey().toString().c_str(), cur->getValue().toString().c_str(), val.get().c_str(), ver); + if (!cur->isValid()) + printf("Verify ERROR: key_not_found: '%s' -> '%s' @%" PRId64 "\n", key.c_str(), + val.get().c_str(), ver); + else if (cur->getKey() != key) + printf("Verify ERROR: key_incorrect: found '%s' expected '%s' @%" PRId64 "\n", + cur->getKey().toString().c_str(), key.c_str(), ver); + else if (cur->getValue() != val.get()) + printf("Verify ERROR: value_incorrect: for '%s' found '%s' expected '%s' @%" PRId64 "\n", + cur->getKey().toString().c_str(), cur->getValue().toString().c_str(), val.get().c_str(), + ver); } } else { - if(cur->isValid() && cur->getKey() == key) { + if (cur->isValid() && cur->getKey() == key) { ++errors; ++*pErrorCount; - printf("Verify ERROR: cleared_key_found: '%s' -> '%s' @%" PRId64 "\n", key.c_str(), cur->getValue().toString().c_str(), ver); + printf("Verify ERROR: cleared_key_found: '%s' -> '%s' @%" PRId64 "\n", key.c_str(), + cur->getValue().toString().c_str(), ver); } } } @@ -5418,7 +5105,9 @@ ACTOR Future seekAll(VersionedBTree *btree, Version v, std::map verify(VersionedBTree *btree, FutureStream vStream, std::map, Optional> *written, int *pErrorCount, bool serial) { +ACTOR Future verify(VersionedBTree* btree, FutureStream vStream, + std::map, Optional>* written, int* pErrorCount, + bool serial) { state Future fRangeAll; state Future fRangeRandom; state Future fSeekAll; @@ -5432,33 +5121,37 @@ ACTOR Future verify(VersionedBTree *btree, FutureStream vStream, committedVersions.push_back(v); // Remove expired versions - while(!committedVersions.empty() && committedVersions.front() < btree->getOldestVersion()) { + while (!committedVersions.empty() && committedVersions.front() < btree->getOldestVersion()) { committedVersions.pop_front(); } - // Choose a random committed version, or sometimes the latest (which could be ahead of the latest version from vStream) - v = (committedVersions.empty() || deterministicRandom()->random01() < 0.25) ? btree->getLastCommittedVersion() : committedVersions[deterministicRandom()->randomInt(0, committedVersions.size())]; + // Choose a random committed version, or sometimes the latest (which could be ahead of the latest version + // from vStream) + v = (committedVersions.empty() || deterministicRandom()->random01() < 0.25) + ? btree->getLastCommittedVersion() + : committedVersions[deterministicRandom()->randomInt(0, committedVersions.size())]; debug_printf("Using committed version %" PRId64 "\n", v); // Get a cursor at v so that v doesn't get expired between the possibly serial steps below. state Reference cur = btree->readAtVersion(v); debug_printf("Verifying entire key range at version %" PRId64 "\n", v); fRangeAll = verifyRange(btree, LiteralStringRef(""), LiteralStringRef("\xff\xff"), v, written, pErrorCount); - if(serial) { + if (serial) { wait(success(fRangeAll)); } Key begin = randomKV().key; Key end = randomKV().key; - debug_printf("Verifying range (%s, %s) at version %" PRId64 "\n", toString(begin).c_str(), toString(end).c_str(), v); + debug_printf("Verifying range (%s, %s) at version %" PRId64 "\n", toString(begin).c_str(), + toString(end).c_str(), v); fRangeRandom = verifyRange(btree, begin, end, v, written, pErrorCount); - if(serial) { + if (serial) { wait(success(fRangeRandom)); } debug_printf("Verifying seeks to each changed key at version %" PRId64 "\n", v); fSeekAll = seekAll(btree, v, written, pErrorCount); - if(serial) { + if (serial) { wait(success(fSeekAll)); } @@ -5466,11 +5159,10 @@ ACTOR Future verify(VersionedBTree *btree, FutureStream vStream, printf("Verified version %" PRId64 ", %d errors\n", v, *pErrorCount); - if(*pErrorCount != 0) - break; + if (*pErrorCount != 0) break; } - } catch(Error &e) { - if(e.code() != error_code_end_of_stream && e.code() != error_code_transaction_too_old) { + } catch (Error& e) { + if (e.code() != error_code_end_of_stream && e.code() != error_code_transaction_too_old) { throw; } } @@ -5478,12 +5170,12 @@ ACTOR Future verify(VersionedBTree *btree, FutureStream vStream, } // Does a random range read, doesn't trap/report errors -ACTOR Future randomReader(VersionedBTree *btree) { +ACTOR Future randomReader(VersionedBTree* btree) { try { state Reference cur; loop { wait(yield()); - if(!cur || deterministicRandom()->random01() > .01) { + if (!cur || deterministicRandom()->random01() > .01) { Version v = btree->getLastCommittedVersion(); cur = btree->readAtVersion(v); } @@ -5491,14 +5183,13 @@ ACTOR Future randomReader(VersionedBTree *btree) { state KeyValue kv = randomKV(10, 0); wait(cur->findFirstEqualOrGreater(kv.key)); state int c = deterministicRandom()->randomInt(0, 100); - while(cur->isValid() && c-- > 0) { + while (cur->isValid() && c-- > 0) { wait(success(cur->next())); wait(yield()); } } - } - catch(Error &e) { - if(e.code() != error_code_transaction_too_old) { + } catch (Error& e) { + if (e.code() != error_code_transaction_too_old) { throw e; } } @@ -5510,9 +5201,7 @@ struct IntIntPair { IntIntPair() {} IntIntPair(int k, int v) : k(k), v(v) {} - IntIntPair(Arena &arena, const IntIntPair &toCopy) { - *this = toCopy; - } + IntIntPair(Arena& arena, const IntIntPair& toCopy) { *this = toCopy; } struct Delta { bool prefixSource; @@ -5520,39 +5209,28 @@ struct IntIntPair { int dk; int dv; - IntIntPair apply(const IntIntPair &base, Arena &arena) { - return {base.k + dk, base.v + dv}; - } + IntIntPair apply(const IntIntPair& base, Arena& arena) { return { base.k + dk, base.v + dv }; } - void setPrefixSource(bool val) { - prefixSource = val; - } + void setPrefixSource(bool val) { prefixSource = val; } - bool getPrefixSource() const { - return prefixSource; - } + bool getPrefixSource() const { return prefixSource; } - void setDeleted(bool val) { - deleted = val; - } + void setDeleted(bool val) { deleted = val; } - bool getDeleted() const { - return deleted; - } + bool getDeleted() const { return deleted; } - int size() const { - return sizeof(Delta); - } + int size() const { return sizeof(Delta); } std::string toString() const { - return format("DELTA{prefixSource=%d deleted=%d dk=%d(0x%x) dv=%d(0x%x)}", prefixSource, deleted, dk, dk, dv, dv); + return format("DELTA{prefixSource=%d deleted=%d dk=%d(0x%x) dv=%d(0x%x)}", prefixSource, deleted, dk, dk, + dv, dv); } }; // For IntIntPair, skipLen will be in units of fields, not bytes - int getCommonPrefixLen(const IntIntPair &other, int skip = 0) const { - if(k == other.k) { - if(v == other.v) { + int getCommonPrefixLen(const IntIntPair& other, int skip = 0) const { + if (k == other.k) { + if (v == other.v) { return 2; } return 1; @@ -5560,31 +5238,25 @@ struct IntIntPair { return 0; } - int compare(const IntIntPair &rhs, int skip = 0) const { - if(skip == 2) { + int compare(const IntIntPair& rhs, int skip = 0) const { + if (skip == 2) { return 0; } int cmp = (skip > 0) ? 0 : (k - rhs.k); - if(cmp == 0) { + if (cmp == 0) { cmp = v - rhs.v; } return cmp; } - bool operator==(const IntIntPair &rhs) const { - return compare(rhs) == 0; - } + bool operator==(const IntIntPair& rhs) const { return compare(rhs) == 0; } - bool operator<(const IntIntPair &rhs) const { - return compare(rhs) < 0; - } + bool operator<(const IntIntPair& rhs) const { return compare(rhs) < 0; } - int deltaSize(const IntIntPair &base, bool worstcase = false, int skipLen = 0) const { - return sizeof(Delta); - } + int deltaSize(const IntIntPair& base, int skipLen, bool worstcase) const { return sizeof(Delta); } - int writeDelta(Delta &d, const IntIntPair &base, int commonPrefix = -1) const { + int writeDelta(Delta& d, const IntIntPair& base, int commonPrefix = -1) const { d.prefixSource = false; d.deleted = false; d.dk = k - base.k; @@ -5595,76 +5267,65 @@ struct IntIntPair { int k; int v; - std::string toString() const { - return format("{k=%d(0x%x) v=%d(0x%x)}", k, k, v, v); - } + std::string toString() const { return format("{k=%d(0x%x) v=%d(0x%x)}", k, k, v, v); } }; -int getCommonIntFieldPrefix2(const RedwoodRecordRef &a, const RedwoodRecordRef &b) { - RedwoodRecordRef::byte aFields[RedwoodRecordRef::intFieldArraySize]; - RedwoodRecordRef::byte bFields[RedwoodRecordRef::intFieldArraySize]; - - a.serializeIntFields(aFields); - b.serializeIntFields(bFields); - - //printf("a: %s\n", StringRef(aFields, RedwoodRecordRef::intFieldArraySize).toHexString().c_str()); - //printf("b: %s\n", StringRef(bFields, RedwoodRecordRef::intFieldArraySize).toHexString().c_str()); - - int i = 0; - while(i < RedwoodRecordRef::intFieldArraySize && aFields[i] == bFields[i]) { - ++i; - } - - //printf("%d\n", i); - return i; -} - -void deltaTest(RedwoodRecordRef rec, RedwoodRecordRef base) { - char buf[500]; - RedwoodRecordRef::Delta &d = *(RedwoodRecordRef::Delta *)buf; +int deltaTest(RedwoodRecordRef rec, RedwoodRecordRef base) { + std::vector buf(rec.key.size() + rec.value.orDefault(StringRef()).size() + 20); + RedwoodRecordRef::Delta& d = *(RedwoodRecordRef::Delta*)&buf.front(); Arena mem; - int expectedSize = rec.deltaSize(base, false); + int expectedSize = rec.deltaSize(base, 0, false); int deltaSize = rec.writeDelta(d, base); RedwoodRecordRef decoded = d.apply(base, mem); - if(decoded != rec || expectedSize != deltaSize) { + if (decoded != rec || expectedSize != deltaSize || d.size() != deltaSize) { printf("\n"); - printf("Base: %s\n", base.toString().c_str()); - printf("ExpectedSize: %d\n", expectedSize); - printf("DeltaSize: %d\n", deltaSize); - printf("Delta: %s\n", d.toString().c_str()); - printf("Record: %s\n", rec.toString().c_str()); - printf("Decoded: %s\n", decoded.toString().c_str()); + printf("Base: %s\n", base.toString().c_str()); + printf("Record: %s\n", rec.toString().c_str()); + printf("Decoded: %s\n", decoded.toString().c_str()); + printf("deltaSize(): %d\n", expectedSize); + printf("writeDelta(): %d\n", deltaSize); + printf("d.size(): %d\n", d.size()); + printf("DeltaToString: %s\n", d.toString().c_str()); printf("RedwoodRecordRef::Delta test failure!\n"); ASSERT(false); } + + return deltaSize; } -Standalone randomRedwoodRecordRef(int maxKeySize = 3, int maxValueSize = 255) { +RedwoodRecordRef randomRedwoodRecordRef(const std::string& keyBuffer, const std::string& valueBuffer) { RedwoodRecordRef rec; - KeyValue kv = randomKV(3, 10); - rec.key = kv.key; - - if(deterministicRandom()->random01() < .9) { - rec.value = kv.value; + rec.key = StringRef((uint8_t*)keyBuffer.data(), deterministicRandom()->randomInt(0, keyBuffer.size())); + if (deterministicRandom()->coinflip()) { + rec.value = StringRef((uint8_t*)valueBuffer.data(), deterministicRandom()->randomInt(0, valueBuffer.size())); } - rec.version = deterministicRandom()->coinflip() ? 0 : deterministicRandom()->randomInt64(0, std::numeric_limits::max()); - - if(deterministicRandom()->coinflip()) { - rec.chunk.total = deterministicRandom()->randomInt(1, 100000); - rec.chunk.start = deterministicRandom()->randomInt(0, rec.chunk.total); + int versionIntSize = deterministicRandom()->randomInt(0, 8) * 8; + if (versionIntSize > 0) { + --versionIntSize; + int64_t max = ((int64_t)1 << versionIntSize) - 1; + rec.version = deterministicRandom()->randomInt64(0, max); } - return Standalone(rec, kv.arena()); + return rec; } TEST_CASE("!/redwood/correctness/unit/RedwoodRecordRef") { + ASSERT(RedwoodRecordRef::Delta::LengthFormatSizes[0] == 3); + ASSERT(RedwoodRecordRef::Delta::LengthFormatSizes[1] == 4); + ASSERT(RedwoodRecordRef::Delta::LengthFormatSizes[2] == 6); + ASSERT(RedwoodRecordRef::Delta::LengthFormatSizes[3] == 8); + + ASSERT(RedwoodRecordRef::Delta::VersionDeltaSizes[0] == 0); + ASSERT(RedwoodRecordRef::Delta::VersionDeltaSizes[1] == 4); + ASSERT(RedwoodRecordRef::Delta::VersionDeltaSizes[2] == 6); + ASSERT(RedwoodRecordRef::Delta::VersionDeltaSizes[3] == 8); // Test pageID stuff. { - LogicalPageID ids[] = {1, 5}; + LogicalPageID ids[] = { 1, 5 }; BTreePageID id(ids, 2); RedwoodRecordRef r; r.setChildPage(id); @@ -5676,128 +5337,81 @@ TEST_CASE("!/redwood/correctness/unit/RedwoodRecordRef") { ASSERT(r2.getChildPage().begin() != id.begin()); } - // Testing common prefix calculation for integer fields using the member function that calculates this directly - // and by serializing the integer fields to arrays and finding the common prefix length of the two arrays + deltaTest(RedwoodRecordRef(LiteralStringRef(""), 0, LiteralStringRef("")), + RedwoodRecordRef(LiteralStringRef(""), 0, LiteralStringRef(""))); - deltaTest(RedwoodRecordRef(LiteralStringRef(""), 0, LiteralStringRef(""), 0, 0), - RedwoodRecordRef(LiteralStringRef(""), 0, LiteralStringRef(""), 0, 0) - ); + deltaTest(RedwoodRecordRef(LiteralStringRef("abc"), 0, LiteralStringRef("")), + RedwoodRecordRef(LiteralStringRef("abc"), 0, LiteralStringRef(""))); - deltaTest(RedwoodRecordRef(LiteralStringRef("abc"), 0, LiteralStringRef(""), 0, 0), - RedwoodRecordRef(LiteralStringRef("abc"), 0, LiteralStringRef(""), 0, 0) - ); + deltaTest(RedwoodRecordRef(LiteralStringRef("abc"), 0, LiteralStringRef("")), + RedwoodRecordRef(LiteralStringRef("abcd"), 0, LiteralStringRef(""))); - deltaTest(RedwoodRecordRef(LiteralStringRef("abc"), 0, LiteralStringRef(""), 0, 0), - RedwoodRecordRef(LiteralStringRef("abcd"), 0, LiteralStringRef(""), 0, 0) - ); + deltaTest(RedwoodRecordRef(LiteralStringRef("abcd"), 2, LiteralStringRef("")), + RedwoodRecordRef(LiteralStringRef("abc"), 2, LiteralStringRef(""))); - deltaTest(RedwoodRecordRef(LiteralStringRef("abc"), 2, LiteralStringRef(""), 0, 0), - RedwoodRecordRef(LiteralStringRef("abc"), 2, LiteralStringRef(""), 0, 0) - ); + deltaTest(RedwoodRecordRef(std::string(300, 'k'), 2, std::string(1e6, 'v')), + RedwoodRecordRef(std::string(300, 'k'), 2, LiteralStringRef(""))); - deltaTest(RedwoodRecordRef(LiteralStringRef("abc"), 2, LiteralStringRef(""), 0, 0), - RedwoodRecordRef(LiteralStringRef("ab"), 2, LiteralStringRef(""), 1, 3) - ); + deltaTest(RedwoodRecordRef(LiteralStringRef(""), 2, LiteralStringRef("")), + RedwoodRecordRef(LiteralStringRef(""), 1, LiteralStringRef(""))); - deltaTest(RedwoodRecordRef(LiteralStringRef("abc"), 2, LiteralStringRef(""), 5, 0), - RedwoodRecordRef(LiteralStringRef("abc"), 2, LiteralStringRef(""), 5, 1) - ); + deltaTest(RedwoodRecordRef(LiteralStringRef(""), 0xffff, LiteralStringRef("")), + RedwoodRecordRef(LiteralStringRef(""), 1, LiteralStringRef(""))); - RedwoodRecordRef::byte varInts[100]; - RedwoodRecordRef::Writer w(varInts); - RedwoodRecordRef::Reader r(varInts); - w.writeVarInt(1); - w.writeVarInt(128); - w.writeVarInt(32000); - ASSERT(r.readVarInt() == 1); - ASSERT(r.readVarInt() == 128); - ASSERT(r.readVarInt() == 32000); + deltaTest(RedwoodRecordRef(LiteralStringRef(""), 1, LiteralStringRef("")), + RedwoodRecordRef(LiteralStringRef(""), 0xffff, LiteralStringRef(""))); - RedwoodRecordRef rec1; - RedwoodRecordRef rec2; + deltaTest(RedwoodRecordRef(LiteralStringRef(""), 0xffffff, LiteralStringRef("")), + RedwoodRecordRef(LiteralStringRef(""), 1, LiteralStringRef(""))); - rec1.version = 0x12345678; - rec2.version = 0x12995678; - ASSERT(rec1.getCommonIntFieldPrefix(rec2) == 5); - ASSERT(rec1.getCommonIntFieldPrefix(rec2) == getCommonIntFieldPrefix2(rec1, rec2)); - - rec1.version = 0x12345678; - rec2.version = 0x12345678; - ASSERT(rec1.getCommonIntFieldPrefix(rec2) == 14); - ASSERT(rec1.getCommonIntFieldPrefix(rec2) == getCommonIntFieldPrefix2(rec1, rec2)); - - rec1.version = invalidVersion; - rec2.version = 0; - ASSERT(rec1.getCommonIntFieldPrefix(rec2) == 0); - ASSERT(rec1.getCommonIntFieldPrefix(rec2) == getCommonIntFieldPrefix2(rec1, rec2)); - - rec1.version = 0x12345678; - rec2.version = 0x12345678; - rec1.chunk.total = 4; - rec2.chunk.total = 4; - ASSERT(rec1.getCommonIntFieldPrefix(rec2) == 14); - ASSERT(rec1.getCommonIntFieldPrefix(rec2) == getCommonIntFieldPrefix2(rec1, rec2)); - - rec1.version = 0x12345678; - rec2.version = 0x12345678; - rec1.chunk.start = 4; - rec2.chunk.start = 4; - ASSERT(rec1.getCommonIntFieldPrefix(rec2) == 14); - ASSERT(rec1.getCommonIntFieldPrefix(rec2) == getCommonIntFieldPrefix2(rec1, rec2)); - - rec1.version = 0x12345678; - rec2.version = 0x12345678; - rec1.chunk.start = 4; - rec2.chunk.start = 5; - ASSERT(rec1.getCommonIntFieldPrefix(rec2) == 13); - ASSERT(rec1.getCommonIntFieldPrefix(rec2) == getCommonIntFieldPrefix2(rec1, rec2)); - - rec1.version = 0x12345678; - rec2.version = 0x12345678; - rec1.chunk.total = 256; - rec2.chunk.total = 512; - ASSERT(rec1.getCommonIntFieldPrefix(rec2) == 9); - ASSERT(rec1.getCommonIntFieldPrefix(rec2) == getCommonIntFieldPrefix2(rec1, rec2)); + deltaTest(RedwoodRecordRef(LiteralStringRef(""), 1, LiteralStringRef("")), + RedwoodRecordRef(LiteralStringRef(""), 0xffffff, LiteralStringRef(""))); Arena mem; double start; uint64_t total; uint64_t count; uint64_t i; + int64_t bytes; + std::string keyBuffer(30000, 'k'); + std::string valueBuffer(70000, 'v'); start = timer(); - total = 0; - count = 1e9; - for(i = 0; i < count; ++i) { - rec1.chunk.total = i & 0xffffff; - rec2.chunk.total = i & 0xffffff; - rec1.chunk.start = i & 0xffffff; - rec2.chunk.start = (i + 1) & 0xffffff; - total += rec1.getCommonIntFieldPrefix(rec2); + count = 1000; + bytes = 0; + for (i = 0; i < count; ++i) { + RedwoodRecordRef a = randomRedwoodRecordRef(keyBuffer, valueBuffer); + RedwoodRecordRef b = randomRedwoodRecordRef(keyBuffer, valueBuffer); + bytes += deltaTest(a, b); } - printf("%" PRId64 " getCommonIntFieldPrefix() %g M/s\n", total, count / (timer() - start) / 1e6); - - rec1.key = LiteralStringRef("alksdfjaklsdfjlkasdjflkasdjfklajsdflk;ajsdflkajdsflkjadsf"); - rec2.key = LiteralStringRef("alksdfjaklsdfjlkasdjflkasdjfklajsdflk;ajsdflkajdsflkjadsf"); + double elapsed = timer() - start; + printf("DeltaTest() on random large records %g M/s %g MB/s\n", count / elapsed / 1e6, bytes / elapsed / 1e6); + keyBuffer.resize(30); + valueBuffer.resize(100); start = timer(); - total = 0; - count = 1e9; - for(i = 0; i < count; ++i) { - RedwoodRecordRef::byte fields[RedwoodRecordRef::intFieldArraySize]; - rec1.chunk.start = i & 0xffffff; - rec2.chunk.start = (i + 1) & 0xffffff; - rec1.serializeIntFields(fields); - total += fields[RedwoodRecordRef::intFieldArraySize - 1]; + count = 1e6; + bytes = 0; + for (i = 0; i < count; ++i) { + RedwoodRecordRef a = randomRedwoodRecordRef(keyBuffer, valueBuffer); + RedwoodRecordRef b = randomRedwoodRecordRef(keyBuffer, valueBuffer); + bytes += deltaTest(a, b); } - printf("%" PRId64 " serializeIntFields() %g M/s\n", total, count / (timer() - start) / 1e6); + printf("DeltaTest() on random small records %g M/s %g MB/s\n", count / elapsed / 1e6, bytes / elapsed / 1e6); + + RedwoodRecordRef rec1; + RedwoodRecordRef rec2; + + rec1.key = LiteralStringRef("alksdfjaklsdfjlkasdjflkasdjfklajsdflk;ajsdflkajdsflkjadsf1"); + rec2.key = LiteralStringRef("alksdfjaklsdfjlkasdjflkasdjfklajsdflk;ajsdflkajdsflkjadsf234"); + + rec1.version = deterministicRandom()->randomInt64(0, std::numeric_limits::max()); + rec2.version = deterministicRandom()->randomInt64(0, std::numeric_limits::max()); start = timer(); total = 0; count = 100e6; - for(i = 0; i < count; ++i) { - rec1.chunk.start = i & 0xffffff; - rec2.chunk.start = (i + 1) & 0xffffff; + for (i = 0; i < count; ++i) { total += rec1.getCommonPrefixLen(rec2, 50); } printf("%" PRId64 " getCommonPrefixLen(skip=50) %g M/s\n", total, count / (timer() - start) / 1e6); @@ -5805,24 +5419,20 @@ TEST_CASE("!/redwood/correctness/unit/RedwoodRecordRef") { start = timer(); total = 0; count = 100e6; - for(i = 0; i < count; ++i) { - rec1.chunk.start = i & 0xffffff; - rec2.chunk.start = (i + 1) & 0xffffff; + for (i = 0; i < count; ++i) { total += rec1.getCommonPrefixLen(rec2, 0); } printf("%" PRId64 " getCommonPrefixLen(skip=0) %g M/s\n", total, count / (timer() - start) / 1e6); char buf[1000]; - RedwoodRecordRef::Delta &d = *(RedwoodRecordRef::Delta *)buf; + RedwoodRecordRef::Delta& d = *(RedwoodRecordRef::Delta*)buf; start = timer(); total = 0; count = 100e6; int commonPrefix = rec1.getCommonPrefixLen(rec2, 0); - for(i = 0; i < count; ++i) { - rec1.chunk.start = i & 0xffffff; - rec2.chunk.start = (i + 1) & 0xffffff; + for (i = 0; i < count; ++i) { total += rec1.writeDelta(d, rec2, commonPrefix); } printf("%" PRId64 " writeDelta(commonPrefix=%d) %g M/s\n", total, commonPrefix, count / (timer() - start) / 1e6); @@ -5830,28 +5440,20 @@ TEST_CASE("!/redwood/correctness/unit/RedwoodRecordRef") { start = timer(); total = 0; count = 10e6; - for(i = 0; i < count; ++i) { - rec1.chunk.start = i & 0xffffff; - rec2.chunk.start = (i + 1) & 0xffffff; + for (i = 0; i < count; ++i) { total += rec1.writeDelta(d, rec2); } printf("%" PRId64 " writeDelta() %g M/s\n", total, count / (timer() - start) / 1e6); - start = timer(); - total = 0; - count = 1e6; - for(i = 0; i < count; ++i) { - Standalone a = randomRedwoodRecordRef(); - Standalone b = randomRedwoodRecordRef(); - deltaTest(a, b); - } - printf("Random deltaTest() %g M/s\n", count / (timer() - start) / 1e6); - return Void(); } TEST_CASE("!/redwood/correctness/unit/deltaTree/RedwoodRecordRef") { - const int N = 200; + // Sanity check on delta tree node format + ASSERT(DeltaTree::Node::headerSize(false) == 4); + ASSERT(DeltaTree::Node::headerSize(true) == 8); + + const int N = deterministicRandom()->randomInt(200, 1000); RedwoodRecordRef prev; RedwoodRecordRef next(LiteralStringRef("\xff\xff\xff\xff")); @@ -5860,39 +5462,39 @@ TEST_CASE("!/redwood/correctness/unit/deltaTree/RedwoodRecordRef") { std::set uniqueItems; // Add random items to uniqueItems until its size is N - while(uniqueItems.size() < N) { + while (uniqueItems.size() < N) { std::string k = deterministicRandom()->randomAlphaNumeric(30); std::string v = deterministicRandom()->randomAlphaNumeric(30); RedwoodRecordRef rec; rec.key = StringRef(arena, k); - rec.version = deterministicRandom()->coinflip() ? deterministicRandom()->randomInt64(0, std::numeric_limits::max()) : invalidVersion; - if(deterministicRandom()->coinflip()) { + rec.version = deterministicRandom()->coinflip() + ? deterministicRandom()->randomInt64(0, std::numeric_limits::max()) + : invalidVersion; + if (deterministicRandom()->coinflip()) { rec.value = StringRef(arena, v); - if(deterministicRandom()->coinflip()) { - rec.chunk.start = deterministicRandom()->randomInt(0, 100000); - rec.chunk.total = rec.chunk.start + v.size() + deterministicRandom()->randomInt(0, 100000); - } } - if(uniqueItems.count(rec) == 0) { + if (uniqueItems.count(rec) == 0) { uniqueItems.insert(rec); } } std::vector items(uniqueItems.begin(), uniqueItems.end()); int bufferSize = N * 100; - DeltaTree *tree = (DeltaTree *) new uint8_t[bufferSize]; + bool largeTree = bufferSize > DeltaTree::SmallSizeLimit; + DeltaTree* tree = (DeltaTree*)new uint8_t[bufferSize]; tree->build(bufferSize, &items[0], &items[items.size()], &prev, &next); - printf("Count=%d Size=%d InitialHeight=%d\n", (int)items.size(), (int)tree->size(), (int)tree->initialHeight); - debug_printf("Data(%p): %s\n", tree, StringRef((uint8_t *)tree, tree->size()).toHexString().c_str()); + printf("Count=%d Size=%d InitialHeight=%d largeTree=%d\n", (int)items.size(), (int)tree->size(), + (int)tree->initialHeight, largeTree); + debug_printf("Data(%p): %s\n", tree, StringRef((uint8_t*)tree, tree->size()).toHexString().c_str()); DeltaTree::Mirror r(tree, &prev, &next); // Test delete/insert behavior for each item, making no net changes printf("Testing seek/delete/insert for existing keys with random values\n"); ASSERT(tree->numItems == items.size()); - for(auto rec : items) { + for (auto rec : items) { // Insert existing should fail ASSERT(!r.insert(rec)); ASSERT(tree->numItems == items.size()); @@ -5920,28 +5522,33 @@ TEST_CASE("!/redwood/correctness/unit/deltaTree/RedwoodRecordRef") { DeltaTree::Mirror rValuesOnly(tree, &prev, &next); DeltaTree::Cursor fwdValueOnly = rValuesOnly.getCursor(); + printf("Verifying tree contents using forward, reverse, and value-only iterators\n"); ASSERT(fwd.moveFirst()); ASSERT(fwdValueOnly.moveFirst()); ASSERT(rev.moveLast()); + int i = 0; - while(1) { - if(fwd.get() != items[i]) { - printf("forward iterator i=%d\n %s found\n %s expected\n", i, fwd.get().toString().c_str(), items[i].toString().c_str()); - printf("Delta: %s\n", fwd.node->raw->delta().toString().c_str()); + while (1) { + if (fwd.get() != items[i]) { + printf("forward iterator i=%d\n %s found\n %s expected\n", i, fwd.get().toString().c_str(), + items[i].toString().c_str()); + printf("Delta: %s\n", fwd.node->raw->delta(largeTree).toString().c_str()); ASSERT(false); } - if(rev.get() != items[items.size() - 1 - i]) { - printf("reverse iterator i=%d\n %s found\n %s expected\n", i, rev.get().toString().c_str(), items[items.size() - 1 - i].toString().c_str()); - printf("Delta: %s\n", rev.node->raw->delta().toString().c_str()); + if (rev.get() != items[items.size() - 1 - i]) { + printf("reverse iterator i=%d\n %s found\n %s expected\n", i, rev.get().toString().c_str(), + items[items.size() - 1 - i].toString().c_str()); + printf("Delta: %s\n", rev.node->raw->delta(largeTree).toString().c_str()); ASSERT(false); } - if(fwdValueOnly.get().value != items[i].value) { - printf("forward values-only iterator i=%d\n %s found\n %s expected\n", i, fwdValueOnly.get().toString().c_str(), items[i].toString().c_str()); - printf("Delta: %s\n", fwdValueOnly.node->raw->delta().toString().c_str()); + if (fwdValueOnly.get().value != items[i].value) { + printf("forward values-only iterator i=%d\n %s found\n %s expected\n", i, + fwdValueOnly.get().toString().c_str(), items[i].toString().c_str()); + printf("Delta: %s\n", fwdValueOnly.node->raw->delta(largeTree).toString().c_str()); ASSERT(false); } ++i; - + bool more = fwd.moveNext(); ASSERT(fwdValueOnly.moveNext() == more); ASSERT(rev.movePrev() == more); @@ -5950,46 +5557,80 @@ TEST_CASE("!/redwood/correctness/unit/deltaTree/RedwoodRecordRef") { ASSERT(fwdValueOnly.valid() == more); ASSERT(rev.valid() == more); - if(!fwd.valid()) { + if (!fwd.valid()) { break; } } ASSERT(i == items.size()); - double start = timer(); - DeltaTree::Cursor c = r.getCursor(); + { + DeltaTree::Mirror mirror(tree, &prev, &next); + DeltaTree::Cursor c = mirror.getCursor(); - for(int i = 0; i < 20000000; ++i) { - const RedwoodRecordRef &query = items[deterministicRandom()->randomInt(0, items.size())]; - if(!c.seekLessThanOrEqual(query)) { - printf("Not found! query=%s\n", query.toString().c_str()); - ASSERT(false); - } - if(c.get() != query) { - printf("Found incorrect node! query=%s found=%s\n", query.toString().c_str(), c.get().toString().c_str()); - ASSERT(false); + printf("Doing 20M random seeks using the same cursor from the same mirror.\n"); + double start = timer(); + + for (int i = 0; i < 20000000; ++i) { + const RedwoodRecordRef& query = items[deterministicRandom()->randomInt(0, items.size())]; + if (!c.seekLessThanOrEqual(query)) { + printf("Not found! query=%s\n", query.toString().c_str()); + ASSERT(false); + } + if (c.get() != query) { + printf("Found incorrect node! query=%s found=%s\n", query.toString().c_str(), + c.get().toString().c_str()); + ASSERT(false); + } } + double elapsed = timer() - start; + printf("Elapsed %f\n", elapsed); + } + + { + printf("Doing 5M random seeks using 10k random cursors, each from a different mirror.\n"); + double start = timer(); + std::vector::Mirror*> mirrors; + std::vector::Cursor> cursors; + for (int i = 0; i < 10000; ++i) { + mirrors.push_back(new DeltaTree::Mirror(tree, &prev, &next)); + cursors.push_back(mirrors.back()->getCursor()); + } + + for (int i = 0; i < 5000000; ++i) { + const RedwoodRecordRef& query = items[deterministicRandom()->randomInt(0, items.size())]; + DeltaTree::Cursor& c = cursors[deterministicRandom()->randomInt(0, cursors.size())]; + if (!c.seekLessThanOrEqual(query)) { + printf("Not found! query=%s\n", query.toString().c_str()); + ASSERT(false); + } + if (c.get() != query) { + printf("Found incorrect node! query=%s found=%s\n", query.toString().c_str(), + c.get().toString().c_str()); + ASSERT(false); + } + } + double elapsed = timer() - start; + printf("Elapsed %f\n", elapsed); } - double elapsed = timer() - start; - printf("Elapsed %f\n", elapsed); return Void(); } TEST_CASE("!/redwood/correctness/unit/deltaTree/IntIntPair") { const int N = 200; - IntIntPair prev = {1, 0}; - IntIntPair next = {10000, 10000}; + IntIntPair prev = { 1, 0 }; + IntIntPair next = { 10000, 10000 }; state std::function randomPair = [&]() { - return IntIntPair({deterministicRandom()->randomInt(prev.k, next.k), deterministicRandom()->randomInt(prev.v, next.v)}); + return IntIntPair( + { deterministicRandom()->randomInt(prev.k, next.k), deterministicRandom()->randomInt(prev.v, next.v) }); }; // Build a set of N unique items std::set uniqueItems; - while(uniqueItems.size() < N) { + while (uniqueItems.size() < N) { IntIntPair p = randomPair(); - if(uniqueItems.count(p) == 0) { + if (uniqueItems.count(p) == 0) { uniqueItems.insert(p); } } @@ -5997,7 +5638,7 @@ TEST_CASE("!/redwood/correctness/unit/deltaTree/IntIntPair") { // Build tree of items std::vector items(uniqueItems.begin(), uniqueItems.end()); int bufferSize = N * 2 * 20; - DeltaTree *tree = (DeltaTree *) new uint8_t[bufferSize]; + DeltaTree* tree = (DeltaTree*)new uint8_t[bufferSize]; int builtSize = tree->build(bufferSize, &items[0], &items[items.size()], &prev, &next); ASSERT(builtSize <= bufferSize); @@ -6005,17 +5646,17 @@ TEST_CASE("!/redwood/correctness/unit/deltaTree/IntIntPair") { // Grow uniqueItems until tree is full, adding half of new items to toDelete std::vector toDelete; - while(1) { + while (1) { IntIntPair p = randomPair(); - if(uniqueItems.count(p) == 0) { - if(!r.insert(p)) { + if (uniqueItems.count(p) == 0) { + if (!r.insert(p)) { break; }; uniqueItems.insert(p); - if(deterministicRandom()->coinflip()) { + if (deterministicRandom()->coinflip()) { toDelete.push_back(p); } - //printf("Inserted %s size=%d\n", items.back().toString().c_str(), tree->size()); + // printf("Inserted %s size=%d\n", items.back().toString().c_str(), tree->size()); } } @@ -6026,13 +5667,14 @@ TEST_CASE("!/redwood/correctness/unit/deltaTree/IntIntPair") { items = std::vector(uniqueItems.begin(), uniqueItems.end()); auto printItems = [&] { - for(int k = 0; k < items.size(); ++k) { + for (int k = 0; k < items.size(); ++k) { printf("%d %s\n", k, items[k].toString().c_str()); } }; - printf("Count=%d Size=%d InitialHeight=%d MaxHeight=%d\n", (int)items.size(), (int)tree->size(), (int)tree->initialHeight, (int)tree->maxHeight); - debug_printf("Data(%p): %s\n", tree, StringRef((uint8_t *)tree, tree->size()).toHexString().c_str()); + printf("Count=%d Size=%d InitialHeight=%d MaxHeight=%d\n", (int)items.size(), (int)tree->size(), + (int)tree->initialHeight, (int)tree->maxHeight); + debug_printf("Data(%p): %s\n", tree, StringRef((uint8_t*)tree, tree->size()).toHexString().c_str()); // Iterate through items and tree forward and backward, verifying tree contents. auto scanAndVerify = [&]() { @@ -6043,15 +5685,17 @@ TEST_CASE("!/redwood/correctness/unit/deltaTree/IntIntPair") { ASSERT(fwd.moveFirst()); ASSERT(rev.moveLast()); - for(int i = 0; i < items.size(); ++i) { - if(fwd.get() != items[i]) { + for (int i = 0; i < items.size(); ++i) { + if (fwd.get() != items[i]) { printItems(); - printf("forward iterator i=%d\n %s found\n %s expected\n", i, fwd.get().toString().c_str(), items[i].toString().c_str()); + printf("forward iterator i=%d\n %s found\n %s expected\n", i, fwd.get().toString().c_str(), + items[i].toString().c_str()); ASSERT(false); } - if(rev.get() != items[items.size() - 1 - i]) { + if (rev.get() != items[items.size() - 1 - i]) { printItems(); - printf("reverse iterator i=%d\n %s found\n %s expected\n", i, rev.get().toString().c_str(), items[items.size() - 1 - i].toString().c_str()); + printf("reverse iterator i=%d\n %s found\n %s expected\n", i, rev.get().toString().c_str(), + items[items.size() - 1 - i].toString().c_str()); ASSERT(false); } @@ -6064,7 +5708,7 @@ TEST_CASE("!/redwood/correctness/unit/deltaTree/IntIntPair") { ASSERT(fwd.valid() == !end); ASSERT(rev.valid() == !end); - if(end) { + if (end) { break; } } @@ -6079,7 +5723,7 @@ TEST_CASE("!/redwood/correctness/unit/deltaTree/IntIntPair") { // For each randomly selected new item to be deleted, delete it from the DeltaTree and from uniqueItems printf("Deleting some items\n"); - for(auto p : toDelete) { + for (auto p : toDelete) { uniqueItems.erase(p); DeltaTree::Cursor c = r.getCursor(); ASSERT(c.seekLessThanOrEqual(p)); @@ -6093,7 +5737,7 @@ TEST_CASE("!/redwood/correctness/unit/deltaTree/IntIntPair") { printf("Verifying insert/erase behavior for existing items\n"); // Test delete/insert behavior for each item, making no net changes - for(auto p : items) { + for (auto p : items) { // Insert existing should fail ASSERT(!r.insert(p)); @@ -6117,79 +5761,85 @@ TEST_CASE("!/redwood/correctness/unit/deltaTree/IntIntPair") { DeltaTree::Cursor s = r.getCursor(); // SeekLTE to each element - for(int i = 0; i < items.size(); ++i) { + for (int i = 0; i < items.size(); ++i) { IntIntPair p = items[i]; IntIntPair q = p; ASSERT(s.seekLessThanOrEqual(q)); - if(s.get() != p) { + if (s.get() != p) { printItems(); - printf("seekLessThanOrEqual(%s) found %s expected %s\n", q.toString().c_str(), s.get().toString().c_str(), p.toString().c_str()); + printf("seekLessThanOrEqual(%s) found %s expected %s\n", q.toString().c_str(), s.get().toString().c_str(), + p.toString().c_str()); ASSERT(false); } } // SeekGTE to each element - for(int i = 0; i < items.size(); ++i) { + for (int i = 0; i < items.size(); ++i) { IntIntPair p = items[i]; IntIntPair q = p; ASSERT(s.seekGreaterThanOrEqual(q)); - if(s.get() != p) { + if (s.get() != p) { printItems(); - printf("seekGreaterThanOrEqual(%s) found %s expected %s\n", q.toString().c_str(), s.get().toString().c_str(), p.toString().c_str()); + printf("seekGreaterThanOrEqual(%s) found %s expected %s\n", q.toString().c_str(), + s.get().toString().c_str(), p.toString().c_str()); ASSERT(false); } } // SeekLTE to the next possible int pair value after each element to make sure the base element is found - for(int i = 0; i < items.size(); ++i) { + for (int i = 0; i < items.size(); ++i) { IntIntPair p = items[i]; IntIntPair q = p; q.v++; ASSERT(s.seekLessThanOrEqual(q)); - if(s.get() != p) { + if (s.get() != p) { printItems(); - printf("seekLessThanOrEqual(%s) found %s expected %s\n", q.toString().c_str(), s.get().toString().c_str(), p.toString().c_str()); + printf("seekLessThanOrEqual(%s) found %s expected %s\n", q.toString().c_str(), s.get().toString().c_str(), + p.toString().c_str()); ASSERT(false); } } // SeekGTE to the previous possible int pair value after each element to make sure the base element is found - for(int i = 0; i < items.size(); ++i) { + for (int i = 0; i < items.size(); ++i) { IntIntPair p = items[i]; IntIntPair q = p; q.v--; ASSERT(s.seekGreaterThanOrEqual(q)); - if(s.get() != p) { + if (s.get() != p) { printItems(); - printf("seekGreaterThanOrEqual(%s) found %s expected %s\n", q.toString().c_str(), s.get().toString().c_str(), p.toString().c_str()); + printf("seekGreaterThanOrEqual(%s) found %s expected %s\n", q.toString().c_str(), + s.get().toString().c_str(), p.toString().c_str()); ASSERT(false); } } // SeekLTE to each element N times, using every element as a hint - for(int i = 0; i < items.size(); ++i) { + for (int i = 0; i < items.size(); ++i) { IntIntPair p = items[i]; IntIntPair q = p; - for(int j = 0; j < items.size(); ++j) { + for (int j = 0; j < items.size(); ++j) { ASSERT(s.seekLessThanOrEqual(items[j])); ASSERT(s.seekLessThanOrEqual(q, 0, &s)); - if(s.get() != p) { + if (s.get() != p) { printItems(); printf("i=%d j=%d\n", i, j); + printf("seekLessThanOrEqual(%s) found %s expected %s\n", q.toString().c_str(), + s.get().toString().c_str(), p.toString().c_str()); ASSERT(false); } } } // SeekLTE to each element's next possible value, using each element as a hint - for(int i = 0; i < items.size(); ++i) { + for (int i = 0; i < items.size(); ++i) { IntIntPair p = items[i]; IntIntPair q = p; q.v++; - for(int j = 0; j < items.size(); ++j) { + for (int j = 0; j < items.size(); ++j) { ASSERT(s.seekLessThanOrEqual(items[j])); ASSERT(s.seekLessThanOrEqual(q, 0, &s)); - if(s.get() != p) { + if (s.get() != p) { printItems(); printf("i=%d j=%d\n", i, j); ASSERT(false); @@ -6197,56 +5847,67 @@ TEST_CASE("!/redwood/correctness/unit/deltaTree/IntIntPair") { } } - auto skipSeekPerformance = [&](int jumpMax, bool useHint, int count) { + auto skipSeekPerformance = [&](int jumpMax, bool old, bool useHint, int count) { // Skip to a series of increasing items, jump by up to jumpMax units forward in the // items, wrapping around to 0. double start = timer(); s.moveFirst(); auto first = s; int pos = 0; - for(int c = 0; c < count; ++c) { + for (int c = 0; c < count; ++c) { int jump = deterministicRandom()->randomInt(0, jumpMax); int newPos = pos + jump; - if(newPos >= items.size()) { + if (newPos >= items.size()) { pos = 0; newPos = jump; s = first; } IntIntPair q = items[newPos]; ++q.v; - if(useHint) { - s.seekLessThanOrEqual(q, 0, &s, newPos - pos); - } - else { - s.seekLessThanOrEqual(q); + if (old) { + if (useHint) { + s.seekLessThanOrEqualOld(q, 0, &s, newPos - pos); + } else { + s.seekLessThanOrEqualOld(q, 0, nullptr, 0); + } + } else { + if (useHint) { + s.seekLessThanOrEqual(q, 0, &s, newPos - pos); + } else { + s.seekLessThanOrEqual(q); + } } pos = newPos; } double elapsed = timer() - start; - printf("Seek/skip test, jumpMax=%d, items=%d, useHint=%d: Elapsed %f s\n", jumpMax, items.size(), useHint, elapsed); + printf("Seek/skip test, jumpMax=%d, items=%d, oldSeek=%d useHint=%d: Elapsed %f s\n", jumpMax, items.size(), + old, useHint, elapsed); }; - skipSeekPerformance(10, false, 20e6); - skipSeekPerformance(10, true, 20e6); + // Compare seeking to nearby elements with and without hints, using the old and new SeekLessThanOrEqual methods. + // TODO: Once seekLessThanOrEqual() with a hint is as fast as seekLessThanOrEqualOld, remove it. + skipSeekPerformance(8, true, false, 80e6); + skipSeekPerformance(8, true, true, 80e6); + skipSeekPerformance(8, false, false, 80e6); + skipSeekPerformance(8, false, true, 80e6); // Repeatedly seek for one of a set of pregenerated random pairs and time it. std::vector randomPairs; - for(int i = 0; i < 10 * N; ++i) { + for (int i = 0; i < 10 * N; ++i) { randomPairs.push_back(randomPair()); } - // Random seeeks + // Random seeks double start = timer(); - for(int i = 0; i < 20000000; ++i) { + for (int i = 0; i < 20000000; ++i) { IntIntPair p = randomPairs[i % randomPairs.size()]; // Verify the result is less than or equal, and if seek fails then p must be lower than lowest (first) item - if(!s.seekLessThanOrEqual(p)) { - if(p >= items.front()) { + if (!s.seekLessThanOrEqual(p)) { + if (p >= items.front()) { printf("Seek failed! query=%s front=%s\n", p.toString().c_str(), items.front().toString().c_str()); ASSERT(false); } - } - else if(s.get() > p) { + } else if (s.get() > p) { printf("Found incorrect node! query=%s found=%s\n", p.toString().c_str(), s.get().toString().c_str()); ASSERT(false); } @@ -6284,14 +5945,14 @@ TEST_CASE("!/redwood/performance/mutationBuffer") { printf("Generating %d strings...\n", count); Arena arena; std::vector strings; - while(strings.size() < count) { + while (strings.size() < count) { strings.push_back(randomString(arena, 5)); } printf("Inserting and then finding each string...\n", count); double start = timer(); VersionedBTree::MutationBuffer m; - for(int i = 0; i < count; ++i) { + for (int i = 0; i < count; ++i) { KeyRef key = strings[i]; auto a = m.insert(key); auto b = m.lower_bound(key); @@ -6307,17 +5968,18 @@ TEST_CASE("!/redwood/performance/mutationBuffer") { TEST_CASE("!/redwood/correctness/btree") { state std::string pagerFile = "unittest_pageFile.redwood"; - IPager2 *pager; + IPager2* pager; state bool serialTest = deterministicRandom()->coinflip(); state bool shortTest = deterministicRandom()->coinflip(); - state int pageSize = shortTest ? 200 : (deterministicRandom()->coinflip() ? 4096 : deterministicRandom()->randomInt(200, 400)); + state int pageSize = + shortTest ? 200 : (deterministicRandom()->coinflip() ? 4096 : deterministicRandom()->randomInt(200, 400)); // We must be able to fit at least two any two keys plus overhead in a page to prevent // a situation where the tree cannot be grown upward with decreasing level size. - state int maxKeySize = deterministicRandom()->randomInt(4, pageSize * 2); - state int maxValueSize = deterministicRandom()->randomInt(0, pageSize * 4); + state int maxKeySize = deterministicRandom()->randomInt(1, pageSize * 2); + state int maxValueSize = randomSize(pageSize * 25); state int maxCommitSize = shortTest ? 1000 : randomSize(std::min((maxKeySize + maxValueSize) * 20000, 10e6)); state int mutationBytesTarget = shortTest ? 5000 : randomSize(std::min(maxCommitSize * 100, 100e6)); state double clearProbability = deterministicRandom()->random01() * .1; @@ -6348,7 +6010,7 @@ TEST_CASE("!/redwood/correctness/btree") { printf("Initializing...\n"); state double startTime = now(); pager = new DWALPager(pageSize, pagerFile, 0); - state VersionedBTree *btree = new VersionedBTree(pager, pagerFile); + state VersionedBTree* btree = new VersionedBTree(pager, pagerFile); wait(btree->init()); state std::map, Optional> written; @@ -6376,66 +6038,66 @@ TEST_CASE("!/redwood/correctness/btree") { state Future commit = Void(); - while(mutationBytes.get() < mutationBytesTarget && (now() - startTime) < maxDuration) { - if(now() - startTime > 600) { + while (mutationBytes.get() < mutationBytesTarget && (now() - startTime) < maxDuration) { + if (now() - startTime > 600) { mutationBytesTarget = mutationBytes.get(); } // Sometimes advance the version - if(deterministicRandom()->random01() < 0.10) { + if (deterministicRandom()->random01() < 0.10) { ++version; btree->setWriteVersion(version); } // Sometimes do a clear range - if(deterministicRandom()->random01() < clearProbability) { + if (deterministicRandom()->random01() < clearProbability) { Key start = randomKV(maxKeySize, 1).key; Key end = (deterministicRandom()->random01() < .01) ? keyAfter(start) : randomKV(maxKeySize, 1).key; // Sometimes replace start and/or end with a close actual (previously used) value - if(deterministicRandom()->random01() < .10) { + if (deterministicRandom()->random01() < .10) { auto i = keys.upper_bound(start); - if(i != keys.end()) - start = *i; + if (i != keys.end()) start = *i; } - if(deterministicRandom()->random01() < .10) { + if (deterministicRandom()->random01() < .10) { auto i = keys.upper_bound(end); - if(i != keys.end()) - end = *i; + if (i != keys.end()) end = *i; } - // Do a single key clear based on probability or end being randomly chosen to be the same as begin (unlikely) - if(deterministicRandom()->random01() < clearSingleKeyProbability || end == start) { + // Do a single key clear based on probability or end being randomly chosen to be the same as begin + // (unlikely) + if (deterministicRandom()->random01() < clearSingleKeyProbability || end == start) { end = keyAfter(start); - } - else if(end < start) { + } else if (end < start) { std::swap(end, start); } // Apply clear range to verification map ++rangeClears; KeyRangeRef range(start, end); - debug_printf(" Mutation: Clear '%s' to '%s' @%" PRId64 "\n", start.toString().c_str(), end.toString().c_str(), version); + debug_printf(" Mutation: Clear '%s' to '%s' @%" PRId64 "\n", start.toString().c_str(), + end.toString().c_str(), version); auto e = written.lower_bound(std::make_pair(start.toString(), 0)); - if(e != written.end()) { + if (e != written.end()) { auto last = e; auto eEnd = written.lower_bound(std::make_pair(end.toString(), 0)); - while(e != eEnd) { + while (e != eEnd) { auto w = *e; ++e; // If e key is different from last and last was present then insert clear for last's key at version - if(last != eEnd && ((e == eEnd || e->first.first != last->first.first) && last->second.present())) { - debug_printf(" Mutation: Clearing key '%s' @%" PRId64 "\n", last->first.first.c_str(), version); + if (last != eEnd && + ((e == eEnd || e->first.first != last->first.first) && last->second.present())) { + debug_printf(" Mutation: Clearing key '%s' @%" PRId64 "\n", last->first.first.c_str(), + version); keyBytesCleared += last->first.first.size(); mutationBytes += last->first.first.size(); mutationBytesThisCommit += last->first.first.size(); // If the last set was at version then just make it not present - if(last->first.second == version) { + if (last->first.second == version) { last->second.reset(); - } - else { + } else { written[std::make_pair(last->first.first, version)].reset(); } } @@ -6446,24 +6108,23 @@ TEST_CASE("!/redwood/correctness/btree") { btree->clear(range); // Sometimes set the range start after the clear - if(deterministicRandom()->random01() < clearPostSetProbability) { + if (deterministicRandom()->random01() < clearPostSetProbability) { KeyValue kv = randomKV(0, maxValueSize); kv.key = range.begin; btree->set(kv); written[std::make_pair(kv.key.toString(), version)] = kv.value.toString(); } - } - else { + } else { // Set a key KeyValue kv = randomKV(maxKeySize, maxValueSize); // Sometimes change key to a close previously used key - if(deterministicRandom()->random01() < .01) { + if (deterministicRandom()->random01() < .01) { auto i = keys.upper_bound(kv.key); - if(i != keys.end()) - kv.key = StringRef(kv.arena(), *i); + if (i != keys.end()) kv.key = StringRef(kv.arena(), *i); } - debug_printf(" Mutation: Set '%s' -> '%s' @%" PRId64 "\n", kv.key.toString().c_str(), kv.value.toString().c_str(), version); + debug_printf(" Mutation: Set '%s' -> '%s' @%" PRId64 "\n", kv.key.toString().c_str(), + kv.value.toString().c_str(), version); ++sets; keyBytesInserted += kv.key.size(); @@ -6477,24 +6138,24 @@ TEST_CASE("!/redwood/correctness/btree") { } // Commit at end or after this commit's mutation bytes are reached - if(mutationBytes.get() >= mutationBytesTarget || mutationBytesThisCommit >= mutationBytesTargetThisCommit) { + if (mutationBytes.get() >= mutationBytesTarget || mutationBytesThisCommit >= mutationBytesTargetThisCommit) { // Wait for previous commit to finish wait(commit); - printf("Committed. Next commit %d bytes, %" PRId64 "/%d (%.2f%%) Stats: Insert %.2f MB/s ClearedKeys %.2f MB/s Total %.2f\n", - mutationBytesThisCommit, - mutationBytes.get(), - mutationBytesTarget, - (double)mutationBytes.get() / mutationBytesTarget * 100, - (keyBytesInserted.rate() + valueBytesInserted.rate()) / 1e6, - keyBytesCleared.rate() / 1e6, - mutationBytes.rate() / 1e6 - ); + printf("Committed. Next commit %d bytes, %" PRId64 + "/%d (%.2f%%) Stats: Insert %.2f MB/s ClearedKeys %.2f MB/s Total %.2f\n", + mutationBytesThisCommit, mutationBytes.get(), mutationBytesTarget, + (double)mutationBytes.get() / mutationBytesTarget * 100, + (keyBytesInserted.rate() + valueBytesInserted.rate()) / 1e6, keyBytesCleared.rate() / 1e6, + mutationBytes.rate() / 1e6); - Version v = version; // Avoid capture of version as a member of *this + Version v = version; // Avoid capture of version as a member of *this - // Sometimes advance the oldest version to close the gap between the oldest and latest versions by a random amount. - if(deterministicRandom()->random01() < advanceOldVersionProbability) { - btree->setOldestVersion(btree->getLastCommittedVersion() - deterministicRandom()->randomInt(0, btree->getLastCommittedVersion() - btree->getOldestVersion() + 1)); + // Sometimes advance the oldest version to close the gap between the oldest and latest versions by a random + // amount. + if (deterministicRandom()->random01() < advanceOldVersionProbability) { + btree->setOldestVersion(btree->getLastCommittedVersion() - + deterministicRandom()->randomInt(0, btree->getLastCommittedVersion() - + btree->getOldestVersion() + 1)); } commit = map(btree->commit(), [=](Void) { @@ -6504,7 +6165,7 @@ TEST_CASE("!/redwood/correctness/btree") { return Void(); }); - if(serialTest) { + if (serialTest) { // Wait for commit, wait for verification, then start new verification wait(commit); committedVersions.sendError(end_of_stream()); @@ -6518,7 +6179,7 @@ TEST_CASE("!/redwood/correctness/btree") { mutationBytesTargetThisCommit = randomSize(maxCommitSize); // Recover from disk at random - if(!serialTest && deterministicRandom()->random01() < coldStartProbability) { + if (!serialTest && deterministicRandom()->random01() < coldStartProbability) { printf("Recovering from disk after next commit.\n"); // Wait for outstanding commit @@ -6536,7 +6197,7 @@ TEST_CASE("!/redwood/correctness/btree") { wait(closedFuture); printf("Reopening btree from disk.\n"); - IPager2 *pager = new DWALPager(pageSize, pagerFile, 0); + IPager2* pager = new DWALPager(pageSize, pagerFile, 0); btree = new VersionedBTree(pager, pagerFile); wait(btree->init()); @@ -6555,8 +6216,7 @@ TEST_CASE("!/redwood/correctness/btree") { } // Check for errors - if(errorCount != 0) - throw internal_error(); + if (errorCount != 0) throw internal_error(); } debug_printf("Waiting for outstanding commit\n"); @@ -6567,8 +6227,7 @@ TEST_CASE("!/redwood/correctness/btree") { wait(verifyTask); // Check for errors - if(errorCount != 0) - throw internal_error(); + if (errorCount != 0) throw internal_error(); wait(btree->destroyAndCheckSanity()); @@ -6580,13 +6239,13 @@ TEST_CASE("!/redwood/correctness/btree") { return Void(); } -ACTOR Future randomSeeks(VersionedBTree *btree, int count, char firstChar, char lastChar) { +ACTOR Future randomSeeks(VersionedBTree* btree, int count, char firstChar, char lastChar) { state Version readVer = btree->getLatestVersion(); state int c = 0; state double readStart = timer(); printf("Executing %d random seeks\n", count); state Reference cur = btree->readAtVersion(readVer); - while(c < count) { + while (c < count) { state Key k = randomString(20, firstChar, lastChar); wait(success(cur->findFirstEqualOrGreater(k))); ++c; @@ -6596,7 +6255,8 @@ ACTOR Future randomSeeks(VersionedBTree *btree, int count, char firstChar, return Void(); } -ACTOR Future randomScans(VersionedBTree *btree, int count, int width, int readAhead, char firstChar, char lastChar) { +ACTOR Future randomScans(VersionedBTree* btree, int count, int width, int readAhead, char firstChar, + char lastChar) { state Version readVer = btree->getLatestVersion(); state int c = 0; state double readStart = timer(); @@ -6604,14 +6264,14 @@ ACTOR Future randomScans(VersionedBTree *btree, int count, int width, int state Reference cur = btree->readAtVersion(readVer); state bool adaptive = readAhead < 0; state int totalScanBytes = 0; - while(c++ < count) { + while (c++ < count) { state Key k = randomString(20, firstChar, lastChar); wait(success(cur->findFirstEqualOrGreater(k, readAhead))); - if(adaptive) { + if (adaptive) { readAhead = totalScanBytes / c; } state int w = width; - while(w > 0 && cur->isValid()) { + while (w > 0 && cur->isValid()) { totalScanBytes += cur->getKey().size(); totalScanBytes += cur->getValue().size(); wait(cur->next()); @@ -6619,7 +6279,8 @@ ACTOR Future randomScans(VersionedBTree *btree, int count, int width, int } } double elapsed = timer() - readStart; - printf("Completed %d scans: readAhead=%d width=%d bytesRead=%d scansRate=%d/s\n", count, readAhead, width, totalScanBytes, int(count / elapsed)); + printf("Completed %d scans: readAhead=%d width=%d bytesRead=%d scansRate=%d/s\n", count, readAhead, width, + totalScanBytes, int(count / elapsed)); return Void(); } @@ -6629,7 +6290,7 @@ TEST_CASE("!/redwood/correctness/pager/cow") { deleteFile(pagerFile); int pageSize = 4096; - state IPager2 *pager = new DWALPager(pageSize, pagerFile, 0); + state IPager2* pager = new DWALPager(pageSize, pagerFile, 0); wait(success(pager->init())); state LogicalPageID id = wait(pager->newPageID()); @@ -6658,15 +6319,15 @@ TEST_CASE("!/redwood/performance/set") { state bool reload = getenv("TESTFILE") == nullptr; state std::string pagerFile = reload ? "unittest.redwood" : getenv("TESTFILE"); - if(reload) { + if (reload) { printf("Deleting old test data\n"); deleteFile(pagerFile); } state int pageSize = 4096; state int64_t pageCacheBytes = FLOW_KNOBS->PAGE_CACHE_4K; - DWALPager *pager = new DWALPager(pageSize, pagerFile, pageCacheBytes); - state VersionedBTree *btree = new VersionedBTree(pager, pagerFile); + DWALPager* pager = new DWALPager(pageSize, pagerFile, pageCacheBytes); + state VersionedBTree* btree = new VersionedBTree(pager, pagerFile); wait(btree->init()); state int nodeCount = 1e9; @@ -6706,8 +6367,8 @@ TEST_CASE("!/redwood/performance/set") { state double intervalStart = timer(); state double start = intervalStart; - if(reload) { - while(kvBytesTotal < kvBytesTarget) { + if (reload) { + while (kvBytesTotal < kvBytesTarget) { wait(yield()); Version lastVer = btree->getLatestVersion(); @@ -6715,15 +6376,19 @@ TEST_CASE("!/redwood/performance/set") { btree->setWriteVersion(version); int changes = deterministicRandom()->randomInt(0, maxChangesPerVersion); - while(changes > 0 && kvBytes < commitTarget) { + while (changes > 0 && kvBytes < commitTarget) { KeyValue kv; - kv.key = randomString(kv.arena(), deterministicRandom()->randomInt(minKeyPrefixBytes + sizeof(uint32_t), maxKeyPrefixBytes + sizeof(uint32_t) + 1), firstKeyChar, lastKeyChar); + kv.key = randomString(kv.arena(), + deterministicRandom()->randomInt(minKeyPrefixBytes + sizeof(uint32_t), + maxKeyPrefixBytes + sizeof(uint32_t) + 1), + firstKeyChar, lastKeyChar); int32_t index = deterministicRandom()->randomInt(0, nodeCount); int runLength = deterministicRandom()->randomInt(minConsecutiveRun, maxConsecutiveRun + 1); - while(runLength > 0 && changes > 0) { - *(uint32_t *)(kv.key.end() - sizeof(uint32_t)) = bigEndian32(index++); - kv.value = StringRef((uint8_t *)value.data(), deterministicRandom()->randomInt(minValueSize, maxValueSize + 1)); + while (runLength > 0 && changes > 0) { + *(uint32_t*)(kv.key.end() - sizeof(uint32_t)) = bigEndian32(index++); + kv.value = StringRef((uint8_t*)value.data(), + deterministicRandom()->randomInt(minValueSize, maxValueSize + 1)); btree->set(kv); @@ -6734,22 +6399,25 @@ TEST_CASE("!/redwood/performance/set") { } } - if(kvBytes >= commitTarget) { + if (kvBytes >= commitTarget) { btree->setOldestVersion(btree->getLastCommittedVersion()); wait(commit); - printf("Cumulative %.2f MB keyValue bytes written at %.2f MB/s\n", kvBytesTotal / 1e6, kvBytesTotal / (timer() - start) / 1e6); + printf("Cumulative %.2f MB keyValue bytes written at %.2f MB/s\n", kvBytesTotal / 1e6, + kvBytesTotal / (timer() - start) / 1e6); // Avoid capturing via this to freeze counter values int recs = records; int kvb = kvBytes; - // Capturing invervalStart via this->intervalStart makes IDE's unhappy as they do not know about the actor state object - double *pIntervalStart = &intervalStart; + // Capturing invervalStart via this->intervalStart makes IDE's unhappy as they do not know about the + // actor state object + double* pIntervalStart = &intervalStart; commit = map(btree->commit(), [=](Void result) { printf("Committed: %s\n", VersionedBTree::counts.toString(true).c_str()); double elapsed = timer() - *pIntervalStart; - printf("Committed %d kvBytes in %d records in %f seconds, %.2f MB/s\n", kvb, recs, elapsed, kvb / elapsed / 1e6); + printf("Committed %d kvBytes in %d records in %f seconds, %.2f MB/s\n", kvb, recs, elapsed, + kvb / elapsed / 1e6); *pIntervalStart = timer(); return Void(); }); @@ -6761,14 +6429,15 @@ TEST_CASE("!/redwood/performance/set") { } wait(commit); - printf("Cumulative %.2f MB keyValue bytes written at %.2f MB/s\n", kvBytesTotal / 1e6, kvBytesTotal / (timer() - start) / 1e6); + printf("Cumulative %.2f MB keyValue bytes written at %.2f MB/s\n", kvBytesTotal / 1e6, + kvBytesTotal / (timer() - start) / 1e6); } int seeks = 1e6; printf("Warming cache with seeks\n"); - actors.add(randomSeeks(btree, seeks/3, firstKeyChar, lastKeyChar)); - actors.add(randomSeeks(btree, seeks/3, firstKeyChar, lastKeyChar)); - actors.add(randomSeeks(btree, seeks/3, firstKeyChar, lastKeyChar)); + actors.add(randomSeeks(btree, seeks / 3, firstKeyChar, lastKeyChar)); + actors.add(randomSeeks(btree, seeks / 3, firstKeyChar, lastKeyChar)); + actors.add(randomSeeks(btree, seeks / 3, firstKeyChar, lastKeyChar)); wait(actors.signalAndReset()); printf("Stats: %s\n", VersionedBTree::counts.toString(true).c_str()); @@ -6822,9 +6491,7 @@ struct PrefixSegment { int length; int cardinality; - std::string toString() const { - return format("{%d bytes, %d choices}", length, cardinality); - } + std::string toString() const { return format("{%d bytes, %d choices}", length, cardinality); } }; // Utility class for generating kv pairs under a prefix pattern @@ -6838,42 +6505,42 @@ struct KVSource { std::vector desc; std::vector> segments; std::vector prefixes; - std::vector prefixesSorted; + std::vector prefixesSorted; std::string valueData; int prefixLen; int lastIndex; - KVSource(const std::vector &desc, int numPrefixes = 0) : desc(desc) { - if(numPrefixes == 0) { + KVSource(const std::vector& desc, int numPrefixes = 0) : desc(desc) { + if (numPrefixes == 0) { numPrefixes = 1; - for(auto &p : desc) { + for (auto& p : desc) { numPrefixes *= p.cardinality; } } prefixLen = 0; - for(auto &s : desc) { + for (auto& s : desc) { prefixLen += s.length; std::vector parts; - while(parts.size() < s.cardinality) { + while (parts.size() < s.cardinality) { parts.push_back(deterministicRandom()->randomAlphaNumeric(s.length)); } segments.push_back(std::move(parts)); } - while(prefixes.size() < numPrefixes) { + while (prefixes.size() < numPrefixes) { std::string p; - for(auto &s : segments) { + for (auto& s : segments) { p.append(s[deterministicRandom()->randomInt(0, s.size())]); } - prefixes.push_back(PrefixRef((uint8_t *)p.data(), p.size())); + prefixes.push_back(PrefixRef((uint8_t*)p.data(), p.size())); } - for(auto &p : prefixes) { + for (auto& p : prefixes) { prefixesSorted.push_back(&p); } - std::sort(prefixesSorted.begin(), prefixesSorted.end(), [](const Prefix *a, const Prefix *b) { - return KeyRef((uint8_t *)a->begin(), a->size()) < KeyRef((uint8_t *)b->begin(), b->size()); + std::sort(prefixesSorted.begin(), prefixesSorted.end(), [](const Prefix* a, const Prefix* b) { + return KeyRef((uint8_t*)a->begin(), a->size()) < KeyRef((uint8_t*)b->begin(), b->size()); }); valueData = deterministicRandom()->randomAlphaNumeric(100000); @@ -6882,13 +6549,11 @@ struct KVSource { // Expands the chosen prefix in the prefix list to hold suffix, // fills suffix with random bytes, and returns a reference to the string - KeyRef getKeyRef(int suffixLen) { - return makeKey(randomPrefix(), suffixLen); - } + KeyRef getKeyRef(int suffixLen) { return makeKey(randomPrefix(), suffixLen); } // Like getKeyRef but uses the same prefix as the last randomly chosen prefix KeyRef getAnotherKeyRef(int suffixLen, bool sorted = false) { - Prefix &p = sorted ? *prefixesSorted[lastIndex] : prefixes[lastIndex]; + Prefix& p = sorted ? *prefixesSorted[lastIndex] : prefixes[lastIndex]; return makeKey(p, suffixLen); } @@ -6896,51 +6561,48 @@ struct KVSource { KeyRangeRef getRangeRef(int prefixesCovered, int suffixLen) { prefixesCovered = std::min(prefixesCovered, prefixes.size()); int i = deterministicRandom()->randomInt(0, prefixesSorted.size() - prefixesCovered); - Prefix *begin = prefixesSorted[i]; - Prefix *end = prefixesSorted[i + prefixesCovered]; + Prefix* begin = prefixesSorted[i]; + Prefix* end = prefixesSorted[i + prefixesCovered]; return KeyRangeRef(makeKey(*begin, suffixLen), makeKey(*end, suffixLen)); } - KeyRef getValue(int len) { - return KeyRef(valueData).substr(0, len); - } + KeyRef getValue(int len) { return KeyRef(valueData).substr(0, len); } // Move lastIndex to the next position, wrapping around to 0 void nextPrefix() { ++lastIndex; - if(lastIndex == prefixes.size()) { + if (lastIndex == prefixes.size()) { lastIndex = 0; } } - Prefix & randomPrefix() { + Prefix& randomPrefix() { lastIndex = deterministicRandom()->randomInt(0, prefixes.size()); return prefixes[lastIndex]; } - static KeyRef makeKey(Prefix &p, int suffixLen) { + static KeyRef makeKey(Prefix& p, int suffixLen) { p.reserve(p.arena(), p.size() + suffixLen); - uint8_t *wptr = p.end(); - for(int i = 0; i < suffixLen; ++i) { + uint8_t* wptr = p.end(); + for (int i = 0; i < suffixLen; ++i) { *wptr++ = (uint8_t)deterministicRandom()->randomAlphaNumeric(); } return KeyRef(p.begin(), p.size() + suffixLen); } - int numPrefixes() const { - return prefixes.size(); - }; + int numPrefixes() const { return prefixes.size(); }; std::string toString() const { return format("{prefixLen=%d prefixes=%d format=%s}", prefixLen, numPrefixes(), ::toString(desc).c_str()); } }; -std::string toString(const StorageBytes &sb) { - return format("{%.2f MB total, %.2f MB free, %.2f MB available, %.2f MB used}", sb.total / 1e6, sb.free / 1e6, sb.available / 1e6, sb.used / 1e6); +std::string toString(const StorageBytes& sb) { + return format("{%.2f MB total, %.2f MB free, %.2f MB available, %.2f MB used}", sb.total / 1e6, sb.free / 1e6, + sb.available / 1e6, sb.used / 1e6); } -ACTOR Future getStableStorageBytes(IKeyValueStore *kvs) { +ACTOR Future getStableStorageBytes(IKeyValueStore* kvs) { state StorageBytes sb = kvs->getStorageBytes(); // Wait for StorageBytes used metric to stabilize @@ -6949,7 +6611,7 @@ ACTOR Future getStableStorageBytes(IKeyValueStore *kvs) { StorageBytes sb2 = kvs->getStorageBytes(); bool stable = sb2.used == sb.used; sb = sb2; - if(stable) { + if (stable) { break; } } @@ -6957,7 +6619,8 @@ ACTOR Future getStableStorageBytes(IKeyValueStore *kvs) { return sb; } -ACTOR Future prefixClusteredInsert(IKeyValueStore *kvs, int suffixSize, int valueSize, KVSource source, int recordCountTarget, bool usePrefixesInOrder) { +ACTOR Future prefixClusteredInsert(IKeyValueStore* kvs, int suffixSize, int valueSize, KVSource source, + int recordCountTarget, bool usePrefixesInOrder) { state int commitTarget = 5e6; state int recordSize = source.prefixLen + suffixSize + valueSize; @@ -6988,26 +6651,27 @@ ACTOR Future prefixClusteredInsert(IKeyValueStore *kvs, int suffixSize, in state std::function stats = [&]() { double elapsed = timer() - start; - printf("Cumulative stats: %.2f seconds %.2f MB keyValue bytes %d records %.2f MB/s %.2f rec/s\r", elapsed, kvBytesTotal / 1e6, records, kvBytesTotal / elapsed / 1e6, records / elapsed); + printf("Cumulative stats: %.2f seconds %.2f MB keyValue bytes %d records %.2f MB/s %.2f rec/s\r", elapsed, + kvBytesTotal / 1e6, records, kvBytesTotal / elapsed / 1e6, records / elapsed); fflush(stdout); }; - while(kvBytesTotal < kvBytesTarget) { + while (kvBytesTotal < kvBytesTarget) { wait(yield()); state int i; - for(i = 0; i < recordsPerPrefix; ++i) { + for (i = 0; i < recordsPerPrefix; ++i) { KeyValueRef kv(source.getAnotherKeyRef(4, usePrefixesInOrder), source.getValue(valueSize)); kvs->set(kv); kvBytes += kv.expectedSize(); ++records; - if(kvBytes >= commitTarget) { + if (kvBytes >= commitTarget) { wait(commit); stats(); commit = kvs->commit(); kvBytesTotal += kvBytes; - if(kvBytesTotal >= kvBytesTarget) { + if (kvBytesTotal >= kvBytesTarget) { break; } kvBytes = 0; @@ -7030,15 +6694,16 @@ ACTOR Future prefixClusteredInsert(IKeyValueStore *kvs, int suffixSize, in intervalStart = timer(); kvs->clear(KeyRangeRef(LiteralStringRef(""), LiteralStringRef("\xff"))); state StorageBytes sbClear = wait(getStableStorageBytes(kvs)); - printf("Cleared all keys in %.2f seconds, final storageByte: %s\n", timer() - intervalStart, toString(sbClear).c_str()); + printf("Cleared all keys in %.2f seconds, final storageByte: %s\n", timer() - intervalStart, + toString(sbClear).c_str()); return Void(); } -ACTOR Future sequentialInsert(IKeyValueStore *kvs, int prefixLen, int valueSize, int recordCountTarget) { +ACTOR Future sequentialInsert(IKeyValueStore* kvs, int prefixLen, int valueSize, int recordCountTarget) { state int commitTarget = 5e6; - state KVSource source({{prefixLen, 1}}); + state KVSource source({ { prefixLen, 1 } }); state int recordSize = source.prefixLen + sizeof(uint64_t) + valueSize; state int64_t kvBytesTarget = (int64_t)recordCountTarget * recordSize; @@ -7062,27 +6727,28 @@ ACTOR Future sequentialInsert(IKeyValueStore *kvs, int prefixLen, int valu state std::function stats = [&]() { double elapsed = timer() - start; - printf("Cumulative stats: %.2f seconds %.2f MB keyValue bytes %d records %.2f MB/s %.2f rec/s\r", elapsed, kvBytesTotal / 1e6, records, kvBytesTotal / elapsed / 1e6, records / elapsed); + printf("Cumulative stats: %.2f seconds %.2f MB keyValue bytes %d records %.2f MB/s %.2f rec/s\r", elapsed, + kvBytesTotal / 1e6, records, kvBytesTotal / elapsed / 1e6, records / elapsed); fflush(stdout); }; state uint64_t c = 0; state Key key = source.getKeyRef(sizeof(uint64_t)); - while(kvBytesTotal < kvBytesTarget) { + while (kvBytesTotal < kvBytesTarget) { wait(yield()); - *(uint64_t *)(key.end() - sizeof(uint64_t)) = bigEndian64(c); + *(uint64_t*)(key.end() - sizeof(uint64_t)) = bigEndian64(c); KeyValueRef kv(key, source.getValue(valueSize)); kvs->set(kv); kvBytes += kv.expectedSize(); ++records; - if(kvBytes >= commitTarget) { + if (kvBytes >= commitTarget) { wait(commit); stats(); commit = kvs->commit(); kvBytesTotal += kvBytes; - if(kvBytesTotal >= kvBytesTarget) { + if (kvBytesTotal >= kvBytesTarget) { break; } kvBytes = 0; @@ -7097,18 +6763,19 @@ ACTOR Future sequentialInsert(IKeyValueStore *kvs, int prefixLen, int valu return Void(); } -Future closeKVS(IKeyValueStore *kvs) { +Future closeKVS(IKeyValueStore* kvs) { Future closed = kvs->onClosed(); kvs->close(); return closed; } -ACTOR Future doPrefixInsertComparison(int suffixSize, int valueSize, int recordCountTarget, bool usePrefixesInOrder, KVSource source) { +ACTOR Future doPrefixInsertComparison(int suffixSize, int valueSize, int recordCountTarget, + bool usePrefixesInOrder, KVSource source) { VersionedBTree::counts.clear(); deleteFile("test.redwood"); wait(delay(5)); - state IKeyValueStore *redwood = openKVStore(KeyValueStoreType::SSD_REDWOOD_V1, "test.redwood", UID(), 0); + state IKeyValueStore* redwood = openKVStore(KeyValueStoreType::SSD_REDWOOD_V1, "test.redwood", UID(), 0); wait(prefixClusteredInsert(redwood, suffixSize, valueSize, source, recordCountTarget, usePrefixesInOrder)); wait(closeKVS(redwood)); printf("\n"); @@ -7116,7 +6783,7 @@ ACTOR Future doPrefixInsertComparison(int suffixSize, int valueSize, int r deleteFile("test.sqlite"); deleteFile("test.sqlite-wal"); wait(delay(5)); - state IKeyValueStore *sqlite = openKVStore(KeyValueStoreType::SSD_BTREE_V2, "test.sqlite", UID(), 0); + state IKeyValueStore* sqlite = openKVStore(KeyValueStoreType::SSD_BTREE_V2, "test.sqlite", UID(), 0); wait(prefixClusteredInsert(sqlite, suffixSize, valueSize, source, recordCountTarget, usePrefixesInOrder)); wait(closeKVS(sqlite)); printf("\n"); @@ -7130,10 +6797,14 @@ TEST_CASE("!/redwood/performance/prefixSizeComparison") { state int recordCountTarget = 100e6; state int usePrefixesInOrder = false; - wait(doPrefixInsertComparison(suffixSize, valueSize, recordCountTarget, usePrefixesInOrder, KVSource({{10, 100000}}))); - wait(doPrefixInsertComparison(suffixSize, valueSize, recordCountTarget, usePrefixesInOrder, KVSource({{16, 100000}}))); - wait(doPrefixInsertComparison(suffixSize, valueSize, recordCountTarget, usePrefixesInOrder, KVSource({{32, 100000}}))); - wait(doPrefixInsertComparison(suffixSize, valueSize, recordCountTarget, usePrefixesInOrder, KVSource({{4, 5}, {12, 1000}, {8, 5}, {8, 4}}))); + wait(doPrefixInsertComparison(suffixSize, valueSize, recordCountTarget, usePrefixesInOrder, + KVSource({ { 10, 100000 } }))); + wait(doPrefixInsertComparison(suffixSize, valueSize, recordCountTarget, usePrefixesInOrder, + KVSource({ { 16, 100000 } }))); + wait(doPrefixInsertComparison(suffixSize, valueSize, recordCountTarget, usePrefixesInOrder, + KVSource({ { 32, 100000 } }))); + wait(doPrefixInsertComparison(suffixSize, valueSize, recordCountTarget, usePrefixesInOrder, + KVSource({ { 4, 5 }, { 12, 1000 }, { 8, 5 }, { 8, 4 } }))); return Void(); } @@ -7145,11 +6816,10 @@ TEST_CASE("!/redwood/performance/sequentialInsert") { deleteFile("test.redwood"); wait(delay(5)); - state IKeyValueStore *redwood = openKVStore(KeyValueStoreType::SSD_REDWOOD_V1, "test.redwood", UID(), 0); + state IKeyValueStore* redwood = openKVStore(KeyValueStoreType::SSD_REDWOOD_V1, "test.redwood", UID(), 0); wait(sequentialInsert(redwood, prefixLen, valueSize, recordCountTarget)); wait(closeKVS(redwood)); printf("\n"); return Void(); } - diff --git a/fdbserver/WorkerInterface.actor.h b/fdbserver/WorkerInterface.actor.h index c8885cb4a0..73bf110f55 100644 --- a/fdbserver/WorkerInterface.actor.h +++ b/fdbserver/WorkerInterface.actor.h @@ -37,6 +37,7 @@ #include "fdbserver/LogSystemConfig.h" #include "fdbrpc/MultiInterface.h" #include "fdbclient/ClientWorkerInterface.h" +#include "fdbserver/RecoveryState.h" #include "flow/actorcompiler.h" struct WorkerInterface { @@ -60,14 +61,17 @@ struct WorkerInterface { RequestStream< struct EventLogRequest > eventLogRequest; RequestStream< struct TraceBatchDumpRequest > traceBatchDumpRequest; RequestStream< struct DiskStoreRequest > diskStoreRequest; - RequestStream execReq; - RequestStream workerSnapReq; + RequestStream< struct ExecuteRequest> execReq; + RequestStream< struct WorkerSnapRequest> workerSnapReq; + RequestStream< struct UpdateServerDBInfoRequest > updateServerDBInfo; TesterInterface testerInterface; UID id() const { return tLog.getEndpoint().token; } NetworkAddress address() const { return tLog.getEndpoint().getPrimaryAddress(); } + NetworkAddress stableAddress() const { return tLog.getEndpoint().getStableAddress(); } Optional secondaryAddress() const { return tLog.getEndpoint().addresses.secondaryAddress; } + NetworkAddressList addresses() const { return tLog.getEndpoint().addresses; } WorkerInterface() {} WorkerInterface( const LocalityData& locality ) : locality( locality ) {} @@ -81,11 +85,13 @@ struct WorkerInterface { logRouter.getEndpoint( TaskPriority::Worker ); debugPing.getEndpoint( TaskPriority::Worker ); coordinationPing.getEndpoint( TaskPriority::Worker ); + updateServerDBInfo.getEndpoint( TaskPriority::Worker ); + eventLogRequest.getEndpoint( TaskPriority::Worker ); } template void serialize(Ar& ar) { - serializer(ar, clientInterface, locality, tLog, master, masterProxy, dataDistributor, ratekeeper, resolver, storage, logRouter, debugPing, coordinationPing, waitFailure, setMetricsRate, eventLogRequest, traceBatchDumpRequest, testerInterface, diskStoreRequest, execReq, workerSnapReq, backup); + serializer(ar, clientInterface, locality, tLog, master, masterProxy, dataDistributor, ratekeeper, resolver, storage, logRouter, debugPing, coordinationPing, waitFailure, setMetricsRate, eventLogRequest, traceBatchDumpRequest, testerInterface, diskStoreRequest, execReq, workerSnapReq, backup, updateServerDBInfo); } }; @@ -104,6 +110,230 @@ struct WorkerDetails { } }; +// This interface and its serialization depend on slicing, since the client will deserialize only the first part of this structure +struct ClusterControllerFullInterface { + constexpr static FileIdentifier file_identifier = + ClusterControllerClientInterface::file_identifier; + ClusterInterface clientInterface; + RequestStream< struct RecruitFromConfigurationRequest > recruitFromConfiguration; + RequestStream< struct RecruitRemoteFromConfigurationRequest > recruitRemoteFromConfiguration; + RequestStream< struct RecruitStorageRequest > recruitStorage; + RequestStream< struct RegisterWorkerRequest > registerWorker; + RequestStream< struct GetWorkersRequest > getWorkers; + RequestStream< struct RegisterMasterRequest > registerMaster; + RequestStream< struct GetServerDBInfoRequest > getServerDBInfo; //only used by testers; the cluster controller will send the serverDBInfo to workers + + UID id() const { return clientInterface.id(); } + bool operator == (ClusterControllerFullInterface const& r) const { return id() == r.id(); } + bool operator != (ClusterControllerFullInterface const& r) const { return id() != r.id(); } + + bool hasMessage() { + return clientInterface.hasMessage() || + recruitFromConfiguration.getFuture().isReady() || + recruitRemoteFromConfiguration.getFuture().isReady() || + recruitStorage.getFuture().isReady() || + registerWorker.getFuture().isReady() || + getWorkers.getFuture().isReady() || + registerMaster.getFuture().isReady() || + getServerDBInfo.getFuture().isReady(); + } + + void initEndpoints() { + clientInterface.initEndpoints(); + recruitFromConfiguration.getEndpoint( TaskPriority::ClusterControllerRecruit ); + recruitRemoteFromConfiguration.getEndpoint( TaskPriority::ClusterControllerRecruit ); + recruitStorage.getEndpoint( TaskPriority::ClusterController ); + registerWorker.getEndpoint( TaskPriority::ClusterControllerWorker ); + getWorkers.getEndpoint( TaskPriority::ClusterController ); + registerMaster.getEndpoint( TaskPriority::ClusterControllerRegister ); + getServerDBInfo.getEndpoint( TaskPriority::ClusterController ); + } + + template + void serialize(Ar& ar) { + if constexpr (!is_fb_function) { + ASSERT(ar.protocolVersion().isValid()); + } + serializer(ar, clientInterface, recruitFromConfiguration, recruitRemoteFromConfiguration, recruitStorage, + registerWorker, getWorkers, registerMaster, getServerDBInfo); + } +}; + +struct RegisterWorkerReply { + constexpr static FileIdentifier file_identifier = 16475696; + ProcessClass processClass; + ClusterControllerPriorityInfo priorityInfo; + Optional storageCache; + + RegisterWorkerReply() : priorityInfo(ProcessClass::UnsetFit, false, ClusterControllerPriorityInfo::FitnessUnknown) {} + RegisterWorkerReply(ProcessClass processClass, ClusterControllerPriorityInfo priorityInfo, Optional storageCache) : processClass(processClass), priorityInfo(priorityInfo), storageCache(storageCache) {} + + template + void serialize( Ar& ar ) { + serializer(ar, processClass, priorityInfo, storageCache); + } +}; + +struct RegisterMasterRequest { + constexpr static FileIdentifier file_identifier = 10773445; + UID id; + LocalityData mi; + LogSystemConfig logSystemConfig; + std::vector proxies; + std::vector resolvers; + DBRecoveryCount recoveryCount; + int64_t registrationCount; + Optional configuration; + std::vector priorCommittedLogServers; + RecoveryState recoveryState; + bool recoveryStalled; + + ReplyPromise reply; + + RegisterMasterRequest() : logSystemConfig(0) {} + + template + void serialize(Ar& ar) { + if constexpr (!is_fb_function) { + ASSERT(ar.protocolVersion().isValid()); + } + serializer(ar, id, mi, logSystemConfig, proxies, resolvers, recoveryCount, registrationCount, configuration, + priorCommittedLogServers, recoveryState, recoveryStalled, reply); + } +}; + +struct RecruitFromConfigurationReply { + constexpr static FileIdentifier file_identifier = 2224085; + std::vector backupWorkers; + std::vector tLogs; + std::vector satelliteTLogs; + std::vector proxies; + std::vector resolvers; + std::vector storageServers; + std::vector oldLogRouters; + Optional dcId; + bool satelliteFallback; + + RecruitFromConfigurationReply() : satelliteFallback(false) {} + + template + void serialize(Ar& ar) { + serializer(ar, tLogs, satelliteTLogs, proxies, resolvers, storageServers, oldLogRouters, dcId, + satelliteFallback, backupWorkers); + } +}; + +struct RecruitFromConfigurationRequest { + constexpr static FileIdentifier file_identifier = 2023046; + DatabaseConfiguration configuration; + bool recruitSeedServers; + int maxOldLogRouters; + ReplyPromise< RecruitFromConfigurationReply > reply; + + RecruitFromConfigurationRequest() {} + explicit RecruitFromConfigurationRequest(DatabaseConfiguration const& configuration, bool recruitSeedServers, int maxOldLogRouters) + : configuration(configuration), recruitSeedServers(recruitSeedServers), maxOldLogRouters(maxOldLogRouters) {} + + template + void serialize( Ar& ar ) { + serializer(ar, configuration, recruitSeedServers, maxOldLogRouters, reply); + } +}; + +struct RecruitRemoteFromConfigurationReply { + constexpr static FileIdentifier file_identifier = 9091392; + std::vector remoteTLogs; + std::vector logRouters; + + template + void serialize( Ar& ar ) { + serializer(ar, remoteTLogs, logRouters); + } +}; + +struct RecruitRemoteFromConfigurationRequest { + constexpr static FileIdentifier file_identifier = 3235995; + DatabaseConfiguration configuration; + Optional dcId; + int logRouterCount; + std::vector exclusionWorkerIds; + ReplyPromise< RecruitRemoteFromConfigurationReply > reply; + + RecruitRemoteFromConfigurationRequest() {} + RecruitRemoteFromConfigurationRequest(DatabaseConfiguration const& configuration, Optional const& dcId, int logRouterCount, const std::vector &exclusionWorkerIds) : configuration(configuration), dcId(dcId), logRouterCount(logRouterCount), exclusionWorkerIds(exclusionWorkerIds){} + + template + void serialize( Ar& ar ) { + serializer(ar, configuration, dcId, logRouterCount, exclusionWorkerIds, reply); + } +}; + +struct RecruitStorageReply { + constexpr static FileIdentifier file_identifier = 15877089; + WorkerInterface worker; + ProcessClass processClass; + + template + void serialize( Ar& ar ) { + serializer(ar, worker, processClass); + } +}; + +struct RecruitStorageRequest { + constexpr static FileIdentifier file_identifier = 905920; + std::vector>> excludeMachines; //< Don't recruit any of these machines + std::vector excludeAddresses; //< Don't recruit any of these addresses + std::vector>> includeDCs; + bool criticalRecruitment; //< True if machine classes are to be ignored + ReplyPromise< RecruitStorageReply > reply; + + template + void serialize( Ar& ar ) { + serializer(ar, excludeMachines, excludeAddresses, includeDCs, criticalRecruitment, reply); + } +}; + +struct RegisterWorkerRequest { + constexpr static FileIdentifier file_identifier = 14332605; + WorkerInterface wi; + ProcessClass initialClass; + ProcessClass processClass; + ClusterControllerPriorityInfo priorityInfo; + Generation generation; + Optional distributorInterf; + Optional ratekeeperInterf; + Optional> storageCacheInterf; + Standalone> issues; + std::vector incompatiblePeers; + ReplyPromise reply; + bool degraded; + + RegisterWorkerRequest() : priorityInfo(ProcessClass::UnsetFit, false, ClusterControllerPriorityInfo::FitnessUnknown), degraded(false) {} + RegisterWorkerRequest(WorkerInterface wi, ProcessClass initialClass, ProcessClass processClass, ClusterControllerPriorityInfo priorityInfo, Generation generation, Optional ddInterf, Optional rkInterf, Optional> storageCacheInterf, bool degraded) : + wi(wi), initialClass(initialClass), processClass(processClass), priorityInfo(priorityInfo), generation(generation), distributorInterf(ddInterf), ratekeeperInterf(rkInterf), storageCacheInterf(storageCacheInterf), degraded(degraded) {} + + template + void serialize( Ar& ar ) { + serializer(ar, wi, initialClass, processClass, priorityInfo, generation, distributorInterf, ratekeeperInterf, storageCacheInterf, issues, incompatiblePeers, reply, degraded); + } +}; + +struct GetWorkersRequest { + constexpr static FileIdentifier file_identifier = 1254174; + enum { TESTER_CLASS_ONLY = 0x1, NON_EXCLUDED_PROCESSES_ONLY = 0x2 }; + + int flags; + ReplyPromise> reply; + + GetWorkersRequest() : flags(0) {} + explicit GetWorkersRequest(int fl) : flags(fl) {} + + template + void serialize(Ar& ar) { + serializer(ar, flags, reply); + } +}; + struct InitializeTLogRequest { constexpr static FileIdentifier file_identifier = 15604392; UID recruitmentID; @@ -462,7 +692,6 @@ void endRole(const Role &role, UID id, std::string reason, bool ok = true, Error struct ServerDBInfo; class Database openDBOnServer( Reference> const& db, TaskPriority taskID = TaskPriority::DefaultEndpoint, bool enableLocalityLoadBalance = true, bool lockAware = false ); -class Database openDBOnServer( Reference>> const& db, TaskPriority taskID = TaskPriority::DefaultEndpoint, bool enableLocalityLoadBalance = true, bool lockAware = false ); ACTOR Future extractClusterInterface(Reference>> a, Reference>> b); @@ -496,12 +725,6 @@ ACTOR Future tLog(IKeyValueStore* persistentData, IDiskQueue* persistentQu PromiseStream tlogRequests, UID tlogId, UID workerID, bool restoreFromDisk, Promise oldLog, Promise recovered, std::string folder, Reference> degraded, Reference> activeSharedTLog); - -ACTOR Future monitorServerDBInfo(Reference>> ccInterface, - Reference ccf, LocalityData locality, - Reference> dbInfo, - Optional>>> issues = - Optional>>>()); ACTOR Future resolver(ResolverInterface proxy, InitializeResolverRequest initReq, Reference> db); ACTOR Future logRouter(TLogInterface interf, InitializeLogRouterRequest req, @@ -535,5 +758,6 @@ ACTOR Future tLog(IKeyValueStore* persistentData, IDiskQueue* persistentQu typedef decltype(&tLog) TLogFn; +#include "fdbserver/ServerDBInfo.h" #include "flow/unactorcompiler.h" #endif diff --git a/fdbserver/fdbserver.actor.cpp b/fdbserver/fdbserver.actor.cpp index 266af46b74..78a6ad7211 100644 --- a/fdbserver/fdbserver.actor.cpp +++ b/fdbserver/fdbserver.actor.cpp @@ -34,7 +34,6 @@ #include "fdbserver/CoordinationInterface.h" #include "fdbserver/WorkerInterface.actor.h" #include "fdbclient/RestoreWorkerInterface.actor.h" -#include "fdbserver/ClusterRecruitmentInterface.h" #include "fdbserver/ServerDBInfo.h" #include "fdbserver/MoveKeys.actor.h" #include "fdbserver/ConflictSet.h" @@ -1629,6 +1628,7 @@ int main(int argc, char* argv[]) { openTraceFile(NetworkAddress(), opts.rollsize, opts.maxLogsSize, opts.logFolder, "trace", opts.logGroup); } else { g_network = newNet2(opts.tlsConfig, opts.useThreadPool, true); + g_network->addStopCallback( Net2FileSystem::stop ); FlowTransport::createInstance(false, 1); const bool expectsPublicAddress = (role == FDBD || role == NetworkTestServer || role == Restore); diff --git a/fdbserver/masterserver.actor.cpp b/fdbserver/masterserver.actor.cpp index d6cb465f12..c20d8ffb71 100644 --- a/fdbserver/masterserver.actor.cpp +++ b/fdbserver/masterserver.actor.cpp @@ -33,7 +33,6 @@ #include "fdbserver/MasterInterface.h" #include "fdbserver/WaitFailure.h" #include "fdbserver/WorkerInterface.actor.h" -#include "fdbserver/ClusterRecruitmentInterface.h" #include "fdbserver/ServerDBInfo.h" #include "fdbserver/CoordinatedState.h" #include "fdbserver/CoordinationInterface.h" // copy constructors for ServerCoordinators class @@ -740,22 +739,27 @@ ACTOR Future sendInitialCommitToResolvers( Reference self ) { ASSERT(self->recoveryTransactionVersion); state Standalone data = self->txnStateStore->readRange(txnKeys, BUGGIFY ? 3 : SERVER_KNOBS->DESIRED_TOTAL_BYTES, SERVER_KNOBS->DESIRED_TOTAL_BYTES).get(); - state vector> txnReplies; + state std::vector> txnReplies; state int64_t dataOutstanding = 0; + + state std::vector endpoints; + for(auto& it : self->proxies) { + endpoints.push_back(it.txnState.getEndpoint()); + } + loop { if(!data.size()) break; ((KeyRangeRef&)txnKeys) = KeyRangeRef( keyAfter(data.back().key, txnKeys.arena()), txnKeys.end ); Standalone nextData = self->txnStateStore->readRange(txnKeys, BUGGIFY ? 3 : SERVER_KNOBS->DESIRED_TOTAL_BYTES, SERVER_KNOBS->DESIRED_TOTAL_BYTES).get(); - for(auto& r : self->proxies) { - TxnStateRequest req; - req.arena = data.arena(); - req.data = data; - req.sequence = txnSequence; - req.last = !nextData.size(); - txnReplies.push_back( brokenPromiseToNever( r.txnState.getReply( req ) ) ); - dataOutstanding += data.arena().getSize(); - } + TxnStateRequest req; + req.arena = data.arena(); + req.data = data; + req.sequence = txnSequence; + req.last = !nextData.size(); + req.broadcastInfo = endpoints; + txnReplies.push_back(broadcastTxnRequest(req, SERVER_KNOBS->TXN_STATE_SEND_AMOUNT, false)); + dataOutstanding += SERVER_KNOBS->TXN_STATE_SEND_AMOUNT*data.arena().getSize(); data = nextData; txnSequence++; diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp index e8a9bbca76..0c8974cb53 100644 --- a/fdbserver/storageserver.actor.cpp +++ b/fdbserver/storageserver.actor.cpp @@ -3484,7 +3484,7 @@ ACTOR Future metricsCore( StorageServer* self, StorageServerInterface ssi } when (GetStorageMetricsRequest req = waitNext(ssi.getStorageMetrics.getFuture())) { StorageBytes sb = self->storage.getStorageBytes(); - self->metrics.getStorageMetrics( req, sb, self->counters.bytesInput.getRate() ); + self->metrics.getStorageMetrics( req, sb, self->counters.bytesInput.getRate(), self->versionLag, self->lastUpdate ); } when (wait(doPollMetrics) ) { self->metrics.poll(); diff --git a/fdbserver/tester.actor.cpp b/fdbserver/tester.actor.cpp index bac8f6c22a..f675acebc9 100644 --- a/fdbserver/tester.actor.cpp +++ b/fdbserver/tester.actor.cpp @@ -28,13 +28,13 @@ #include "fdbclient/SystemData.h" #include "fdbserver/TesterInterface.actor.h" #include "fdbserver/WorkerInterface.actor.h" -#include "fdbserver/ClusterRecruitmentInterface.h" #include "fdbserver/workloads/workloads.actor.h" #include "fdbserver/Status.h" #include "fdbserver/QuietDatabase.h" #include "fdbclient/MonitorLeader.h" #include "fdbserver/CoordinationInterface.h" #include "fdbclient/ManagementAPI.actor.h" +#include "fdbserver/WorkerInterface.actor.h" #include "flow/actorcompiler.h" // This must be the last #include. using namespace std; @@ -1017,11 +1017,40 @@ vector readTests( ifstream& ifs ) { return result; } +ACTOR Future monitorServerDBInfo(Reference>> ccInterface, + LocalityData locality, + Reference> dbInfo) { + // Initially most of the serverDBInfo is not known, but we know our locality right away + ServerDBInfo localInfo; + localInfo.myLocality = locality; + dbInfo->set(localInfo); + + loop { + GetServerDBInfoRequest req; + req.knownServerInfoID = dbInfo->get().id; + + choose { + when( ServerDBInfo _localInfo = wait( ccInterface->get().present() ? brokenPromiseToNever( ccInterface->get().get().getServerDBInfo.getReply( req ) ) : Never() ) ) { + ServerDBInfo localInfo = _localInfo; + TraceEvent("GotServerDBInfoChange").detail("ChangeID", localInfo.id).detail("MasterID", localInfo.master.id()) + .detail("RatekeeperID", localInfo.ratekeeper.present() ? localInfo.ratekeeper.get().id() : UID()) + .detail("DataDistributorID", localInfo.distributor.present() ? localInfo.distributor.get().id() : UID()); + + localInfo.myLocality = locality; + dbInfo->set(localInfo); + } + when( wait( ccInterface->onChange() ) ) { + if(ccInterface->get().present()) + TraceEvent("GotCCInterfaceChange").detail("CCID", ccInterface->get().get().id()).detail("CCMachine", ccInterface->get().get().getWorkers.getEndpoint().getPrimaryAddress()); + } + } + } +} + ACTOR Future runTests( Reference>> cc, Reference>> ci, vector< TesterInterface > testers, vector tests, StringRef startingConfiguration, LocalityData locality ) { state Database cx; state Reference> dbInfo( new AsyncVar ); - state Future ccMonitor = - monitorServerDBInfo(cc, Reference(), LocalityData(), dbInfo); // FIXME: locality + state Future ccMonitor = monitorServerDBInfo(cc, LocalityData(), dbInfo); // FIXME: locality state bool useDB = false; state bool waitForQuiescenceBegin = false; @@ -1192,7 +1221,7 @@ ACTOR Future runTests( Reference connFile, test_typ if (at == TEST_HERE) { Reference> db( new AsyncVar ); vector iTesters(1); - actors.push_back( reportErrors(monitorServerDBInfo( cc, Reference(), LocalityData(), db ), "MonitorServerDBInfo") ); // FIXME: Locality + actors.push_back( reportErrors(monitorServerDBInfo( cc, LocalityData(), db ), "MonitorServerDBInfo") ); // FIXME: Locality actors.push_back( reportErrors(testerServerCore( iTesters[0], connFile, db, locality ), "TesterServerCore") ); tests = runTests( cc, ci, iTesters, testSpecs, startingConfiguration, locality ); } else { diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp index b22015e296..9beffc5ece 100644 --- a/fdbserver/worker.actor.cpp +++ b/fdbserver/worker.actor.cpp @@ -33,7 +33,6 @@ #include "fdbserver/TesterInterface.actor.h" // for poisson() #include "fdbserver/IDiskQueue.h" #include "fdbclient/DatabaseContext.h" -#include "fdbserver/ClusterRecruitmentInterface.h" #include "fdbserver/DataDistributorInterface.h" #include "fdbserver/ServerDBInfo.h" #include "fdbserver/FDBExecHelper.actor.h" @@ -68,6 +67,44 @@ extern IKeyValueStore* keyValueStoreCompressTestData(IKeyValueStore* store); # define KV_STORE(filename,uid) keyValueStoreMemory(filename,uid) #endif +ACTOR Future> tryDBInfoBroadcast(RequestStream stream, UpdateServerDBInfoRequest req) { + ErrorOr> rep = wait( stream.getReplyUnlessFailedFor(req, SERVER_KNOBS->DBINFO_FAILED_DELAY, 0) ); + if(rep.present()) { + return rep.get(); + } + req.broadcastInfo.push_back(stream.getEndpoint()); + return req.broadcastInfo; +} + +ACTOR Future> broadcastDBInfoRequest(UpdateServerDBInfoRequest req, int sendAmount, Optional sender, bool sendReply) { + state std::vector>> replies; + state ReplyPromise> reply = req.reply; + resetReply( req ); + int currentStream = 0; + std::vector broadcastEndpoints = req.broadcastInfo; + for(int i = 0; i < sendAmount && currentStream < broadcastEndpoints.size(); i++) { + std::vector endpoints; + RequestStream cur(broadcastEndpoints[currentStream++]); + while(currentStream < broadcastEndpoints.size()*(i+1)/sendAmount) { + endpoints.push_back(broadcastEndpoints[currentStream++]); + } + req.broadcastInfo = endpoints; + replies.push_back( tryDBInfoBroadcast( cur, req ) ); + resetReply( req ); + } + wait( waitForAll(replies) ); + std::vector notUpdated; + if(sender.present()) { + notUpdated.push_back(sender.get()); + } + for(auto& it : replies) { + notUpdated.insert(notUpdated.end(), it.get().begin(), it.get().end()); + } + if(sendReply) { + reply.send(notUpdated); + } + return notUpdated; +} ACTOR static Future extractClientInfo( Reference> db, Reference> info ) { state std::vector lastProxyUIDs; @@ -80,27 +117,11 @@ ACTOR static Future extractClientInfo( Reference> d } } -ACTOR static Future extractClientInfo( Reference>> db, Reference> info ) { - state std::vector lastProxyUIDs; - state std::vector lastProxies; - loop { - ClientDBInfo ni = db->get().read().client; - shrinkProxyList(ni, lastProxyUIDs, lastProxies); - info->set( ni ); - wait( db->onChange() ); - } -} - Database openDBOnServer( Reference> const& db, TaskPriority taskID, bool enableLocalityLoadBalance, bool lockAware ) { Reference> info( new AsyncVar ); return DatabaseContext::create( info, extractClientInfo(db, info), enableLocalityLoadBalance ? db->get().myLocality : LocalityData(), enableLocalityLoadBalance, taskID, lockAware ); } -Database openDBOnServer( Reference>> const& db, TaskPriority taskID, bool enableLocalityLoadBalance, bool lockAware ) { - Reference> info( new AsyncVar ); - return DatabaseContext::create( info, extractClientInfo(db, info), enableLocalityLoadBalance ? db->get().read().myLocality : LocalityData(), enableLocalityLoadBalance, taskID, lockAware ); -} - struct ErrorInfo { Error error; const Role &role; @@ -413,7 +434,9 @@ ACTOR Future registrationClient( Reference> degraded, PromiseStream< ErrorInfo > errors, LocalityData locality, - Reference> dbInfo) { + Reference> dbInfo, + Reference connFile, + Reference>> issues) { // Keeps the cluster controller (as it may be re-elected) informed that this worker exists // The cluster controller uses waitFailureClient to find out if we die, and returns from registrationReply (requiring us to re-register) // The registration request piggybacks optional distributor interface if it exists. @@ -422,8 +445,41 @@ ACTOR Future registrationClient( state Reference>>> scInterf( new AsyncVar>>() ); state Future cacheProcessFuture; state Future cacheErrorsFuture; + state Optional incorrectTime; loop { RegisterWorkerRequest request(interf, initialClass, processClass, asyncPriorityInfo->get(), requestGeneration++, ddInterf->get(), rkInterf->get(), scInterf->get(), degraded->get()); + for (auto const& i : issues->get()) { + request.issues.push_back_deep(request.issues.arena(), i); + } + ClusterConnectionString fileConnectionString; + if (connFile && !connFile->fileContentsUpToDate(fileConnectionString)) { + request.issues.push_back_deep(request.issues.arena(), LiteralStringRef("incorrect_cluster_file_contents")); + std::string connectionString = connFile->getConnectionString().toString(); + if(!incorrectTime.present()) { + incorrectTime = now(); + } + if(connFile->canGetFilename()) { + // Don't log a SevWarnAlways initially to account for transient issues (e.g. someone else changing the file right before us) + TraceEvent(now() - incorrectTime.get() > 300 ? SevWarnAlways : SevWarn, "IncorrectClusterFileContents") + .detail("Filename", connFile->getFilename()) + .detail("ConnectionStringFromFile", fileConnectionString.toString()) + .detail("CurrentConnectionString", connectionString); + } + } + else { + incorrectTime = Optional(); + } + + auto peers = FlowTransport::transport().getIncompatiblePeers(); + for(auto it = peers->begin(); it != peers->end();) { + if( now() - it->second.second > FLOW_KNOBS->INCOMPATIBLE_PEER_DELAY_BEFORE_LOGGING ) { + request.incompatiblePeers.push_back(it->first); + it = peers->erase(it); + } else { + it++; + } + } + Future registrationReply = ccInterface->get().present() ? brokenPromiseToNever( ccInterface->get().get().registerWorker.getReply(request) ) : Never(); choose { when ( RegisterWorkerReply reply = wait( registrationReply )) { @@ -464,6 +520,8 @@ ACTOR Future registrationClient( when ( wait( rkInterf->onChange() ) ) {} when ( wait( scInterf->onChange() ) ) {} when ( wait( degraded->onChange() ) ) {} + when ( wait( FlowTransport::transport().onIncompatibleChanged() ) ) {} + when ( wait( issues->onChange() ) ) {} } } } @@ -749,7 +807,10 @@ ACTOR Future workerSnapCreate(WorkerSnapRequest snapReq, StringRef snapFol return Void(); } -ACTOR Future monitorTraceLogIssues(Optional>>> issues) { +// TODO: `issues` is right now only updated by `monitorTraceLogIssues` and thus is being `set` on every update. +// It could be changed to `insert` and `trigger` later if we want to use it as a generic way for the caller of this +// function to report issues to cluster controller. +ACTOR Future monitorTraceLogIssues(Reference>> issues) { state bool pingTimeout = false; loop { wait(delay(SERVER_KNOBS->TRACE_LOG_FLUSH_FAILURE_CHECK_INTERVAL_SECONDS)); @@ -764,87 +825,14 @@ ACTOR Future monitorTraceLogIssues(Optional _issues; - retriveTraceLogIssues(_issues); - if (pingTimeout) { - // Ping trace log writer thread timeout. - _issues.insert("trace_log_writer_thread_unresponsive"); - pingTimeout = false; - } - issues.get()->set(_issues); - } - } -} - -// TODO: `issues` is right now only updated by `monitorTraceLogIssues` and thus is being `set` on every update. -// It could be changed to `insert` and `trigger` later if we want to use it as a generic way for the caller of this -// function to report issues to cluster controller. -ACTOR Future monitorServerDBInfo(Reference>> ccInterface, - Reference connFile, LocalityData locality, - Reference> dbInfo, - Optional>>> issues) { - // Initially most of the serverDBInfo is not known, but we know our locality right away - ServerDBInfo localInfo; - localInfo.myLocality = locality; - dbInfo->set(localInfo); - - state Optional incorrectTime; - loop { - GetServerDBInfoRequest req; - req.knownServerInfoID = dbInfo->get().id; - - if (issues.present()) { - for (auto const& i : issues.get()->get()) { - req.issues.push_back_deep(req.issues.arena(), i); - } - } - - ClusterConnectionString fileConnectionString; - if (connFile && !connFile->fileContentsUpToDate(fileConnectionString)) { - req.issues.push_back_deep(req.issues.arena(), LiteralStringRef("incorrect_cluster_file_contents")); - std::string connectionString = connFile->getConnectionString().toString(); - if(!incorrectTime.present()) { - incorrectTime = now(); - } - if(connFile->canGetFilename()) { - // Don't log a SevWarnAlways initially to account for transient issues (e.g. someone else changing the file right before us) - TraceEvent(now() - incorrectTime.get() > 300 ? SevWarnAlways : SevWarn, "IncorrectClusterFileContents") - .detail("Filename", connFile->getFilename()) - .detail("ConnectionStringFromFile", fileConnectionString.toString()) - .detail("CurrentConnectionString", connectionString); - } - } - else { - incorrectTime = Optional(); - } - - auto peers = FlowTransport::transport().getIncompatiblePeers(); - for(auto it = peers->begin(); it != peers->end();) { - if( now() - it->second.second > SERVER_KNOBS->INCOMPATIBLE_PEER_DELAY_BEFORE_LOGGING ) { - req.incompatiblePeers.push_back(it->first); - it = peers->erase(it); - } else { - it++; - } - } - - choose { - when( CachedSerialization ni = wait( ccInterface->get().present() ? brokenPromiseToNever( ccInterface->get().get().getServerDBInfo.getReply( req ) ) : Never() ) ) { - ServerDBInfo localInfo = ni.read(); - TraceEvent("GotServerDBInfoChange").detail("ChangeID", localInfo.id).detail("MasterID", localInfo.master.id()) - .detail("RatekeeperID", localInfo.ratekeeper.present() ? localInfo.ratekeeper.get().id() : UID()) - .detail("DataDistributorID", localInfo.distributor.present() ? localInfo.distributor.get().id() : UID()); - - localInfo.myLocality = locality; - dbInfo->set(localInfo); - } - when( wait( ccInterface->onChange() ) ) { - if(ccInterface->get().present()) - TraceEvent("GotCCInterfaceChange").detail("CCID", ccInterface->get().get().id()).detail("CCMachine", ccInterface->get().get().getWorkers.getEndpoint().getPrimaryAddress()); - } - when(wait(issues.present() ? issues.get()->onChange() : Never())) {} + std::set _issues; + retriveTraceLogIssues(_issues); + if (pingTimeout) { + // Ping trace log writer thread timeout. + _issues.insert("trace_log_writer_thread_unresponsive"); + pingTimeout = false; } + issues->set(_issues); } } @@ -934,8 +922,7 @@ ACTOR Future workerServer( errorForwarders.add( resetAfter(degraded, SERVER_KNOBS->DEGRADED_RESET_INTERVAL, false, SERVER_KNOBS->DEGRADED_WARNING_LIMIT, SERVER_KNOBS->DEGRADED_WARNING_RESET_DELAY, "DegradedReset")); errorForwarders.add( loadedPonger( interf.debugPing.getFuture() ) ); errorForwarders.add( waitFailureServer( interf.waitFailure.getFuture() ) ); - errorForwarders.add(monitorTraceLogIssues(issues)); - errorForwarders.add(monitorServerDBInfo(ccInterface, connFile, locality, dbInfo, issues)); + errorForwarders.add( monitorTraceLogIssues(issues) ); errorForwarders.add( testerServerCore( interf.testerInterface, connFile, dbInfo, locality ) ); errorForwarders.add(monitorHighMemory(memoryProfileThreshold)); @@ -958,6 +945,7 @@ ACTOR Future workerServer( DUMPTOKEN(recruited.setMetricsRate); DUMPTOKEN(recruited.eventLogRequest); DUMPTOKEN(recruited.traceBatchDumpRequest); + DUMPTOKEN(recruited.updateServerDBInfo); } state std::vector> recoveries; @@ -1051,12 +1039,34 @@ ACTOR Future workerServer( wait(waitForAll(recoveries)); recoveredDiskFiles.send(Void()); - errorForwarders.add( registrationClient( ccInterface, interf, asyncPriorityInfo, initialClass, ddInterf, rkInterf, degraded, errors, locality, dbInfo ) ); + errorForwarders.add( registrationClient( ccInterface, interf, asyncPriorityInfo, initialClass, ddInterf, rkInterf, degraded, errors, locality, dbInfo, connFile, issues) ); TraceEvent("RecoveriesComplete", interf.id()); loop choose { + when( UpdateServerDBInfoRequest req = waitNext( interf.updateServerDBInfo.getFuture() ) ) { + ServerDBInfo localInfo = BinaryReader::fromStringRef(req.serializedDbInfo, AssumeVersion(currentProtocolVersion)); + localInfo.myLocality = locality; + if(localInfo.infoGeneration < dbInfo->get().infoGeneration && localInfo.clusterInterface == dbInfo->get().clusterInterface) { + std::vector rep = req.broadcastInfo; + rep.push_back(interf.updateServerDBInfo.getEndpoint()); + req.reply.send(rep); + } else { + Optional notUpdated; + if(!ccInterface->get().present() || localInfo.clusterInterface != ccInterface->get().get()) { + notUpdated = interf.updateServerDBInfo.getEndpoint(); + } + else if(localInfo.infoGeneration > dbInfo->get().infoGeneration || dbInfo->get().clusterInterface != ccInterface->get().get()) { + + TraceEvent("GotServerDBInfoChange").detail("ChangeID", localInfo.id).detail("MasterID", localInfo.master.id()) + .detail("RatekeeperID", localInfo.ratekeeper.present() ? localInfo.ratekeeper.get().id() : UID()) + .detail("DataDistributorID", localInfo.distributor.present() ? localInfo.distributor.get().id() : UID()); + dbInfo->set(localInfo); + } + errorForwarders.add(success(broadcastDBInfoRequest(req, SERVER_KNOBS->DBINFO_SEND_AMOUNT, notUpdated, true))); + } + } when( RebootRequest req = waitNext( interf.clientInterface.reboot.getFuture() ) ) { state RebootRequest rebootReq = req; // If suspendDuration is INT_MAX, the trace will not be logged if it was inside the next block diff --git a/fdbserver/workloads/ConsistencyCheck.actor.cpp b/fdbserver/workloads/ConsistencyCheck.actor.cpp index 434f24a39f..3916a2f9c8 100644 --- a/fdbserver/workloads/ConsistencyCheck.actor.cpp +++ b/fdbserver/workloads/ConsistencyCheck.actor.cpp @@ -1141,12 +1141,12 @@ struct ConsistencyCheckWorkload : TestWorkload std::set> missingStorage; for( int i = 0; i < workers.size(); i++ ) { - NetworkAddress addr = workers[i].interf.tLog.getEndpoint().addresses.getTLSAddress(); - if( !configuration.isExcludedServer(addr) && + NetworkAddress addr = workers[i].interf.stableAddress(); + if( !configuration.isExcludedServer(workers[i].interf.addresses()) && ( workers[i].processClass == ProcessClass::StorageClass || workers[i].processClass == ProcessClass::UnsetClass ) ) { bool found = false; for( int j = 0; j < storageServers.size(); j++ ) { - if( storageServers[j].getValue.getEndpoint().addresses.getTLSAddress() == addr ) { + if( storageServers[j].stableAddress() == addr ) { found = true; break; } diff --git a/fdbserver/workloads/FuzzApiCorrectness.actor.cpp b/fdbserver/workloads/FuzzApiCorrectness.actor.cpp index 7a979352a0..0f9b67d8ca 100644 --- a/fdbserver/workloads/FuzzApiCorrectness.actor.cpp +++ b/fdbserver/workloads/FuzzApiCorrectness.actor.cpp @@ -595,7 +595,7 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { TestGet(unsigned int id, FuzzApiCorrectnessWorkload *workload) : BaseTest(id, workload, "TestGet") { key = makeKey(); contract = { - std::make_pair( error_code_key_outside_legal_range, ExceptionContract::requiredIf((key >= (workload->useSystemKeys ? systemKeys.end : normalKeys.end))) ), + std::make_pair( error_code_key_outside_legal_range, ExceptionContract::requiredIf((key >= (workload->useSystemKeys ? systemKeys.end : normalKeys.end)) && !specialKeys.contains(key)) ), std::make_pair( error_code_client_invalid_operation, ExceptionContract::Possible ), std::make_pair( error_code_accessed_unreadable, ExceptionContract::Possible ) }; @@ -652,12 +652,15 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { limit = deterministicRandom()->randomInt(0, INT_MAX)+1; } + bool isSpecialKeyRange = specialKeys.contains(keysel1.getKey()) && specialKeys.contains(keysel2.getKey()); + contract = { std::make_pair( error_code_range_limits_invalid, ExceptionContract::possibleButRequiredIf(limit < 0) ), std::make_pair( error_code_client_invalid_operation, ExceptionContract::Possible ), std::make_pair( error_code_key_outside_legal_range, ExceptionContract::requiredIf( - (keysel1.getKey() > (workload->useSystemKeys ? systemKeys.end : normalKeys.end)) || - (keysel2.getKey() > (workload->useSystemKeys ? systemKeys.end : normalKeys.end))) ), + ((keysel1.getKey() > (workload->useSystemKeys ? systemKeys.end : normalKeys.end)) || + (keysel2.getKey() > (workload->useSystemKeys ? systemKeys.end : normalKeys.end))) && + !isSpecialKeyRange) ), std::make_pair( error_code_accessed_unreadable, ExceptionContract::Possible ) }; } @@ -681,12 +684,16 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { keysel1 = makeKeySel(); keysel2 = makeKeySel(); limits = makeRangeLimits(); + + bool isSpecialKeyRange = specialKeys.contains(keysel1.getKey()) && specialKeys.contains(keysel2.getKey()); + contract = { std::make_pair( error_code_range_limits_invalid, ExceptionContract::possibleButRequiredIf( !limits.isReached() && !limits.isValid()) ), std::make_pair( error_code_client_invalid_operation, ExceptionContract::Possible ), std::make_pair( error_code_key_outside_legal_range, ExceptionContract::requiredIf( - (keysel1.getKey() > (workload->useSystemKeys ? systemKeys.end : normalKeys.end)) || - (keysel2.getKey() > (workload->useSystemKeys ? systemKeys.end : normalKeys.end))) ), + ((keysel1.getKey() > (workload->useSystemKeys ? systemKeys.end : normalKeys.end)) || + (keysel2.getKey() > (workload->useSystemKeys ? systemKeys.end : normalKeys.end))) && + !isSpecialKeyRange) ), std::make_pair( error_code_accessed_unreadable, ExceptionContract::Possible ) }; } @@ -721,13 +728,17 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { else limit = deterministicRandom()->randomInt(0, INT_MAX)+1; } + + bool isSpecialKeyRange = specialKeys.contains(key1) && specialKeys.contains(key2); + contract = { std::make_pair( error_code_inverted_range, ExceptionContract::requiredIf(key1 > key2) ), std::make_pair( error_code_range_limits_invalid, ExceptionContract::possibleButRequiredIf(limit < 0) ), std::make_pair( error_code_client_invalid_operation, ExceptionContract::Possible ), std::make_pair( error_code_key_outside_legal_range, ExceptionContract::requiredIf( - (key1 > (workload->useSystemKeys ? systemKeys.end : normalKeys.end)) || - (key2 > (workload->useSystemKeys ? systemKeys.end : normalKeys.end))) ), + ((key1 > (workload->useSystemKeys ? systemKeys.end : normalKeys.end)) || + (key2 > (workload->useSystemKeys ? systemKeys.end : normalKeys.end))) + && !isSpecialKeyRange) ), std::make_pair( error_code_accessed_unreadable, ExceptionContract::Possible ) }; } @@ -752,13 +763,17 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { key1 = makeKey(); key2 = makeKey(); limits = makeRangeLimits(); + + bool isSpecialKeyRange = specialKeys.contains(key1) && specialKeys.contains(key2); + contract = { std::make_pair( error_code_inverted_range, ExceptionContract::requiredIf(key1 > key2) ), std::make_pair( error_code_range_limits_invalid, ExceptionContract::possibleButRequiredIf( !limits.isReached() && !limits.isValid()) ), std::make_pair( error_code_client_invalid_operation, ExceptionContract::Possible ), std::make_pair( error_code_key_outside_legal_range, ExceptionContract::requiredIf( - (key1 > (workload->useSystemKeys ? systemKeys.end : normalKeys.end)) || - (key2 > (workload->useSystemKeys ? systemKeys.end : normalKeys.end))) ), + ((key1 > (workload->useSystemKeys ? systemKeys.end : normalKeys.end)) || + (key2 > (workload->useSystemKeys ? systemKeys.end : normalKeys.end))) && + !isSpecialKeyRange) ), std::make_pair( error_code_accessed_unreadable, ExceptionContract::Possible ) }; } diff --git a/fdbserver/workloads/KillRegion.actor.cpp b/fdbserver/workloads/KillRegion.actor.cpp index 8d864a8957..50d01e2fa4 100644 --- a/fdbserver/workloads/KillRegion.actor.cpp +++ b/fdbserver/workloads/KillRegion.actor.cpp @@ -66,6 +66,13 @@ struct KillRegionWorkload : TestWorkload { return Void(); } + ACTOR static Future waitForStorageRecovered( KillRegionWorkload *self ) { + while( self->dbInfo->get().recoveryState < RecoveryState::STORAGE_RECOVERED ) { + wait( self->dbInfo->onChange() ); + } + return Void(); + } + ACTOR static Future killRegion( KillRegionWorkload *self, Database cx ) { ASSERT( g_network->isSimulated() ); if(deterministicRandom()->random01() < 0.5) { @@ -94,10 +101,13 @@ struct KillRegionWorkload : TestWorkload { TraceEvent("ForceRecovery_GotConfig").detail("Conf", conf.toString()); if(conf.usableRegions>1) { - //only needed if force recovery was unnecessary and we killed the secondary - wait( success( changeConfig( cx, g_simulator.disablePrimary + " repopulate_anti_quorum=1", true ) ) ); - while( self->dbInfo->get().recoveryState < RecoveryState::STORAGE_RECOVERED ) { - wait( self->dbInfo->onChange() ); + loop { + //only needed if force recovery was unnecessary and we killed the secondary + wait( success( changeConfig( cx, g_simulator.disablePrimary + " repopulate_anti_quorum=1", true ) ) ); + choose { + when( wait( waitForStorageRecovered(self) ) ) { break; } + when( wait( delay(300.0) ) ) { } + } } wait( success( changeConfig( cx, "usable_regions=1", true ) ) ); } diff --git a/fdbserver/workloads/MachineAttrition.actor.cpp b/fdbserver/workloads/MachineAttrition.actor.cpp index 4ad7b4de84..5a666bf887 100644 --- a/fdbserver/workloads/MachineAttrition.actor.cpp +++ b/fdbserver/workloads/MachineAttrition.actor.cpp @@ -20,7 +20,6 @@ #include "fdbclient/NativeAPI.actor.h" #include "fdbclient/CoordinationInterface.h" -#include "fdbserver/ClusterRecruitmentInterface.h" #include "fdbserver/TesterInterface.actor.h" #include "fdbserver/WorkerInterface.actor.h" #include "fdbserver/workloads/workloads.actor.h" diff --git a/fdbserver/workloads/Performance.actor.cpp b/fdbserver/workloads/Performance.actor.cpp index 16c6b00528..ffb4f90e07 100644 --- a/fdbserver/workloads/Performance.actor.cpp +++ b/fdbserver/workloads/Performance.actor.cpp @@ -22,7 +22,6 @@ #include "fdbserver/TesterInterface.actor.h" #include "fdbserver/workloads/workloads.actor.h" #include "fdbserver/QuietDatabase.h" -#include "fdbserver/ClusterRecruitmentInterface.h" #include "flow/actorcompiler.h" // This must be the last #include. struct PerformanceWorkload : TestWorkload { diff --git a/fdbserver/workloads/ReadWrite.actor.cpp b/fdbserver/workloads/ReadWrite.actor.cpp index cd7cc918c3..24cbc203e5 100644 --- a/fdbserver/workloads/ReadWrite.actor.cpp +++ b/fdbserver/workloads/ReadWrite.actor.cpp @@ -28,7 +28,6 @@ #include "fdbserver/WorkerInterface.actor.h" #include "fdbserver/workloads/workloads.actor.h" #include "fdbserver/workloads/BulkSetup.actor.h" -#include "fdbserver/ClusterRecruitmentInterface.h" #include "fdbclient/ReadYourWrites.h" #include "flow/TDMetric.actor.h" #include "flow/actorcompiler.h" // This must be the last #include. diff --git a/fdbserver/workloads/SnapTest.actor.cpp b/fdbserver/workloads/SnapTest.actor.cpp index 78cd7580ae..85c5fbbd09 100644 --- a/fdbserver/workloads/SnapTest.actor.cpp +++ b/fdbserver/workloads/SnapTest.actor.cpp @@ -4,7 +4,6 @@ #include "fdbclient/ReadYourWrites.h" #include "fdbrpc/ContinuousSample.h" #include "fdbmonitor/SimpleIni.h" -#include "fdbserver/ClusterRecruitmentInterface.h" #include "fdbserver/Status.h" #include "fdbserver/TesterInterface.actor.h" #include "fdbserver/WorkerInterface.actor.h" diff --git a/flow/Arena.h b/flow/Arena.h index 74a29c8b82..cfc756506d 100644 --- a/flow/Arena.h +++ b/flow/Arena.h @@ -576,6 +576,12 @@ public: return eatAny(StringRef((const uint8_t *)sep, strlen(sep)), foundSeparator); } + // Copies string contents to dst and returns a pointer to the next byte after + uint8_t * copyTo(uint8_t *dst) const { + memcpy(dst, data, length); + return dst + length; + } + private: // Unimplemented; blocks conversion through std::string StringRef( char* ); diff --git a/flow/CompressedInt.h b/flow/CompressedInt.h index e473f0bf80..096ef16d74 100644 --- a/flow/CompressedInt.h +++ b/flow/CompressedInt.h @@ -19,6 +19,7 @@ */ #pragma once +#include // A signed compressed integer format that retains ordering in compressed form. // Format is: [~sign_bit] [unary_len] [value_bits] diff --git a/flow/Knobs.cpp b/flow/Knobs.cpp index fcd88f8c6d..27ab62fb1c 100644 --- a/flow/Knobs.cpp +++ b/flow/Knobs.cpp @@ -80,6 +80,7 @@ void FlowKnobs::initialize(bool randomize, bool isSimulated) { init( TOO_MANY_CONNECTIONS_CLOSED_RESET_DELAY, 5.0 ); init( TOO_MANY_CONNECTIONS_CLOSED_TIMEOUT, 20.0 ); init( PEER_UNAVAILABLE_FOR_LONG_TIME_TIMEOUT, 3600.0 ); + init( INCOMPATIBLE_PEER_DELAY_BEFORE_LOGGING, 5.0 ); init( TLS_CERT_REFRESH_DELAY_SECONDS, 12*60*60 ); init( TLS_SERVER_CONNECTION_THROTTLE_TIMEOUT, 9.0 ); @@ -206,6 +207,12 @@ void FlowKnobs::initialize(bool randomize, bool isSimulated) { init( FUTURE_VERSION_BACKOFF_GROWTH, 2.0 ); init( LOAD_BALANCE_MAX_BAD_OPTIONS, 1 ); //should be the same as MAX_MACHINES_FALLING_BEHIND init( LOAD_BALANCE_PENALTY_IS_BAD, true ); + + // Health Monitor + init( FAILURE_DETECTION_DELAY, 4.0 ); if( randomize && BUGGIFY ) FAILURE_DETECTION_DELAY = 1.0; + init( HEALTH_MONITOR_MARK_FAILED_UNSTABLE_CONNECTIONS, true ); + init( HEALTH_MONITOR_CLIENT_REQUEST_INTERVAL_SECS, 30 ); + init( HEALTH_MONITOR_CONNECTION_MAX_CLOSED, 5 ); } // clang-format on diff --git a/flow/Knobs.h b/flow/Knobs.h index 7e80e0233c..2482dad7a4 100644 --- a/flow/Knobs.h +++ b/flow/Knobs.h @@ -94,6 +94,7 @@ public: double RECONNECTION_TIME_GROWTH_RATE; double RECONNECTION_RESET_TIME; int ACCEPT_BATCH_SIZE; + double INCOMPATIBLE_PEER_DELAY_BEFORE_LOGGING; int TLS_CERT_REFRESH_DELAY_SECONDS; double TLS_SERVER_CONNECTION_THROTTLE_TIMEOUT; @@ -226,6 +227,12 @@ public: int LOAD_BALANCE_MAX_BAD_OPTIONS; bool LOAD_BALANCE_PENALTY_IS_BAD; + // Health Monitor + int FAILURE_DETECTION_DELAY; + bool HEALTH_MONITOR_MARK_FAILED_UNSTABLE_CONNECTIONS; + int HEALTH_MONITOR_CLIENT_REQUEST_INTERVAL_SECS; + int HEALTH_MONITOR_CONNECTION_MAX_CLOSED; + FlowKnobs(); void initialize(bool randomize = false, bool isSimulated = false); }; diff --git a/flow/Net2.actor.cpp b/flow/Net2.actor.cpp index 2570b2c434..a793eb6ea6 100644 --- a/flow/Net2.actor.cpp +++ b/flow/Net2.actor.cpp @@ -23,12 +23,13 @@ #define BOOST_SYSTEM_NO_LIB #define BOOST_DATE_TIME_NO_LIB #define BOOST_REGEX_NO_LIB -#include "boost/asio.hpp" -#include "boost/bind.hpp" -#include "boost/date_time/posix_time/posix_time_types.hpp" +#include +#include +#include +#include +#include #include "flow/network.h" #include "flow/IThreadPool.h" -#include "boost/range.hpp" #include "flow/ActorCollection.h" #include "flow/ThreadSafeQueue.h" @@ -142,9 +143,14 @@ public: if ( thread_network == this ) stopImmediately(); else - // SOMEDAY: NULL for deferred error, no analysis of correctness (itp) onMainThreadVoid( [this] { this->stopImmediately(); }, NULL ); } + virtual void addStopCallback( std::function fn ) { + if ( thread_network == this ) + stopCallbacks.emplace_back(std::move(fn)); + else + onMainThreadVoid( [this, fn] { this->stopCallbacks.emplace_back(std::move(fn)); }, nullptr ); + } virtual bool isSimulated() const { return false; } virtual THREAD_HANDLE startThread( THREAD_FUNC_RETURN (*func) (void*), void *arg); @@ -232,6 +238,7 @@ public: EventMetricHandle slowTaskMetric; std::vector blobCredentialFiles; + std::vector> stopCallbacks; }; static boost::asio::ip::address tcpAddress(IPAddress const& n) { @@ -261,11 +268,19 @@ public: try { if (error) { // Log the error... - TraceEvent(SevWarn, errContext, errID).suppressFor(1.0).detail("ErrorCode", error.value()).detail("Message", error.message()) + { + TraceEvent evt(SevWarn, errContext, errID); + evt.suppressFor(1.0).detail("ErrorCode", error.value()).detail("Message", error.message()); #ifndef TLS_DISABLED - .detail("WhichMeans", TLSPolicy::ErrorString(error)) + // There is no function in OpenSSL to use to check if an error code is from OpenSSL, + // but all OpenSSL errors have a non-zero "library" code set in bits 24-32, and linux + // error codes should never go that high. + if (error.value() >= (1 << 24L)) { + evt.detail("WhichMeans", TLSPolicy::ErrorString(error)); + } #endif - ; + } + p.sendError( connection_failed() ); } else p.send( Void() ); @@ -790,11 +805,11 @@ private: } void onReadError( const boost::system::error_code& error ) { - TraceEvent(SevWarn, "N2_ReadError", id).suppressFor(1.0).detail("Message", error.value()); + TraceEvent(SevWarn, "N2_ReadError", id).suppressFor(1.0).detail("ErrorCode", error.value()).detail("Message", error.message()); closeSocket(); } void onWriteError( const boost::system::error_code& error ) { - TraceEvent(SevWarn, "N2_WriteError", id).suppressFor(1.0).detail("Message", error.value()); + TraceEvent(SevWarn, "N2_WriteError", id).suppressFor(1.0).detail("ErrorCode", error.value()).detail("Message", error.message()); closeSocket(); } }; @@ -896,13 +911,19 @@ ACTOR static Future watchFileForChanges( std::string filename, AsyncTrigge if (filename == "") { return Never(); } - state std::time_t lastModTime = wait(IAsyncFileSystem::filesystem()->lastWriteTime(filename)); + state bool firstRun = true; + state bool statError = false; + state std::time_t lastModTime = 0; loop { - wait(delay(FLOW_KNOBS->TLS_CERT_REFRESH_DELAY_SECONDS)); try { std::time_t modtime = wait(IAsyncFileSystem::filesystem()->lastWriteTime(filename)); - if (lastModTime != modtime) { + if (firstRun) { lastModTime = modtime; + firstRun = false; + } + if (lastModTime != modtime || statError) { + lastModTime = modtime; + statError = false; fileChanged->trigger(); } } catch (Error& e) { @@ -912,10 +933,12 @@ ACTOR static Future watchFileForChanges( std::string filename, AsyncTrigge // certificates, then there's no point in crashing, but we should complain // loudly. IAsyncFile will log the error, but not necessarily as a warning. TraceEvent(SevWarnAlways, "TLSCertificateRefreshStatError").detail("File", filename); + statError = true; } else { throw; } } + wait(delay(FLOW_KNOBS->TLS_CERT_REFRESH_DELAY_SECONDS)); } } @@ -964,16 +987,22 @@ void Net2::initTLS() { return; } #ifndef TLS_DISABLED + auto onPolicyFailure = [this]() { this->countTLSPolicyFailures++; }; try { boost::asio::ssl::context newContext(boost::asio::ssl::context::tls); - auto onPolicyFailure = [this]() { this->countTLSPolicyFailures++; }; + const LoadedTLSConfig& loaded = tlsConfig.loadSync(); + TraceEvent("Net2TLSConfig") + .detail("CAPath", tlsConfig.getCAPathSync()) + .detail("CertificatePath", tlsConfig.getCertificatePathSync()) + .detail("KeyPath", tlsConfig.getKeyPathSync()) + .detail("HasPassword", !loaded.getPassword().empty()) + .detail("VerifyPeers", boost::algorithm::join(loaded.getVerifyPeers(), "|")); ConfigureSSLContext( tlsConfig.loadSync(), &newContext, onPolicyFailure ); sslContextVar.set(ReferencedObject::from(std::move(newContext))); - backgroundCertRefresh = reloadCertificatesOnChange( tlsConfig, onPolicyFailure, &sslContextVar ); } catch (Error& e) { TraceEvent("Net2TLSInitError").error(e); - throw tls_error(); } + backgroundCertRefresh = reloadCertificatesOnChange( tlsConfig, onPolicyFailure, &sslContextVar ); #endif tlsInitialized = true; } @@ -1199,6 +1228,10 @@ void Net2::run() { TraceEvent("SomewhatSlowRunLoopBottom").detail("Elapsed", nnow - now); // This includes the time spent running tasks } + for ( auto& fn : stopCallbacks ) { + fn(); + } + #ifdef WIN32 timeEndPeriod(1); #endif diff --git a/flow/Platform.cpp b/flow/Platform.cpp index be0111b6dc..4ba02770a9 100644 --- a/flow/Platform.cpp +++ b/flow/Platform.cpp @@ -2836,7 +2836,25 @@ void crashHandler(int sig) { fprintf(stderr, "SIGNAL: %s (%d)\n", strsignal(sig), sig); fprintf(stderr, "Trace: %s\n", backtrace.c_str()); - _exit(128 + sig); + struct sigaction sa; + sa.sa_handler = SIG_DFL; + if (sigemptyset(&sa.sa_mask)) { + int err = errno; + fprintf(stderr, "sigemptyset failed: %s\n", strerror(err)); + _exit(sig + 128); + } + sa.sa_flags = 0; + if (sigaction(sig, &sa, NULL)) { + int err = errno; + fprintf(stderr, "sigaction failed: %s\n", strerror(err)); + _exit(sig + 128); + } + if (kill(getpid(), sig)) { + int err = errno; + fprintf(stderr, "kill failed: %s\n", strerror(err)); + _exit(sig + 128); + } + // Rely on kill to end the process #else // No crash handler for other platforms! #endif diff --git a/flow/TLSConfig.actor.cpp b/flow/TLSConfig.actor.cpp index f432229ec9..73a336e38a 100644 --- a/flow/TLSConfig.actor.cpp +++ b/flow/TLSConfig.actor.cpp @@ -25,6 +25,32 @@ // To force typeinfo to only be emitted once. TLSPolicy::~TLSPolicy() {} +namespace TLS { + +void DisableOpenSSLAtExitHandler() { +#ifdef TLS_DISABLED + return; +#else + static bool once = false; + if (!once) { + once = true; + int success = OPENSSL_init_crypto(OPENSSL_INIT_NO_ATEXIT, nullptr); + if (!success) { + throw tls_error(); + } + } +#endif +} + +void DestroyOpenSSLGlobalState() { +#ifdef TLS_DISABLED + return; +#else + OPENSSL_cleanup(); +#endif +} + +} // namespace TLS #ifdef TLS_DISABLED void LoadedTLSConfig::print(FILE *fp) { diff --git a/flow/TLSConfig.actor.h b/flow/TLSConfig.actor.h index 820c90d5c9..aa07e27fde 100644 --- a/flow/TLSConfig.actor.h +++ b/flow/TLSConfig.actor.h @@ -36,6 +36,22 @@ #include "flow/Knobs.h" #include "flow/flow.h" +namespace TLS { + +// Force OpenSSL to not register an atexit handler to clean up global state before process exit. +// If you call this, you must also call DestroyOpenSSLGlobalState() before the program exits. +// Calls OPENSSL_init_crypto with OPENSSL_INIT_NO_ATEXIT. +// Must be called before any other OpenSSL function. +void DisableOpenSSLAtExitHandler(); + +// Frees all global state maintained by OpenSSL. +// Calls OPENSSL_cleanup. +// Must be called before program exit if using DisableOpenSSLAtExitHandler. +// No OpenSSL code may be run after calling this function. +void DestroyOpenSSLGlobalState(); + +} // namespace TLS + #ifndef TLS_DISABLED #include diff --git a/flow/flat_buffers.cpp b/flow/flat_buffers.cpp index 89fb058f98..8871fa438e 100644 --- a/flow/flat_buffers.cpp +++ b/flow/flat_buffers.cpp @@ -31,10 +31,12 @@ namespace detail { namespace { -std::vector mWriteToOffsetsMemoy; +thread_local std::vector gWriteToOffsetsMemory; } -std::vector* writeToOffsetsMemory = &mWriteToOffsetsMemoy; +void swapWithThreadLocalGlobal(std::vector& writeToOffsets) { + gWriteToOffsetsMemory.swap(writeToOffsets); +} VTable generate_vtable(size_t numMembers, const std::vector& sizesAlignments) { if (numMembers == 0) { @@ -488,10 +490,6 @@ TEST_CASE("/flow/FlatBuffers/Standalone") { // Meant to be run with valgrind or asan, to catch heap buffer overflows TEST_CASE("/flow/FlatBuffers/Void") { Standalone msg = ObjectWriter::toValue(Void(), Unversioned()); - // Manually verified to be a valid flatbuffers message. This is technically brittle since there are other valid - // encodings of this message, but our implementation is unlikely to change. - ASSERT(msg == LiteralStringRef("\x14\x00\x00\x00J\xad\x1e\x00\x00\x00\x04\x00\x04\x00\x06\x00\x08\x00\x04\x00\x06" - "\x00\x00\x00\x04\x00\x00\x00\x12\x00\x00\x00")); auto buffer = std::make_unique(msg.size()); // Make a heap allocation of precisely the right size, so // that asan or valgrind will catch any overflows memcpy(buffer.get(), msg.begin(), msg.size()); diff --git a/flow/flat_buffers.h b/flow/flat_buffers.h index 5847c40bf2..88f0f6dc3c 100644 --- a/flow/flat_buffers.h +++ b/flow/flat_buffers.h @@ -348,15 +348,16 @@ struct _SizeOf { static constexpr unsigned int align = fb_align; }; -extern std::vector* writeToOffsetsMemory; +// Re-use this intermediate memory to avoid frequent new/delete +void swapWithThreadLocalGlobal(std::vector& writeToOffsets); template struct PrecomputeSize : Context { PrecomputeSize(const Context& context) : Context(context) { - writeToOffsets.swap(*writeToOffsetsMemory); + swapWithThreadLocalGlobal(writeToOffsets); writeToOffsets.clear(); } - ~PrecomputeSize() { writeToOffsets.swap(*writeToOffsetsMemory); } + ~PrecomputeSize() { swapWithThreadLocalGlobal(writeToOffsets); } // |offset| is measured from the end of the buffer. Precondition: len <= // offset. void write(const void*, int offset, int /*len*/) { current_buffer_size = std::max(current_buffer_size, offset); } @@ -496,7 +497,7 @@ extern VTable generate_vtable(size_t numMembers, const std::vector& si template const VTable* gen_vtable3() { - static VTable table = + static thread_local VTable table = generate_vtable(sizeof...(MembersAndAlignments) / 2, std::vector{ MembersAndAlignments... }); return &table; } @@ -624,7 +625,7 @@ VTableSet get_vtableset_impl(const Root& root, const Context& context) { template const VTableSet* get_vtableset(const Root& root, const Context& context) { - static VTableSet result = get_vtableset_impl(root, context); + static thread_local VTableSet result = get_vtableset_impl(root, context); return &result; } diff --git a/flow/genericactors.actor.cpp b/flow/genericactors.actor.cpp index 64a5d60940..9a70c659d3 100644 --- a/flow/genericactors.actor.cpp +++ b/flow/genericactors.actor.cpp @@ -69,6 +69,8 @@ ACTOR Future timeoutWarningCollector( FutureStream input, double log ACTOR Future quorumEqualsTrue( std::vector> futures, int required ) { state std::vector< Future > true_futures; state std::vector< Future > false_futures; + true_futures.reserve(futures.size()); + false_futures.reserve(futures.size()); for(int i=0; i quorumEqualsTrue( std::vector> futures, int requ ACTOR Future shortCircuitAny( std::vector> f ) { std::vector> sc; + sc.reserve(f.size()); for(Future fut : f) { sc.push_back(returnIfTrue(fut)); } @@ -96,7 +99,7 @@ ACTOR Future shortCircuitAny( std::vector> f ) // Handle a possible race condition? If the _last_ term to // be evaluated triggers the waitForAll before bubbling // out of the returnIfTrue quorum - for ( auto fut : f ) { + for (const auto& fut : f) { if ( fut.get() ) { return true; } diff --git a/flow/genericactors.actor.h b/flow/genericactors.actor.h index 0150d17855..728e877dc2 100644 --- a/flow/genericactors.actor.h +++ b/flow/genericactors.actor.h @@ -312,8 +312,8 @@ template std::vector>> mapAsync(std::vector> const& what, F const& actorFunc) { std::vector> ret; - for(auto f : what) - ret.push_back(mapAsync( f, actorFunc )); + ret.reserve(what.size()); + for (const auto& f : what) ret.push_back(mapAsync(f, actorFunc)); return ret; } @@ -371,8 +371,8 @@ template std::vector>> map(std::vector> const& what, F const& func) { std::vector>> ret; - for(auto f : what) - ret.push_back(map( f, func )); + ret.reserve(what.size()); + for (const auto& f : what) ret.push_back(map(f, func)); return ret; } @@ -585,6 +585,7 @@ public: } std::vector getKeys() { std::vector keys; + keys.reserve(items.size()); for(auto i = items.begin(); i != items.end(); ++i) keys.push_back( i->first ); return keys; @@ -887,6 +888,7 @@ Future streamHelper( PromiseStream output, PromiseStream errors, template Future makeStream( const std::vector>& futures, PromiseStream& stream, PromiseStream& errors ) { std::vector> forwarders; + forwarders.reserve(futures.size()); for(int f=0; f> getAll( std::vector> input ) { wait( quorum( input, input.size() ) ); std::vector output; + output.reserve(input.size()); for(int i=0; i> appendAll( std::vector>> input ) { wait( quorum( input, input.size() ) ); std::vector output; + size_t sz = 0; + for (const auto& f : input) { + sz += f.get().size(); + } + output.reserve(sz); + for(int i=0; i operator &&( Future const& lhs, Future const& rh else return lhs; } - std::vector> v; - v.push_back( lhs ); - v.push_back( rhs ); - return waitForAll(v); + return waitForAll(std::vector>{ lhs, rhs }); } // error || unset -> error @@ -1626,8 +1632,7 @@ public: return futures[0]; Future f = waitForAll(futures); - futures = std::vector>(); - futures.push_back(f); + futures = std::vector>{ f }; return f; } diff --git a/flow/network.h b/flow/network.h index be898058f1..b3b132613d 100644 --- a/flow/network.h +++ b/flow/network.h @@ -485,6 +485,10 @@ public: virtual void stop() = 0; // Terminate the program + virtual void addStopCallback( std::function fn ) = 0; + // Calls `fn` when stop() is called. + // addStopCallback can be called more than once, and each added `fn` will be run once. + virtual bool isSimulated() const = 0; // Returns true if this network is a local simulation diff --git a/packaging/msi/FDBInstaller.wxs b/packaging/msi/FDBInstaller.wxs index a9bfc62418..dad6bb058e 100644 --- a/packaging/msi/FDBInstaller.wxs +++ b/packaging/msi/FDBInstaller.wxs @@ -32,7 +32,7 @@