Merged from upstream master

This commit is contained in:
Young Liu 2020-06-13 16:47:12 -07:00
commit f211a54593
122 changed files with 5973 additions and 1339 deletions

View File

@ -536,3 +536,51 @@ sse2neon Authors (sse2neon)
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
rte_memcpy.h (from DPDK):
SPDX-License-Identifier: BSD-3-Clause
Copyright(c) 2010-2014 Intel Corporation
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
3. Neither the name of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from this
software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
folly_memcpy:
Copyright (c) Facebook, Inc. and its affiliates.
Author: Bin Liu <binliu@fb.com>
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

View File

@ -158,15 +158,15 @@ namespace FDB {
void reset() override;
TransactionImpl() : tr(NULL) {}
TransactionImpl(TransactionImpl&& r) BOOST_NOEXCEPT {
tr = r.tr;
r.tr = NULL;
}
TransactionImpl& operator=(TransactionImpl&& r) BOOST_NOEXCEPT {
tr = r.tr;
r.tr = NULL;
TransactionImpl(TransactionImpl&& r) noexcept {
tr = r.tr;
r.tr = NULL;
}
TransactionImpl& operator=(TransactionImpl&& r) noexcept {
tr = r.tr;
r.tr = NULL;
return *this;
}
}
private:
FDBTransaction* tr;

View File

@ -35,6 +35,7 @@ import com.apple.foundationdb.Transaction;
*
*/
public class ByteArrayUtil extends FastByteComparisons {
private static final byte[] EMPTY_BYTES = new byte[0];
/**
* Joins a set of byte arrays into a larger array. The {@code interlude} is placed
@ -45,36 +46,46 @@ public class ByteArrayUtil extends FastByteComparisons {
* concatenated elements.
* @param parts the pieces to be joined. May be {@code null}, but does not allow
* for elements in the list to be {@code null}.
*
*
* @return a newly created concatenation of the input
*/
public static byte[] join(byte[] interlude, List<byte[]> parts) {
return interludeJoin(interlude, parts.toArray(new byte[0][]));
}
/**
* Joins a set of byte arrays into a larger array. The {@code interlude} is placed
* between each of the elements, but not at the beginning or end. In the case that
* the list is empty or {@code null}, a zero-length byte array will be returned.
*
* @param interlude can be {@code null} or zero length. Placed internally between
* concatenated elements.
* @param parts the pieces to be joined. May be {@code null}, but does not allow
* for elements in the array to be {@code null}.
*
* @return a newly created concatenation of the input
*/
public static byte[] interludeJoin(byte[] interlude, byte[][] parts) {
if(parts == null)
return new byte[0];
int partCount = parts.size();
int partCount = parts.length;
if(partCount == 0)
return new byte[0];
return EMPTY_BYTES;
if(interlude == null)
interlude = new byte[0];
interlude = EMPTY_BYTES;
int elementTotals = 0;
int interludeSize = interlude.length;
for(byte[] e : parts) {
elementTotals += e.length;
for (int i = 0; i < partCount; i++) {
elementTotals += parts[i].length;
}
byte[] dest = new byte[(interludeSize * (partCount - 1)) + elementTotals];
//System.out.println(" interlude -> " + ArrayUtils.printable(interlude));
int startByte = 0;
int index = 0;
for(byte[] part : parts) {
//System.out.println(" section -> " + ArrayUtils.printable(parts.get(i)));
int length = part.length;
for (int i = 0; i < partCount; i++) {
int length = parts[i].length;
if(length > 0) {
System.arraycopy(part, 0, dest, startByte, length);
System.arraycopy(parts[i], 0, dest, startByte, length);
startByte += length;
}
if(index < partCount - 1 && interludeSize > 0) {
@ -84,8 +95,6 @@ public class ByteArrayUtil extends FastByteComparisons {
}
index++;
}
//System.out.println(" complete -> " + ArrayUtils.printable(dest));
return dest;
}
@ -97,7 +106,7 @@ public class ByteArrayUtil extends FastByteComparisons {
* @return a newly created concatenation of the input
*/
public static byte[] join(byte[]... parts) {
return join(null, Arrays.asList(parts));
return interludeJoin(null, parts);
}
/**

View File

@ -761,10 +761,6 @@ public class AsyncStackTester {
c.run();
//System.out.println("Done with test.");
/*byte[] key = Tuple.from("test_results".getBytes(), 5).pack();
byte[] bs = db.createTransaction().get(key).get();
System.out.println("output of " + ByteArrayUtil.printable(key) + " as: " + ByteArrayUtil.printable(bs));*/
db.close();
System.gc();

View File

@ -45,17 +45,16 @@ public class LocalityTests {
long start = System.currentTimeMillis();
CloseableAsyncIterator<byte[]> keys = LocalityUtil.getBoundaryKeys(database, new byte[0], new byte[]{(byte) 255});
CompletableFuture<List<byte[]>> collection = AsyncUtil.collectRemaining(keys);
List<byte[]> list = collection.join();
System.out.println("Took " + (System.currentTimeMillis() - start) + "ms to get " +
list.size() + " items");
try(CloseableAsyncIterator<byte[]> keys = LocalityUtil.getBoundaryKeys(database, new byte[0], new byte[]{(byte) 255})) {
CompletableFuture<List<byte[]>> collection = AsyncUtil.collectRemaining(keys);
List<byte[]> list = collection.join();
System.out.println("Took " + (System.currentTimeMillis() - start) + "ms to get " +
list.size() + " items");
keys.close();
int i = 0;
for(byte[] key : collection.join()) {
System.out.println(i++ + ": " + ByteArrayUtil.printable(key));
int i = 0;
for(byte[] key : collection.join()) {
System.out.println(i++ + ": " + ByteArrayUtil.printable(key));
}
}
}
}

View File

@ -64,7 +64,9 @@ public class RangeTest {
System.out.println("First transaction was successful");
checkRange(db.createTransaction());
try(Transaction tr = db.createTransaction()) {
checkRange(tr);
}
long version;
try(Transaction tr = db.createTransaction()) {
@ -184,7 +186,6 @@ public class RangeTest {
String value = new String(kv.getValue());
System.out.println(" -- " + key + " -> " + value);
}
}
private RangeTest() {}

View File

@ -88,6 +88,7 @@ public class SerialInsertion {
tr.set(buf.array(), value);
}
tr.commit().join();
tr.close();
tr = db.createTransaction();
done += i;
}

View File

@ -649,9 +649,10 @@ public class StackTester {
}
}
catch(FDBException e) {
Transaction tr = db.createTransaction();
tr.onError(e).join();
return false;
try(Transaction tr = db.createTransaction()) {
tr.onError(e).join();
return false;
}
}
}
}

View File

@ -15,8 +15,8 @@ EOF
s.email = 'fdb-dist@apple.com'
s.files = ["${CMAKE_SOURCE_DIR}/LICENSE", "${CMAKE_CURRENT_SOURCE_DIR}/lib/fdb.rb", "${CMAKE_CURRENT_SOURCE_DIR}/lib/fdbdirectory.rb", "${CMAKE_CURRENT_SOURCE_DIR}/lib/fdbimpl.rb", "${CMAKE_CURRENT_SOURCE_DIR}/lib/fdblocality.rb", "${CMAKE_CURRENT_SOURCE_DIR}/lib/fdboptions.rb", "${CMAKE_CURRENT_SOURCE_DIR}/lib/fdbsubspace.rb", "${CMAKE_CURRENT_SOURCE_DIR}/lib/fdbtuple.rb", "${CMAKE_CURRENT_SOURCE_DIR}/lib/fdbimpl_v609.rb"]
s.homepage = 'https://www.foundationdb.org'
s.license = 'Apache v2'
s.add_dependency('ffi', '>= 1.1.5')
s.license = 'Apache-2.0'
s.add_dependency('ffi', '~> 1.1', '>= 1.1.5')
s.required_ruby_version = '>= 1.9.3'
s.requirements << 'These bindings require the FoundationDB client. The client can be obtained from https://www.foundationdb.org/download/.'
end

View File

@ -15,8 +15,8 @@ EOF
s.email = 'fdb-dist@apple.com'
s.files = ["LICENSE", "lib/fdb.rb", "lib/fdbdirectory.rb", "lib/fdbimpl.rb", "lib/fdblocality.rb", "lib/fdboptions.rb", "lib/fdbsubspace.rb", "lib/fdbtuple.rb", "lib/fdbimpl_v609.rb"]
s.homepage = 'https://www.foundationdb.org'
s.license = 'Apache v2'
s.add_dependency('ffi', '>= 1.1.5')
s.license = 'Apache-2.0'
s.add_dependency('ffi', '~> 1.1', '>= 1.1.5')
s.required_ruby_version = '>= 1.9.3'
s.requirements << 'These bindings require the FoundationDB client. The client can be obtained from https://www.foundationdb.org/download/.'
end

View File

@ -74,6 +74,14 @@ services:
<<: *snapshot-bindings-cmake
snapshot-cmake: &snapshot-testpackages
<<: *build-setup
command: scl enable devtoolset-8 rh-python36 rh-ruby24 -- bash -c 'mkdir -p "$${BUILD_DIR}" && cd "$${BUILD_DIR}" && cmake -G "Ninja" -DFDB_RELEASE=0 /__this_is_some_very_long_name_dir_needed_to_fix_a_bug_with_debug_rpms__/foundationdb && ninja -v -j "$${MAKEJOBS}"'
prb-testpackages:
<<: *snapshot-testpackages
snapshot-ctest: &snapshot-ctest
<<: *build-setup
command: scl enable devtoolset-8 rh-python36 rh-ruby24 -- bash -c 'mkdir -p "$${BUILD_DIR}" && cd "$${BUILD_DIR}" && cmake -G "Ninja" -DFDB_RELEASE=1 /__this_is_some_very_long_name_dir_needed_to_fix_a_bug_with_debug_rpms__/foundationdb && ninja -v -j "$${MAKEJOBS}" && ctest -L fast -j "$${MAKEJOBS}" --output-on-failure'

View File

@ -151,10 +151,15 @@ if(NOT WIN32)
set(TEST_PACKAGE_ADD_DIRECTORIES "" CACHE STRING "A ;-separated list of directories. All files within each directory will be added to the test package")
endif()
function(create_test_package)
if(WIN32)
return()
endif()
# This sets up a directory with the correctness files common to all correctness packages.
# This function should be called with the following arguments:
#
# - OUT_DIR the directory where files will be staged
# - CONTEXT the type of correctness package being built (e.g. 'valgrind correctness')
function(stage_correctness_package)
set(oneValueArgs OUT_DIR CONTEXT)
cmake_parse_arguments(STAGE "" "${oneValueArgs}" "" "${ARGN}")
file(MAKE_DIRECTORY ${STAGE_OUT_DIR}/bin)
string(LENGTH "${CMAKE_SOURCE_DIR}/tests/" base_length)
foreach(test IN LISTS TEST_NAMES)
if(("${TEST_TYPE_${test}}" STREQUAL "simulation") AND
@ -162,12 +167,14 @@ function(create_test_package)
(NOT ${test} MATCHES ${TEST_PACKAGE_EXCLUDE}))
foreach(file IN LISTS TEST_FILES_${test})
string(SUBSTRING ${file} ${base_length} -1 rel_out_file)
set(out_file ${CMAKE_BINARY_DIR}/packages/tests/${rel_out_file})
list(APPEND out_files ${out_file})
set(out_file ${STAGE_OUT_DIR}/tests/${rel_out_file})
list(APPEND test_files ${out_file})
add_custom_command(
OUTPUT ${out_file}
DEPENDS ${file}
COMMAND ${CMAKE_COMMAND} -E copy ${file} ${out_file})
COMMAND ${CMAKE_COMMAND} -E copy ${file} ${out_file}
COMMENT "Copying ${STAGE_CONTEXT} test file ${rel_out_file}"
)
endforeach()
endif()
endforeach()
@ -181,68 +188,83 @@ function(create_test_package)
# SUBSTRING will fail
set(src_dir "${src_dir}/")
string(SUBSTRING ${src_dir} ${dir_len} -1 dest_dir)
string(SUBSTRING ${file} ${dir_len} -1 out_file)
list(APPEND external_files ${CMAKE_BINARY_DIR}/packages/${out_file})
file(COPY ${file} DESTINATION ${CMAKE_BINARY_DIR}/packages/${dest_dir})
string(SUBSTRING ${file} ${dir_len} -1 rel_out_file)
set(out_file ${STAGE_OUT_DIR}/${rel_out_file})
list(APPEND external_files ${out_file})
add_custom_command(
OUTPUT ${out_file}
DEPENDS ${file}
COMMAND ${CMAKE_COMMAND} -E copy ${file} ${out_file}
COMMENT "Copying ${STAGE_CONTEXT} external file ${file}"
)
endforeach()
endforeach()
if(NOT USE_VALGRIND)
set(tar_file ${CMAKE_BINARY_DIR}/packages/correctness-${CMAKE_PROJECT_VERSION}.tar.gz)
add_custom_command(
OUTPUT ${tar_file}
DEPENDS ${out_files}
${CMAKE_BINARY_DIR}/packages/bin/fdbserver
${CMAKE_BINARY_DIR}/packages/bin/TestHarness.exe
${CMAKE_BINARY_DIR}/packages/bin/TraceLogHelper.dll
${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/correctnessTest.sh
${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/correctnessTimeout.sh
${external_files}
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/correctnessTest.sh
${CMAKE_BINARY_DIR}/packages/joshua_test
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/correctnessTimeout.sh
${CMAKE_BINARY_DIR}/packages/joshua_timeout
COMMAND ${CMAKE_COMMAND} -E tar cfz ${tar_file} ${CMAKE_BINARY_DIR}/packages/bin/fdbserver
${CMAKE_BINARY_DIR}/packages/bin/TestHarness.exe
${CMAKE_BINARY_DIR}/packages/bin/TraceLogHelper.dll
${CMAKE_BINARY_DIR}/packages/joshua_test
${CMAKE_BINARY_DIR}/packages/joshua_timeout
${out_files}
${external_files}
COMMAND ${CMAKE_COMMAND} -E remove ${CMAKE_BINARY_DIR}/packages/joshua_test ${CMAKE_BINARY_DIR}/packages/joshua_timeout
WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/packages
COMMENT "Package correctness archive"
)
add_custom_target(package_tests ALL DEPENDS ${tar_file})
# seems make needs this dependency while this does nothing with ninja
add_dependencies(package_tests strip_only_fdbserver TestHarness)
endif()
list(APPEND package_files ${STAGE_OUT_DIR}/bin/fdbserver
${STAGE_OUT_DIR}/bin/TestHarness.exe
${STAGE_OUT_DIR}/bin/TraceLogHelper.dll
${STAGE_OUT_DIR}/CMakeCache.txt
)
add_custom_command(
OUTPUT ${package_files}
DEPENDS ${CMAKE_BINARY_DIR}/CMakeCache.txt
${CMAKE_BINARY_DIR}/packages/bin/fdbserver
${CMAKE_BINARY_DIR}/packages/bin/TestHarness.exe
${CMAKE_BINARY_DIR}/packages/bin/TraceLogHelper.dll
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_BINARY_DIR}/CMakeCache.txt ${STAGE_OUT_DIR}
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_BINARY_DIR}/packages/bin/fdbserver
${CMAKE_BINARY_DIR}/packages/bin/TestHarness.exe
${CMAKE_BINARY_DIR}/packages/bin/TraceLogHelper.dll
${STAGE_OUT_DIR}/bin
COMMENT "Copying files for ${STAGE_CONTEXT} package"
)
list(APPEND package_files ${test_files} ${external_files})
set(package_files ${package_files} PARENT_SCOPE)
endfunction()
function(create_correctness_package)
if(WIN32)
return()
endif()
set(out_dir "${CMAKE_BINARY_DIR}/correctness")
stage_correctness_package(OUT_DIR ${out_dir} CONTEXT "correctness")
set(tar_file ${CMAKE_BINARY_DIR}/packages/correctness-${CMAKE_PROJECT_VERSION}.tar.gz)
add_custom_command(
OUTPUT ${tar_file}
DEPENDS ${package_files}
${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/correctnessTest.sh
${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/correctnessTimeout.sh
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/correctnessTest.sh
${out_dir}/joshua_test
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/correctnessTimeout.sh
${out_dir}/joshua_timeout
COMMAND ${CMAKE_COMMAND} -E tar cfz ${tar_file} *
WORKING_DIRECTORY ${out_dir}
COMMENT "Package correctness archive"
)
add_custom_target(package_tests ALL DEPENDS ${tar_file})
add_dependencies(package_tests strip_only_fdbserver TestHarness)
endfunction()
function(create_valgrind_correctness_package)
if(WIN32)
return()
endif()
if(USE_VALGRIND)
set(out_dir "${CMAKE_BINARY_DIR}/valgrind_correctness")
stage_correctness_package(OUT_DIR ${out_dir} CONTEXT "valgrind correctness")
set(tar_file ${CMAKE_BINARY_DIR}/packages/valgrind-${CMAKE_PROJECT_VERSION}.tar.gz)
add_custom_command(
OUTPUT ${tar_file}
DEPENDS ${out_files}
${CMAKE_BINARY_DIR}/packages/bin/fdbserver
${CMAKE_BINARY_DIR}/packages/bin/TestHarness.exe
${CMAKE_BINARY_DIR}/packages/bin/TraceLogHelper.dll
DEPENDS ${package_files}
${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/valgrindTest.sh
${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/valgrindTimeout.sh
${external_files}
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/valgrindTest.sh
${CMAKE_BINARY_DIR}/packages/joshua_test
${out_dir}/joshua_test
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/valgrindTimeout.sh
${CMAKE_BINARY_DIR}/packages/joshua_timeout
COMMAND ${CMAKE_COMMAND} -E tar cfz ${tar_file}
${CMAKE_BINARY_DIR}/packages/bin/fdbserver
${CMAKE_BINARY_DIR}/packages/bin/TestHarness.exe
${CMAKE_BINARY_DIR}/packages/bin/TraceLogHelper.dll
${CMAKE_BINARY_DIR}/packages/joshua_test
${CMAKE_BINARY_DIR}/packages/joshua_timeout
${out_files}
${external_files}
COMMAND ${CMAKE_COMMAND} -E remove ${CMAKE_BINARY_DIR}/packages/joshua_test ${CMAKE_BINARY_DIR}/packages/joshua_timeout
WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/packages
COMMENT "Package correctness archive"
${out_dir}/joshua_timeout
COMMAND ${CMAKE_COMMAND} -E tar cfz ${tar_file} *
WORKING_DIRECTORY ${out_dir}
COMMENT "Package valgrind correctness archive"
)
add_custom_target(package_valgrind_tests ALL DEPENDS ${tar_file})
add_dependencies(package_valgrind_tests strip_only_fdbserver TestHarness)
@ -262,7 +284,8 @@ function(package_bindingtester)
set(outfiles ${bdir}/fdbcli ${bdir}/fdbserver ${bdir}/${fdbcName} ${bdir}/joshua_test ${bdir}/joshua_timeout)
add_custom_command(
OUTPUT ${outfiles}
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_BINARY_DIR}/packages/bin/fdbcli
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_BINARY_DIR}/CMakeCache.txt
${CMAKE_BINARY_DIR}/packages/bin/fdbcli
${CMAKE_BINARY_DIR}/packages/bin/fdbserver
${CMAKE_BINARY_DIR}/packages/lib/${fdbcName}
${bdir}
@ -270,7 +293,7 @@ function(package_bindingtester)
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/bindingTimeout.sh ${bdir}/joshua_timeout
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/localClusterStart.sh ${bdir}/localClusterStart.sh
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/bindingTestScript.sh ${bdir}/bindingTestScript.sh
COMMENT "Copy executes to bindingtester dir")
COMMENT "Copy executables and scripts to bindingtester dir")
file(GLOB_RECURSE test_files ${CMAKE_SOURCE_DIR}/bindings/*)
add_custom_command(
OUTPUT "${CMAKE_BINARY_DIR}/bindingtester.touch"

View File

@ -209,6 +209,25 @@ else()
# -mavx
# -msse4.2)
# Tentatively re-enabling vector instructions
set(USE_AVX512F OFF CACHE BOOL "Enable AVX 512F instructions")
if (USE_AVX512F)
add_compile_options(-mavx512f)
endif()
set(USE_AVX ON CACHE BOOL "Enable AVX instructions")
if (USE_AVX)
add_compile_options(-mavx)
endif()
# Intentionally using builtin memcpy. G++ does a good job on small memcpy's when the size is known at runtime.
# If the size is not known, then it falls back on the memcpy that's available at runtime (rte_memcpy, as of this
# writing; see flow.cpp).
#
# The downside of the builtin memcpy is that it's slower at large copies, so if we spend a lot of time on large
# copies of sizes that are known at compile time, this might not be a win. See the output of performance/memcpy
# for more information.
#add_compile_options(-fno-builtin-memcpy)
if (USE_VALGRIND)
add_compile_options(-DVALGRIND -DUSE_VALGRIND)
endif()
@ -254,7 +273,6 @@ else()
endif()
if (GCC)
add_compile_options(-Wno-pragmas)
# Otherwise `state [[maybe_unused]] int x;` will issue a warning.
# https://stackoverflow.com/questions/50646334/maybe-unused-on-member-variable-gcc-warns-incorrectly-that-attribute-is
add_compile_options(-Wno-attributes)
@ -268,6 +286,7 @@ else()
-fvisibility=hidden
-Wreturn-type
-fPIC)
add_compile_options($<$<COMPILE_LANGUAGE:CXX>:-Wclass-memaccess>)
if (GPERFTOOLS_FOUND AND GCC)
add_compile_options(
-fno-builtin-malloc

View File

@ -493,7 +493,7 @@ If a process has had more than 10 TCP segments retransmitted in the last 5 secon
10.0.4.1:4500 ( 3% cpu; 2% machine; 0.004 Gbps; 0% disk; REXMIT! 2.5 GB / 4.1 GB RAM )
Machine-readable status
--------------------------------
-----------------------
The status command can provide a complete summary of statistics about the cluster and the database with the ``json`` argument. Full documentation for ``status json`` output can be found :doc:`here <mr-status>`.
From the output of ``status json``, operators can find useful health metrics to determine whether or not their cluster is hitting performance limits.
@ -505,6 +505,72 @@ Durable version lag ``cluster.qos.worst_durability_lag_storage_server`` cont
Transaction log queue ``cluster.qos.worst_queue_bytes_log_server`` contains the maximum size in bytes of the mutations stored on a transaction log that have not yet been popped by storage servers. A large transaction log queue size can potentially cause the ratekeeper to increase throttling.
====================== ==============================================================================================================
Server-side latency band tracking
---------------------------------
As part of the status document, ``status json`` provides some sampled latency metrics obtained by running probe transactions internally. While this can often be useful, it does not necessarily reflect the distribution of latencies for requests originated by clients.
FoundationDB additionally provides optional functionality to measure the latencies of all incoming get read version (GRV), read, and commit requests and report some basic details about those requests. The latencies are measured from the time the server receives the request to the point when it replies, and will therefore not include time spent in transit between the client and server or delays in the client process itself.
The latency band tracking works by configuring various latency thresholds and counting the number of requests that occur in each band (i.e. between two consecutive thresholds). For example, if you wanted to define a service-level objective (SLO) for your cluster where 99.9% of read requests were answered within N seconds, you could set a read latency threshold at N. You could then count the number of requests below and above the threshold and determine whether the required percentage of requests are answered sufficiently quickly.
Configuration of server-side latency bands is performed by setting the ``\xff\x02/latencyBandConfig`` key to a string encoding the following JSON document::
{
"get_read_version" : {
"bands" : [ 0.01, 0.1]
},
"read" : {
"bands" : [ 0.01, 0.1],
"max_key_selector_offset" : 1000,
"max_read_bytes" : 1000000
},
"commit" : {
"bands" : [ 0.01, 0.1],
"max_commit_bytes" : 1000000
}
}
Every field in this configuration is optional, and any missing fields will be left unset (i.e. no bands will be tracked or limits will not apply). The configuration takes the following arguments:
* ``bands`` - a list of thresholds (in seconds) to be measured for the given request type (``get_read_version``, ``read``, or ``commit``)
* ``max_key_selector_offset`` - an integer specifying the maximum key selector offset a read request can have and still be counted
* ``max_read_bytes`` - an integer specifying the maximum size in bytes of a read response that will be counted
* ``max_commit_bytes`` - an integer specifying the maximum size in bytes of a commit request that will be counted
Setting this configuration key to a value that changes the configuration will result in the cluster controller server process logging a ``LatencyBandConfigChanged`` event. This event will indicate whether a configuration is present or not using its ``Present`` field. Specifying an invalid configuration will result in the latency band feature being unconfigured, and the server process running the cluster controller will log a ``InvalidLatencyBandConfiguration`` trace event.
.. note:: GRV requests are counted only at default and immediate priority. Batch priority GRV requests are ignored for the purposes of latency band tracking.
When configured, the ``status json`` output will include additional fields to report the number of requests in each latency band located at ``cluster.processes.<ID>.roles[N].*_latency_bands``::
"grv_latency_bands" : {
0.01: 10,
0.1: 0,
inf: 1,
filtered: 0
},
"read_latency_bands" : {
0.01: 12,
0.1: 1,
inf: 0,
filtered: 0
},
"commit_latency_bands" : {
0.01: 5,
0.1: 5,
inf: 2,
filtered: 1
}
The ``grv_latency_bands`` and ``commit_latency_bands`` objects will only be logged for ``proxy`` roles, and ``read_latency_bands`` will only be logged for storage roles. Each threshold is represented as a key in the map, and its associated value will be the total number of requests in the lifetime of the process with a latency smaller than the threshold but larger than the next smaller threshold.
For example, ``0.1: 1`` in ``read_latency_bands`` indicates that there has been 1 read request with a latency in the range ``[0.01, 0.1)``. For the smallest specified threshold, the lower bound is 0 (e.g. ``[0, 0.01)`` in the example above). Requests that took longer than any defined latency band will be reported in the ``inf`` (infinity) band. Requests that were filtered by the configuration (e.g. using ``max_read_bytes``) are reported in the ``filtered`` category.
Because each threshold reports latencies strictly in the range between the next lower threshold and itself, it may be necessary to sum up the counts for multiple bands to determine the total number of requests below a certain threshold.
.. note:: No history of request counts is recorded for processes that ran in the past. This includes the history prior to restart for a process that has been restarted, for which the counts get reset to 0. For this reason, it is recommended that you collect this information periodically if you need to be able to track requests from such processes.
.. _administration_fdbmonitor:
``fdbmonitor`` and ``fdbserver``

View File

@ -10,38 +10,38 @@ macOS
The macOS installation package is supported on macOS 10.7+. It includes the client and (optionally) the server.
* `FoundationDB-6.3.0.pkg <https://www.foundationdb.org/downloads/6.3.0/macOS/installers/FoundationDB-6.3.0.pkg>`_
* `FoundationDB-6.3.1.pkg <https://www.foundationdb.org/downloads/6.3.1/macOS/installers/FoundationDB-6.3.1.pkg>`_
Ubuntu
------
The Ubuntu packages are supported on 64-bit Ubuntu 12.04+, but beware of the Linux kernel bug in Ubuntu 12.x.
* `foundationdb-clients-6.3.0-1_amd64.deb <https://www.foundationdb.org/downloads/6.3.0/ubuntu/installers/foundationdb-clients_6.3.0-1_amd64.deb>`_
* `foundationdb-server-6.3.0-1_amd64.deb <https://www.foundationdb.org/downloads/6.3.0/ubuntu/installers/foundationdb-server_6.3.0-1_amd64.deb>`_ (depends on the clients package)
* `foundationdb-clients-6.3.1-1_amd64.deb <https://www.foundationdb.org/downloads/6.3.1/ubuntu/installers/foundationdb-clients_6.3.1-1_amd64.deb>`_
* `foundationdb-server-6.3.1-1_amd64.deb <https://www.foundationdb.org/downloads/6.3.1/ubuntu/installers/foundationdb-server_6.3.1-1_amd64.deb>`_ (depends on the clients package)
RHEL/CentOS EL6
---------------
The RHEL/CentOS EL6 packages are supported on 64-bit RHEL/CentOS 6.x.
* `foundationdb-clients-6.3.0-1.el6.x86_64.rpm <https://www.foundationdb.org/downloads/6.3.0/rhel6/installers/foundationdb-clients-6.3.0-1.el6.x86_64.rpm>`_
* `foundationdb-server-6.3.0-1.el6.x86_64.rpm <https://www.foundationdb.org/downloads/6.3.0/rhel6/installers/foundationdb-server-6.3.0-1.el6.x86_64.rpm>`_ (depends on the clients package)
* `foundationdb-clients-6.3.1-1.el6.x86_64.rpm <https://www.foundationdb.org/downloads/6.3.1/rhel6/installers/foundationdb-clients-6.3.1-1.el6.x86_64.rpm>`_
* `foundationdb-server-6.3.1-1.el6.x86_64.rpm <https://www.foundationdb.org/downloads/6.3.1/rhel6/installers/foundationdb-server-6.3.1-1.el6.x86_64.rpm>`_ (depends on the clients package)
RHEL/CentOS EL7
---------------
The RHEL/CentOS EL7 packages are supported on 64-bit RHEL/CentOS 7.x.
* `foundationdb-clients-6.3.0-1.el7.x86_64.rpm <https://www.foundationdb.org/downloads/6.3.0/rhel7/installers/foundationdb-clients-6.3.0-1.el7.x86_64.rpm>`_
* `foundationdb-server-6.3.0-1.el7.x86_64.rpm <https://www.foundationdb.org/downloads/6.3.0/rhel7/installers/foundationdb-server-6.3.0-1.el7.x86_64.rpm>`_ (depends on the clients package)
* `foundationdb-clients-6.3.1-1.el7.x86_64.rpm <https://www.foundationdb.org/downloads/6.3.1/rhel7/installers/foundationdb-clients-6.3.1-1.el7.x86_64.rpm>`_
* `foundationdb-server-6.3.1-1.el7.x86_64.rpm <https://www.foundationdb.org/downloads/6.3.1/rhel7/installers/foundationdb-server-6.3.1-1.el7.x86_64.rpm>`_ (depends on the clients package)
Windows
-------
The Windows installer is supported on 64-bit Windows XP and later. It includes the client and (optionally) the server.
* `foundationdb-6.3.0-x64.msi <https://www.foundationdb.org/downloads/6.3.0/windows/installers/foundationdb-6.3.0-x64.msi>`_
* `foundationdb-6.3.1-x64.msi <https://www.foundationdb.org/downloads/6.3.1/windows/installers/foundationdb-6.3.1-x64.msi>`_
API Language Bindings
=====================
@ -58,18 +58,18 @@ On macOS and Windows, the FoundationDB Python API bindings are installed as part
If you need to use the FoundationDB Python API from other Python installations or paths, use the Python package manager ``pip`` (``pip install foundationdb``) or download the Python package:
* `foundationdb-6.3.0.tar.gz <https://www.foundationdb.org/downloads/6.3.0/bindings/python/foundationdb-6.3.0.tar.gz>`_
* `foundationdb-6.3.1.tar.gz <https://www.foundationdb.org/downloads/6.3.1/bindings/python/foundationdb-6.3.1.tar.gz>`_
Ruby 1.9.3/2.0.0+
-----------------
* `fdb-6.3.0.gem <https://www.foundationdb.org/downloads/6.3.0/bindings/ruby/fdb-6.3.0.gem>`_
* `fdb-6.3.1.gem <https://www.foundationdb.org/downloads/6.3.1/bindings/ruby/fdb-6.3.1.gem>`_
Java 8+
-------
* `fdb-java-6.3.0.jar <https://www.foundationdb.org/downloads/6.3.0/bindings/java/fdb-java-6.3.0.jar>`_
* `fdb-java-6.3.0-javadoc.jar <https://www.foundationdb.org/downloads/6.3.0/bindings/java/fdb-java-6.3.0-javadoc.jar>`_
* `fdb-java-6.3.1.jar <https://www.foundationdb.org/downloads/6.3.1/bindings/java/fdb-java-6.3.1.jar>`_
* `fdb-java-6.3.1-javadoc.jar <https://www.foundationdb.org/downloads/6.3.1/bindings/java/fdb-java-6.3.1-javadoc.jar>`_
Go 1.11+
--------

View File

@ -2,6 +2,15 @@
Release Notes
#############
6.2.22
======
Fixes
-----
* Coordinator class processes could be recruited as the cluster controller. `(PR #3282) <https://github.com/apple/foundationdb/pull/3282>`_
* HTTPS requests made by backup would fail (introduced in 6.2.21). `(PR #3284) <https://github.com/apple/foundationdb/pull/3284>`_
6.2.21
======

View File

@ -0,0 +1,123 @@
#############
Release Notes
#############
6.3.1
=====
Features
--------
* Added the ability to set arbitrary tags on transactions. Tags can be specifically throttled using ``fdbcli``, and certain types of tags can be automatically throttled by ratekeeper. `(PR #2942) <https://github.com/apple/foundationdb/pull/2942>`_
* Add an option for transactions to report conflicting keys by calling ``getRange`` with the special key prefix ``\xff\xff/transaction/conflicting_keys/``. `(PR 2257) <https://github.com/apple/foundationdb/pull/2257>`_
* Added the ``exclude failed`` command to ``fdbcli``. This command designates that a process is dead and will never come back, so the transaction logs can forget about mutations sent to that process. `(PR #1955) <https://github.com/apple/foundationdb/pull/1955>`_
* A new fast restore system that can restore a database to a point in time from backup files. It is a Spark-like parallel processing framework that processes backup data asynchronously, in parallel and in pipeline. `(Fast Restore Project) <https://github.com/apple/foundationdb/projects/7>`_
* Added backup workers for pulling mutations from transaction logs and uploading them to blob storage. Switching from the previous backup implementation will double a cluster's maximum write bandwidth. `(PR #1625) <https://github.com/apple/foundationdb/pull/1625>`_ `(PR #2588) <https://github.com/apple/foundationdb/pull/2588>`_ `(PR #2642) <https://github.com/apple/foundationdb/pull/2642>`_
* Added a new API in all bindings that can be used to query the estimated byte size of a given range. `(PR #2537) <https://github.com/apple/foundationdb/pull/2537>`_
* Added the ``lock`` and ``unlock`` commands to ``fdbcli`` which lock or unlock a cluster. `(PR #2890) <https://github.com/apple/foundationdb/pull/2890>`_
* Add a framework which helps to add client functions using special keys (keys within ``[\xff\xff, \xff\xff\xff)``). `(PR #2662) <https://github.com/apple/foundationdb/pull/2662>`_
Performance
-----------
* Improved the client's load balancing algorithm so that each proxy processes an equal number of requests. `(PR #2520) <https://github.com/apple/foundationdb/pull/2520>`_
* Significantly reduced the amount of work done on the cluster controller by removing the centralized failure monitoring. `(PR #2518) <https://github.com/apple/foundationdb/pull/2518>`_
* Improved master recovery speeds by more efficiently broadcasting the recovery state between processes. `(PR #2941) <https://github.com/apple/foundationdb/pull/2941>`_
* Significantly reduced the number of network connections opened to the coordinators. `(PR #3069) <https://github.com/apple/foundationdb/pull/3069>`_
* Improve GRV tail latencies, particularly as the transaction rate gets nearer the ratekeeper limit. `(PR #2735) <https://github.com/apple/foundationdb/pull/2735>`_
* The proxies are now more responsive to changes in workload when unthrottling lower priority transactions. `(PR #2735) <https://github.com/apple/foundationdb/pull/2735>`_
* Removed a lot of unnecessary copying across the codebase. `(PR #2986) <https://github.com/apple/foundationdb/pull/2986>`_ `(PR #2915) <https://github.com/apple/foundationdb/pull/2915>`_ `(PR #3024) <https://github.com/apple/foundationdb/pull/3024>`_ `(PR #2999) <https://github.com/apple/foundationdb/pull/2999>`_
* Optimized the performance of the storage server. `(PR #1988) <https://github.com/apple/foundationdb/pull/1988>`_ `(PR #3103) <https://github.com/apple/foundationdb/pull/3103>`_
* Optimized the performance of the resolver. `(PR #2648) <https://github.com/apple/foundationdb/pull/2648>`_
* Replaced most uses of hashlittle2 with crc32 for better performance. `(PR #2538) <https://github.com/apple/foundationdb/pull/2538>`_
* Significantly reduced the serialized size of conflict ranges and single key clears. `(PR #2513) <https://github.com/apple/foundationdb/pull/2513>`_
* Improved range read performance when the reads overlap recently cleared key ranges. `(PR #2028) <https://github.com/apple/foundationdb/pull/2028>`_
* Reduced the number of comparisons used by various map implementations. `(PR #2882) <https://github.com/apple/foundationdb/pull/2882>`_
* Reduced the serialized size of empty strings. `(PR #3063) <https://github.com/apple/foundationdb/pull/3063>`_
* Reduced the serialized size of various interfaces by 10x. `(PR #3068) <https://github.com/apple/foundationdb/pull/3068>`_
Reliability
-----------
* Connections that disconnect frequently are not immediately marked available. `(PR #2932) <https://github.com/apple/foundationdb/pull/2932>`_
* The data distributor will consider storage servers that are continually lagging behind as if they were failed. `(PR #2917) <https://github.com/apple/foundationdb/pull/2917>`_
* Changing the storage engine type of a cluster will no longer cause the cluster to run out of memory. Instead, the cluster will gracefully migrate storage server processes to the new storage engine one by one. `(PR #1985) <https://github.com/apple/foundationdb/pull/1985>`_
* Batch priority transactions which are being throttled by ratekeeper will get a ``batch_transaction_throttled`` error instead of hanging indefinitely. `(PR #1868) <https://github.com/apple/foundationdb/pull/1868>`_
* Avoid using too much memory on the transaction logs when multiple types of transaction logs exist in the same process. `(PR #2213) <https://github.com/apple/foundationdb/pull/2213>`_
Fixes
-----
* The ``SetVersionstampedKey`` atomic operation no longer conflicts with versions smaller than the current read version of the transaction. `(PR #2557) <https://github.com/apple/foundationdb/pull/2557>`_
* Ratekeeper would measure durability lag a few seconds higher than reality. `(PR #2499) <https://github.com/apple/foundationdb/pull/2499>`_
* In very rare scenarios, the data distributor process could get stuck in an infinite loop. `(PR #2228) <https://github.com/apple/foundationdb/pull/2228>`_
* If the number of configured transaction logs were reduced at the exact same time a change to the system keyspace took place, it was possible for the transaction state store to become corrupted. `(PR #3051) <https://github.com/apple/foundationdb/pull/3051>`_
* Fix multiple data races between threads on the client. `(PR #3026) <https://github.com/apple/foundationdb/pull/3026>`_
* Transaction logs configured to spill by reference had an unintended delay between each spilled batch. `(PR #3153) <https://github.com/apple/foundationdb/pull/3153>`_
* Added guards to honor ``DISABLE_POSIX_KERNEL_AIO``. `(PR #2888) <https://github.com/apple/foundationdb/pull/2888>`_
Status
------
* A process's ``memory.available_bytes`` can no longer exceed the memory limit of the process. For purposes of this statistic, processes on the same machine will be allocated memory proportionally based on the size of their memory limits. `(PR #3174) <https://github.com/apple/foundationdb/pull/3174>`_
* Replaced ``cluster.database_locked`` status field with ``cluster.database_lock_state``, which contains two subfields: ``locked`` (boolean) and ``lock_uid`` (which contains the database lock uid if the database is locked). `(PR #2058) <https://github.com/apple/foundationdb/pull/2058>`_
* Removed fields ``worst_version_lag_storage_server`` and ``limiting_version_lag_storage_server`` from the ``cluster.qos`` section. The ``worst_data_lag_storage_server`` and ``limiting_data_lag_storage_server`` objects can be used instead. `(PR #3196) <https://github.com/apple/foundationdb/pull/3196>`_
* If a process is unable to flush trace logs to disk, the problem will now be reported via the output of ``status`` command inside ``fdbcli``. `(PR #2605) <https://github.com/apple/foundationdb/pull/2605>`_ `(PR #2820) <https://github.com/apple/foundationdb/pull/2820>`_
Bindings
--------
* API version updated to 630. See the :ref:`API version upgrade guide <api-version-upgrade-guide-630>` for upgrade details.
* Python: The ``@fdb.transactional`` decorator will now throw an error if the decorated function returns a generator. `(PR #1724) <https://github.com/apple/foundationdb/pull/1724>`_
* Java: Add caching for various JNI objects to improve performance. `(PR #2809) <https://github.com/apple/foundationdb/pull/2809>`_
* Java: Optimize byte array comparisons in ``ByteArrayUtil``. `(PR #2823) <https://github.com/apple/foundationdb/pull/2823>`_
* Java: Add ``FDB.disableShutdownHook`` that can be used to prevent the default shutdown hook from running. Users of this new function should make sure to call ``stopNetwork`` before terminating a client process. `(PR #2635) <https://github.com/apple/foundationdb/pull/2635>`_
* Java: Introduced ``keyAfter`` utility function that can be used to create the immediate next key for a given byte array. `(PR #2458) <https://github.com/apple/foundationdb/pull/2458>`_
* Golang: The ``Transact`` function will unwrap errors that have been wrapped using ``xerrors`` to determine if a retryable FoundationDB error is in the error chain. `(PR #3131) <https://github.com/apple/foundationdb/pull/3131>`_
* Golang: Added ``Subspace.PackWithVersionstamp`` that can be used to pack a ``Tuple`` that contains a versionstamp. `(PR #2243) <https://github.com/apple/foundationdb/pull/2243>`_
* Golang: Implement ``Stringer`` interface for ``Tuple``, ``Subspace``, ``UUID``, and ``Versionstamp``. `(PR #3032) <https://github.com/apple/foundationdb/pull/3032>`_
* C: The ``FDBKeyValue`` struct's ``key`` and ``value`` members have changed type from ``void*`` to ``uint8_t*``. `(PR #2622) <https://github.com/apple/foundationdb/pull/2622>`_
* Deprecated ``enable_slow_task_profiling`` network option and replaced it with ``enable_run_loop_profiling``. `(PR #2608) <https://github.com/apple/foundationdb/pull/2608>`_
Other Changes
-------------
* Small key ranges which are being heavily read will be reported in the logs using the trace event ``ReadHotRangeLog``. `(PR #2046) <https://github.com/apple/foundationdb/pull/2046>`_ `(PR #2378) <https://github.com/apple/foundationdb/pull/2378>`_ `(PR #2532) <https://github.com/apple/foundationdb/pull/2532>`_
* Added the read version, commit version, and datacenter locality to the client transaction information. `(PR #3079) <https://github.com/apple/foundationdb/pull/3079>`_ `(PR #3205) <https://github.com/apple/foundationdb/pull/3205>`_
* Added a network option ``TRACE_FILE_IDENTIFIER`` that can be used to provide a custom identifier string that will be part of the file name for all trace log files created on the client. `(PR #2869) <https://github.com/apple/foundationdb/pull/2869>`_
* It is now possible to use the ``TRACE_LOG_GROUP`` option on a client process after the database has been created. `(PR #2862) <https://github.com/apple/foundationdb/pull/2862>`_
* Added a network option ``TRACE_CLOCK_SOURCE`` that can be used to switch the trace event timestamps to use a realtime clock source. `(PR #2329) <https://github.com/apple/foundationdb/pull/2329>`_
* The ``INCLUDE_PORT_IN_ADDRESS`` transaction option is now on by default. This means ``get_addresses_for_key`` will always return ports in the address strings. `(PR #2639) <https://github.com/apple/foundationdb/pull/2639>`_
* Added the ``getversion`` command to ``fdbcli`` which returns the current read version of the cluster. `(PR #2882) <https://github.com/apple/foundationdb/pull/2882>`_
* Added the ``advanceversion`` command to ``fdbcli`` which increases the current version of a cluster. `(PR #2965) <https://github.com/apple/foundationdb/pull/2965>`_
* Improved the slow task profiler to also report backtraces for periods when the run loop is saturated. `(PR #2608) <https://github.com/apple/foundationdb/pull/2608>`_
* Double the number of shard locations that the client will cache locally. `(PR #2198) <https://github.com/apple/foundationdb/pull/2198>`_
* Replaced the ``-add_prefix`` and ``-remove_prefix`` options with ``--add_prefix`` and ``--remove_prefix`` in ``fdbrestore`` `(PR 3206) <https://github.com/apple/foundationdb/pull/3206>`_
* Data distribution metrics can now be read using the special keyspace ``\xff\xff/metrics/data_distribution_stats``. `(PR #2547) <https://github.com/apple/foundationdb/pull/2547>`_
* The ``\xff\xff/worker_interfaces/`` keyspace now begins at a key which includes a trailing ``/`` (previously ``\xff\xff/worker_interfaces``). Range reads to this range now respect the end key passed into the range and include the keyspace prefix in the resulting keys. `(PR #3095) <https://github.com/apple/foundationdb/pull/3095>`_
* Added FreeBSD support. `(PR #2634) <https://github.com/apple/foundationdb/pull/2634>`_
* Updated boost to 1.72. `(PR #2684) <https://github.com/apple/foundationdb/pull/2684>`_
Earlier release notes
---------------------
* :doc:`6.2 (API Version 620) </old-release-notes/release-notes-620>`
* :doc:`6.1 (API Version 610) </old-release-notes/release-notes-610>`
* :doc:`6.0 (API Version 600) </old-release-notes/release-notes-600>`
* :doc:`5.2 (API Version 520) </old-release-notes/release-notes-520>`
* :doc:`5.1 (API Version 510) </old-release-notes/release-notes-510>`
* :doc:`5.0 (API Version 500) </old-release-notes/release-notes-500>`
* :doc:`4.6 (API Version 460) </old-release-notes/release-notes-460>`
* :doc:`4.5 (API Version 450) </old-release-notes/release-notes-450>`
* :doc:`4.4 (API Version 440) </old-release-notes/release-notes-440>`
* :doc:`4.3 (API Version 430) </old-release-notes/release-notes-430>`
* :doc:`4.2 (API Version 420) </old-release-notes/release-notes-420>`
* :doc:`4.1 (API Version 410) </old-release-notes/release-notes-410>`
* :doc:`4.0 (API Version 400) </old-release-notes/release-notes-400>`
* :doc:`3.0 (API Version 300) </old-release-notes/release-notes-300>`
* :doc:`2.0 (API Version 200) </old-release-notes/release-notes-200>`
* :doc:`1.0 (API Version 100) </old-release-notes/release-notes-100>`
* :doc:`Beta 3 (API Version 23) </old-release-notes/release-notes-023>`
* :doc:`Beta 2 (API Version 22) </old-release-notes/release-notes-022>`
* :doc:`Beta 1 (API Version 21) </old-release-notes/release-notes-021>`
* :doc:`Alpha 6 (API Version 16) </old-release-notes/release-notes-016>`
* :doc:`Alpha 5 (API Version 14) </old-release-notes/release-notes-014>`

View File

@ -2,7 +2,7 @@
Release Notes
#############
6.3.0
6.3.2
=====
Features
@ -98,8 +98,14 @@ Other Changes
* Added FreeBSD support. `(PR #2634) <https://github.com/apple/foundationdb/pull/2634>`_
* Updated boost to 1.72. `(PR #2684) <https://github.com/apple/foundationdb/pull/2684>`_
Fixes only impacting 6.3.0+
---------------------------
* Renamed ``MIN_DELAY_STORAGE_CANDIDACY_SECONDS`` knob to ``MIN_DELAY_CC_WORST_FIT_CANDIDACY_SECONDS``. [6.3.2] `(PR #3327) <https://github.com/apple/foundationdb/pull/3327>`_
Earlier release notes
---------------------
* :doc:`6.3 (API Version 630) </old-release-notes/release-notes-630>`
* :doc:`6.2 (API Version 620) </old-release-notes/release-notes-620>`
* :doc:`6.1 (API Version 610) </old-release-notes/release-notes-610>`
* :doc:`6.0 (API Version 600) </old-release-notes/release-notes-600>`

View File

@ -955,7 +955,7 @@ static void printBackupUsage(bool devhelp) {
printf(" -e ERRORLIMIT The maximum number of errors printed by status (default is 10).\n");
printf(" -k KEYS List of key ranges to backup.\n"
" If not specified, the entire database will be backed up.\n");
printf(" -p, --partitioned_log Starts with new type of backup system using partitioned logs.\n");
printf(" --partitioned_log_experimental Starts with new type of backup system using partitioned logs.\n");
printf(" -n, --dryrun For backup start or restore start, performs a trial run with no actual changes made.\n");
printf(" --log Enables trace file logging for the CLI session.\n"
" --logdir PATH Specifes the output directory for trace files. If\n"

View File

@ -247,14 +247,11 @@ class FileBackupAgent : public BackupAgentBase {
public:
FileBackupAgent();
FileBackupAgent( FileBackupAgent&& r ) BOOST_NOEXCEPT :
subspace( std::move(r.subspace) ),
config( std::move(r.config) ),
lastRestorable( std::move(r.lastRestorable) ),
taskBucket( std::move(r.taskBucket) ),
futureBucket( std::move(r.futureBucket) ) {}
FileBackupAgent(FileBackupAgent&& r) noexcept
: subspace(std::move(r.subspace)), config(std::move(r.config)), lastRestorable(std::move(r.lastRestorable)),
taskBucket(std::move(r.taskBucket)), futureBucket(std::move(r.futureBucket)) {}
void operator=( FileBackupAgent&& r ) BOOST_NOEXCEPT {
void operator=(FileBackupAgent&& r) noexcept {
subspace = std::move(r.subspace);
config = std::move(r.config);
lastRestorable = std::move(r.lastRestorable),
@ -381,19 +378,13 @@ public:
DatabaseBackupAgent();
explicit DatabaseBackupAgent(Database src);
DatabaseBackupAgent( DatabaseBackupAgent&& r ) BOOST_NOEXCEPT :
subspace( std::move(r.subspace) ),
states( std::move(r.states) ),
config( std::move(r.config) ),
errors( std::move(r.errors) ),
ranges( std::move(r.ranges) ),
tagNames( std::move(r.tagNames) ),
taskBucket( std::move(r.taskBucket) ),
futureBucket( std::move(r.futureBucket) ),
sourceStates( std::move(r.sourceStates) ),
sourceTagNames( std::move(r.sourceTagNames) ) {}
DatabaseBackupAgent(DatabaseBackupAgent&& r) noexcept
: subspace(std::move(r.subspace)), states(std::move(r.states)), config(std::move(r.config)),
errors(std::move(r.errors)), ranges(std::move(r.ranges)), tagNames(std::move(r.tagNames)),
taskBucket(std::move(r.taskBucket)), futureBucket(std::move(r.futureBucket)),
sourceStates(std::move(r.sourceStates)), sourceTagNames(std::move(r.sourceTagNames)) {}
void operator=( DatabaseBackupAgent&& r ) BOOST_NOEXCEPT {
void operator=(DatabaseBackupAgent&& r) noexcept {
subspace = std::move(r.subspace);
states = std::move(r.states);
config = std::move(r.config);
@ -883,7 +874,7 @@ public:
}
TraceEvent t(SevWarn, "FileBackupError");
t.error(e).detail("BackupUID", uid).detail("Description", details).detail("TaskInstance", (uint64_t)taskInstance);
// These should not happen
// key_not_found could happen
if(e.code() == error_code_key_not_found)
t.backtrace();

View File

@ -20,6 +20,11 @@
#ifndef DatabaseContext_h
#define DatabaseContext_h
#include "flow/FastAlloc.h"
#include "flow/FastRef.h"
#include "fdbclient/StorageServerInterface.h"
#include "flow/genericactors.actor.h"
#include <vector>
#pragma once
#include "fdbclient/NativeAPI.actor.h"
@ -44,7 +49,25 @@ private:
StorageServerInfo( DatabaseContext *cx, StorageServerInterface const& interf, LocalityData const& locality ) : cx(cx), ReferencedInterface<StorageServerInterface>(interf, locality) {}
};
typedef MultiInterface<ReferencedInterface<StorageServerInterface>> LocationInfo;
struct LocationInfo : MultiInterface<ReferencedInterface<StorageServerInterface>>, FastAllocated<LocationInfo> {
using Locations = MultiInterface<ReferencedInterface<StorageServerInterface>>;
explicit LocationInfo(const std::vector<Reference<ReferencedInterface<StorageServerInterface>>>& v)
: Locations(v)
{}
LocationInfo(const std::vector<Reference<ReferencedInterface<StorageServerInterface>>>& v, bool hasCaches)
: Locations(v)
, hasCaches(hasCaches)
{}
LocationInfo(const LocationInfo&) = delete;
LocationInfo(LocationInfo&&) = delete;
LocationInfo& operator=(const LocationInfo&) = delete;
LocationInfo& operator=(LocationInfo&&) = delete;
bool hasCaches = false;
Reference<Locations> locations() {
return Reference<Locations>::addRef(this);
}
};
typedef ModelInterface<MasterProxyInterface> ProxyInfo;
class ClientTagThrottleData : NonCopyable {
@ -131,7 +154,7 @@ public:
Database clone() const { return Database(new DatabaseContext( connectionFile, clientInfo, clientInfoMonitor, taskID, clientLocality, enableLocalityLoadBalance, lockAware, internal, apiVersion, switchable )); }
std::pair<KeyRange,Reference<LocationInfo>> getCachedLocation( const KeyRef&, bool isBackward = false );
std::pair<KeyRange, Reference<LocationInfo>> getCachedLocation( const KeyRef&, bool isBackward = false );
bool getCachedLocations( const KeyRangeRef&, vector<std::pair<KeyRange,Reference<LocationInfo>>>&, int limit, bool reverse );
Reference<LocationInfo> setCachedLocation( const KeyRangeRef&, const vector<struct StorageServerInterface>& );
void invalidateCache( const KeyRef&, bool isBackward = false );
@ -200,11 +223,13 @@ public:
bool enableLocalityLoadBalance;
struct VersionRequest {
SpanID spanContext;
Promise<GetReadVersionReply> reply;
TagSet tags;
Optional<UID> debugID;
VersionRequest(TagSet tags = TagSet(), Optional<UID> debugID = Optional<UID>()) : tags(tags), debugID(debugID) {}
VersionRequest(SpanID spanContext, TagSet tags = TagSet(), Optional<UID> debugID = Optional<UID>())
: spanContext(spanContext), tags(tags), debugID(debugID) {}
};
// Transaction start request batching
@ -232,7 +257,7 @@ public:
// Cache of location information
int locationCacheSize;
CoalescedKeyRangeMap< Reference<LocationInfo> > locationCache;
CoalescedKeyRangeMap<Reference<LocationInfo>> locationCache;
std::map< UID, StorageServerInfo* > server_interf;
@ -314,7 +339,8 @@ public:
double detailedHealthMetricsLastUpdated;
UniqueOrderedOptionList<FDBTransactionOptions> transactionDefaults;
Future<Void> cacheListMonitor;
AsyncTrigger updateCache;
std::vector<std::unique_ptr<SpecialKeyRangeBaseImpl>> specialKeySpaceModules;
std::unique_ptr<SpecialKeySpace> specialKeySpace;
void registerSpecialKeySpaceModule(SpecialKeySpace::MODULE module, std::unique_ptr<SpecialKeyRangeBaseImpl> impl);

View File

@ -26,6 +26,7 @@
#include <string>
#include <vector>
#include "flow/Arena.h"
#include "flow/flow.h"
#include "fdbclient/Knobs.h"
@ -35,6 +36,7 @@ typedef uint64_t Sequence;
typedef StringRef KeyRef;
typedef StringRef ValueRef;
typedef int64_t Generation;
typedef UID SpanID;
enum {
tagLocalitySpecial = -1,
@ -77,6 +79,10 @@ struct Tag {
serializer(ar, locality, id);
}
};
template <>
struct flow_ref<Tag> : std::integral_constant<bool, false> {};
#pragma pack(pop)
template <class Ar> void load( Ar& ar, Tag& tag ) { tag.serialize_unversioned(ar); }

View File

@ -235,7 +235,7 @@ public:
}
TraceEvent t(SevWarn, "FileRestoreError");
t.error(e).detail("RestoreUID", uid).detail("Description", details).detail("TaskInstance", (uint64_t)taskInstance);
// These should not happen
// key_not_found could happen
if(e.code() == error_code_key_not_found)
t.backtrace();
@ -3580,33 +3580,38 @@ public:
// Parallel restore
ACTOR static Future<Void> parallelRestoreFinish(Database cx, UID randomUID) {
state ReadYourWritesTransaction tr(cx);
state Future<Void> watchForRestoreRequestDone;
state bool restoreDone = false;
state Optional<Value> restoreRequestDoneKeyValue;
TraceEvent("FastRestoreAgentWaitForRestoreToFinish").detail("DBLock", randomUID);
// TODO: register watch first and then check if the key exist
loop {
try {
tr.reset();
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
Optional<Value> restoreRequestDoneKeyValue = wait(tr.get(restoreRequestDoneKey));
Optional<Value> _restoreRequestDoneKeyValue = wait(tr.get(restoreRequestDoneKey));
restoreRequestDoneKeyValue = _restoreRequestDoneKeyValue;
// Restore may finish before restoreAgent waits on the restore finish event.
if (restoreRequestDoneKeyValue.present()) {
restoreDone = true; // In case commit clears the key but in unknown_state
tr.clear(restoreRequestDoneKey);
wait(tr.commit());
break;
} else if (!restoreDone) {
watchForRestoreRequestDone = tr.watch(restoreRequestDoneKey);
} else {
state Future<Void> watchForRestoreRequestDone = tr.watch(restoreRequestDoneKey);
wait(tr.commit());
wait(watchForRestoreRequestDone);
} else {
break;
}
} catch (Error& e) {
wait(tr.onError(e));
}
}
TraceEvent("FastRestoreAgentRestoreFinished")
.detail("ClearRestoreRequestDoneKey", restoreRequestDoneKeyValue.present());
// Only this agent can clear the restoreRequestDoneKey
wait(runRYWTransaction(cx, [](Reference<ReadYourWritesTransaction> tr) -> Future<Void> {
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr->setOption(FDBTransactionOptions::LOCK_AWARE);
tr->clear(restoreRequestDoneKey);
return Void();
}));
TraceEvent("FastRestoreAgentRestoreFinished").detail("UnlockDBStart", randomUID);
try {
wait(unlockDatabase(cx, randomUID));
@ -3671,18 +3676,18 @@ public:
TraceEvent("FastRestoreAgentSubmitRestoreRequests").detail("DBIsLocked", randomUID);
break;
} catch (Error& e) {
TraceEvent("FastRestoreAgentSubmitRestoreRequests").detail("CheckLockError", e.what());
TraceEvent(numTries > 50 ? SevError : SevWarnAlways, "FastRestoreMayFail")
TraceEvent(numTries > 50 ? SevError : SevWarnAlways, "FastRestoreAgentSubmitRestoreRequestsMayFail")
.detail("Reason", "DB is not properly locked")
.detail("ExpectedLockID", randomUID);
.detail("ExpectedLockID", randomUID)
.error(e);
numTries++;
wait(delay(5.0));
wait(tr->onError(e));
}
}
// set up restore request
tr->reset();
loop {
tr->reset();
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr->setOption(FDBTransactionOptions::LOCK_AWARE);
try {
@ -4444,7 +4449,10 @@ public:
return r;
}
ACTOR static Future<Version> restore(FileBackupAgent* backupAgent, Database cx, Optional<Database> cxOrig, Key tagName, Key url, Standalone<VectorRef<KeyRangeRef>> ranges, bool waitForComplete, Version targetVersion, bool verbose, Key addPrefix, Key removePrefix, bool lockDB, UID randomUid) {
ACTOR static Future<Version> restore(FileBackupAgent* backupAgent, Database cx, Optional<Database> cxOrig,
Key tagName, Key url, Standalone<VectorRef<KeyRangeRef>> ranges,
bool waitForComplete, Version targetVersion, bool verbose, Key addPrefix,
Key removePrefix, bool lockDB, UID randomUid) {
state Reference<IBackupContainer> bc = IBackupContainer::openContainer(url.toString());
state BackupDescription desc = wait(bc->describeBackup());

View File

@ -352,6 +352,9 @@ namespace HTTP {
send_start = timer();
loop {
wait(conn->onWritable());
wait( delay( 0, TaskPriority::WriteSocket ) );
// If we already got a response, before finishing sending the request, then close the connection,
// set the Connection header to "close" as a hint to the caller that this connection can't be used
// again, and break out of the send loop.
@ -372,11 +375,6 @@ namespace HTTP {
pContent->sent(len);
if(pContent->empty())
break;
if(len == 0) {
wait(conn->onWritable());
wait( delay( 0, TaskPriority::WriteSocket ) );
}
}
wait(responseReading);

View File

@ -36,7 +36,10 @@ template <class Val, class Metric=int, class MetricFunc = ConstantMetric<Metric>
class KeyRangeMap : public RangeMap<Key,Val,KeyRangeRef,Metric,MetricFunc>, NonCopyable, public ReferenceCounted<KeyRangeMap<Val>> {
public:
explicit KeyRangeMap(Val v=Val(), Key endKey = allKeys.end) : RangeMap<Key,Val,KeyRangeRef,Metric,MetricFunc>(endKey, v), mapEnd(endKey) {}
void operator=(KeyRangeMap&& r) BOOST_NOEXCEPT { mapEnd = std::move(r.mapEnd); RangeMap<Key,Val,KeyRangeRef,Metric,MetricFunc>::operator=(std::move(r)); }
void operator=(KeyRangeMap&& r) noexcept {
mapEnd = std::move(r.mapEnd);
RangeMap<Key, Val, KeyRangeRef, Metric, MetricFunc>::operator=(std::move(r));
}
void insert( const KeyRangeRef& keys, const Val& value ) { RangeMap<Key,Val,KeyRangeRef,Metric,MetricFunc>::insert(keys, value); }
void insert( const KeyRef& key, const Val& value ) { RangeMap<Key,Val,KeyRangeRef,Metric,MetricFunc>::insert( singleKeyRange(key), value); }
std::vector<KeyRangeWith<Val>> getAffectedRangesAfterInsertion( const KeyRangeRef& keys, const Val &insertionValue = Val());
@ -67,7 +70,10 @@ template <class Val, class Metric=int, class MetricFunc = ConstantMetric<Metric>
class CoalescedKeyRefRangeMap : public RangeMap<KeyRef,Val,KeyRangeRef,Metric,MetricFunc>, NonCopyable {
public:
explicit CoalescedKeyRefRangeMap(Val v=Val(), Key endKey = allKeys.end) : RangeMap<KeyRef,Val,KeyRangeRef,Metric,MetricFunc>(endKey, v), mapEnd(endKey) {}
void operator=(CoalescedKeyRefRangeMap&& r) BOOST_NOEXCEPT { mapEnd = std::move(r.mapEnd); RangeMap<KeyRef, Val, KeyRangeRef,Metric,MetricFunc>::operator=(std::move(r)); }
void operator=(CoalescedKeyRefRangeMap&& r) noexcept {
mapEnd = std::move(r.mapEnd);
RangeMap<KeyRef, Val, KeyRangeRef, Metric, MetricFunc>::operator=(std::move(r));
}
void insert( const KeyRangeRef& keys, const Val& value );
void insert( const KeyRef& key, const Val& value, Arena& arena );
Key mapEnd;
@ -77,7 +83,10 @@ template <class Val, class Metric=int, class MetricFunc = ConstantMetric<Metric>
class CoalescedKeyRangeMap : public RangeMap<Key,Val,KeyRangeRef,Metric,MetricFunc>, NonCopyable {
public:
explicit CoalescedKeyRangeMap(Val v=Val(), Key endKey = allKeys.end) : RangeMap<Key,Val,KeyRangeRef,Metric,MetricFunc>(endKey, v), mapEnd(endKey) {}
void operator=(CoalescedKeyRangeMap&& r) BOOST_NOEXCEPT { mapEnd = std::move(r.mapEnd); RangeMap<Key,Val,KeyRangeRef,Metric,MetricFunc>::operator=(std::move(r)); }
void operator=(CoalescedKeyRangeMap&& r) noexcept {
mapEnd = std::move(r.mapEnd);
RangeMap<Key, Val, KeyRangeRef, Metric, MetricFunc>::operator=(std::move(r));
}
void insert( const KeyRangeRef& keys, const Val& value );
void insert( const KeyRef& key, const Val& value );
Key mapEnd;

View File

@ -19,7 +19,12 @@
*/
#include <cinttypes>
#include <vector>
#include "flow/Arena.h"
#include "fdbclient/FDBOptions.g.h"
#include "fdbclient/FDBTypes.h"
#include "fdbclient/ReadYourWrites.h"
#include "fdbclient/ManagementAPI.actor.h"
#include "fdbclient/SystemData.h"
@ -1858,6 +1863,69 @@ ACTOR Future<Void> waitForPrimaryDC( Database cx, StringRef dcId ) {
}
}
ACTOR Future<Void> changeCachedRange(Database cx, KeyRangeRef range, bool add) {
state ReadYourWritesTransaction tr(cx);
state KeyRange sysRange = KeyRangeRef(storageCacheKey(range.begin), storageCacheKey(range.end));
state KeyRange sysRangeClear = KeyRangeRef(storageCacheKey(range.begin), keyAfter(storageCacheKey(range.end)));
state KeyRange privateRange = KeyRangeRef(cacheKeysKey(0, range.begin), cacheKeysKey(0, range.end));
state Value trueValue = storageCacheValue(std::vector<uint16_t>{ 0 });
state Value falseValue = storageCacheValue(std::vector<uint16_t>{});
loop {
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
try {
tr.clear(sysRangeClear);
tr.clear(privateRange);
tr.addReadConflictRange(privateRange);
Standalone<RangeResultRef> previous =
wait(tr.getRange(KeyRangeRef(storageCachePrefix, sysRange.begin), 1, true));
bool prevIsCached = false;
if (!previous.empty()) {
std::vector<uint16_t> prevVal;
decodeStorageCacheValue(previous[0].value, prevVal);
prevIsCached = !prevVal.empty();
}
if (prevIsCached && !add) {
// we need to uncache from here
tr.set(sysRange.begin, falseValue);
tr.set(privateRange.begin, serverKeysFalse);
} else if (!prevIsCached && add) {
// we need to cache, starting from here
tr.set(sysRange.begin, trueValue);
tr.set(privateRange.begin, serverKeysTrue);
}
Standalone<RangeResultRef> after =
wait(tr.getRange(KeyRangeRef(sysRange.end, storageCacheKeys.end), 1, false));
bool afterIsCached = false;
if (!after.empty()) {
std::vector<uint16_t> afterVal;
decodeStorageCacheValue(after[0].value, afterVal);
afterIsCached = afterVal.empty();
}
if (afterIsCached && !add) {
tr.set(sysRange.end, trueValue);
tr.set(privateRange.end, serverKeysTrue);
} else if (!afterIsCached && add) {
tr.set(sysRange.end, falseValue);
tr.set(privateRange.end, serverKeysFalse);
}
wait(tr.commit());
return Void();
} catch (Error& e) {
state Error err = e;
wait(tr.onError(err));
TraceEvent(SevDebug, "ChangeCachedRangeError").error(err);
}
}
}
Future<Void> addCachedRange(const Database& cx, KeyRangeRef range) {
return changeCachedRange(cx, range, true);
}
Future<Void> removeCachedRange(const Database& cx, KeyRangeRef range) {
return changeCachedRange(cx, range, false);
}
json_spirit::Value_type normJSONType(json_spirit::Value_type type) {
if (type == json_spirit::int_type)
return json_spirit::real_type;

View File

@ -201,5 +201,8 @@ bool schemaMatch( json_spirit::mValue const& schema, json_spirit::mValue const&
// storage nodes
ACTOR Future<Void> mgmtSnapCreate(Database cx, Standalone<StringRef> snapCmd, UID snapUID);
Future<Void> addCachedRange(const Database& cx, KeyRangeRef range);
Future<Void> removeCachedRange(const Database& cx, KeyRangeRef range);
#include "flow/unactorcompiler.h"
#endif

View File

@ -153,6 +153,7 @@ struct CommitTransactionRequest : TimedRequest {
bool firstInBatch() const { return (flags & FLAG_FIRST_IN_BATCH) != 0; }
Arena arena;
SpanID spanContext;
CommitTransactionRef transaction;
ReplyPromise<CommitID> reply;
uint32_t flags;
@ -162,7 +163,7 @@ struct CommitTransactionRequest : TimedRequest {
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, transaction, reply, arena, flags, debugID);
serializer(ar, transaction, reply, arena, flags, debugID, spanContext);
}
};
@ -209,6 +210,7 @@ struct GetReadVersionRequest : TimedRequest {
FLAG_PRIORITY_MASK = PRIORITY_SYSTEM_IMMEDIATE,
};
SpanID spanContext;
uint32_t transactionCount;
uint32_t flags;
TransactionPriority priority;
@ -219,9 +221,11 @@ struct GetReadVersionRequest : TimedRequest {
ReplyPromise<GetReadVersionReply> reply;
GetReadVersionRequest() : transactionCount(1), flags(0) {}
GetReadVersionRequest(uint32_t transactionCount, TransactionPriority priority, uint32_t flags = 0, TransactionTagMap<uint32_t> tags = TransactionTagMap<uint32_t>(), Optional<UID> debugID = Optional<UID>())
: transactionCount(transactionCount), priority(priority), flags(flags), tags(tags), debugID(debugID)
{
GetReadVersionRequest(SpanID spanContext, uint32_t transactionCount, TransactionPriority priority,
uint32_t flags = 0, TransactionTagMap<uint32_t> tags = TransactionTagMap<uint32_t>(),
Optional<UID> debugID = Optional<UID>())
: spanContext(spanContext), transactionCount(transactionCount), priority(priority), flags(flags), tags(tags),
debugID(debugID) {
flags = flags & ~FLAG_PRIORITY_MASK;
switch(priority) {
case TransactionPriority::BATCH:
@ -237,12 +241,12 @@ struct GetReadVersionRequest : TimedRequest {
ASSERT(false);
}
}
bool operator < (GetReadVersionRequest const& rhs) const { return priority < rhs.priority; }
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, transactionCount, flags, tags, debugID, reply);
serializer(ar, transactionCount, flags, tags, debugID, reply, spanContext);
if(ar.isDeserializing) {
if((flags & PRIORITY_SYSTEM_IMMEDIATE) == PRIORITY_SYSTEM_IMMEDIATE) {
@ -275,6 +279,7 @@ struct GetKeyServerLocationsReply {
struct GetKeyServerLocationsRequest {
constexpr static FileIdentifier file_identifier = 9144680;
Arena arena;
SpanID spanContext;
KeyRef begin;
Optional<KeyRef> end;
int limit;
@ -282,24 +287,28 @@ struct GetKeyServerLocationsRequest {
ReplyPromise<GetKeyServerLocationsReply> reply;
GetKeyServerLocationsRequest() : limit(0), reverse(false) {}
GetKeyServerLocationsRequest( KeyRef const& begin, Optional<KeyRef> const& end, int limit, bool reverse, Arena const& arena ) : begin( begin ), end( end ), limit( limit ), reverse( reverse ), arena( arena ) {}
template <class Ar>
GetKeyServerLocationsRequest(SpanID spanContext, KeyRef const& begin, Optional<KeyRef> const& end, int limit,
bool reverse, Arena const& arena)
: spanContext(spanContext), begin(begin), end(end), limit(limit), reverse(reverse), arena(arena) {}
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, begin, end, limit, reverse, reply, arena);
serializer(ar, begin, end, limit, reverse, reply, spanContext, arena);
}
};
struct GetRawCommittedVersionRequest {
constexpr static FileIdentifier file_identifier = 12954034;
SpanID spanContext;
Optional<UID> debugID;
ReplyPromise<GetReadVersionReply> reply;
explicit GetRawCommittedVersionRequest(Optional<UID> const& debugID = Optional<UID>()) : debugID(debugID) {}
explicit GetRawCommittedVersionRequest(SpanID spanContext, Optional<UID> const& debugID = Optional<UID>()) : spanContext(spanContext), debugID(debugID) {}
explicit GetRawCommittedVersionRequest() : spanContext(), debugID() {}
template <class Ar>
void serialize( Ar& ar ) {
serializer(ar, debugID, reply);
serializer(ar, debugID, reply, spanContext);
}
};

View File

@ -20,14 +20,23 @@
#include "fdbclient/NativeAPI.actor.h"
#include <algorithm>
#include <iterator>
#include <regex>
#include <unordered_set>
#include <tuple>
#include <utility>
#include <vector>
#include "fdbclient/FDBTypes.h"
#include "fdbrpc/FailureMonitor.h"
#include "fdbrpc/MultiInterface.h"
#include "fdbclient/Atomic.h"
#include "fdbclient/ClusterInterface.h"
#include "fdbclient/CoordinationInterface.h"
#include "fdbclient/DatabaseContext.h"
#include "fdbclient/FDBOptions.g.h"
#include "fdbclient/KeyRangeMap.h"
#include "fdbclient/Knobs.h"
#include "fdbclient/ManagementAPI.actor.h"
@ -38,18 +47,23 @@
#include "fdbclient/SpecialKeySpace.actor.h"
#include "fdbclient/StorageServerInterface.h"
#include "fdbclient/SystemData.h"
#include "fdbclient/versions.h"
#include "fdbrpc/LoadBalance.h"
#include "fdbrpc/Net2FileSystem.h"
#include "fdbrpc/simulator.h"
#include "flow/Arena.h"
#include "flow/ActorCollection.h"
#include "flow/DeterministicRandom.h"
#include "flow/Error.h"
#include "flow/flow.h"
#include "flow/genericactors.actor.h"
#include "flow/Knobs.h"
#include "flow/Platform.h"
#include "flow/SystemMonitor.h"
#include "flow/TLSConfig.actor.h"
#include "flow/Tracing.h"
#include "flow/UnitTest.h"
#include "fdbclient/versions.h"
#include "flow/serialize.h"
#ifdef WIN32
#define WIN32_LEAN_AND_MEAN
@ -67,6 +81,33 @@ using std::max;
using std::min;
using std::pair;
namespace {
ACTOR template <class T, class Fun>
Future<T> runAfter(Future<T> in, Fun func) {
T res = wait(in);
return func(res);
}
template <class Interface, class Request>
Future<REPLY_TYPE(Request)> loadBalance(
DatabaseContext* ctx, const Reference<LocationInfo> alternatives, RequestStream<Request> Interface::*channel,
const Request& request = Request(), TaskPriority taskID = TaskPriority::DefaultPromiseEndpoint,
bool atMostOnce = false, // if true, throws request_maybe_delivered() instead of retrying automatically
QueueModel* model = NULL) {
if (alternatives->hasCaches) {
return loadBalance(alternatives->locations(), channel, request, taskID, atMostOnce, model);
}
return runAfter(loadBalance(alternatives->locations(), channel, request, taskID, atMostOnce, model),
[ctx](auto res) {
if (res.cached) {
ctx->updateCache.trigger();
}
return res;
});
}
} // namespace
NetworkOptions networkOptions;
TLSConfig tlsConfig(TLSEndpointType::CLIENT);
@ -454,6 +495,166 @@ ACTOR static Future<Void> monitorMasterProxiesChange(Reference<AsyncVar<ClientDB
}
}
void updateLocationCacheWithCaches(DatabaseContext* self, const std::map<UID, StorageServerInterface>& removed,
const std::map<UID, StorageServerInterface>& added) {
// TODO: this needs to be more clever in the future
auto ranges = self->locationCache.ranges();
for (auto iter = ranges.begin(); iter != ranges.end(); ++iter) {
if (iter->value() && iter->value()->hasCaches) {
auto& val = iter->value();
std::vector<Reference<ReferencedInterface<StorageServerInterface>>> interfaces;
interfaces.reserve(val->size() - removed.size() + added.size());
for (int i = 0; i < val->size(); ++i) {
const auto& interf = (*val)[i];
if (removed.count(interf->interf.id()) == 0) {
interfaces.emplace_back(interf);
}
}
for (const auto& p : added) {
interfaces.emplace_back(Reference<ReferencedInterface<StorageServerInterface>>{new ReferencedInterface<StorageServerInterface>{p.second}});
}
iter->value() = Reference<LocationInfo>{ new LocationInfo(interfaces, true) };
}
}
}
Reference<LocationInfo> addCaches(const Reference<LocationInfo>& loc,
const std::vector<Reference<ReferencedInterface<StorageServerInterface>>>& other) {
std::vector<Reference<ReferencedInterface<StorageServerInterface>>> interfaces;
interfaces.reserve(loc->size() + other.size());
for (int i = 0; i < loc->size(); ++i) {
interfaces.emplace_back((*loc)[i]);
}
interfaces.insert(interfaces.end(), other.begin(), other.end());
return Reference<LocationInfo>{ new LocationInfo{ interfaces, true } };
}
ACTOR Future<Void> updateCachedRanges(DatabaseContext* self, std::map<UID, StorageServerInterface>* cacheServers) {
state Database db(self);
state ReadYourWritesTransaction tr(db);
state Value trueValue = storageCacheValue(std::vector<uint16_t>{ 0 });
state Value falseValue = storageCacheValue(std::vector<uint16_t>{});
try {
loop {
wait(self->updateCache.onTrigger());
tr.reset();
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr.setOption(FDBTransactionOptions::READ_LOCK_AWARE);
try {
Standalone<RangeResultRef> range = wait(tr.getRange(storageCacheKeys, CLIENT_KNOBS->TOO_MANY));
ASSERT(!range.more);
std::vector<Reference<ReferencedInterface<StorageServerInterface>>> cacheInterfaces;
cacheInterfaces.reserve(cacheServers->size());
for (const auto& p : *cacheServers) {
cacheInterfaces.emplace_back(Reference<ReferencedInterface<StorageServerInterface>>{
new ReferencedInterface<StorageServerInterface>{ p.second } });
}
bool currCached = false;
KeyRef begin, end;
for (const auto& kv : range) {
// These booleans have to flip consistently
ASSERT(currCached == (kv.value == falseValue));
if (kv.value == trueValue) {
begin = kv.key.substr(storageCacheKeys.begin.size());
currCached = true;
} else {
currCached = false;
end = kv.key.substr(storageCacheKeys.begin.size());
KeyRangeRef cachedRange{begin, end};
auto ranges = self->locationCache.containedRanges(cachedRange);
KeyRef containedRangesBegin, containedRangesEnd, prevKey;
if (!ranges.empty()) {
containedRangesBegin = ranges.begin().range().begin;
}
for (auto iter = ranges.begin(); iter != ranges.end(); ++iter) {
// We probably don't want to do the code below? Otherwise we would never
// fetch the corresponding storages - which would give us a different semantics
//if (containedRangesEnd > iter->range().begin) {
// self->locationCache.insert(
// KeyRangeRef{ containedRangesEnd, iter->range().begin },
// Reference<LocationInfo>{ new LocationInfo{ cacheInterfaces, true } });
//}
containedRangesEnd = iter->range().end;
if (iter->value() && !iter->value()->hasCaches) {
iter->value() = addCaches(iter->value(), cacheInterfaces);
}
}
auto iter = self->locationCache.rangeContaining(begin);
if (iter->value() && !iter->value()->hasCaches) {
if (end>=iter->range().end) {
self->locationCache.insert(KeyRangeRef{ begin, iter->range().end },
addCaches(iter->value(), cacheInterfaces));
} else {
self->locationCache.insert(KeyRangeRef{ begin, end },
addCaches(iter->value(), cacheInterfaces));
}
}
iter = self->locationCache.rangeContainingKeyBefore(end);
if (iter->value() && !iter->value()->hasCaches) {
self->locationCache.insert(KeyRangeRef{iter->range().begin, end}, addCaches(iter->value(), cacheInterfaces));
}
}
}
wait(delay(2.0)); // we want to wait at least some small amount of time before
// updating this list again
} catch (Error& e) {
wait(tr.onError(e));
}
}
} catch (Error& e) {
TraceEvent(SevError, "UpdateCachedRangesFailed")
.error(e);
throw;
}
}
ACTOR Future<Void> monitorCacheList(DatabaseContext* self) {
state Database db(self);
state Transaction tr(db);
state std::map<UID, StorageServerInterface> cacheServerMap;
state Future<Void> updateRanges = updateCachedRanges(self, &cacheServerMap);
// if no caches are configured, we don't want to run this actor at all
// so we just wait for the first trigger from a storage server
wait(self->updateCache.onTrigger());
try {
loop {
tr.reset();
try {
Standalone<RangeResultRef> cacheList =
wait(tr.getRange(storageCacheServerKeys, CLIENT_KNOBS->TOO_MANY));
ASSERT(!cacheList.more);
bool hasChanges = false;
std::map<UID, StorageServerInterface> allCacheServers;
for (auto kv : cacheList) {
auto ssi = BinaryReader::fromStringRef<StorageServerInterface>(kv.value, IncludeVersion());
allCacheServers.emplace(ssi.id(), ssi);
}
std::map<UID, StorageServerInterface> newCacheServers;
std::map<UID, StorageServerInterface> deletedCacheServers;
std::set_difference(allCacheServers.begin(), allCacheServers.end(), cacheServerMap.begin(),
cacheServerMap.end(),
std::insert_iterator<std::map<UID, StorageServerInterface>>(
newCacheServers, newCacheServers.begin()));
std::set_difference(cacheServerMap.begin(), cacheServerMap.end(), allCacheServers.begin(),
allCacheServers.end(),
std::insert_iterator<std::map<UID, StorageServerInterface>>(
deletedCacheServers, deletedCacheServers.begin()));
hasChanges = !(newCacheServers.empty() && deletedCacheServers.empty());
if (hasChanges) {
updateLocationCacheWithCaches(self, deletedCacheServers, newCacheServers);
}
cacheServerMap = std::move(allCacheServers);
wait(delay(5.0));
} catch (Error& e) {
wait(tr.onError(e));
}
}
} catch (Error& e) {
TraceEvent(SevError, "MonitorCacheListFailed").error(e);
throw;
}
}
ACTOR static Future<HealthMetrics> getHealthMetricsActor(DatabaseContext *cx, bool detailed) {
if (now() - cx->healthMetricsLastUpdated < CLIENT_KNOBS->AGGREGATE_HEALTH_METRICS_MAX_STALENESS) {
if (detailed) {
@ -600,6 +801,7 @@ DatabaseContext::DatabaseContext(Reference<AsyncVar<Reference<ClusterConnectionF
monitorMasterProxiesInfoChange = monitorMasterProxiesChange(clientInfo, &masterProxiesChangeTrigger);
clientStatusUpdater.actor = clientStatusUpdateActor(this);
cacheListMonitor = monitorCacheList(this);
if (apiVersionAtLeast(630)) {
registerSpecialKeySpaceModule(SpecialKeySpace::MODULE::TRANSACTION, std::make_unique<ConflictingKeysImpl>(conflictingKeysRange));
registerSpecialKeySpaceModule(SpecialKeySpace::MODULE::TRANSACTION, std::make_unique<ReadConflictRangeImpl>(readConflictRangeKeysRange));
@ -681,14 +883,15 @@ Database DatabaseContext::create(Reference<AsyncVar<ClientDBInfo>> clientInfo, F
}
DatabaseContext::~DatabaseContext() {
cacheListMonitor.cancel();
monitorMasterProxiesInfoChange.cancel();
for(auto it = server_interf.begin(); it != server_interf.end(); it = server_interf.erase(it))
it->second->notifyContextDestroyed();
ASSERT_ABORT( server_interf.empty() );
locationCache.insert( allKeys, Reference<LocationInfo>() );
locationCache.insert(allKeys, Reference<LocationInfo>());
}
pair<KeyRange,Reference<LocationInfo>> DatabaseContext::getCachedLocation( const KeyRef& key, bool isBackward ) {
pair<KeyRange, Reference<LocationInfo>> DatabaseContext::getCachedLocation( const KeyRef& key, bool isBackward ) {
if( isBackward ) {
auto range = locationCache.rangeContainingKeyBefore(key);
return std::make_pair(range->range(), range->value());
@ -740,23 +943,24 @@ Reference<LocationInfo> DatabaseContext::setCachedLocation( const KeyRangeRef& k
attempts++;
auto r = locationCache.randomRange();
Key begin = r.begin(), end = r.end(); // insert invalidates r, so can't be passed a mere reference into it
locationCache.insert( KeyRangeRef(begin, end), Reference<LocationInfo>() );
locationCache.insert(KeyRangeRef(begin, end), Reference<LocationInfo>());
}
locationCache.insert( keys, loc );
return loc;
}
void DatabaseContext::invalidateCache( const KeyRef& key, bool isBackward ) {
if( isBackward )
if( isBackward ) {
locationCache.rangeContainingKeyBefore(key)->value() = Reference<LocationInfo>();
else
} else {
locationCache.rangeContaining(key)->value() = Reference<LocationInfo>();
}
}
void DatabaseContext::invalidateCache( const KeyRangeRef& keys ) {
auto rs = locationCache.intersectingRanges(keys);
Key begin = rs.begin().begin(), end = rs.end().begin(); // insert invalidates rs, so can't be passed a mere reference into it
locationCache.insert( KeyRangeRef(begin, end), Reference<LocationInfo>() );
locationCache.insert(KeyRangeRef(begin, end), Reference<LocationInfo>());
}
Future<Void> DatabaseContext::onMasterProxiesChanged() {
@ -1335,6 +1539,7 @@ ACTOR Future<Optional<vector<StorageServerInterface>>> transactionalGetServerInt
//If isBackward == true, returns the shard containing the key before 'key' (an infinitely long, inexpressible key). Otherwise returns the shard containing key
ACTOR Future< pair<KeyRange,Reference<LocationInfo>> > getKeyLocation_internal( Database cx, Key key, TransactionInfo info, bool isBackward = false ) {
state Span span("NAPI:getKeyLocation"_loc, { info.span->context });
if (isBackward) {
ASSERT( key != allKeys.begin && key <= allKeys.end );
} else {
@ -1348,7 +1553,10 @@ ACTOR Future< pair<KeyRange,Reference<LocationInfo>> > getKeyLocation_internal(
++cx->transactionKeyServerLocationRequests;
choose {
when ( wait( cx->onMasterProxiesChanged() ) ) {}
when ( GetKeyServerLocationsReply rep = wait( basicLoadBalance( cx->getMasterProxies(info.useProvisionalProxies), &MasterProxyInterface::getKeyServersLocations, GetKeyServerLocationsRequest(key, Optional<KeyRef>(), 100, isBackward, key.arena()), TaskPriority::DefaultPromiseEndpoint ) ) ) {
when(GetKeyServerLocationsReply rep = wait(basicLoadBalance(
cx->getMasterProxies(info.useProvisionalProxies), &MasterProxyInterface::getKeyServersLocations,
GetKeyServerLocationsRequest(span->context, key, Optional<KeyRef>(), 100, isBackward, key.arena()),
TaskPriority::DefaultPromiseEndpoint))) {
++cx->transactionKeyServerLocationRequestsCompleted;
if( info.debugID.present() )
g_traceBatch.addEvent("TransactionDebug", info.debugID.get().first(), "NativeAPI.getKeyLocation.After");
@ -1362,7 +1570,11 @@ ACTOR Future< pair<KeyRange,Reference<LocationInfo>> > getKeyLocation_internal(
}
template <class F>
Future<pair<KeyRange, Reference<LocationInfo>>> getKeyLocation( Database const& cx, Key const& key, F StorageServerInterface::*member, TransactionInfo const& info, bool isBackward = false ) {
Future<pair<KeyRange, Reference<LocationInfo>>> getKeyLocation(Database const& cx, Key const& key,
F StorageServerInterface::*member,
TransactionInfo const& info,
bool isBackward = false) {
// we first check whether this range is cached
auto ssi = cx->getCachedLocation( key, isBackward );
if (!ssi.second) {
return getKeyLocation_internal( cx, key, info, isBackward );
@ -1380,6 +1592,7 @@ Future<pair<KeyRange, Reference<LocationInfo>>> getKeyLocation( Database const&
}
ACTOR Future< vector< pair<KeyRange,Reference<LocationInfo>> > > getKeyRangeLocations_internal( Database cx, KeyRange keys, int limit, bool reverse, TransactionInfo info ) {
state Span span("NAPI:getKeyRangeLocations"_loc, { info.span->context });
if( info.debugID.present() )
g_traceBatch.addEvent("TransactionDebug", info.debugID.get().first(), "NativeAPI.getKeyLocations.Before");
@ -1387,7 +1600,10 @@ ACTOR Future< vector< pair<KeyRange,Reference<LocationInfo>> > > getKeyRangeLoca
++cx->transactionKeyServerLocationRequests;
choose {
when ( wait( cx->onMasterProxiesChanged() ) ) {}
when ( GetKeyServerLocationsReply _rep = wait( basicLoadBalance( cx->getMasterProxies(info.useProvisionalProxies), &MasterProxyInterface::getKeyServersLocations, GetKeyServerLocationsRequest(keys.begin, keys.end, limit, reverse, keys.arena()), TaskPriority::DefaultPromiseEndpoint ) ) ) {
when(GetKeyServerLocationsReply _rep = wait(basicLoadBalance(
cx->getMasterProxies(info.useProvisionalProxies), &MasterProxyInterface::getKeyServersLocations,
GetKeyServerLocationsRequest(span->context, keys.begin, keys.end, limit, reverse, keys.arena()),
TaskPriority::DefaultPromiseEndpoint))) {
++cx->transactionKeyServerLocationRequestsCompleted;
state GetKeyServerLocationsReply rep = _rep;
if( info.debugID.present() )
@ -1478,6 +1694,7 @@ Future<Void> Transaction::warmRange(Database cx, KeyRange keys) {
ACTOR Future<Optional<Value>> getValue( Future<Version> version, Key key, Database cx, TransactionInfo info, Reference<TransactionLogInfo> trLogInfo, TagSet tags )
{
state Version ver = wait( version );
state Span span("NAPI:getValue"_loc, { info.span->context });
cx->validateVersion(ver);
loop {
@ -1510,10 +1727,12 @@ ACTOR Future<Optional<Value>> getValue( Future<Version> version, Key key, Databa
}
choose {
when(wait(cx->connectionFileChanged())) { throw transaction_too_old(); }
when(GetValueReply _reply =
wait(loadBalance(ssi.second, &StorageServerInterface::getValue,
GetValueRequest(key, ver, cx->sampleReadTags() ? tags : Optional<TagSet>(), getValueID), TaskPriority::DefaultPromiseEndpoint, false,
cx->enableLocalityLoadBalance ? &cx->queueModel : nullptr))) {
when(GetValueReply _reply = wait(
loadBalance(cx.getPtr(), ssi.second, &StorageServerInterface::getValue,
GetValueRequest(span->context, key, ver,
cx->sampleReadTags() ? tags : Optional<TagSet>(), getValueID),
TaskPriority::DefaultPromiseEndpoint, false,
cx->enableLocalityLoadBalance ? &cx->queueModel : nullptr))) {
reply = _reply;
}
}
@ -1571,6 +1790,7 @@ ACTOR Future<Key> getKey( Database cx, KeySelector k, Future<Version> version, T
wait(success(version));
state Optional<UID> getKeyID = Optional<UID>();
state Span span("NAPI:getKey"_loc, { info.span->context });
if( info.debugID.present() ) {
getKeyID = nondeterministicRandom()->randomUniqueID();
@ -1599,9 +1819,11 @@ ACTOR Future<Key> getKey( Database cx, KeySelector k, Future<Version> version, T
choose {
when(wait(cx->connectionFileChanged())) { throw transaction_too_old(); }
when(GetKeyReply _reply =
wait(loadBalance(ssi.second, &StorageServerInterface::getKey, GetKeyRequest(k, version.get(), cx->sampleReadTags() ? tags : Optional<TagSet>(), getKeyID),
TaskPriority::DefaultPromiseEndpoint, false,
cx->enableLocalityLoadBalance ? &cx->queueModel : nullptr))) {
wait(loadBalance(cx.getPtr(), ssi.second, &StorageServerInterface::getKey,
GetKeyRequest(span->context, k, version.get(),
cx->sampleReadTags() ? tags : Optional<TagSet>(), getKeyID),
TaskPriority::DefaultPromiseEndpoint, false,
cx->enableLocalityLoadBalance ? &cx->queueModel : nullptr))) {
reply = _reply;
}
}
@ -1634,12 +1856,15 @@ ACTOR Future<Key> getKey( Database cx, KeySelector k, Future<Version> version, T
}
}
ACTOR Future<Version> waitForCommittedVersion( Database cx, Version version ) {
ACTOR Future<Version> waitForCommittedVersion( Database cx, Version version, SpanID spanContext ) {
state Span span("NAPI:waitForCommittedVersion"_loc, { spanContext });
try {
loop {
choose {
when ( wait( cx->onMasterProxiesChanged() ) ) {}
when ( GetReadVersionReply v = wait( basicLoadBalance( cx->getMasterProxies(false), &MasterProxyInterface::getConsistentReadVersion, GetReadVersionRequest( 0, TransactionPriority::IMMEDIATE ), cx->taskID ) ) ) {
when(GetReadVersionReply v = wait(basicLoadBalance(
cx->getMasterProxies(false), &MasterProxyInterface::getConsistentReadVersion,
GetReadVersionRequest(span->context, 0, TransactionPriority::IMMEDIATE), cx->taskID))) {
cx->minAcceptableReadVersion = std::min(cx->minAcceptableReadVersion, v.version);
if (v.version >= version)
@ -1655,11 +1880,14 @@ ACTOR Future<Version> waitForCommittedVersion( Database cx, Version version ) {
}
}
ACTOR Future<Version> getRawVersion( Database cx ) {
ACTOR Future<Version> getRawVersion( Database cx, SpanID spanContext ) {
state Span span("NAPI:getRawVersion"_loc, { spanContext });
loop {
choose {
when ( wait( cx->onMasterProxiesChanged() ) ) {}
when ( GetReadVersionReply v = wait( basicLoadBalance( cx->getMasterProxies(false), &MasterProxyInterface::getConsistentReadVersion, GetReadVersionRequest( 0, TransactionPriority::IMMEDIATE ), cx->taskID ) ) ) {
when(GetReadVersionReply v =
wait(basicLoadBalance(cx->getMasterProxies(false), &MasterProxyInterface::getConsistentReadVersion,
GetReadVersionRequest(spanContext, 0, TransactionPriority::IMMEDIATE), cx->taskID))) {
return v.version;
}
}
@ -1673,6 +1901,7 @@ ACTOR Future<Void> readVersionBatcher(
ACTOR Future<Void> watchValue(Future<Version> version, Key key, Optional<Value> value, Database cx,
TransactionInfo info, TagSet tags) {
state Version ver = wait( version );
state Span span(deterministicRandom()->randomUniqueID(), "NAPI:watchValue"_loc, { info.span->context });
cx->validateVersion(ver);
ASSERT(ver != latestVersion);
@ -1689,9 +1918,11 @@ ACTOR Future<Void> watchValue(Future<Version> version, Key key, Optional<Value>
}
state WatchValueReply resp;
choose {
when(WatchValueReply r = wait(loadBalance(ssi.second, &StorageServerInterface::watchValue,
WatchValueRequest(key, value, ver, cx->sampleReadTags() ? tags : Optional<TagSet>(), watchValueID),
TaskPriority::DefaultPromiseEndpoint))) {
when(WatchValueReply r = wait(
loadBalance(cx.getPtr(), ssi.second, &StorageServerInterface::watchValue,
WatchValueRequest(span->context, key, value, ver,
cx->sampleReadTags() ? tags : Optional<TagSet>(), watchValueID),
TaskPriority::DefaultPromiseEndpoint))) {
resp = r;
}
when(wait(cx->connectionFile ? cx->connectionFile->onChange() : Never())) { wait(Never()); }
@ -1702,7 +1933,7 @@ ACTOR Future<Void> watchValue(Future<Version> version, Key key, Optional<Value>
//FIXME: wait for known committed version on the storage server before replying,
//cannot do this until the storage server is notified on knownCommittedVersion changes from tlog (faster than the current update loop)
Version v = wait(waitForCommittedVersion(cx, resp.version));
Version v = wait(waitForCommittedVersion(cx, resp.version, span->context));
//TraceEvent("WatcherCommitted").detail("CommittedVersion", v).detail("WatchVersion", resp.version).detail("Key", key ).detail("Value", value);
@ -1755,6 +1986,7 @@ ACTOR Future<Standalone<RangeResultRef>> getExactRange( Database cx, Version ver
KeyRange keys, GetRangeLimits limits, bool reverse, TransactionInfo info, TagSet tags )
{
state Standalone<RangeResultRef> output;
state Span span("NAPI:getExactRange"_loc, { info.span->context });
//printf("getExactRange( '%s', '%s' )\n", keys.begin.toString().c_str(), keys.end.toString().c_str());
loop {
@ -1768,6 +2000,7 @@ ACTOR Future<Standalone<RangeResultRef>> getExactRange( Database cx, Version ver
req.version = version;
req.begin = firstGreaterOrEqual( range.begin );
req.end = firstGreaterOrEqual( range.end );
req.spanContext = span->context;
transformRangeLimits(limits, reverse, req);
ASSERT(req.limitBytes > 0 && req.limit != 0 && req.limit < 0 == reverse);
@ -1793,10 +2026,10 @@ ACTOR Future<Standalone<RangeResultRef>> getExactRange( Database cx, Version ver
try {
choose {
when(wait(cx->connectionFileChanged())) { throw transaction_too_old(); }
when(GetKeyValuesReply _rep =
wait(loadBalance(locations[shard].second, &StorageServerInterface::getKeyValues, req,
TaskPriority::DefaultPromiseEndpoint, false,
cx->enableLocalityLoadBalance ? &cx->queueModel : nullptr))) {
when(GetKeyValuesReply _rep = wait(
loadBalance(cx.getPtr(), locations[shard].second, &StorageServerInterface::getKeyValues,
req, TaskPriority::DefaultPromiseEndpoint, false,
cx->enableLocalityLoadBalance ? &cx->queueModel : nullptr))) {
rep = _rep;
}
}
@ -2012,6 +2245,7 @@ ACTOR Future<Standalone<RangeResultRef>> getRange( Database cx, Reference<Transa
state KeySelector originalBegin = begin;
state KeySelector originalEnd = end;
state Standalone<RangeResultRef> output;
state Span span("NAPI:getRange"_loc, info.span);
try {
state Version version = wait( fVersion );
@ -2064,6 +2298,7 @@ ACTOR Future<Standalone<RangeResultRef>> getRange( Database cx, Reference<Transa
req.tags = cx->sampleReadTags() ? tags : Optional<TagSet>();
req.debugID = info.debugID;
req.spanContext = span->context;
try {
if( info.debugID.present() ) {
g_traceBatch.addEvent("TransactionDebug", info.debugID.get().first(), "NativeAPI.getRange.Before");
@ -2092,7 +2327,10 @@ ACTOR Future<Standalone<RangeResultRef>> getRange( Database cx, Reference<Transa
transaction_too_old(), future_version()
});
}
GetKeyValuesReply _rep = wait( loadBalance(beginServer.second, &StorageServerInterface::getKeyValues, req, TaskPriority::DefaultPromiseEndpoint, false, cx->enableLocalityLoadBalance ? &cx->queueModel : NULL ) );
GetKeyValuesReply _rep =
wait(loadBalance(cx.getPtr(), beginServer.second, &StorageServerInterface::getKeyValues, req,
TaskPriority::DefaultPromiseEndpoint, false,
cx->enableLocalityLoadBalance ? &cx->queueModel : NULL));
rep = _rep;
++cx->transactionPhysicalReadsCompleted;
} catch(Error&) {
@ -2268,7 +2506,7 @@ Transaction::~Transaction() {
cancelWatches();
}
void Transaction::operator=(Transaction&& r) BOOST_NOEXCEPT {
void Transaction::operator=(Transaction&& r) noexcept {
flushTrLogsIfEnabled();
cx = std::move(r.cx);
tr = std::move(r.tr);
@ -2364,7 +2602,6 @@ void Watch::setWatch(Future<Void> watchFuture) {
//FIXME: This seems pretty horrible. Now a Database can't die until all of its watches do...
ACTOR Future<Void> watch(Reference<Watch> watch, Database cx, TagSet tags, TransactionInfo info) {
cx->addWatch();
try {
choose {
// RYOW write to value that is being watched (if applicable)
@ -2399,7 +2636,7 @@ ACTOR Future<Void> watch(Reference<Watch> watch, Database cx, TagSet tags, Trans
}
Future<Version> Transaction::getRawReadVersion() {
return ::getRawVersion(cx);
return ::getRawVersion(cx, info.span->context);
}
Future< Void > Transaction::watch( Reference<Watch> watch ) {
@ -2753,6 +2990,7 @@ void Transaction::reset() {
void Transaction::fullReset() {
reset();
info.span = Span(info.span->location);
backoff = CLIENT_KNOBS->DEFAULT_BACKOFF;
}
@ -2869,6 +3107,8 @@ ACTOR void checkWrites( Database cx, Future<Void> committed, Promise<Void> outCo
ACTOR static Future<Void> commitDummyTransaction( Database cx, KeyRange range, TransactionInfo info, TransactionOptions options ) {
state Transaction tr(cx);
state int retries = 0;
state Span span("NAPI:dummyTransaction"_loc, info.span);
tr.info.span->parents.insert(span->context);
loop {
try {
TraceEvent("CommitDummyTransaction").detail("Key", range.begin).detail("Retries", retries);
@ -2915,6 +3155,8 @@ void Transaction::setupWatches() {
ACTOR static Future<Void> tryCommit( Database cx, Reference<TransactionLogInfo> trLogInfo, CommitTransactionRequest req, Future<Version> readVersion, TransactionInfo info, Version* pCommittedVersion, Transaction* tr, TransactionOptions options) {
state TraceInterval interval( "TransactionCommit" );
state double startTime = now();
state Span span("NAPI:tryCommit"_loc, { info.span->context });
req.spanContext = span->context;
if (info.debugID.present())
TraceEvent(interval.begin()).detail( "Parent", info.debugID.get() );
try {
@ -3338,6 +3580,14 @@ void Transaction::setOption( FDBTransactionOptions::Option option, Optional<Stri
options.readTags.addTag(value.get());
break;
case FDBTransactionOptions::SPAN_PARENT:
validateOptionValue(value, true);
if (value.get().size() != 16) {
throw invalid_option_value();
}
info.span->parents.emplace(BinaryReader::fromStringRef<UID>(value.get(), Unversioned()));
break;
case FDBTransactionOptions::REPORT_CONFLICTING_KEYS:
validateOptionValue(value, false);
options.reportConflictingKeys = true;
@ -3348,13 +3598,16 @@ void Transaction::setOption( FDBTransactionOptions::Option option, Optional<Stri
}
}
ACTOR Future<GetReadVersionReply> getConsistentReadVersion( DatabaseContext *cx, uint32_t transactionCount, TransactionPriority priority, uint32_t flags, TransactionTagMap<uint32_t> tags, Optional<UID> debugID ) {
ACTOR Future<GetReadVersionReply> getConsistentReadVersion(Span parentSpan, DatabaseContext* cx, uint32_t transactionCount,
TransactionPriority priority, uint32_t flags,
TransactionTagMap<uint32_t> tags, Optional<UID> debugID) {
state Span span("NAPI:getConsistentReadVersion"_loc, parentSpan);
try {
++cx->transactionReadVersionBatches;
if( debugID.present() )
g_traceBatch.addEvent("TransactionDebug", debugID.get().first(), "NativeAPI.getConsistentReadVersion.Before");
loop {
state GetReadVersionRequest req( transactionCount, priority, flags, tags, debugID );
state GetReadVersionRequest req( span->context, transactionCount, priority, flags, tags, debugID );
choose {
when ( wait( cx->onMasterProxiesChanged() ) ) {}
when ( GetReadVersionReply v = wait( basicLoadBalance( cx->getMasterProxies(flags & GetReadVersionRequest::FLAG_USE_PROVISIONAL_PROXIES), &MasterProxyInterface::getConsistentReadVersion, req, cx->taskID ) ) ) {
@ -3405,6 +3658,7 @@ ACTOR Future<Void> readVersionBatcher( DatabaseContext *cx, FutureStream<Databas
state PromiseStream<double> replyTimes;
state PromiseStream<Error> _errorStream;
state double batchTime = 0;
state Span span("NAPI:readVersionBatcher"_loc);
loop {
send_batch = false;
choose {
@ -3415,6 +3669,7 @@ ACTOR Future<Void> readVersionBatcher( DatabaseContext *cx, FutureStream<Databas
}
g_traceBatch.addAttach("TransactionAttachID", req.debugID.get().first(), debugID.get().first());
}
span->parents.insert(req.spanContext);
requests.push_back(req.reply);
for(auto tag : req.tags) {
++tags[tag];
@ -3442,9 +3697,10 @@ ACTOR Future<Void> readVersionBatcher( DatabaseContext *cx, FutureStream<Databas
addActor.send(ready(timeReply(GRVReply.getFuture(), replyTimes)));
Future<Void> batch = incrementalBroadcastWithError(
getConsistentReadVersion(cx, count, priority, flags, std::move(tags), std::move(debugID)),
getConsistentReadVersion(span, cx, count, priority, flags, std::move(tags), std::move(debugID)),
std::move(requests), CLIENT_KNOBS->BROADCAST_BATCH_SIZE);
span = Span("NAPI:readVersionBatcher"_loc);
tags.clear();
debugID = Optional<UID>();
requests.clear();
@ -3454,7 +3710,11 @@ ACTOR Future<Void> readVersionBatcher( DatabaseContext *cx, FutureStream<Databas
}
}
ACTOR Future<Version> extractReadVersion(DatabaseContext* cx, TransactionPriority priority, Reference<TransactionLogInfo> trLogInfo, Future<GetReadVersionReply> f, bool lockAware, double startTime, Promise<Optional<Value>> metadataVersion, TagSet tags) {
ACTOR Future<Version> extractReadVersion(Span parentSpan, DatabaseContext* cx, TransactionPriority priority,
Reference<TransactionLogInfo> trLogInfo, Future<GetReadVersionReply> f,
bool lockAware, double startTime, Promise<Optional<Value>> metadataVersion,
TagSet tags) {
// parentSpan here is only used to keep the parent alive until the request completes
GetReadVersionReply rep = wait(f);
double latency = now() - startTime;
cx->GRVLatencies.addSample(latency);
@ -3576,10 +3836,12 @@ Future<Version> Transaction::getReadVersion(uint32_t flags) {
batcher.actor = readVersionBatcher( cx.getPtr(), batcher.stream.getFuture(), options.priority, flags );
}
auto const req = DatabaseContext::VersionRequest(options.tags, info.debugID);
Span span("NAPI:getReadVersion"_loc, info.span);
auto const req = DatabaseContext::VersionRequest(span->context, options.tags, info.debugID);
batcher.stream.send(req);
startTime = now();
readVersion = extractReadVersion( cx.getPtr(), options.priority, trLogInfo, req.reply.getFuture(), options.lockAware, startTime, metadataVersion, options.tags);
readVersion = extractReadVersion(span, cx.getPtr(), options.priority, trLogInfo, req.reply.getFuture(),
options.lockAware, startTime, metadataVersion, options.tags);
}
return readVersion;
}
@ -3660,7 +3922,7 @@ ACTOR Future<StorageMetrics> doGetStorageMetrics(Database cx, KeyRangeRef keys,
req.min.bytes = 0;
req.max.bytes = -1;
StorageMetrics m = wait(
loadBalance(locationInfo, &StorageServerInterface::waitMetrics, req, TaskPriority::DataDistribution));
loadBalance(locationInfo->locations(), &StorageServerInterface::waitMetrics, req, TaskPriority::DataDistribution));
return m;
} catch (Error& e) {
if (e.code() != error_code_wrong_shard_server && e.code() != error_code_all_alternatives_failed) {
@ -3702,8 +3964,8 @@ ACTOR Future<Void> trackBoundedStorageMetrics(
try {
loop {
WaitMetricsRequest req( keys, x - halfError, x + halfError );
StorageMetrics nextX = wait( loadBalance( location, &StorageServerInterface::waitMetrics, req ) );
deltaStream.send( nextX - x );
StorageMetrics nextX = wait(loadBalance(location->locations(), &StorageServerInterface::waitMetrics, req));
deltaStream.send(nextX - x);
x = nextX;
}
} catch (Error& e) {
@ -3728,8 +3990,8 @@ ACTOR Future<StorageMetrics> waitStorageMetricsMultipleLocations(
WaitMetricsRequest req(locations[i].first, StorageMetrics(), StorageMetrics());
req.min.bytes = 0;
req.max.bytes = -1;
fx[i] =
loadBalance(locations[i].second, &StorageServerInterface::waitMetrics, req, TaskPriority::DataDistribution);
fx[i] = loadBalance(locations[i].second->locations(), &StorageServerInterface::waitMetrics, req,
TaskPriority::DataDistribution);
}
wait(waitForAll(fx));
@ -3777,7 +4039,7 @@ ACTOR Future<Standalone<VectorRef<KeyRangeRef>>> getReadHotRanges(Database cx, K
state vector<Future<ReadHotSubRangeReply>> fReplies(nLocs);
for (int i = 0; i < nLocs; i++) {
ReadHotSubRangeRequest req(locations[i].first);
fReplies[i] = loadBalance(locations[i].second, &StorageServerInterface::getReadHotRanges, req,
fReplies[i] = loadBalance(locations[i].second->locations(), &StorageServerInterface::getReadHotRanges, req,
TaskPriority::DataDistribution);
}
@ -3823,7 +4085,8 @@ ACTOR Future< std::pair<Optional<StorageMetrics>, int> > waitStorageMetrics(
fx = waitStorageMetricsMultipleLocations(locations, min, max, permittedError);
} else {
WaitMetricsRequest req( keys, min, max );
fx = loadBalance( locations[0].second, &StorageServerInterface::waitMetrics, req, TaskPriority::DataDistribution );
fx = loadBalance(locations[0].second->locations(), &StorageServerInterface::waitMetrics, req,
TaskPriority::DataDistribution);
}
StorageMetrics x = wait(fx);
return std::make_pair(x,-1);
@ -3911,8 +4174,12 @@ ACTOR Future< Standalone<VectorRef<KeyRef>> > splitStorageMetrics( Database cx,
state int i = 0;
for(; i<locations.size(); i++) {
SplitMetricsRequest req( locations[i].first, limit, used, estimated, i == locations.size() - 1 );
SplitMetricsReply res = wait( loadBalance( locations[i].second, &StorageServerInterface::splitMetrics, req, TaskPriority::DataDistribution ) );
if( res.splits.size() && res.splits[0] <= results.back() ) { // split points are out of order, possibly because of moving data, throw error to retry
SplitMetricsReply res =
wait(loadBalance(locations[i].second->locations(), &StorageServerInterface::splitMetrics, req,
TaskPriority::DataDistribution));
if (res.splits.size() &&
res.splits[0] <= results.back()) { // split points are out of order, possibly because of moving
// data, throw error to retry
ASSERT_WE_THINK(false); // FIXME: This seems impossible and doesn't seem to be covered by testing
throw all_alternatives_failed();
}

View File

@ -19,6 +19,8 @@
*/
#pragma once
#include "flow/IRandom.h"
#include "flow/Tracing.h"
#if defined(NO_INTELLISENSE) && !defined(FDBCLIENT_NATIVEAPI_ACTOR_G_H)
#define FDBCLIENT_NATIVEAPI_ACTOR_G_H
#include "fdbclient/NativeAPI.actor.g.h"
@ -77,8 +79,8 @@ public:
Database() {} // an uninitialized database can be destructed or reassigned safely; that's it
void operator= ( Database const& rhs ) { db = rhs.db; }
Database( Database const& rhs ) : db(rhs.db) {}
Database(Database&& r) BOOST_NOEXCEPT : db(std::move(r.db)) {}
void operator= (Database&& r) BOOST_NOEXCEPT { db = std::move(r.db); }
Database(Database&& r) noexcept : db(std::move(r.db)) {}
void operator=(Database&& r) noexcept { db = std::move(r.db); }
// For internal use by the native client:
explicit Database(Reference<DatabaseContext> cx) : db(cx) {}
@ -147,13 +149,16 @@ class ReadYourWritesTransaction; // workaround cyclic dependency
struct TransactionInfo {
Optional<UID> debugID;
TaskPriority taskID;
Span span;
bool useProvisionalProxies;
// Used to save conflicting keys if FDBTransactionOptions::REPORT_CONFLICTING_KEYS is enabled
// prefix/<key1> : '1' - any keys equal or larger than this key are (probably) conflicting keys
// prefix/<key2> : '0' - any keys equal or larger than this key are (definitely) not conflicting keys
std::shared_ptr<CoalescedKeyRangeMap<Value>> conflictingKeys;
explicit TransactionInfo( TaskPriority taskID ) : taskID(taskID), useProvisionalProxies(false) {}
explicit TransactionInfo(TaskPriority taskID)
: taskID(taskID), span(deterministicRandom()->randomUniqueID(), "Transaction"_loc), useProvisionalProxies(false) {
}
};
struct TransactionLogInfo : public ReferenceCounted<TransactionLogInfo>, NonCopyable {
@ -279,7 +284,7 @@ public:
// These are to permit use as state variables in actors:
Transaction() : info( TaskPriority::DefaultEndpoint ) {}
void operator=(Transaction&& r) BOOST_NOEXCEPT;
void operator=(Transaction&& r) noexcept;
void reset();
void fullReset();
@ -329,7 +334,7 @@ private:
Future<Void> committing;
};
ACTOR Future<Version> waitForCommittedVersion(Database cx, Version version);
ACTOR Future<Version> waitForCommittedVersion(Database cx, Version version, SpanID spanContext);
ACTOR Future<Standalone<VectorRef<DDMetricsRef>>> waitDataDistributionMetricsList(Database cx, KeyRange keys,
int shardLimit);

View File

@ -73,8 +73,8 @@ struct Notified {
void operator=(const ValueType& v) { set(v); }
Notified(Notified&& r) BOOST_NOEXCEPT : waiting(std::move(r.waiting)), val(std::move(r.val)) {}
void operator=(Notified&& r) BOOST_NOEXCEPT {
Notified(Notified&& r) noexcept : waiting(std::move(r.waiting)), val(std::move(r.val)) {}
void operator=(Notified&& r) noexcept {
waiting = std::move(r.waiting);
val = std::move(r.val);
}

View File

@ -1119,8 +1119,7 @@ public:
}
bool retry_limit_hit = ryw->options.maxRetries != -1 && ryw->retries >= ryw->options.maxRetries;
if (ryw->retries < std::numeric_limits<int>::max())
ryw->retries++;
if (ryw->retries < std::numeric_limits<int>::max()) ryw->retries++;
if(retry_limit_hit) {
throw e;
}
@ -1130,7 +1129,7 @@ public:
ryw->debugLogRetries(e);
ryw->resetRyow();
return Void();
return Void();
} catch( Error &e ) {
if ( !ryw->resetPromise.isSet() ) {
if(ryw->tr.apiVersionAtLeast(610)) {
@ -2025,7 +2024,7 @@ void ReadYourWritesTransaction::setOptionImpl( FDBTransactionOptions::Option opt
tr.setOption( option, value );
}
void ReadYourWritesTransaction::operator=(ReadYourWritesTransaction&& r) BOOST_NOEXCEPT {
void ReadYourWritesTransaction::operator=(ReadYourWritesTransaction&& r) noexcept {
cache = std::move( r.cache );
writes = std::move( r.writes );
arena = std::move( r.arena );
@ -2051,21 +2050,12 @@ void ReadYourWritesTransaction::operator=(ReadYourWritesTransaction&& r) BOOST_N
versionStampKeys = std::move(r.versionStampKeys);
}
ReadYourWritesTransaction::ReadYourWritesTransaction(ReadYourWritesTransaction&& r) BOOST_NOEXCEPT :
cache( std::move(r.cache) ),
writes( std::move(r.writes) ),
arena( std::move(r.arena) ),
reading( std::move(r.reading) ),
retries( r.retries ),
approximateSize(r.approximateSize),
creationTime( r.creationTime ),
deferredError( std::move(r.deferredError) ),
timeoutActor( std::move(r.timeoutActor) ),
resetPromise( std::move(r.resetPromise) ),
commitStarted( r.commitStarted ),
options( r.options ),
transactionDebugInfo( r.transactionDebugInfo )
{
ReadYourWritesTransaction::ReadYourWritesTransaction(ReadYourWritesTransaction&& r) noexcept
: cache(std::move(r.cache)), writes(std::move(r.writes)), arena(std::move(r.arena)), reading(std::move(r.reading)),
retries(r.retries), approximateSize(r.approximateSize), creationTime(r.creationTime),
deferredError(std::move(r.deferredError)), timeoutActor(std::move(r.timeoutActor)),
resetPromise(std::move(r.resetPromise)), commitStarted(r.commitStarted), options(r.options),
transactionDebugInfo(r.transactionDebugInfo) {
cache.arena = &arena;
writes.arena = &arena;
tr = std::move( r.tr );

View File

@ -110,8 +110,8 @@ public:
// These are to permit use as state variables in actors:
ReadYourWritesTransaction() : cache(&arena), writes(&arena) {}
void operator=(ReadYourWritesTransaction&& r) BOOST_NOEXCEPT;
ReadYourWritesTransaction(ReadYourWritesTransaction&& r) BOOST_NOEXCEPT;
void operator=(ReadYourWritesTransaction&& r) noexcept;
ReadYourWritesTransaction(ReadYourWritesTransaction&& r) noexcept;
virtual void addref() { ReferenceCounted<ReadYourWritesTransaction>::addref(); }
virtual void delref() { ReferenceCounted<ReadYourWritesTransaction>::delref(); }

View File

@ -292,8 +292,12 @@ public:
entries.insert( Entry( allKeys.end, afterAllKeys, VectorRef<KeyValueRef>() ), NoMetric(), true );
}
// Visual Studio refuses to generate these, apparently despite the standard
SnapshotCache(SnapshotCache&& r) BOOST_NOEXCEPT : entries(std::move(r.entries)), arena(r.arena) {}
SnapshotCache& operator=(SnapshotCache&& r) BOOST_NOEXCEPT { entries = std::move(r.entries); arena = r.arena; return *this; }
SnapshotCache(SnapshotCache&& r) noexcept : entries(std::move(r.entries)), arena(r.arena) {}
SnapshotCache& operator=(SnapshotCache&& r) noexcept {
entries = std::move(r.entries);
arena = r.arena;
return *this;
}
bool empty() const {
// Returns true iff anything is known about the contents of the snapshot

View File

@ -72,7 +72,6 @@ struct StorageServerInterface {
RequestStream<ReplyPromise<KeyValueStoreType>> getKeyValueStoreType;
RequestStream<struct WatchValueRequest> watchValue;
RequestStream<struct ReadHotSubRangeRequest> getReadHotRanges;
explicit StorageServerInterface(UID uid) : uniqueID( uid ) {}
StorageServerInterface() : uniqueID( deterministicRandom()->randomUniqueID() ) {}
NetworkAddress address() const { return getValue.getEndpoint().getPrimaryAddress(); }
@ -157,18 +156,20 @@ struct ServerCacheInfo {
struct GetValueReply : public LoadBalancedReply {
constexpr static FileIdentifier file_identifier = 1378929;
Optional<Value> value;
bool cached;
GetValueReply() {}
GetValueReply(Optional<Value> value) : value(value) {}
GetValueReply() : cached(false) {}
GetValueReply(Optional<Value> value, bool cached) : value(value), cached(cached) {}
template <class Ar>
void serialize( Ar& ar ) {
serializer(ar, LoadBalancedReply::penalty, LoadBalancedReply::error, value);
serializer(ar, LoadBalancedReply::penalty, LoadBalancedReply::error, value, cached);
}
};
struct GetValueRequest : TimedRequest {
constexpr static FileIdentifier file_identifier = 8454530;
SpanID spanContext;
Key key;
Version version;
Optional<TagSet> tags;
@ -176,11 +177,12 @@ struct GetValueRequest : TimedRequest {
ReplyPromise<GetValueReply> reply;
GetValueRequest(){}
GetValueRequest(const Key& key, Version ver, Optional<TagSet> tags, Optional<UID> debugID) : key(key), version(ver), tags(tags), debugID(debugID) {}
template <class Ar>
GetValueRequest(SpanID spanContext, const Key& key, Version ver, Optional<TagSet> tags, Optional<UID> debugID)
: spanContext(spanContext), key(key), version(ver), tags(tags), debugID(debugID) {}
template <class Ar>
void serialize( Ar& ar ) {
serializer(ar, key, version, tags, debugID, reply);
serializer(ar, key, version, tags, debugID, reply, spanContext);
}
};
@ -188,17 +190,19 @@ struct WatchValueReply {
constexpr static FileIdentifier file_identifier = 3;
Version version;
bool cached = false;
WatchValueReply() = default;
explicit WatchValueReply(Version version) : version(version) {}
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, version);
serializer(ar, version, cached);
}
};
struct WatchValueRequest {
constexpr static FileIdentifier file_identifier = 14747733;
SpanID spanContext;
Key key;
Optional<Value> value;
Version version;
@ -207,11 +211,13 @@ struct WatchValueRequest {
ReplyPromise<WatchValueReply> reply;
WatchValueRequest(){}
WatchValueRequest(const Key& key, Optional<Value> value, Version ver, Optional<TagSet> tags, Optional<UID> debugID) : key(key), value(value), version(ver), tags(tags), debugID(debugID) {}
template <class Ar>
WatchValueRequest(SpanID spanContext, const Key& key, Optional<Value> value, Version ver, Optional<TagSet> tags,
Optional<UID> debugID)
: spanContext(spanContext), key(key), value(value), version(ver), tags(tags), debugID(debugID) {}
template <class Ar>
void serialize( Ar& ar ) {
serializer(ar, key, value, version, tags, debugID, reply);
serializer(ar, key, value, version, tags, debugID, reply, spanContext);
}
};
@ -221,18 +227,19 @@ struct GetKeyValuesReply : public LoadBalancedReply {
VectorRef<KeyValueRef, VecSerStrategy::String> data;
Version version; // useful when latestVersion was requested
bool more;
bool cached;
bool cached = false;
GetKeyValuesReply() : version(invalidVersion), more(false), cached(false) {}
template <class Ar>
void serialize( Ar& ar ) {
serializer(ar, LoadBalancedReply::penalty, LoadBalancedReply::error, data, version, more, arena);
serializer(ar, LoadBalancedReply::penalty, LoadBalancedReply::error, data, version, more, cached, arena);
}
};
struct GetKeyValuesRequest : TimedRequest {
constexpr static FileIdentifier file_identifier = 6795746;
SpanID spanContext;
Arena arena;
KeySelectorRef begin, end;
Version version; // or latestVersion
@ -245,25 +252,27 @@ struct GetKeyValuesRequest : TimedRequest {
GetKeyValuesRequest() : isFetchKeys(false) {}
template <class Ar>
void serialize( Ar& ar ) {
serializer(ar, begin, end, version, limit, limitBytes, isFetchKeys, tags, debugID, reply, arena);
serializer(ar, begin, end, version, limit, limitBytes, isFetchKeys, tags, debugID, reply, spanContext, arena);
}
};
struct GetKeyReply : public LoadBalancedReply {
constexpr static FileIdentifier file_identifier = 11226513;
KeySelector sel;
bool cached;
GetKeyReply() {}
GetKeyReply(KeySelector sel) : sel(sel) {}
GetKeyReply() : cached(false) {}
GetKeyReply(KeySelector sel, bool cached) : sel(sel), cached(cached) {}
template <class Ar>
void serialize( Ar& ar ) {
serializer(ar, LoadBalancedReply::penalty, LoadBalancedReply::error, sel);
serializer(ar, LoadBalancedReply::penalty, LoadBalancedReply::error, sel, cached);
}
};
struct GetKeyRequest : TimedRequest {
constexpr static FileIdentifier file_identifier = 10457870;
SpanID spanContext;
Arena arena;
KeySelectorRef sel;
Version version; // or latestVersion
@ -272,11 +281,13 @@ struct GetKeyRequest : TimedRequest {
ReplyPromise<GetKeyReply> reply;
GetKeyRequest() {}
GetKeyRequest(KeySelectorRef const& sel, Version version, Optional<TagSet> tags, Optional<UID> debugID) : sel(sel), version(version), debugID(debugID) {}
GetKeyRequest(SpanID spanContext, KeySelectorRef const& sel, Version version, Optional<TagSet> tags,
Optional<UID> debugID)
: spanContext(spanContext), sel(sel), version(version), debugID(debugID) {}
template <class Ar>
void serialize( Ar& ar ) {
serializer(ar, sel, version, tags, debugID, reply, arena);
serializer(ar, sel, version, tags, debugID, reply, spanContext, arena);
}
};

View File

@ -19,10 +19,12 @@
*/
#include "fdbclient/SystemData.h"
#include "fdbclient/StorageServerInterface.h"
#include "flow/TDMetric.actor.h"
#include "fdbclient/FDBTypes.h"
#include "fdbclient/NativeAPI.actor.h"
#include "fdbclient/StorageServerInterface.h"
#include "flow/Arena.h"
#include "flow/TDMetric.actor.h"
#include "flow/serialize.h"
const KeyRef systemKeysPrefix = LiteralStringRef("\xff");
const KeyRangeRef normalKeys(KeyRef(), systemKeysPrefix);
@ -200,6 +202,29 @@ const KeyRangeRef writeConflictRangeKeysRange =
KeyRangeRef(LiteralStringRef("\xff\xff/transaction/write_conflict_range/"),
LiteralStringRef("\xff\xff/transaction/write_conflict_range/\xff\xff"));
// "\xff/cacheServer/[[UID]] := StorageServerInterface"
// This will be added by the cache server on initialization and removed by DD
// TODO[mpilman]: We will need a way to map uint16_t ids to UIDs in a future
// versions. For now caches simply cache everything so the ids
// are not yet meaningful.
const KeyRangeRef storageCacheServerKeys(LiteralStringRef("\xff/cacheServer/"),
LiteralStringRef("\xff/cacheServer0"));
const KeyRef storageCacheServersPrefix = storageCacheServerKeys.begin;
const KeyRef storageCacheServersEnd = storageCacheServerKeys.end;
const Key storageCacheServerKey(UID id) {
BinaryWriter wr(Unversioned());
wr.serializeBytes(storageCacheServersPrefix);
wr << id;
return wr.toValue();
}
const Value storageCacheServerValue(const StorageServerInterface& ssi) {
BinaryWriter wr(IncludeVersion());
wr << ssi;
return wr.toValue();
}
const KeyRangeRef ddStatsRange = KeyRangeRef(LiteralStringRef("\xff\xff/metrics/data_distribution_stats/"),
LiteralStringRef("\xff\xff/metrics/data_distribution_stats/\xff\xff"));
@ -526,6 +551,7 @@ StorageServerInterface decodeServerListValue( ValueRef const& value ) {
return s;
}
// processClassKeys.contains(k) iff k.startsWith( processClassKeys.begin ) because '/'+1 == '0'
const KeyRangeRef processClassKeys(
LiteralStringRef("\xff/processClass/"),

View File

@ -62,6 +62,12 @@ void decodeKeyServersValue( Standalone<RangeResultRef> result, const ValueRef& v
void decodeKeyServersValue( std::map<Tag, UID> const& tag_uid, const ValueRef& value,
std::vector<UID>& src, std::vector<UID>& dest );
// "\xff/storageCacheServer/[[UID]] := StorageServerInterface"
extern const KeyRangeRef storageCacheServerKeys;
extern const KeyRef storageCacheServersPrefix, storageCacheServersEnd;
const Key storageCacheServerKey(UID id);
const Value storageCacheServerValue(const StorageServerInterface& ssi);
// "\xff/storageCache/[[begin]]" := "[[vector<uint16_t>]]"
extern const KeyRangeRef storageCacheKeys;
extern const KeyRef storageCachePrefix;

View File

@ -326,12 +326,12 @@ ThreadFuture<Void> ThreadSafeTransaction::onError( Error const& e ) {
return onMainThread( [tr, e](){ return tr->onError(e); } );
}
void ThreadSafeTransaction::operator=(ThreadSafeTransaction&& r) BOOST_NOEXCEPT {
void ThreadSafeTransaction::operator=(ThreadSafeTransaction&& r) noexcept {
tr = r.tr;
r.tr = NULL;
}
ThreadSafeTransaction::ThreadSafeTransaction(ThreadSafeTransaction&& r) BOOST_NOEXCEPT {
ThreadSafeTransaction::ThreadSafeTransaction(ThreadSafeTransaction&& r) noexcept {
tr = r.tr;
r.tr = NULL;
}

View File

@ -97,8 +97,8 @@ public:
// These are to permit use as state variables in actors:
ThreadSafeTransaction() : tr(NULL) {}
void operator=(ThreadSafeTransaction&& r) BOOST_NOEXCEPT;
ThreadSafeTransaction(ThreadSafeTransaction&& r) BOOST_NOEXCEPT;
void operator=(ThreadSafeTransaction&& r) noexcept;
ThreadSafeTransaction(ThreadSafeTransaction&& r) noexcept;
void reset() override;

View File

@ -538,7 +538,8 @@ namespace PTreeImpl {
return;
}
if (p->updated && p->lastUpdateVersion <= newOldestVersion) {
/* If the node has been updated, figure out which pointer was repalced. And delete that pointer */
/* If the node has been updated, figure out which pointer was replaced. And replace that pointer with the updated pointer.
Then we can get rid of the updated child pointer and then make room in the node for future updates */
auto which = p->replacedPointer;
p->pointer[which] = p->pointer[2];
p->updated = false;
@ -611,9 +612,9 @@ public:
VersionedMap() : oldestVersion(0), latestVersion(0) {
roots.emplace_back(0, Tree());
}
VersionedMap( VersionedMap&& v ) BOOST_NOEXCEPT : oldestVersion(v.oldestVersion), latestVersion(v.latestVersion), roots(std::move(v.roots)) {
}
void operator = (VersionedMap && v) BOOST_NOEXCEPT {
VersionedMap(VersionedMap&& v) noexcept
: oldestVersion(v.oldestVersion), latestVersion(v.latestVersion), roots(std::move(v.roots)) {}
void operator=(VersionedMap&& v) noexcept {
oldestVersion = v.oldestVersion;
latestVersion = v.latestVersion;
roots = std::move(v.roots);

View File

@ -145,8 +145,17 @@ public:
PTreeImpl::insert( writes, ver, WriteMapEntry( afterAllKeys, OperationStack(), false, false, false, false, false ) );
}
WriteMap(WriteMap&& r) BOOST_NOEXCEPT : writeMapEmpty(r.writeMapEmpty), writes(std::move(r.writes)), ver(r.ver), scratch_iterator(std::move(r.scratch_iterator)), arena(r.arena) {}
WriteMap& operator=(WriteMap&& r) BOOST_NOEXCEPT { writeMapEmpty = r.writeMapEmpty; writes = std::move(r.writes); ver = r.ver; scratch_iterator = std::move(r.scratch_iterator); arena = r.arena; return *this; }
WriteMap(WriteMap&& r) noexcept
: writeMapEmpty(r.writeMapEmpty), writes(std::move(r.writes)), ver(r.ver),
scratch_iterator(std::move(r.scratch_iterator)), arena(r.arena) {}
WriteMap& operator=(WriteMap&& r) noexcept {
writeMapEmpty = r.writeMapEmpty;
writes = std::move(r.writes);
ver = r.ver;
scratch_iterator = std::move(r.scratch_iterator);
arena = r.arena;
return *this;
}
//a write with addConflict false on top of an existing write with a conflict range will not remove the conflict
void mutate( KeyRef key, MutationRef::Type operation, ValueRef param, bool addConflict ) {

View File

@ -268,6 +268,8 @@ description is not currently required but encouraged.
description="Adds a tag to the transaction that can be used to apply manual targeted throttling. At most 5 tags can be set on a transaction." />
<Option name="auto_throttle_tag" code="801" paramType="String" paramDescription="String identifier used to associated this transaction with a throttling group. Must not exceed 16 characters."
description="Adds a tag to the transaction that can be used to apply manual or automatic targeted throttling. At most 5 tags can be set on a transaction." />
<Option name="span_parent" code="900" paramType="Bytes" paramDescription="A byte string of length 16 used to associate the span of this transaction with a parent"
description="Adds a parent to the Span of this transaction. Used for transaction tracing. A span can be identified with any 16 bytes"/>
</Scope>
<!-- The enumeration values matter - do not change them without

View File

@ -132,7 +132,7 @@ struct OpenFileInfo : NonCopyable {
Future<Reference<IAsyncFile>> opened; // Only valid until the file is fully opened
OpenFileInfo() : f(0) {}
OpenFileInfo(OpenFileInfo && r) BOOST_NOEXCEPT : f(r.f), opened(std::move(r.opened)) { r.f = 0; }
OpenFileInfo(OpenFileInfo&& r) noexcept : f(r.f), opened(std::move(r.opened)) { r.f = 0; }
Future<Reference<IAsyncFile>> get() {
if (f) return Reference<IAsyncFile>::addRef(f);

View File

@ -22,6 +22,11 @@
#define FLOW_MULTIINTERFACE_H
#pragma once
#include "flow/FastRef.h"
#include "fdbrpc/Locality.h"
#include <vector>
extern uint64_t debug_lastLoadBalanceResultEndpointToken;
template <class K, class V>
@ -168,7 +173,7 @@ class MultiInterface : public ReferenceCounted<MultiInterface<T>> {
template <class T>
class MultiInterface<ReferencedInterface<T>> : public ReferenceCounted<MultiInterface<ReferencedInterface<T>>> {
public:
MultiInterface( const vector<Reference<ReferencedInterface<T>>>& v ) : alternatives(v), bestCount(0) {
MultiInterface( const std::vector<Reference<ReferencedInterface<T>>>& v ) : alternatives(v), bestCount(0) {
deterministicRandom()->randomShuffle(alternatives);
if ( LBLocalityData<T>::Present ) {
std::stable_sort( alternatives.begin(), alternatives.end(), ReferencedInterface<T>::sort_by_distance );
@ -204,6 +209,18 @@ public:
T const& getInterface(int index) { return alternatives[index]->interf; }
UID getId( int index ) const { return alternatives[index]->interf.id(); }
bool hasInterface(UID id) const {
for (const auto& ref : alternatives) {
if (ref->interf.id() == id) {
return true;
}
}
return false;
}
Reference<ReferencedInterface<T>>& operator[](int i) { return alternatives[i]; }
const Reference<ReferencedInterface<T>>& operator[](int i) const { return alternatives[i]; }
virtual ~MultiInterface() {}
@ -211,7 +228,7 @@ public:
return describe( alternatives );
}
private:
vector<Reference<ReferencedInterface<T>>> alternatives;
std::vector<Reference<ReferencedInterface<T>>> alternatives;
int16_t bestCount;
};

View File

@ -150,7 +150,7 @@ public:
void coalesce( const Range& k );
void validateCoalesced();
void operator=(RangeMap&& r) BOOST_NOEXCEPT { map = std::move(r.map); }
void operator=(RangeMap&& r) noexcept { map = std::move(r.map); }
//void clear( const Val& value ) { ranges.clear(); ranges.insert(std::make_pair(Key(),value)); }
void insert( const Range& keys, const Val& value );

View File

@ -121,7 +121,7 @@ public:
bool isValid() const { return sav != NULL; }
ReplyPromise() : sav(new NetSAV<T>(0, 1)) {}
ReplyPromise(const ReplyPromise& rhs) : sav(rhs.sav) { sav->addPromiseRef(); }
ReplyPromise(ReplyPromise&& rhs) BOOST_NOEXCEPT : sav(rhs.sav) { rhs.sav = 0; }
ReplyPromise(ReplyPromise&& rhs) noexcept : sav(rhs.sav) { rhs.sav = 0; }
~ReplyPromise() { if (sav) sav->delPromiseRef(); }
ReplyPromise(const Endpoint& endpoint) : sav(new NetSAV<T>(0, 1, endpoint)) {}
@ -132,7 +132,7 @@ public:
if (sav) sav->delPromiseRef();
sav = rhs.sav;
}
void operator=(ReplyPromise && rhs) BOOST_NOEXCEPT {
void operator=(ReplyPromise&& rhs) noexcept {
if (sav != rhs.sav) {
if (sav) sav->delPromiseRef();
sav = rhs.sav;
@ -363,13 +363,13 @@ public:
FutureStream<T> getFuture() const { queue->addFutureRef(); return FutureStream<T>(queue); }
RequestStream() : queue(new NetNotifiedQueue<T>(0, 1)) {}
RequestStream(const RequestStream& rhs) : queue(rhs.queue) { queue->addPromiseRef(); }
RequestStream(RequestStream&& rhs) BOOST_NOEXCEPT : queue(rhs.queue) { rhs.queue = 0; }
RequestStream(RequestStream&& rhs) noexcept : queue(rhs.queue) { rhs.queue = 0; }
void operator=(const RequestStream& rhs) {
rhs.queue->addPromiseRef();
if (queue) queue->delPromiseRef();
queue = rhs.queue;
}
void operator=(RequestStream&& rhs) BOOST_NOEXCEPT {
void operator=(RequestStream&& rhs) noexcept {
if (queue != rhs.queue) {
if (queue) queue->delPromiseRef();
queue = rhs.queue;

View File

@ -1630,10 +1630,18 @@ public:
Promise<Void> action;
Task( double time, TaskPriority taskID, uint64_t stable, ProcessInfo* machine, Promise<Void>&& action ) : time(time), taskID(taskID), stable(stable), machine(machine), action(std::move(action)) {}
Task( double time, TaskPriority taskID, uint64_t stable, ProcessInfo* machine, Future<Void>& future ) : time(time), taskID(taskID), stable(stable), machine(machine) { future = action.getFuture(); }
Task(Task&& rhs) BOOST_NOEXCEPT : time(rhs.time), taskID(rhs.taskID), stable(rhs.stable), machine(rhs.machine), action(std::move(rhs.action)) {}
Task(Task&& rhs) noexcept
: time(rhs.time), taskID(rhs.taskID), stable(rhs.stable), machine(rhs.machine),
action(std::move(rhs.action)) {}
void operator= ( Task const& rhs ) { taskID = rhs.taskID; time = rhs.time; stable = rhs.stable; machine = rhs.machine; action = rhs.action; }
Task( Task const& rhs ) : taskID(rhs.taskID), time(rhs.time), stable(rhs.stable), machine(rhs.machine), action(rhs.action) {}
void operator= (Task&& rhs) BOOST_NOEXCEPT { time = rhs.time; taskID = rhs.taskID; stable = rhs.stable; machine = rhs.machine; action = std::move(rhs.action); }
void operator=(Task&& rhs) noexcept {
time = rhs.time;
taskID = rhs.taskID;
stable = rhs.stable;
machine = rhs.machine;
action = std::move(rhs.action);
}
bool operator < (Task const& rhs) const {
// Ordering is reversed for priority_queue

View File

@ -144,7 +144,7 @@ void applyMetadataMutations(UID const& dbgid, Arena &arena, VectorRef<MutationRe
{
MutationRef privatized = m;
privatized.param1 = m.param1.withPrefix(systemKeys.begin, arena);
TraceEvent(SevDebug, "SendingPrivateMutation", dbgid).detail("Original", m.toString()).detail("Privatized", privatized.toString());
//TraceEvent(SevDebug, "SendingPrivateMutation", dbgid).detail("Original", m.toString()).detail("Privatized", privatized.toString());
cachedRangeInfo[k] = privatized;
}
if(k != allKeys.end) {
@ -161,7 +161,7 @@ void applyMetadataMutations(UID const& dbgid, Arena &arena, VectorRef<MutationRe
if(toCommit) {
MutationRef privatized = m;
privatized.param1 = m.param1.withPrefix(systemKeys.begin, arena);
TraceEvent(SevDebug, "SendingPrivateMutation", dbgid).detail("Original", m.toString()).detail("Privatized", privatized.toString());
//TraceEvent(SevDebug, "SendingPrivateMutation", dbgid).detail("Original", m.toString()).detail("Privatized", privatized.toString());
toCommit->addTag( cacheTag );
toCommit->addTypedMessage(privatized);
}
@ -276,6 +276,7 @@ void applyMetadataMutations(UID const& dbgid, Arena &arena, VectorRef<MutationRe
allTags.insert(decodeServerTagValue(kv.value));
}
}
allTags.insert(cacheTag);
if (m.param1 == lastEpochEndKey) {
toCommit->addTags(allTags);
@ -494,14 +495,24 @@ void applyMetadataMutations(UID const& dbgid, Arena &arena, VectorRef<MutationRe
keyBegin = itr->first;
mutationBegin = itr->second;
++itr;
keyEnd = itr->first;
mutationEnd = itr->second;
if (itr != cachedRangeInfo.end()) {
keyEnd = itr->first;
mutationEnd = itr->second;
} else {
//TraceEvent(SevDebug, "EndKeyNotFound", dbgid).detail("KeyBegin", keyBegin.toString());
break;
}
} else {
keyEnd = itr->first;
mutationEnd = itr->second;
++itr;
keyBegin = itr->first;
mutationBegin = itr->second;
if (itr != cachedRangeInfo.end()) {
keyBegin = itr->first;
mutationBegin = itr->second;
} else {
//TraceEvent(SevDebug, "BeginKeyNotFound", dbgid).detail("KeyEnd", keyEnd.toString());
break;
}
}
// Now get all the storage server tags for the cached key-ranges

View File

@ -115,7 +115,7 @@ std::map<std::tuple<LogEpoch, Version, int>, std::map<Tag, Version>> BackupProgr
// ASSERT(info.logRouterTags == epochTags[rit->first]);
updateTagVersions(&tagVersions, &tags, rit->second, info.epochEnd, adjustedBeginVersion, epoch);
break;
if (tags.empty()) break;
}
rit++;
}

View File

@ -32,6 +32,8 @@
#include "fdbserver/WorkerInterface.actor.h"
#include "flow/Error.h"
#include "flow/IRandom.h"
#include "flow/Tracing.h"
#include "flow/actorcompiler.h" // This must be the last #include.
#define SevDebugMemory SevVerbose
@ -429,8 +431,9 @@ struct BackupData {
}
ACTOR static Future<Version> _getMinKnownCommittedVersion(BackupData* self) {
state Span span(deterministicRandom()->randomUniqueID(), "BA:GetMinCommittedVersion"_loc);
loop {
GetReadVersionRequest request(1, TransactionPriority::DEFAULT,
GetReadVersionRequest request(span->context, 1, TransactionPriority::DEFAULT,
GetReadVersionRequest::FLAG_USE_MIN_KNOWN_COMMITTED_VERSION);
choose {
when(wait(self->cx->onMasterProxiesChanged())) {}

View File

@ -124,6 +124,7 @@ set(FDBSERVER_SRCS
workloads/BackupToDBUpgrade.actor.cpp
workloads/BulkLoad.actor.cpp
workloads/BulkSetup.actor.h
workloads/Cache.actor.cpp
workloads/ChangeConfig.actor.cpp
workloads/ClientTransactionProfileCorrectness.actor.cpp
workloads/TriggerRecovery.actor.cpp

View File

@ -61,17 +61,17 @@ struct WorkerInfo : NonCopyable {
WorkerDetails details;
Future<Void> haltRatekeeper;
Future<Void> haltDistributor;
Optional<uint16_t> storageCacheInfo;
Standalone<VectorRef<StringRef>> issues;
WorkerInfo() : gen(-1), reboots(0), priorityInfo(ProcessClass::UnsetFit, false, ClusterControllerPriorityInfo::FitnessUnknown) {}
WorkerInfo( Future<Void> watcher, ReplyPromise<RegisterWorkerReply> reply, Generation gen, WorkerInterface interf, ProcessClass initialClass, ProcessClass processClass, ClusterControllerPriorityInfo priorityInfo, bool degraded, Standalone<VectorRef<StringRef>> issues ) :
watcher(watcher), reply(reply), gen(gen), reboots(0), initialClass(initialClass), priorityInfo(priorityInfo), details(interf, processClass, degraded), issues(issues) {}
WorkerInfo( WorkerInfo&& r ) BOOST_NOEXCEPT : watcher(std::move(r.watcher)), reply(std::move(r.reply)), gen(r.gen),
reboots(r.reboots), initialClass(r.initialClass), priorityInfo(r.priorityInfo), details(std::move(r.details)),
haltRatekeeper(r.haltRatekeeper), haltDistributor(r.haltDistributor), storageCacheInfo(r.storageCacheInfo), issues(r.issues) {}
void operator=( WorkerInfo&& r ) BOOST_NOEXCEPT {
WorkerInfo(WorkerInfo&& r) noexcept
: watcher(std::move(r.watcher)), reply(std::move(r.reply)), gen(r.gen), reboots(r.reboots),
initialClass(r.initialClass), priorityInfo(r.priorityInfo), details(std::move(r.details)),
haltRatekeeper(r.haltRatekeeper), haltDistributor(r.haltDistributor), issues(r.issues) {}
void operator=(WorkerInfo&& r) noexcept {
watcher = std::move(r.watcher);
reply = std::move(r.reply);
gen = r.gen;
@ -81,7 +81,6 @@ struct WorkerInfo : NonCopyable {
details = std::move(r.details);
haltRatekeeper = r.haltRatekeeper;
haltDistributor = r.haltDistributor;
storageCacheInfo = r.storageCacheInfo;
issues = r.issues;
}
};
@ -111,7 +110,6 @@ public:
Database db;
int unfinishedRecoveries;
int logGenerations;
std::map<uint16_t, std::pair<Optional<StorageServerInterface>, Optional<Key>>> cacheInterfaces;
bool cachePopulated;
std::map<NetworkAddress, std::pair<double, OpenDatabaseRequest>> clientStatus;
@ -138,28 +136,6 @@ public:
serverInfo->set( newInfo );
}
void setStorageCache(uint16_t id, const StorageServerInterface& interf) {
auto newInfo = serverInfo->get();
bool found = false;
for(auto& it : newInfo.storageCaches) {
if(it.first == id) {
if(it.second != interf) {
newInfo.id = deterministicRandom()->randomUniqueID();
newInfo.infoGeneration = ++dbInfoCount;
it.second = interf;
}
found = true;
break;
}
}
if(!found) {
newInfo.id = deterministicRandom()->randomUniqueID();
newInfo.infoGeneration = ++dbInfoCount;
newInfo.storageCaches.push_back(std::make_pair(id, interf));
}
serverInfo->set( newInfo );
}
void clearInterf(ProcessClass::ClassType t) {
auto newInfo = serverInfo->get();
newInfo.id = deterministicRandom()->randomUniqueID();
@ -172,18 +148,6 @@ public:
serverInfo->set( newInfo );
}
void clearStorageCache(uint16_t id) {
auto newInfo = serverInfo->get();
for(auto it = newInfo.storageCaches.begin(); it != newInfo.storageCaches.end(); ++it) {
if(it->first == id) {
newInfo.id = deterministicRandom()->randomUniqueID();
newInfo.infoGeneration = ++dbInfoCount;
newInfo.storageCaches.erase(it);
break;
}
}
serverInfo->set( newInfo );
}
};
struct UpdateWorkerList {
@ -365,7 +329,7 @@ public:
logServerMap->add(worker.interf.locality, &worker);
}
}
if (logServerSet->size() < (addingDegraded == 0 ? desired : required)) {
}
else if (logServerSet->size() == required || logServerSet->size() <= desired) {
@ -1441,7 +1405,6 @@ ACTOR Future<Void> clusterWatchDatabase( ClusterControllerData* cluster, Cluster
dbInfo.clusterInterface = db->serverInfo->get().clusterInterface;
dbInfo.distributor = db->serverInfo->get().distributor;
dbInfo.ratekeeper = db->serverInfo->get().ratekeeper;
dbInfo.storageCaches = db->serverInfo->get().storageCaches;
dbInfo.latencyBandConfig = db->serverInfo->get().latencyBandConfig;
TraceEvent("CCWDB", cluster->id).detail("Lifetime", dbInfo.masterLifetime.toString()).detail("ChangeID", dbInfo.id);
@ -1496,7 +1459,7 @@ ACTOR Future<Void> clusterOpenDatabase(ClusterControllerData::DBInfo* db, OpenDa
if(db->clientStatus.size() > 10000) {
TraceEvent(SevWarnAlways, "TooManyClientStatusEntries").suppressFor(1.0);
}
while (db->clientInfo->get().id == req.knownClientInfoID) {
choose {
when (wait( db->clientInfo->onChange() )) {}
@ -1747,27 +1710,9 @@ ACTOR Future<Void> workerAvailabilityWatch( WorkerInterface worker, ProcessClass
}
when( wait( failed ) ) { // remove workers that have failed
WorkerInfo& failedWorkerInfo = cluster->id_worker[ worker.locality.processId() ];
if(failedWorkerInfo.storageCacheInfo.present()) {
bool found = false;
for(auto& it : cluster->id_worker) {
if(!it.second.storageCacheInfo.present() && it.second.details.processClass == ProcessClass::StorageCacheClass) {
found = true;
it.second.storageCacheInfo = failedWorkerInfo.storageCacheInfo;
cluster->db.cacheInterfaces[failedWorkerInfo.storageCacheInfo.get()] = std::make_pair(Optional<StorageServerInterface>(), it.first);
if(!it.second.reply.isSet()) {
it.second.reply.send( RegisterWorkerReply(it.second.details.processClass, it.second.priorityInfo, failedWorkerInfo.storageCacheInfo) );
}
break;
}
}
if(!found) {
cluster->db.cacheInterfaces[failedWorkerInfo.storageCacheInfo.get()] = std::make_pair(Optional<StorageServerInterface>(), Optional<Key>());
}
cluster->db.clearStorageCache(failedWorkerInfo.storageCacheInfo.get());
}
if (!failedWorkerInfo.reply.isSet()) {
failedWorkerInfo.reply.send( RegisterWorkerReply(failedWorkerInfo.details.processClass, failedWorkerInfo.priorityInfo, Optional<uint16_t>()) );
failedWorkerInfo.reply.send( RegisterWorkerReply(failedWorkerInfo.details.processClass, failedWorkerInfo.priorityInfo) );
}
if (worker.locality.processId() == cluster->masterProcessId) {
cluster->masterProcessId = Optional<Key>();
@ -2055,7 +2000,7 @@ void clusterRegisterMaster( ClusterControllerData* self, RegisterMasterRequest c
if ( it.second.priorityInfo.isExcluded != isExcludedFromConfig ) {
it.second.priorityInfo.isExcluded = isExcludedFromConfig;
if( !it.second.reply.isSet() ) {
it.second.reply.send( RegisterWorkerReply( it.second.details.processClass, it.second.priorityInfo, it.second.storageCacheInfo ) );
it.second.reply.send( RegisterWorkerReply( it.second.details.processClass, it.second.priorityInfo ) );
}
}
}
@ -2228,56 +2173,10 @@ void registerWorker( RegisterWorkerRequest req, ClusterControllerData *self ) {
}
}
}
Optional<uint16_t> newStorageCache = req.storageCacheInterf.present() ? req.storageCacheInterf.get().first : Optional<uint16_t>();
auto& cacheInfo = self->id_worker[w.locality.processId()].storageCacheInfo;
if (req.storageCacheInterf.present()) {
auto it = self->db.cacheInterfaces.find(req.storageCacheInterf.get().first);
if(it == self->db.cacheInterfaces.end()) {
if(self->db.cachePopulated) {
if(cacheInfo.present()) {
self->db.clearStorageCache(cacheInfo.get());
}
newStorageCache = Optional<uint16_t>();
cacheInfo = Optional<uint16_t>();
} else {
self->db.setStorageCache(req.storageCacheInterf.get().first, req.storageCacheInterf.get().second);
self->db.cacheInterfaces[req.storageCacheInterf.get().first] = std::make_pair(req.storageCacheInterf.get().second, w.locality.processId());
cacheInfo = req.storageCacheInterf.get().first;
}
} else {
if(!it->second.second.present() || (cacheInfo.present() && cacheInfo.get() == it->first) ) {
self->db.setStorageCache(req.storageCacheInterf.get().first, req.storageCacheInterf.get().second);
it->second = std::make_pair(req.storageCacheInterf.get().second, w.locality.processId());
cacheInfo = req.storageCacheInterf.get().first;
}
else {
if(cacheInfo.present()) {
self->db.clearStorageCache(cacheInfo.get());
}
newStorageCache = Optional<uint16_t>();
cacheInfo = Optional<uint16_t>();
}
}
} else {
newStorageCache = cacheInfo;
}
if(self->gotProcessClasses && newProcessClass == ProcessClass::StorageCacheClass && !newStorageCache.present()) {
for(auto& it : self->db.cacheInterfaces) {
if(!it.second.second.present()) {
it.second.second = w.locality.processId();
self->id_worker[w.locality.processId()].storageCacheInfo = it.first;
newStorageCache = it.first;
break;
}
}
}
// Notify the worker to register again with new process class/exclusive property
if ( !req.reply.isSet() && ( newPriorityInfo != req.priorityInfo ||
newStorageCache.present() != req.storageCacheInterf.present() ||
(newStorageCache.present() && newStorageCache.get() != req.storageCacheInterf.get().first) ) ) {
req.reply.send( RegisterWorkerReply(newProcessClass, newPriorityInfo, newStorageCache) );
if ( !req.reply.isSet() && newPriorityInfo != req.priorityInfo ) {
req.reply.send( RegisterWorkerReply(newProcessClass, newPriorityInfo) );
}
}
@ -2504,7 +2403,7 @@ ACTOR Future<Void> monitorProcessClasses(ClusterControllerData *self) {
w.second.details.processClass = newProcessClass;
w.second.priorityInfo.processClassFitness = newProcessClass.machineClassFitness(ProcessClass::ClusterController);
if (!w.second.reply.isSet()) {
w.second.reply.send( RegisterWorkerReply(w.second.details.processClass, w.second.priorityInfo, w.second.storageCacheInfo) );
w.second.reply.send( RegisterWorkerReply(w.second.details.processClass, w.second.priorityInfo) );
}
}
}
@ -2558,81 +2457,7 @@ ACTOR Future<Void> monitorServerInfoConfig(ClusterControllerData::DBInfo* db) {
break;
}
catch (Error &e) {
wait(tr.onError(e));
}
}
}
}
ACTOR Future<Void> monitorStorageCache(ClusterControllerData* self) {
loop {
state ReadYourWritesTransaction tr(self->db.db);
loop {
try {
tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
tr.setOption(FDBTransactionOptions::READ_LOCK_AWARE);
Optional<Value> changeVal = wait(tr.get(cacheChangeKey));
Standalone<RangeResultRef> changeKeys = wait(tr.getRange(cacheChangeKeys, CLIENT_KNOBS->TOO_MANY));
ASSERT( !changeKeys.more && changeKeys.size() < CLIENT_KNOBS->TOO_MANY );
std::set<uint16_t> changeIDs;
for(auto& it : changeKeys) {
changeIDs.insert(cacheChangeKeyDecodeIndex(it.key));
}
for(auto& it : changeIDs) {
if(!self->db.cacheInterfaces.count(it)) {
self->db.cacheInterfaces[it] = std::make_pair(Optional<StorageServerInterface>(), Optional<Key>());
}
}
std::vector<uint16_t> removeIDs;
for(auto& it : self->db.cacheInterfaces) {
if(!changeIDs.count(it.first)) {
removeIDs.push_back(it.first);
if(it.second.second.present()) {
self->id_worker[it.second.second.get()].storageCacheInfo = Optional<uint16_t>();
}
self->db.clearStorageCache(it.first);
}
}
for(auto& it : removeIDs) {
self->db.cacheInterfaces.erase(it);
}
for(auto& c : self->db.cacheInterfaces) {
if(!c.second.second.present()) {
bool found = false;
for(auto& it : self->id_worker) {
if(!it.second.storageCacheInfo.present() && it.second.details.processClass == ProcessClass::StorageCacheClass) {
found = true;
it.second.storageCacheInfo = c.first;
c.second.second = it.first;
if(!it.second.reply.isSet()) {
it.second.reply.send( RegisterWorkerReply(it.second.details.processClass, it.second.priorityInfo, c.first) );
}
break;
}
}
if(!found) {
break;
}
}
}
state Future<Void> configChangeFuture = tr.watch(cacheChangeKey);
self->db.cachePopulated = true;
wait(tr.commit());
wait(configChangeFuture);
break;
}
catch (Error &e) {
wait(tr.onError(e));
wait(tr.onError(e));
}
}
}
@ -2688,7 +2513,7 @@ ACTOR Future<Void> updatedChangingDatacenters(ClusterControllerData *self) {
if ( worker.priorityInfo.dcFitness > newFitness ) {
worker.priorityInfo.dcFitness = newFitness;
if(!worker.reply.isSet()) {
worker.reply.send( RegisterWorkerReply( worker.details.processClass, worker.priorityInfo, worker.storageCacheInfo ) );
worker.reply.send( RegisterWorkerReply( worker.details.processClass, worker.priorityInfo ) );
}
} else {
state int currentFit = ProcessClass::BestFit;
@ -2701,7 +2526,7 @@ ACTOR Future<Void> updatedChangingDatacenters(ClusterControllerData *self) {
updated = true;
it.second.priorityInfo.dcFitness = fitness;
if(!it.second.reply.isSet()) {
it.second.reply.send( RegisterWorkerReply( it.second.details.processClass, it.second.priorityInfo, it.second.storageCacheInfo ) );
it.second.reply.send( RegisterWorkerReply( it.second.details.processClass, it.second.priorityInfo ) );
}
}
}
@ -2740,7 +2565,7 @@ ACTOR Future<Void> updatedChangedDatacenters(ClusterControllerData *self) {
if( worker.priorityInfo.dcFitness != newFitness ) {
worker.priorityInfo.dcFitness = newFitness;
if(!worker.reply.isSet()) {
worker.reply.send( RegisterWorkerReply( worker.details.processClass, worker.priorityInfo, worker.storageCacheInfo ) );
worker.reply.send( RegisterWorkerReply( worker.details.processClass, worker.priorityInfo ) );
}
}
} else {
@ -2754,7 +2579,7 @@ ACTOR Future<Void> updatedChangedDatacenters(ClusterControllerData *self) {
updated = true;
it.second.priorityInfo.dcFitness = fitness;
if(!it.second.reply.isSet()) {
it.second.reply.send( RegisterWorkerReply( it.second.details.processClass, it.second.priorityInfo, it.second.storageCacheInfo ) );
it.second.reply.send( RegisterWorkerReply( it.second.details.processClass, it.second.priorityInfo ) );
}
}
}
@ -2908,7 +2733,7 @@ ACTOR Future<DataDistributorInterface> startDataDistributor( ClusterControllerDa
if (self->onMasterIsBetter(worker, ProcessClass::DataDistributor)) {
worker = self->id_worker[self->masterProcessId.get()].details;
}
InitializeDataDistributorRequest req(deterministicRandom()->randomUniqueID());
TraceEvent("CCDataDistributorRecruit", self->id).detail("Addr", worker.interf.address());
@ -3091,7 +2916,6 @@ ACTOR Future<Void> clusterControllerCore( ClusterControllerFullInterface interf,
self.addActor.send( handleForcedRecoveries(&self, interf) );
self.addActor.send( monitorDataDistributor(&self) );
self.addActor.send( monitorRatekeeper(&self) );
self.addActor.send( monitorStorageCache(&self) );
self.addActor.send( dbInfoUpdater(&self) );
self.addActor.send( traceCounters("ClusterControllerMetrics", self.id, SERVER_KNOBS->STORAGE_LOGGING_DELAY, &self.clusterControllerMetrics, self.id.toString() + "/ClusterControllerMetrics") );
self.addActor.send( traceRole(Role::CLUSTER_CONTROLLER, interf.id()) );

View File

@ -20,6 +20,10 @@
#include <set>
#include <sstream>
#include "fdbclient/FDBOptions.g.h"
#include "fdbclient/FDBTypes.h"
#include "fdbclient/Knobs.h"
#include "fdbclient/StorageServerInterface.h"
#include "fdbclient/SystemData.h"
#include "fdbclient/DatabaseContext.h"
#include "fdbclient/ManagementAPI.actor.h"
@ -35,9 +39,11 @@
#include "fdbserver/TLogInterface.h"
#include "fdbserver/WaitFailure.h"
#include "flow/ActorCollection.h"
#include "flow/Arena.h"
#include "flow/Trace.h"
#include "flow/UnitTest.h"
#include "flow/actorcompiler.h" // This must be the last #include.
#include "flow/serialize.h"
class TCTeamInfo;
struct TCMachineInfo;
@ -4853,6 +4859,56 @@ ACTOR Future<Void> ddExclusionSafetyCheck(DistributorExclusionSafetyCheckRequest
return Void();
}
ACTOR Future<Void> waitFailCacheServer(Database* db, StorageServerInterface ssi) {
state Transaction tr(*db);
state Key key = storageCacheServerKey(ssi.id());
wait(waitFailureClient(ssi.waitFailure));
loop {
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
try {
tr.addReadConflictRange(storageCacheServerKeys);
tr.clear(key);
wait(tr.commit());
break;
} catch (Error& e) {
wait(tr.onError(e));
}
}
return Void();
}
ACTOR Future<Void> cacheServerWatcher(Database* db) {
state Transaction tr(*db);
state ActorCollection actors(false);
state std::set<UID> knownCaches;
loop {
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
try {
Standalone<RangeResultRef> range = wait(tr.getRange(storageCacheServerKeys, CLIENT_KNOBS->TOO_MANY));
ASSERT(!range.more);
std::set<UID> caches;
for (auto& kv : range) {
UID id;
BinaryReader reader{kv.key.removePrefix(storageCacheServersPrefix), Unversioned()};
reader >> id;
caches.insert(id);
if (knownCaches.find(id) == knownCaches.end()) {
StorageServerInterface ssi;
BinaryReader reader{kv.value, IncludeVersion()};
reader >> ssi;
actors.add(waitFailCacheServer(db, ssi));
}
}
knownCaches = std::move(caches);
tr.reset();
wait(delay(5.0) || actors.getResult());
ASSERT(!actors.getResult().isReady());
} catch (Error& e) {
wait(tr.onError(e));
}
}
}
ACTOR Future<Void> dataDistributor(DataDistributorInterface di, Reference<AsyncVar<struct ServerDBInfo>> db ) {
state Reference<DataDistributorData> self( new DataDistributorData(db, di.id()) );
state Future<Void> collection = actorCollection( self->addActor.getFuture() );
@ -4865,6 +4921,7 @@ ACTOR Future<Void> dataDistributor(DataDistributorInterface di, Reference<AsyncV
try {
TraceEvent("DataDistributorRunning", di.id());
self->addActor.send( waitFailureServer(di.waitFailure.getFuture()) );
self->addActor.send(cacheServerWatcher(&cx));
state Future<Void> distributor = reportErrorsExcept( dataDistribution(self, getShardMetricsList), "DataDistribution", di.id(), &normalDataDistributorErrors() );
loop choose {

View File

@ -1013,7 +1013,7 @@ private:
ASSERT( nextPageSeq%sizeof(Page)==0 );
auto& p = backPage();
memset(&p, 0, sizeof(Page)); // FIXME: unnecessary?
memset(static_cast<void*>(&p), 0, sizeof(Page)); // FIXME: unnecessary?
p.magic = 0xFDB;
switch (diskQueueVersion) {
case DiskQueueVersion::V0:

View File

@ -565,8 +565,8 @@ void ServerKnobs::initialize(bool randomize, ClientKnobs* clientKnobs, bool isSi
init( DEGRADED_WARNING_RESET_DELAY, 7*24*60*60 );
init( TRACE_LOG_FLUSH_FAILURE_CHECK_INTERVAL_SECONDS, 10 );
init( TRACE_LOG_PING_TIMEOUT_SECONDS, 5.0 );
init( MIN_DELAY_STORAGE_CANDIDACY_SECONDS, 10.0 );
init( MAX_DELAY_STORAGE_CANDIDACY_SECONDS, 30.0 );
init( MIN_DELAY_CC_WORST_FIT_CANDIDACY_SECONDS, 10.0 );
init( MAX_DELAY_CC_WORST_FIT_CANDIDACY_SECONDS, 30.0 );
init( DBINFO_FAILED_DELAY, 1.0 );
// Test harness
@ -630,6 +630,7 @@ void ServerKnobs::initialize(bool randomize, ClientKnobs* clientKnobs, bool isSi
init( REDWOOD_DEFAULT_PAGE_SIZE, 4096 );
init( REDWOOD_KVSTORE_CONCURRENT_READS, 64 );
init( REDWOOD_COMMIT_CONCURRENT_READS, 64 );
init( REDWOOD_PAGE_REBUILD_FILL_FACTOR, 0.66 );
init( REDWOOD_LAZY_CLEAR_BATCH_SIZE_PAGES, 10 );
init( REDWOOD_LAZY_CLEAR_MIN_PAGES, 0 );

View File

@ -494,8 +494,8 @@ public:
double DEGRADED_WARNING_RESET_DELAY;
int64_t TRACE_LOG_FLUSH_FAILURE_CHECK_INTERVAL_SECONDS;
double TRACE_LOG_PING_TIMEOUT_SECONDS;
double MIN_DELAY_STORAGE_CANDIDACY_SECONDS; // Listen for a leader for N seconds, and if not heard, then try to become the leader.
double MAX_DELAY_STORAGE_CANDIDACY_SECONDS;
double MIN_DELAY_CC_WORST_FIT_CANDIDACY_SECONDS; // Listen for a leader for N seconds, and if not heard, then try to become the leader.
double MAX_DELAY_CC_WORST_FIT_CANDIDACY_SECONDS;
double DBINFO_FAILED_DELAY;
// Test harness
@ -562,6 +562,7 @@ public:
int REDWOOD_DEFAULT_PAGE_SIZE; // Page size for new Redwood files
int REDWOOD_KVSTORE_CONCURRENT_READS; // Max number of simultaneous point or range reads in progress.
int REDWOOD_COMMIT_CONCURRENT_READS; // Max number of concurrent reads done to support commit operations
double REDWOOD_PAGE_REBUILD_FILL_FACTOR; // When rebuilding pages, start a new page after this capacity
int REDWOOD_LAZY_CLEAR_BATCH_SIZE_PAGES; // Number of pages to try to pop from the lazy delete queue and process at once
int REDWOOD_LAZY_CLEAR_MIN_PAGES; // Minimum number of pages to free before ending a lazy clear cycle, unless the queue is empty

View File

@ -42,8 +42,10 @@ struct LogRouterData {
TagData( Tag tag, Version popped, Version durableKnownCommittedVersion ) : tag(tag), popped(popped), durableKnownCommittedVersion(durableKnownCommittedVersion) {}
TagData(TagData&& r) BOOST_NOEXCEPT : version_messages(std::move(r.version_messages)), tag(r.tag), popped(r.popped), durableKnownCommittedVersion(r.durableKnownCommittedVersion) {}
void operator= (TagData&& r) BOOST_NOEXCEPT {
TagData(TagData&& r) noexcept
: version_messages(std::move(r.version_messages)), tag(r.tag), popped(r.popped),
durableKnownCommittedVersion(r.durableKnownCommittedVersion) {}
void operator=(TagData&& r) noexcept {
version_messages = std::move(r.version_messages);
tag = r.tag;
popped = r.popped;

View File

@ -738,10 +738,10 @@ void ILogSystem::SetPeekCursor::advanceTo(LogMessageVersion n) {
ACTOR Future<Void> setPeekGetMore(ILogSystem::SetPeekCursor* self, LogMessageVersion startVersion, TaskPriority taskID) {
loop {
//TraceEvent("LPC_GetMore1", self->randomID).detail("Start", startVersion.toString()).detail("Tag", self->tag);
//TraceEvent("LPC_GetMore1", self->randomID).detail("Start", startVersion.toString()).detail("Tag", self->tag.toString());
if(self->bestServer >= 0 && self->bestSet >= 0 && self->serverCursors[self->bestSet][self->bestServer]->isActive()) {
ASSERT(!self->serverCursors[self->bestSet][self->bestServer]->hasMessage());
//TraceEvent("LPC_GetMore2", self->randomID).detail("Start", startVersion.toString()).detail("Tag", self->tag);
//TraceEvent("LPC_GetMore2", self->randomID).detail("Start", startVersion.toString()).detail("Tag", self->tag.toString());
wait( self->serverCursors[self->bestSet][self->bestServer]->getMore(taskID) || self->serverCursors[self->bestSet][self->bestServer]->onFailed() );
self->useBestSet = true;
} else {
@ -778,7 +778,7 @@ ACTOR Future<Void> setPeekGetMore(ILogSystem::SetPeekCursor* self, LogMessageVer
} else {
//FIXME: this will peeking way too many cursors when satellites exist, and does not need to peek bestSet cursors since we cannot get anymore data from them
vector<Future<Void>> q;
//TraceEvent("LPC_GetMore4", self->randomID).detail("Start", startVersion.toString()).detail("Tag", self->tag);
//TraceEvent("LPC_GetMore4", self->randomID).detail("Start", startVersion.toString()).detail("Tag", self->tag.toString());
for(auto& cursors : self->serverCursors) {
for (auto& c :cursors) {
if (!c->hasMessage()) {

View File

@ -163,18 +163,21 @@ struct GetCommitVersionReply {
struct GetCommitVersionRequest {
constexpr static FileIdentifier file_identifier = 16683181;
SpanID spanContext;
uint64_t requestNum;
uint64_t mostRecentProcessedRequestNum;
UID requestingProxy;
ReplyPromise<GetCommitVersionReply> reply;
GetCommitVersionRequest() { }
GetCommitVersionRequest(uint64_t requestNum, uint64_t mostRecentProcessedRequestNum, UID requestingProxy)
: requestNum(requestNum), mostRecentProcessedRequestNum(mostRecentProcessedRequestNum), requestingProxy(requestingProxy) {}
GetCommitVersionRequest(SpanID spanContext, uint64_t requestNum, uint64_t mostRecentProcessedRequestNum,
UID requestingProxy)
: spanContext(spanContext), requestNum(requestNum), mostRecentProcessedRequestNum(mostRecentProcessedRequestNum),
requestingProxy(requestingProxy) {}
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, requestNum, mostRecentProcessedRequestNum, requestingProxy, reply);
serializer(ar, requestNum, mostRecentProcessedRequestNum, requestingProxy, reply, spanContext);
}
};

View File

@ -44,10 +44,13 @@
#include "fdbserver/WaitFailure.h"
#include "fdbserver/WorkerInterface.actor.h"
#include "flow/ActorCollection.h"
#include "flow/IRandom.h"
#include "flow/Knobs.h"
#include "flow/Stats.h"
#include "flow/TDMetric.actor.h"
#include "flow/Tracing.h"
#include "flow/actorcompiler.h" // This must be the last #include.
#include <tuple>
ACTOR Future<Void> broadcastTxnRequest(TxnStateRequest req, int sendAmount, bool sendReply) {
state ReplyPromise<Void> reply = req.reply;
@ -287,9 +290,9 @@ ACTOR Future<Void> getRate(UID myID, Reference<AsyncVar<ServerDBInfo>> db, int64
ACTOR Future<Void> queueTransactionStartRequests(
Reference<AsyncVar<ServerDBInfo>> db,
Deque<GetReadVersionRequest> *systemQueue,
Deque<GetReadVersionRequest> *defaultQueue,
Deque<GetReadVersionRequest> *batchQueue,
SpannedDeque<GetReadVersionRequest> *systemQueue,
SpannedDeque<GetReadVersionRequest> *defaultQueue,
SpannedDeque<GetReadVersionRequest> *batchQueue,
FutureStream<GetReadVersionRequest> readVersionRequests,
PromiseStream<Void> GRVTimer, double *lastGRVTime,
double *GRVBatchTime, FutureStream<double> replyTimes,
@ -326,9 +329,11 @@ ACTOR Future<Void> queueTransactionStartRequests(
if (req.priority >= TransactionPriority::IMMEDIATE) {
stats->txnSystemPriorityStartIn += req.transactionCount;
systemQueue->push_back(req);
systemQueue->span->parents.insert(req.spanContext);
} else if (req.priority >= TransactionPriority::DEFAULT) {
stats->txnDefaultPriorityStartIn += req.transactionCount;
defaultQueue->push_back(req);
defaultQueue->span->parents.insert(req.spanContext);
} else {
// Return error for batch_priority GRV requests
int64_t proxiesCount = std::max((int)db->get().client.proxies.size(), 1);
@ -340,6 +345,7 @@ ACTOR Future<Void> queueTransactionStartRequests(
stats->txnBatchPriorityStartIn += req.transactionCount;
batchQueue->push_back(req);
batchQueue->span->parents.insert(req.spanContext);
}
}
}
@ -505,8 +511,11 @@ struct ResolutionRequestBuilder {
// [CommitTransactionRef_Index][Resolver_Index][Read_Conflict_Range_Index_on_Resolver]
// -> read_conflict_range's original index in the commitTransactionRef
ResolutionRequestBuilder( ProxyCommitData* self, Version version, Version prevVersion, Version lastReceivedVersion) : self(self), requests(self->resolvers.size()) {
for(auto& req : requests) {
ResolutionRequestBuilder(ProxyCommitData* self, Version version, Version prevVersion, Version lastReceivedVersion,
Span& parentSpan)
: self(self), requests(self->resolvers.size()) {
for (auto& req : requests) {
req.spanContext = parentSpan->context;
req.prevVersion = prevVersion;
req.version = version;
req.lastReceivedVersion = lastReceivedVersion;
@ -790,6 +799,7 @@ ACTOR Future<Void> commitBatch(
state Optional<UID> debugID;
state bool forceRecovery = false;
state int batchOperations = 0;
state Span span("MP:commitBatch"_loc);
int64_t batchBytes = 0;
for (int t = 0; t<trs.size(); t++) {
batchOperations += trs[t].transaction.mutations.size();
@ -812,6 +822,7 @@ ACTOR Future<Void> commitBatch(
debugID = nondeterministicRandom()->randomUniqueID();
g_traceBatch.addAttach("CommitAttachID", trs[t].debugID.get().first(), debugID.get().first());
}
span->parents.insert(trs[t].spanContext);
}
if(localBatchNumber == 2 && !debugID.present() && self->firstProxy && !g_network->isSimulated()) {
@ -832,7 +843,7 @@ ACTOR Future<Void> commitBatch(
if (debugID.present())
g_traceBatch.addEvent("CommitDebug", debugID.get().first(), "MasterProxyServer.commitBatch.GettingCommitVersion");
GetCommitVersionRequest req(self->commitVersionRequestNumber++, self->mostRecentProcessedRequestNumber, self->dbgid);
GetCommitVersionRequest req(span->context, self->commitVersionRequestNumber++, self->mostRecentProcessedRequestNumber, self->dbgid);
GetCommitVersionReply versionReply = wait( brokenPromiseToNever(self->master.getCommitVersion.getReply(req, TaskPriority::ProxyMasterVersionReply)) );
self->mostRecentProcessedRequestNumber = versionReply.requestNum;
@ -853,7 +864,7 @@ ACTOR Future<Void> commitBatch(
if (debugID.present())
g_traceBatch.addEvent("CommitDebug", debugID.get().first(), "MasterProxyServer.commitBatch.GotCommitVersion");
ResolutionRequestBuilder requests( self, commitVersion, prevVersion, self->version );
ResolutionRequestBuilder requests( self, commitVersion, prevVersion, self->version, span );
int conflictRangeCount = 0;
state int64_t maxTransactionBytes = 0;
for (int t = 0; t<trs.size(); t++) {
@ -1166,27 +1177,32 @@ ACTOR Future<Void> commitBatch(
// We prevent this by limiting the number of versions which are semi-committed but not fully committed to be less than the MVCC window
if(self->committedVersion.get() < commitVersion - SERVER_KNOBS->MAX_READ_TRANSACTION_LIFE_VERSIONS) {
computeDuration += g_network->timer() - computeStart;
state Span waitVersionSpan;
while (self->committedVersion.get() < commitVersion - SERVER_KNOBS->MAX_READ_TRANSACTION_LIFE_VERSIONS) {
// This should be *extremely* rare in the real world, but knob buggification should make it happen in simulation
TEST(true); // Semi-committed pipeline limited by MVCC window
//TraceEvent("ProxyWaitingForCommitted", self->dbgid).detail("CommittedVersion", self->committedVersion.get()).detail("NeedToCommit", commitVersion);
waitVersionSpan = Span(deterministicRandom()->randomUniqueID(), "MP:overMaxReadTransactionLifeVersions"_loc, {span->context});
choose{
when(wait(self->committedVersion.whenAtLeast(commitVersion - SERVER_KNOBS->MAX_READ_TRANSACTION_LIFE_VERSIONS))) {
wait(yield());
break;
}
when(GetReadVersionReply v = wait(self->getConsistentReadVersion.getReply(GetReadVersionRequest(0, TransactionPriority::IMMEDIATE, GetReadVersionRequest::FLAG_CAUSAL_READ_RISKY)))) {
if(v.version > self->committedVersion.get()) {
when(GetReadVersionReply v = wait(self->getConsistentReadVersion.getReply(
GetReadVersionRequest(waitVersionSpan->context, 0, TransactionPriority::IMMEDIATE,
GetReadVersionRequest::FLAG_CAUSAL_READ_RISKY)))) {
if (v.version > self->committedVersion.get()) {
self->locked = v.locked;
self->metadataVersion = v.metadataVersion;
self->committedVersion.set(v.version);
}
if (self->committedVersion.get() < commitVersion - SERVER_KNOBS->MAX_READ_TRANSACTION_LIFE_VERSIONS)
wait(delay(SERVER_KNOBS->PROXY_SPIN_DELAY));
}
}
}
waitVersionSpan = Span{};
computeStart = g_network->timer();
}
@ -1386,21 +1402,22 @@ ACTOR Future<Void> updateLastCommit(ProxyCommitData* self, Optional<UID> debugID
return Void();
}
ACTOR Future<GetReadVersionReply> getLiveCommittedVersion(ProxyCommitData* commitData, uint32_t flags, vector<MasterProxyInterface> *otherProxies, Optional<UID> debugID,
ACTOR Future<GetReadVersionReply> getLiveCommittedVersion(Span parentSpan, ProxyCommitData* commitData, uint32_t flags, vector<MasterProxyInterface> *otherProxies, Optional<UID> debugID,
int transactionCount, int systemTransactionCount, int defaultPriTransactionCount, int batchPriTransactionCount)
{
// Returns a version which (1) is committed, and (2) is >= the latest version reported committed (by a commit response) when this request was sent
// (1) The version returned is the committedVersion of some proxy at some point before the request returns, so it is committed.
// (2) No proxy on our list reported committed a higher version before this request was received, because then its committedVersion would have been higher,
// and no other proxy could have already committed anything without first ending the epoch
state Span span("MP:getLiveCommittedVersion"_loc, parentSpan);
++commitData->stats.txnStartBatch;
state vector<Future<GetReadVersionReply>> proxyVersions;
state Future<GetReadVersionReply> replyFromMasterFuture;
if (SERVER_KNOBS->ASK_READ_VERSION_FROM_MASTER) {
replyFromMasterFuture = commitData->master.getLiveCommittedVersion.getReply(GetRawCommittedVersionRequest(debugID), TaskPriority::ProxyMasterVersionReply);
replyFromMasterFuture = commitData->master.getLiveCommittedVersion.getReply(GetRawCommittedVersionRequest(span->context, debugID), TaskPriority::ProxyMasterVersionReply);
} else {
for (auto const& p : *otherProxies)
proxyVersions.push_back(brokenPromiseToNever(p.getRawCommittedVersion.getReply(GetRawCommittedVersionRequest(debugID), TaskPriority::TLogConfirmRunningReply)));
proxyVersions.push_back(brokenPromiseToNever(p.getRawCommittedVersion.getReply(GetRawCommittedVersionRequest(span->context, debugID), TaskPriority::TLogConfirmRunningReply)));
}
if (!SERVER_KNOBS->ALWAYS_CAUSAL_READ_RISKY && !(flags&GetReadVersionRequest::FLAG_CAUSAL_READ_RISKY)) {
@ -1508,15 +1525,16 @@ ACTOR static Future<Void> transactionStarter(
state TransactionRateInfo normalRateInfo(10);
state TransactionRateInfo batchRateInfo(0);
state Deque<GetReadVersionRequest> systemQueue;
state Deque<GetReadVersionRequest> defaultQueue;
state Deque<GetReadVersionRequest> batchQueue;
state SpannedDeque<GetReadVersionRequest> systemQueue("MP:transactionStarterSystemQueue"_loc);
state SpannedDeque<GetReadVersionRequest> defaultQueue("MP:transactionStarterDefaultQueue"_loc);
state SpannedDeque<GetReadVersionRequest> batchQueue("MP:transactionStarterBatchQueue"_loc);
state vector<MasterProxyInterface> otherProxies;
state TransactionTagMap<uint64_t> transactionTagCounter;
state PrioritizedTransactionTagMap<ClientTagThrottleLimits> throttledTags;
state PromiseStream<double> replyTimes;
state Span span;
addActor.send(getRate(proxy.id(), db, &transactionCount, &batchTransactionCount, &normalRateInfo, &batchRateInfo, healthMetricsReply, detailedHealthMetricsReply, &transactionTagCounter, &throttledTags));
addActor.send(queueTransactionStartRequests(db, &systemQueue, &defaultQueue, &batchQueue, proxy.getConsistentReadVersion.getFuture(),
@ -1558,13 +1576,16 @@ ACTOR static Future<Void> transactionStarter(
int requestsToStart = 0;
while (requestsToStart < SERVER_KNOBS->START_TRANSACTION_MAX_REQUESTS_TO_START) {
Deque<GetReadVersionRequest>* transactionQueue;
SpannedDeque<GetReadVersionRequest>* transactionQueue;
if(!systemQueue.empty()) {
transactionQueue = &systemQueue;
span = systemQueue.resetSpan();
} else if(!defaultQueue.empty()) {
transactionQueue = &defaultQueue;
span = defaultQueue.resetSpan();
} else if(!batchQueue.empty()) {
transactionQueue = &batchQueue;
span = batchQueue.resetSpan();
} else {
break;
}
@ -1629,7 +1650,9 @@ ACTOR static Future<Void> transactionStarter(
for (int i = 0; i < start.size(); i++) {
if (start[i].size()) {
Future<GetReadVersionReply> readVersionReply = getLiveCommittedVersion(commitData, i, &otherProxies, debugID, transactionsStarted[i], systemTransactionsStarted[i], defaultPriTransactionsStarted[i], batchPriTransactionsStarted[i]);
Future<GetReadVersionReply> readVersionReply = getLiveCommittedVersion(
span, commitData, i, &otherProxies, debugID, transactionsStarted[i], systemTransactionsStarted[i],
defaultPriTransactionsStarted[i], batchPriTransactionsStarted[i]);
addActor.send(sendGrvReplies(readVersionReply, start[i], &commitData->stats,
commitData->minKnownCommittedVersion, throttledTags));
@ -1639,6 +1662,7 @@ ACTOR static Future<Void> transactionStarter(
}
}
}
span.reset();
}
}
@ -2097,6 +2121,7 @@ ACTOR Future<Void> masterProxyServerCore(
}
when(GetRawCommittedVersionRequest req = waitNext(proxy.getRawCommittedVersion.getFuture())) {
//TraceEvent("ProxyGetRCV", proxy.id());
Span span("MP:getRawCommittedReadVersion"_loc, { req.spanContext });
if (req.debugID.present())
g_traceBatch.addEvent("TransactionDebug", req.debugID.get().first(), "MasterProxyServer.masterProxyServerCore.GetRawCommittedVersion");
GetReadVersionReply rep;

View File

@ -324,17 +324,19 @@ namespace oldTLog_4_6 {
TagData( Version popped, bool nothing_persistent, bool popped_recently, OldTag tag ) : nothing_persistent(nothing_persistent), popped(popped), popped_recently(popped_recently), update_version_sizes(tag != txsTagOld) {}
TagData(TagData&& r) BOOST_NOEXCEPT : version_messages(std::move(r.version_messages)), nothing_persistent(r.nothing_persistent), popped_recently(r.popped_recently), popped(r.popped), update_version_sizes(r.update_version_sizes) {}
void operator= (TagData&& r) BOOST_NOEXCEPT {
version_messages = std::move(r.version_messages);
nothing_persistent = r.nothing_persistent;
TagData(TagData&& r) noexcept
: version_messages(std::move(r.version_messages)), nothing_persistent(r.nothing_persistent),
popped_recently(r.popped_recently), popped(r.popped), update_version_sizes(r.update_version_sizes) {}
void operator=(TagData&& r) noexcept {
version_messages = std::move(r.version_messages);
nothing_persistent = r.nothing_persistent;
popped_recently = r.popped_recently;
popped = r.popped;
update_version_sizes = r.update_version_sizes;
}
}
// Erase messages not needed to update *from* versions >= before (thus, messages with toversion <= before)
ACTOR Future<Void> eraseMessagesBefore( TagData *self, Version before, int64_t* gBytesErased, Reference<LogData> tlogData, TaskPriority taskID ) {
// Erase messages not needed to update *from* versions >= before (thus, messages with toversion <= before)
ACTOR Future<Void> eraseMessagesBefore( TagData *self, Version before, int64_t* gBytesErased, Reference<LogData> tlogData, TaskPriority taskID ) {
while(!self->version_messages.empty() && self->version_messages.front().first < before) {
Version version = self->version_messages.front().first;
std::pair<int, int> &sizes = tlogData->version_sizes[version];

View File

@ -310,8 +310,10 @@ struct LogData : NonCopyable, public ReferenceCounted<LogData> {
TagData( Tag tag, Version popped, bool nothingPersistent, bool poppedRecently, bool unpoppedRecovered ) : tag(tag), nothingPersistent(nothingPersistent), popped(popped), poppedRecently(poppedRecently), unpoppedRecovered(unpoppedRecovered) {}
TagData(TagData&& r) BOOST_NOEXCEPT : versionMessages(std::move(r.versionMessages)), nothingPersistent(r.nothingPersistent), poppedRecently(r.poppedRecently), popped(r.popped), tag(r.tag), unpoppedRecovered(r.unpoppedRecovered) {}
void operator= (TagData&& r) BOOST_NOEXCEPT {
TagData(TagData&& r) noexcept
: versionMessages(std::move(r.versionMessages)), nothingPersistent(r.nothingPersistent),
poppedRecently(r.poppedRecently), popped(r.popped), tag(r.tag), unpoppedRecovered(r.unpoppedRecovered) {}
void operator=(TagData&& r) noexcept {
versionMessages = std::move(r.versionMessages);
nothingPersistent = r.nothingPersistent;
poppedRecently = r.poppedRecently;

View File

@ -375,8 +375,12 @@ struct LogData : NonCopyable, public ReferenceCounted<LogData> {
TagData( Tag tag, Version popped, IDiskQueue::location poppedLocation, bool nothingPersistent, bool poppedRecently, bool unpoppedRecovered ) : tag(tag), nothingPersistent(nothingPersistent), poppedRecently(poppedRecently), popped(popped), persistentPopped(0), versionForPoppedLocation(0), poppedLocation(poppedLocation), unpoppedRecovered(unpoppedRecovered) {}
TagData(TagData&& r) BOOST_NOEXCEPT : versionMessages(std::move(r.versionMessages)), nothingPersistent(r.nothingPersistent), poppedRecently(r.poppedRecently), popped(r.popped), persistentPopped(r.persistentPopped), versionForPoppedLocation(r.versionForPoppedLocation), poppedLocation(r.poppedLocation), tag(r.tag), unpoppedRecovered(r.unpoppedRecovered) {}
void operator= (TagData&& r) BOOST_NOEXCEPT {
TagData(TagData&& r) noexcept
: versionMessages(std::move(r.versionMessages)), nothingPersistent(r.nothingPersistent),
poppedRecently(r.poppedRecently), popped(r.popped), persistentPopped(r.persistentPopped),
versionForPoppedLocation(r.versionForPoppedLocation), poppedLocation(r.poppedLocation), tag(r.tag),
unpoppedRecovered(r.unpoppedRecovered) {}
void operator=(TagData&& r) noexcept {
versionMessages = std::move(r.versionMessages);
nothingPersistent = r.nothingPersistent;
poppedRecently = r.poppedRecently;

View File

@ -20,9 +20,15 @@
#ifndef FDBSERVER_RESOLVERINTERFACE_H
#define FDBSERVER_RESOLVERINTERFACE_H
#include "fdbclient/CommitTransaction.h"
#include "fdbrpc/Locality.h"
#include "fdbrpc/fdbrpc.h"
#pragma once
#include "fdbrpc/Locality.h"
#include "fdbrpc/fdbrpc.h"
#include "fdbclient/FDBTypes.h"
#include "fdbclient/CommitTransaction.h"
struct ResolverInterface {
constexpr static FileIdentifier file_identifier = 1755944;
@ -91,17 +97,19 @@ struct ResolveTransactionBatchRequest {
constexpr static FileIdentifier file_identifier = 16462858;
Arena arena;
SpanID spanContext;
Version prevVersion;
Version version; // FIXME: ?
Version lastReceivedVersion;
VectorRef<CommitTransactionRef> transactions;
VectorRef<struct CommitTransactionRef> transactions;
VectorRef<int> txnStateTransactions; // Offsets of elements of transactions that have (transaction subsystem state) mutations
ReplyPromise<ResolveTransactionBatchReply> reply;
Optional<UID> debugID;
template <class Archive>
void serialize(Archive& ar) {
serializer(ar, prevVersion, version, lastReceivedVersion, transactions, txnStateTransactions, reply, arena, debugID);
serializer(ar, prevVersion, version, lastReceivedVersion, transactions, txnStateTransactions, reply, arena,
debugID, spanContext);
}
};

View File

@ -83,6 +83,7 @@ ACTOR Future<Void> restoreApplierCore(RestoreApplierInterface applierInterf, int
updateProcessStats(self);
updateProcessStatsTimer = delay(SERVER_KNOBS->FASTRESTORE_UPDATE_PROCESS_STATS_INTERVAL);
}
when(wait(actors.getResult())) {}
when(wait(exitRole)) {
TraceEvent("RestoreApplierCoreExitRole", self->id());
break;
@ -92,6 +93,7 @@ ACTOR Future<Void> restoreApplierCore(RestoreApplierInterface applierInterf, int
TraceEvent(SevWarn, "FastRestoreApplierError", self->id())
.detail("RequestType", requestTypeStr)
.error(e, true);
actors.clear(false);
break;
}
}
@ -179,7 +181,6 @@ ACTOR static Future<Void> applyClearRangeMutations(Standalone<VectorRef<KeyRange
.detail("DelayTime", delayTime);
loop {
try {
tr->reset();
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr->setOption(FDBTransactionOptions::LOCK_AWARE);
for (auto& range : ranges) {
@ -216,47 +217,50 @@ ACTOR static Future<Void> getAndComputeStagingKeys(
std::map<Key, std::map<Key, StagingKey>::iterator> incompleteStagingKeys, double delayTime, Database cx,
UID applierID, int batchIndex) {
state Reference<ReadYourWritesTransaction> tr(new ReadYourWritesTransaction(cx));
state std::vector<Future<Optional<Value>>> fValues;
state std::vector<Future<Optional<Value>>> fValues(incompleteStagingKeys.size(), Never());
state int retries = 0;
state UID randomID = deterministicRandom()->randomUniqueID();
wait(delay(delayTime + deterministicRandom()->random01() * delayTime));
TraceEvent("FastRestoreApplierGetAndComputeStagingKeysStart", applierID)
.detail("RandomUID", randomID)
.detail("BatchIndex", batchIndex)
.detail("GetKeys", incompleteStagingKeys.size())
.detail("DelayTime", delayTime);
loop {
try {
tr->reset();
int i = 0;
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr->setOption(FDBTransactionOptions::LOCK_AWARE);
for (auto& key : incompleteStagingKeys) {
fValues.push_back(tr->get(key.first));
fValues[i++] = tr->get(key.first);
}
wait(waitForAll(fValues));
break;
} catch (Error& e) {
if (retries++ > 10) { // TODO: Can we stop retry at the first error?
TraceEvent(SevWarn, "FastRestoreApplierGetAndComputeStagingKeysGetKeysStuck", applierID)
if (retries++ > incompleteStagingKeys.size()) {
TraceEvent(SevWarnAlways, "GetAndComputeStagingKeys", applierID)
.suppressFor(1.0)
.detail("RandomUID", randomID)
.detail("BatchIndex", batchIndex)
.detail("GetKeys", incompleteStagingKeys.size())
.error(e);
break;
}
wait(tr->onError(e));
fValues.clear();
}
}
ASSERT(fValues.size() == incompleteStagingKeys.size());
int i = 0;
for (auto& key : incompleteStagingKeys) {
if (!fValues[i].get().present()) { // Debug info to understand which key does not exist in DB
if (!fValues[i].get().present()) { // Key not exist in DB
// if condition: fValues[i].Valid() && fValues[i].isReady() && !fValues[i].isError() &&
TraceEvent(SevWarn, "FastRestoreApplierGetAndComputeStagingKeysNoBaseValueInDB", applierID)
.detail("BatchIndex", batchIndex)
.detail("Key", key.first)
.detail("Reason", "Not found in DB")
.detail("IsReady", fValues[i].isReady())
.detail("PendingMutations", key.second->second.pendingMutations.size())
.detail("StagingKeyType", (int)key.second->second.type);
.detail("StagingKeyType", getTypeString(key.second->second.type));
for (auto& vm : key.second->second.pendingMutations) {
TraceEvent(SevWarn, "FastRestoreApplierGetAndComputeStagingKeysNoBaseValueInDB")
.detail("PendingMutationVersion", vm.first.toString())
@ -274,8 +278,10 @@ ACTOR static Future<Void> getAndComputeStagingKeys(
}
TraceEvent("FastRestoreApplierGetAndComputeStagingKeysDone", applierID)
.detail("RandomUID", randomID)
.detail("BatchIndex", batchIndex)
.detail("GetKeys", incompleteStagingKeys.size());
.detail("GetKeys", incompleteStagingKeys.size())
.detail("DelayTime", delayTime);
return Void();
}
@ -404,7 +410,6 @@ ACTOR static Future<Void> applyStagingKeysBatch(std::map<Key, StagingKey>::itera
TraceEvent("FastRestoreApplierPhaseApplyStagingKeysBatch", applierID).detail("Begin", begin->first);
loop {
try {
tr->reset();
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr->setOption(FDBTransactionOptions::LOCK_AWARE);
std::map<Key, StagingKey>::iterator iter = begin;
@ -502,6 +507,7 @@ ACTOR static Future<Void> handleApplyToDBRequest(RestoreVersionBatchRequest req,
.detail("FinishedBatch", self->finishedBatch.get());
// Ensure batch (i-1) is applied before batch i
// TODO: Add a counter to warn when too many requests are waiting on the actor
wait(self->finishedBatch.whenAtLeast(req.batchIndex - 1));
state bool isDuplicated = true;
@ -523,6 +529,8 @@ ACTOR static Future<Void> handleApplyToDBRequest(RestoreVersionBatchRequest req,
}
ASSERT(batchData->dbApplier.present());
ASSERT(!batchData->dbApplier.get().isError()); // writeMutationsToDB actor cannot have error.
// We cannot blindly retry because it is not idempodent
wait(batchData->dbApplier.get());
@ -578,4 +586,4 @@ Value applyAtomicOp(Optional<StringRef> existingValue, Value value, MutationRef:
ASSERT(false);
}
return Value();
}
}

View File

@ -123,7 +123,8 @@ struct StagingKey {
.detail("Value", val)
.detail("MType", type < MutationRef::MAX_ATOMIC_OP ? getTypeString(type) : "[Unset]")
.detail("LargestPendingVersion",
(pendingMutations.empty() ? "[none]" : pendingMutations.rbegin()->first.toString()));
(pendingMutations.empty() ? "[none]" : pendingMutations.rbegin()->first.toString()))
.detail("PendingMutations", pendingMutations.size());
std::map<LogMessageVersion, Standalone<MutationRef>>::iterator lb = pendingMutations.lower_bound(version);
if (lb == pendingMutations.end()) {
return;

View File

@ -122,7 +122,7 @@ Future<Void> RestoreConfigFR::logError(Database cx, Error e, std::string const&
}
TraceEvent t(SevWarn, "FileRestoreError");
t.error(e).detail("RestoreUID", uid).detail("Description", details).detail("TaskInstance", (uint64_t)taskInstance);
// These should not happen
// key_not_found could happen
if (e.code() == error_code_key_not_found) t.backtrace();
return updateErrorInfo(cx, e, details);

View File

@ -111,13 +111,17 @@ ACTOR Future<Void> restoreLoaderCore(RestoreLoaderInterface loaderInterf, int no
updateProcessStats(self);
updateProcessStatsTimer = delay(SERVER_KNOBS->FASTRESTORE_UPDATE_PROCESS_STATS_INTERVAL);
}
when(wait(actors.getResult())) {}
when(wait(exitRole)) {
TraceEvent("FastRestoreLoaderCoreExitRole", self->id());
break;
}
}
} catch (Error& e) {
TraceEvent(SevWarn, "FastRestoreLoader", self->id()).detail("RequestType", requestTypeStr).error(e, true);
TraceEvent(SevWarn, "FastRestoreLoaderError", self->id())
.detail("RequestType", requestTypeStr)
.error(e, true);
actors.clear(false);
break;
}
}

View File

@ -198,17 +198,25 @@ ACTOR Future<Void> startProcessRestoreRequests(Reference<RestoreMasterData> self
TraceEvent("FastRestoreMasterWaitOnRestoreRequests", self->id()).detail("RestoreRequests", restoreRequests.size());
// DB has been locked where restore request is submitted
wait(clearDB(cx));
// TODO: Sanity check restoreRequests' key ranges do not overlap
// Step: Perform the restore requests
try {
for (restoreIndex = 0; restoreIndex < restoreRequests.size(); restoreIndex++) {
RestoreRequest& request = restoreRequests[restoreIndex];
state RestoreRequest request = restoreRequests[restoreIndex];
TraceEvent("FastRestoreMasterProcessRestoreRequests", self->id())
.detail("RestoreRequestInfo", request.toString());
// TODO: Initialize MasterData and all loaders and appliers' data for each restore request!
self->resetPerRestoreRequest();
// clear the key range that will be restored
wait(runRYWTransaction(cx, [=](Reference<ReadYourWritesTransaction> tr) -> Future<Void> {
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr->setOption(FDBTransactionOptions::LOCK_AWARE);
tr->clear(request.range);
return Void();
}));
wait(success(processRestoreRequest(self, cx, request)));
wait(notifyRestoreCompleted(self, false));
}
@ -637,7 +645,6 @@ ACTOR static Future<Standalone<VectorRef<RestoreRequest>>> collectRestoreRequest
loop {
try {
TraceEvent("FastRestoreMasterPhaseCollectRestoreRequestsWait");
tr.reset();
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
@ -866,6 +873,7 @@ ACTOR static Future<Void> notifyApplierToApplyMutations(Reference<MasterBatchDat
}
ASSERT(batchData->applyToDB.present());
ASSERT(!batchData->applyToDB.get().isError());
wait(batchData->applyToDB.get());
// Sanity check all appliers have applied data to destination DB
@ -943,7 +951,7 @@ ACTOR static Future<Void> notifyRestoreCompleted(Reference<RestoreMasterData> se
ACTOR static Future<Void> signalRestoreCompleted(Reference<RestoreMasterData> self, Database cx) {
state Reference<ReadYourWritesTransaction> tr(new ReadYourWritesTransaction(cx));
wait(notifyRestoreCompleted(self, true));
wait(notifyRestoreCompleted(self, true)); // notify workers the restore has completed
wait(delay(5.0)); // Give some time for loaders and appliers to exit

View File

@ -249,7 +249,6 @@ ACTOR Future<Void> startRestoreWorker(Reference<RestoreWorkerData> self, Restore
}
ACTOR static Future<Void> waitOnRestoreRequests(Database cx, UID nodeID = UID()) {
state Future<Void> watch4RestoreRequest;
state ReadYourWritesTransaction tr(cx);
state Optional<Value> numRequests;
@ -263,10 +262,10 @@ ACTOR static Future<Void> waitOnRestoreRequests(Database cx, UID nodeID = UID())
Optional<Value> _numRequests = wait(tr.get(restoreRequestTriggerKey));
numRequests = _numRequests;
if (!numRequests.present()) {
watch4RestoreRequest = tr.watch(restoreRequestTriggerKey);
state Future<Void> watchForRestoreRequest = tr.watch(restoreRequestTriggerKey);
wait(tr.commit());
TraceEvent(SevInfo, "FastRestoreWaitOnRestoreRequestTriggerKey", nodeID);
wait(watch4RestoreRequest);
wait(watchForRestoreRequest);
TraceEvent(SevInfo, "FastRestoreDetectRestoreRequestTriggerKeyChanged", nodeID);
} else {
TraceEvent(SevInfo, "FastRestoreRestoreRequestTriggerKey", nodeID)

View File

@ -55,7 +55,6 @@ struct ServerDBInfo {
LogSystemConfig logSystemConfig;
std::vector<UID> priorCommittedLogServers; // If !fullyRecovered and logSystemConfig refers to a new log system which may not have been committed to the coordinated state yet, then priorCommittedLogServers are the previous, fully committed generation which need to stay alive in case this recovery fails
Optional<LatencyBandConfig> latencyBandConfig;
std::vector<std::pair<uint16_t,StorageServerInterface>> storageCaches;
int64_t infoGeneration;
ServerDBInfo() : recoveryCount(0), recoveryState(RecoveryState::UNINITIALIZED), logSystemConfig(0), infoGeneration(0) {}
@ -65,7 +64,7 @@ struct ServerDBInfo {
template <class Ar>
void serialize( Ar& ar ) {
serializer(ar, id, clusterInterface, client, distributor, master, ratekeeper, resolvers, recoveryCount, recoveryState, masterLifetime, logSystemConfig, priorCommittedLogServers, latencyBandConfig, storageCaches, infoGeneration);
serializer(ar, id, clusterInterface, client, distributor, master, ratekeeper, resolvers, recoveryCount, recoveryState, masterLifetime, logSystemConfig, priorCommittedLogServers, latencyBandConfig, infoGeneration);
}
};

View File

@ -1220,9 +1220,9 @@ void setupSimulatedSystem(vector<Future<Void>>* systemActors, std::string baseFo
//FIXME: temporarily code to test storage cache
//TODO: caching disabled for this merge
//if(dc==0) {
// machines++;
//}
if(dc==0) {
machines++;
}
int useSeedForMachine = deterministicRandom()->randomInt(0, machines);
Standalone<StringRef> zoneId;
@ -1249,10 +1249,10 @@ void setupSimulatedSystem(vector<Future<Void>>* systemActors, std::string baseFo
//FIXME: temporarily code to test storage cache
//TODO: caching disabled for this merge
//if(machine==machines-1 && dc==0) {
// processClass = ProcessClass(ProcessClass::StorageCacheClass, ProcessClass::CommandLineSource);
// nonVersatileMachines++;
//}
if(machine==machines-1 && dc==0) {
processClass = ProcessClass(ProcessClass::StorageCacheClass, ProcessClass::CommandLineSource);
nonVersatileMachines++;
}
std::vector<IPAddress> ips;
for (int i = 0; i < processesPerMachine; i++) {

View File

@ -393,8 +393,8 @@ public:
}
}
~SkipList() { destroy(); }
SkipList(SkipList&& other) BOOST_NOEXCEPT : header(other.header) { other.header = NULL; }
void operator=(SkipList&& other) BOOST_NOEXCEPT {
SkipList(SkipList&& other) noexcept : header(other.header) { other.header = NULL; }
void operator=(SkipList&& other) noexcept {
destroy();
header = other.header;
other.header = NULL;

File diff suppressed because it is too large Load Diff

View File

@ -28,6 +28,7 @@
#include "fdbclient/SystemData.h"
#include "fdbclient/FDBTypes.h"
#include "fdbserver/WorkerInterface.actor.h"
#include "fdbserver/LogProtocolMessage.h"
#include "fdbserver/TLogInterface.h"
#include "fdbserver/Knobs.h"
#include "fdbserver/IKeyValueStore.h"
@ -373,8 +374,12 @@ struct LogData : NonCopyable, public ReferenceCounted<LogData> {
TagData( Tag tag, Version popped, IDiskQueue::location poppedLocation, bool nothingPersistent, bool poppedRecently, bool unpoppedRecovered ) : tag(tag), nothingPersistent(nothingPersistent), poppedRecently(poppedRecently), popped(popped), persistentPopped(0), versionForPoppedLocation(0), poppedLocation(poppedLocation), unpoppedRecovered(unpoppedRecovered) {}
TagData(TagData&& r) BOOST_NOEXCEPT : versionMessages(std::move(r.versionMessages)), nothingPersistent(r.nothingPersistent), poppedRecently(r.poppedRecently), popped(r.popped), persistentPopped(r.persistentPopped), versionForPoppedLocation(r.versionForPoppedLocation), poppedLocation(r.poppedLocation), tag(r.tag), unpoppedRecovered(r.unpoppedRecovered) {}
void operator= (TagData&& r) BOOST_NOEXCEPT {
TagData(TagData&& r) noexcept
: versionMessages(std::move(r.versionMessages)), nothingPersistent(r.nothingPersistent),
poppedRecently(r.poppedRecently), popped(r.popped), persistentPopped(r.persistentPopped),
versionForPoppedLocation(r.versionForPoppedLocation), poppedLocation(r.poppedLocation), tag(r.tag),
unpoppedRecovered(r.unpoppedRecovered) {}
void operator=(TagData&& r) noexcept {
versionMessages = std::move(r.versionMessages);
nothingPersistent = r.nothingPersistent;
poppedRecently = r.poppedRecently;
@ -1677,7 +1682,10 @@ ACTOR Future<Void> tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere
reply.end = endVersion;
reply.onlySpilled = onlySpilled;
//TraceEvent("TlogPeek", self->dbgid).detail("LogId", logData->logId).detail("EndVer", reply.end).detail("MsgBytes", reply.messages.expectedSize()).detail("ForAddress", req.reply.getEndpoint().getPrimaryAddress());
//TraceEvent("TlogPeek", self->dbgid).detail("LogId", logData->logId).detail("Tag", req.tag.toString()).
// detail("BeginVer", req.begin).detail("EndVer", reply.end).
// detail("MsgBytes", reply.messages.expectedSize()).
// detail("ForAddress", req.reply.getEndpoint().getPrimaryAddress());
if(req.sequence.present()) {
auto& trackerData = logData->peekTracker[peekId];

View File

@ -28,6 +28,7 @@
#include "fdbrpc/Replication.h"
#include "fdbrpc/ReplicationUtils.h"
#include "fdbserver/RecoveryState.h"
#include "fdbserver/LogProtocolMessage.h"
#include "flow/actorcompiler.h" // This must be the last #include.
ACTOR Future<Version> minVersionWhenReady(Future<Void> f, std::vector<Future<TLogCommitReply>> replies) {
@ -690,10 +691,10 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
}
Reference<IPeekCursor> peekLocal( UID dbgid, Tag tag, Version begin, Version end, bool useMergePeekCursors, int8_t peekLocality = tagLocalityInvalid ) {
if(tag.locality >= 0 || tag.locality == tagLocalityUpgraded) {
if(tag.locality >= 0 || tag.locality == tagLocalityUpgraded || tag.locality == tagLocalitySpecial) {
peekLocality = tag.locality;
}
ASSERT(peekLocality >= 0 || peekLocality == tagLocalityUpgraded);
ASSERT(peekLocality >= 0 || peekLocality == tagLocalityUpgraded || tag.locality == tagLocalitySpecial);
int bestSet = -1;
bool foundSpecial = false;
@ -702,7 +703,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
if(tLogs[t]->logServers.size() && tLogs[t]->locality != tagLocalitySatellite) {
logCount++;
}
if(tLogs[t]->logServers.size() && (tLogs[t]->locality == tagLocalitySpecial || tLogs[t]->locality == tagLocalityUpgraded || tLogs[t]->locality == peekLocality || peekLocality == tagLocalityUpgraded)) {
if(tLogs[t]->logServers.size() && (tLogs[t]->locality == tagLocalitySpecial || tLogs[t]->locality == tagLocalityUpgraded || tLogs[t]->locality == peekLocality || peekLocality == tagLocalityUpgraded || peekLocality == tagLocalitySpecial)) {
if( tLogs[t]->locality == tagLocalitySpecial || tLogs[t]->locality == tagLocalityUpgraded ) {
foundSpecial = true;
}
@ -757,7 +758,8 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
if(oldLogData[i].tLogs[t]->logServers.size() && oldLogData[i].tLogs[t]->locality != tagLocalitySatellite) {
logCount++;
}
if(oldLogData[i].tLogs[t]->logServers.size() && (oldLogData[i].tLogs[t]->locality == tagLocalitySpecial || oldLogData[i].tLogs[t]->locality == tagLocalityUpgraded || oldLogData[i].tLogs[t]->locality == peekLocality || peekLocality == tagLocalityUpgraded)) {
if(oldLogData[i].tLogs[t]->logServers.size() && (oldLogData[i].tLogs[t]->locality == tagLocalitySpecial || oldLogData[i].tLogs[t]->locality == tagLocalityUpgraded || oldLogData[i].tLogs[t]->locality == peekLocality ||
peekLocality == tagLocalityUpgraded || peekLocality == tagLocalitySpecial)) {
if( oldLogData[i].tLogs[t]->locality == tagLocalitySpecial || oldLogData[i].tLogs[t]->locality == tagLocalityUpgraded ) {
nextFoundSpecial = true;
}
@ -783,8 +785,9 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
Version thisBegin = std::max(oldLogData[i].tLogs[bestOldSet]->startVersion, begin);
if(thisBegin < lastBegin) {
if(thisBegin < end) {
TraceEvent("TLogPeekLocalAddingOldBest", dbgid).detail("Tag", tag.toString()).detail("Begin", begin).detail("End", end)
TraceEvent("TLogPeekLocalAddingOldBest", dbgid).detail("Tag", tag.toString()).detail("Begin", begin).detail("End", end).detail("BestOldSet", bestOldSet)
.detail("LogServers", oldLogData[i].tLogs[bestOldSet]->logServerString()).detail("ThisBegin", thisBegin).detail("LastBegin", lastBegin);
//detail("LogId", oldLogData[i].tLogs[bestOldSet]->logServers[tLogs[bestOldSet]->bestLocationFor( tag )]->get().id());
cursors.emplace_back(new ILogSystem::MergedPeekCursor( oldLogData[i].tLogs[bestOldSet]->logServers, oldLogData[i].tLogs[bestOldSet]->bestLocationFor( tag ), oldLogData[i].tLogs[bestOldSet]->logServers.size() + 1 - oldLogData[i].tLogs[bestOldSet]->tLogReplicationFactor, tag,
thisBegin, std::min(lastBegin, end), useMergePeekCursors, oldLogData[i].tLogs[bestOldSet]->tLogLocalities, oldLogData[i].tLogs[bestOldSet]->tLogPolicy, oldLogData[i].tLogs[bestOldSet]->tLogReplicationFactor));
epochEnds.emplace_back(std::min(lastBegin, end));

View File

@ -531,7 +531,7 @@ static int asyncOpen(
if (flags & SQLITE_OPEN_WAL) oflags |= IAsyncFile::OPEN_LARGE_PAGES;
oflags |= IAsyncFile::OPEN_LOCK;
memset(p, 0, sizeof(VFSAsyncFile));
memset(static_cast<void*>(p), 0, sizeof(VFSAsyncFile));
new (p) VFSAsyncFile(zName, flags);
try {
// Note that SQLiteDB::open also opens the db file, so its flags and modes are important, too

View File

@ -2204,6 +2204,7 @@ struct SplitStringRef {
// A BTree "page id" is actually a list of LogicalPageID's whose contents should be concatenated together.
// NOTE: Uses host byte order
typedef VectorRef<LogicalPageID> BTreePageIDRef;
constexpr LogicalPageID maxPageID = (LogicalPageID)-1;
std::string toString(BTreePageIDRef id) {
return std::string("BTreePageID") + toString(id.begin(), id.end());
@ -2246,6 +2247,10 @@ struct RedwoodRecordRef {
inline RedwoodRecordRef withoutValue() const { return RedwoodRecordRef(key, version); }
inline RedwoodRecordRef withMaxPageID() const {
return RedwoodRecordRef(key, version, StringRef((uint8_t *)&maxPageID, sizeof(maxPageID)));
}
// Truncate (key, version, part) tuple to len bytes.
void truncate(int len) {
ASSERT(len <= key.size());
@ -2988,7 +2993,8 @@ public:
VersionedBTree(IPager2* pager, std::string name)
: m_pager(pager), m_writeVersion(invalidVersion), m_lastCommittedVersion(invalidVersion), m_pBuffer(nullptr),
m_name(name) {
m_commitReadLock(SERVER_KNOBS->REDWOOD_COMMIT_CONCURRENT_READS), m_name(name) {
m_lazyClearActor = 0;
m_init = init_impl(this);
m_latestCommit = m_init;
@ -3435,6 +3441,7 @@ private:
Version m_writeVersion;
Version m_lastCommittedVersion;
Version m_newOldestVersion;
FlowLock m_commitReadLock;
Future<Void> m_latestCommit;
Future<Void> m_init;
std::string m_name;
@ -3872,7 +3879,7 @@ private:
// If the decode upper boundary is the subtree upper boundary the pointers will be the same
// For the lower boundary, if the pointers are not the same there is still a possibility
// that the keys are the same. This happens for the first remaining subtree of an internal page
// after the previous first subtree was cleared.
// after the prior subtree(s) were cleared.
return (decodeUpperBound == subtreeUpperBound) &&
(decodeLowerBound == subtreeLowerBound || decodeLowerBound->sameExceptValue(*subtreeLowerBound));
}
@ -4126,8 +4133,13 @@ private:
}
state Version writeVersion = self->getLastCommittedVersion() + 1;
wait(self->m_commitReadLock.take());
state FlowLock::Releaser readLock(self->m_commitReadLock);
state Reference<const IPage> page =
wait(readPage(snapshot, rootID, update->decodeLowerBound, update->decodeUpperBound));
readLock.release();
state BTreePage* btPage = (BTreePage*)page->begin();
ASSERT(isLeaf == btPage->isLeaf());
g_redwoodMetrics.level(btPage->height).pageCommitStart += 1;
@ -4984,6 +4996,246 @@ public:
Future<bool> moveLast() { return move_end(this, false); }
};
// Cursor designed for short lifespans.
// Holds references to all pages touched.
// All record references returned from it are valid until the cursor is destroyed.
class BTreeCursor {
Arena arena;
Reference<IPagerSnapshot> pager;
std::unordered_map<LogicalPageID, Reference<const IPage>> pages;
VersionedBTree* btree;
bool valid;
struct PathEntry {
BTreePage* btPage;
BTreePage::BinaryTree::Cursor cursor;
};
VectorRef<PathEntry> path;
public:
BTreeCursor() {}
bool isValid() const { return valid; }
std::string toString() const {
std::string r;
for (int i = 0; i < path.size(); ++i) {
r += format("[%d/%d: %s] ", i + 1, path.size(),
path[i].cursor.valid() ? path[i].cursor.get().toString(path[i].btPage->isLeaf()).c_str()
: "<invalid>");
}
if (!valid) {
r += " (invalid) ";
}
return r;
}
const RedwoodRecordRef& get() { return path.back().cursor.get(); }
bool inRoot() const { return path.size() == 1; }
// Pop and return the page cursor at the end of the path.
// This is meant to enable range scans to consume the contents of a leaf page more efficiently.
// Can only be used when inRoot() is true.
BTreePage::BinaryTree::Cursor popPath() {
BTreePage::BinaryTree::Cursor c = path.back().cursor;
path.pop_back();
return c;
}
Future<Void> pushPage(BTreePageIDRef id, const RedwoodRecordRef& lowerBound,
const RedwoodRecordRef& upperBound) {
Reference<const IPage>& page = pages[id.front()];
if (page.isValid()) {
path.push_back(arena, { (BTreePage*)page->begin(), getCursor(page) });
return Void();
}
return map(readPage(pager, id, &lowerBound, &upperBound), [this, &page, id](Reference<const IPage> p) {
page = p;
path.push_back(arena, { (BTreePage*)p->begin(), getCursor(p) });
return Void();
});
}
Future<Void> pushPage(BTreePage::BinaryTree::Cursor c) {
const RedwoodRecordRef& rec = c.get();
auto next = c;
next.moveNext();
BTreePageIDRef id = rec.getChildPage();
return pushPage(id, rec, next.getOrUpperBound());
}
Future<Void> init(VersionedBTree* btree_in, Reference<IPagerSnapshot> pager_in, BTreePageIDRef root) {
btree = btree_in;
pager = pager_in;
path.reserve(arena, 6);
valid = false;
return pushPage(root, dbBegin, dbEnd);
}
// Seeks cursor to query if it exists, the record before or after it, or an undefined and invalid
// position between those records
// If 0 is returned, then
// If the cursor is valid then it points to query
// If the cursor is not valid then the cursor points to some place in the btree such that
// If there is a record in the tree < query then movePrev() will move to it, and
// If there is a record in the tree > query then moveNext() will move to it.
// If non-zero is returned then the cursor is valid and the return value is logically equivalent
// to query.compare(cursor.get())
ACTOR Future<int> seek_impl(BTreeCursor* self, RedwoodRecordRef query, int prefetchBytes) {
state RedwoodRecordRef internalPageQuery = query.withMaxPageID();
self->path = self->path.slice(0, 1);
debug_printf("seek(%s, %d) start cursor = %s\n", query.toString().c_str(), prefetchBytes,
self->toString().c_str());
loop {
auto& entry = self->path.back();
if (entry.btPage->isLeaf()) {
int cmp = entry.cursor.seek(query);
self->valid = entry.cursor.valid() && !entry.cursor.node->isDeleted();
debug_printf("seek(%s, %d) loop exit cmp=%d cursor=%s\n", query.toString().c_str(), prefetchBytes,
cmp, self->toString().c_str());
return self->valid ? cmp : 0;
}
// Internal page, so seek to the branch where query must be
// Currently, after a subtree deletion internal page boundaries are still strictly adhered
// to and will be updated if anything is inserted into the cleared range, so if the seek fails
// or it finds an entry with a null child page then query does not exist in the BTree.
if (entry.cursor.seekLessThan(internalPageQuery) && entry.cursor.get().value.present()) {
debug_printf("seek(%s, %d) loop seek success cursor=%s\n", query.toString().c_str(), prefetchBytes,
self->toString().c_str());
Future<Void> f = self->pushPage(entry.cursor);
// Prefetch siblings, at least prefetchBytes, at level 2 but without jumping to another level 2
// sibling
if (prefetchBytes != 0 && entry.btPage->height == 2) {
auto c = entry.cursor;
bool fwd = prefetchBytes > 0;
prefetchBytes = abs(prefetchBytes);
// While we should still preload more bytes and a move in the target direction is successful
while (prefetchBytes > 0 && (fwd ? c.moveNext() : c.movePrev())) {
// If there is a page link, preload it.
if (c.get().value.present()) {
BTreePageIDRef childPage = c.get().getChildPage();
preLoadPage(self->pager.getPtr(), childPage);
prefetchBytes -= self->btree->m_blockSize * childPage.size();
}
}
}
wait(f);
} else {
self->valid = false;
debug_printf("seek(%s, %d) loop exit cmp=0 cursor=%s\n", query.toString().c_str(), prefetchBytes,
self->toString().c_str());
return 0;
}
}
}
Future<int> seek(RedwoodRecordRef query, int prefetchBytes) { return seek_impl(this, query, prefetchBytes); }
ACTOR Future<Void> seekGTE_impl(BTreeCursor* self, RedwoodRecordRef query, int prefetchBytes) {
debug_printf("seekGTE(%s, %d) start\n", query.toString().c_str(), prefetchBytes);
int cmp = wait(self->seek(query, prefetchBytes));
if (cmp > 0 || (cmp == 0 && !self->isValid())) {
wait(self->moveNext());
}
return Void();
}
Future<Void> seekGTE(RedwoodRecordRef query, int prefetchBytes) {
return seekGTE_impl(this, query, prefetchBytes);
}
ACTOR Future<Void> seekLT_impl(BTreeCursor* self, RedwoodRecordRef query, int prefetchBytes) {
debug_printf("seekLT(%s, %d) start\n", query.toString().c_str(), prefetchBytes);
int cmp = wait(self->seek(query, prefetchBytes));
if (cmp <= 0) {
wait(self->movePrev());
}
return Void();
}
Future<Void> seekLT(RedwoodRecordRef query, int prefetchBytes) {
return seekLT_impl(this, query, -prefetchBytes);
}
ACTOR Future<Void> move_impl(BTreeCursor* self, bool forward) {
// Try to the move cursor at the end of the path in the correct direction
debug_printf("move%s() start cursor=%s\n", forward ? "Next" : "Prev", self->toString().c_str());
while (1) {
debug_printf("move%s() first loop cursor=%s\n", forward ? "Next" : "Prev", self->toString().c_str());
auto& entry = self->path.back();
bool success;
if(entry.cursor.valid()) {
success = forward ? entry.cursor.moveNext() : entry.cursor.movePrev();
} else {
success = forward ? entry.cursor.moveFirst() : false;
}
// Skip over internal page entries that do not link to child pages. There should never be two in a row.
if (success && !entry.btPage->isLeaf() && !entry.cursor.get().value.present()) {
success = forward ? entry.cursor.moveNext() : entry.cursor.movePrev();
ASSERT(!success || entry.cursor.get().value.present());
}
// Stop if successful
if (success) {
break;
}
if (self->path.size() == 1) {
self->valid = false;
return Void();
}
// Move to parent
self->path = self->path.slice(0, self->path.size() - 1);
}
// While not on a leaf page, move down to get to one.
while (1) {
debug_printf("move%s() second loop cursor=%s\n", forward ? "Next" : "Prev", self->toString().c_str());
auto& entry = self->path.back();
if (entry.btPage->isLeaf()) {
break;
}
// The last entry in an internal page could be a null link, if so move back
if (!forward && !entry.cursor.get().value.present()) {
ASSERT(entry.cursor.movePrev());
ASSERT(entry.cursor.get().value.present());
}
wait(self->pushPage(entry.cursor));
auto& newEntry = self->path.back();
ASSERT(forward ? newEntry.cursor.moveFirst() : newEntry.cursor.moveLast());
}
self->valid = true;
debug_printf("move%s() exit cursor=%s\n", forward ? "Next" : "Prev", self->toString().c_str());
return Void();
}
Future<Void> moveNext() { return move_impl(this, true); }
Future<Void> movePrev() { return move_impl(this, false); }
};
Future<Void> initBTreeCursor(BTreeCursor* cursor, Version snapshotVersion) {
// Only committed versions can be read.
ASSERT(snapshotVersion <= m_lastCommittedVersion);
Reference<IPagerSnapshot> snapshot = m_pager->getReadSnapshot(snapshotVersion);
// This is a ref because snapshot will continue to hold the metakey value memory
KeyRef m = snapshot->getMetaKey();
return cursor->init(this, snapshot, ((MetaKey*)m.begin())->root.get());
}
// Cursor is for reading and interating over user visible KV pairs at a specific version
// KeyValueRefs returned become invalid once the cursor is moved
class Cursor : public IStoreCursor, public ReferenceCounted<Cursor>, public FastAllocated<Cursor>, NonCopyable {
@ -5264,10 +5516,13 @@ public:
ACTOR static Future<Standalone<RangeResultRef>> readRange_impl(KeyValueStoreRedwoodUnversioned* self, KeyRange keys,
int rowLimit, int byteLimit) {
state VersionedBTree::BTreeCursor cur;
wait(self->m_tree->initBTreeCursor(&cur, self->m_tree->getLastCommittedVersion()));
wait(self->m_concurrentReads.take());
state FlowLock::Releaser releaser(self->m_concurrentReads);
++g_redwoodMetrics.opGetRange;
state Standalone<RangeResultRef> result;
state int accumulatedBytes = 0;
ASSERT(byteLimit > 0);
@ -5276,33 +5531,58 @@ public:
return result;
}
state Reference<IStoreCursor> cur = self->m_tree->readAtVersion(self->m_tree->getLastCommittedVersion());
// Prefetch is currently only done in the forward direction
state int prefetchBytes = rowLimit > 1 ? byteLimit : 0;
// Prefetch is disabled for now pending some decent logic for deciding how much to fetch
state int prefetchBytes = 0;
if (rowLimit > 0) {
wait(cur->findFirstEqualOrGreater(keys.begin, prefetchBytes));
while (cur->isValid() && cur->getKey() < keys.end) {
KeyValueRef kv(KeyRef(result.arena(), cur->getKey()), ValueRef(result.arena(), cur->getValue()));
accumulatedBytes += kv.expectedSize();
result.push_back(result.arena(), kv);
if (--rowLimit == 0 || accumulatedBytes >= byteLimit) {
wait(cur.seekGTE(keys.begin, prefetchBytes));
while (cur.isValid()) {
// Read page contents without using waits
bool isRoot = cur.inRoot();
BTreePage::BinaryTree::Cursor leafCursor = cur.popPath();
while(leafCursor.valid()) {
KeyValueRef kv = leafCursor.get().toKeyValueRef();
if(kv.key >= keys.end) {
break;
}
accumulatedBytes += kv.expectedSize();
result.push_back_deep(result.arena(), kv);
if (--rowLimit == 0 || accumulatedBytes >= byteLimit) {
break;
}
leafCursor.moveNext();
}
// Stop if the leaf cursor is still valid which means we hit a key or size limit or
// if we started in the root page
if(leafCursor.valid() || isRoot) {
break;
}
wait(cur->next());
wait(cur.moveNext());
}
} else {
wait(cur->findLastLessOrEqual(keys.end));
if (cur->isValid() && cur->getKey() == keys.end) wait(cur->prev());
while (cur->isValid() && cur->getKey() >= keys.begin) {
KeyValueRef kv(KeyRef(result.arena(), cur->getKey()), ValueRef(result.arena(), cur->getValue()));
accumulatedBytes += kv.expectedSize();
result.push_back(result.arena(), kv);
if (++rowLimit == 0 || accumulatedBytes >= byteLimit) {
wait(cur.seekLT(keys.end, prefetchBytes));
while (cur.isValid()) {
// Read page contents without using waits
bool isRoot = cur.inRoot();
BTreePage::BinaryTree::Cursor leafCursor = cur.popPath();
while(leafCursor.valid()) {
KeyValueRef kv = leafCursor.get().toKeyValueRef();
if(kv.key < keys.begin) {
break;
}
accumulatedBytes += kv.expectedSize();
result.push_back_deep(result.arena(), kv);
if (++rowLimit == 0 || accumulatedBytes >= byteLimit) {
break;
}
leafCursor.movePrev();
}
// Stop if the leaf cursor is still valid which means we hit a key or size limit or
// if we started in the root page
if(leafCursor.valid() || isRoot) {
break;
}
wait(cur->prev());
wait(cur.movePrev());
}
}
@ -5316,15 +5596,16 @@ public:
ACTOR static Future<Optional<Value>> readValue_impl(KeyValueStoreRedwoodUnversioned* self, Key key,
Optional<UID> debugID) {
state VersionedBTree::BTreeCursor cur;
wait(self->m_tree->initBTreeCursor(&cur, self->m_tree->getLastCommittedVersion()));
wait(self->m_concurrentReads.take());
state FlowLock::Releaser releaser(self->m_concurrentReads);
++g_redwoodMetrics.opGet;
state Reference<IStoreCursor> cur = self->m_tree->readAtVersion(self->m_tree->getLastCommittedVersion());
wait(cur->findEqual(key));
if (cur->isValid()) {
return cur->getValue();
wait(cur.seekGTE(key, 0));
if (cur.isValid() && cur.get().key == key) {
return cur.get().value.get();
}
return Optional<Value>();
}
@ -5335,18 +5616,20 @@ public:
ACTOR static Future<Optional<Value>> readValuePrefix_impl(KeyValueStoreRedwoodUnversioned* self, Key key,
int maxLength, Optional<UID> debugID) {
state VersionedBTree::BTreeCursor cur;
wait(self->m_tree->initBTreeCursor(&cur, self->m_tree->getLastCommittedVersion()));
wait(self->m_concurrentReads.take());
state FlowLock::Releaser releaser(self->m_concurrentReads);
++g_redwoodMetrics.opGet;
state Reference<IStoreCursor> cur = self->m_tree->readAtVersion(self->m_tree->getLastCommittedVersion());
wait(cur->findEqual(key));
if (cur->isValid()) {
Value v = cur->getValue();
wait(cur.seekGTE(key, 0));
if (cur.isValid() && cur.get().key == key) {
Value v = cur.get().value.get();
int len = std::min(v.size(), maxLength);
return Value(cur->getValue().substr(0, len));
return Value(v.substr(0, len));
}
return Optional<Value>();
}
@ -5411,6 +5694,157 @@ KeyValue randomKV(int maxKeySize = 10, int maxValueSize = 5) {
return kv;
}
// Verify a range using a BTreeCursor.
// Assumes that the BTree holds a single data version and the version is 0.
ACTOR Future<int> verifyRangeBTreeCursor(VersionedBTree* btree, Key start, Key end, Version v,
std::map<std::pair<std::string, Version>, Optional<std::string>>* written,
int* pErrorCount) {
state int errors = 0;
if (end <= start) end = keyAfter(start);
state std::map<std::pair<std::string, Version>, Optional<std::string>>::const_iterator i =
written->lower_bound(std::make_pair(start.toString(), 0));
state std::map<std::pair<std::string, Version>, Optional<std::string>>::const_iterator iEnd =
written->upper_bound(std::make_pair(end.toString(), 0));
state std::map<std::pair<std::string, Version>, Optional<std::string>>::const_iterator iLast;
state VersionedBTree::BTreeCursor cur;
wait(btree->initBTreeCursor(&cur, v));
debug_printf("VerifyRange(@%" PRId64 ", %s, %s): Start\n", v, start.printable().c_str(), end.printable().c_str());
// Randomly use the cursor for something else first.
if (deterministicRandom()->coinflip()) {
state Key randomKey = randomKV().key;
debug_printf("VerifyRange(@%" PRId64 ", %s, %s): Dummy seek to '%s'\n", v, start.printable().c_str(),
end.printable().c_str(), randomKey.toString().c_str());
wait(success(cur.seek(randomKey, 0)));
}
debug_printf("VerifyRange(@%" PRId64 ", %s, %s): Actual seek\n", v, start.printable().c_str(),
end.printable().c_str());
wait(cur.seekGTE(start, 0));
state std::vector<KeyValue> results;
while (cur.isValid() && cur.get().key < end) {
// Find the next written kv pair that would be present at this version
while (1) {
iLast = i;
if (i == iEnd) break;
++i;
if (iLast->first.second <= v && iLast->second.present() &&
(i == iEnd || i->first.first != iLast->first.first || i->first.second > v)) {
debug_printf("VerifyRange(@%" PRId64 ", %s, %s) Found key in written map: %s\n", v,
start.printable().c_str(), end.printable().c_str(), iLast->first.first.c_str());
break;
}
}
if (iLast == iEnd) {
++errors;
++*pErrorCount;
printf("VerifyRange(@%" PRId64 ", %s, %s) ERROR: Tree key '%s' vs nothing in written map.\n", v,
start.printable().c_str(), end.printable().c_str(), cur.get().key.toString().c_str());
break;
}
if (cur.get().key != iLast->first.first) {
++errors;
++*pErrorCount;
printf("VerifyRange(@%" PRId64 ", %s, %s) ERROR: Tree key '%s' but expected '%s'\n", v,
start.printable().c_str(), end.printable().c_str(), cur.get().key.toString().c_str(),
iLast->first.first.c_str());
break;
}
if (cur.get().value.get() != iLast->second.get()) {
++errors;
++*pErrorCount;
printf("VerifyRange(@%" PRId64 ", %s, %s) ERROR: Tree key '%s' has tree value '%s' but expected '%s'\n", v,
start.printable().c_str(), end.printable().c_str(), cur.get().key.toString().c_str(),
cur.get().value.get().toString().c_str(), iLast->second.get().c_str());
break;
}
ASSERT(errors == 0);
results.push_back(KeyValue(KeyValueRef(cur.get().key, cur.get().value.get())));
wait(cur.moveNext());
}
// Make sure there are no further written kv pairs that would be present at this version.
while (1) {
iLast = i;
if (i == iEnd) break;
++i;
if (iLast->first.second <= v && iLast->second.present() &&
(i == iEnd || i->first.first != iLast->first.first || i->first.second > v))
break;
}
if (iLast != iEnd) {
++errors;
++*pErrorCount;
printf("VerifyRange(@%" PRId64 ", %s, %s) ERROR: Tree range ended but written has @%" PRId64 " '%s'\n", v,
start.printable().c_str(), end.printable().c_str(), iLast->first.second, iLast->first.first.c_str());
}
debug_printf("VerifyRangeReverse(@%" PRId64 ", %s, %s): start\n", v, start.printable().c_str(),
end.printable().c_str());
// Randomly use a new cursor at the same version for the reverse range read, if the version is still available for
// opening new cursors
if (v >= btree->getOldestVersion() && deterministicRandom()->coinflip()) {
cur = VersionedBTree::BTreeCursor();
wait(btree->initBTreeCursor(&cur, v));
}
// Now read the range from the tree in reverse order and compare to the saved results
wait(cur.seekLT(end, 0));
state std::vector<KeyValue>::const_reverse_iterator r = results.rbegin();
while (cur.isValid() && cur.get().key >= start) {
if (r == results.rend()) {
++errors;
++*pErrorCount;
printf("VerifyRangeReverse(@%" PRId64 ", %s, %s) ERROR: Tree key '%s' vs nothing in written map.\n", v,
start.printable().c_str(), end.printable().c_str(), cur.get().key.toString().c_str());
break;
}
if (cur.get().key != r->key) {
++errors;
++*pErrorCount;
printf("VerifyRangeReverse(@%" PRId64 ", %s, %s) ERROR: Tree key '%s' but expected '%s'\n", v,
start.printable().c_str(), end.printable().c_str(), cur.get().key.toString().c_str(),
r->key.toString().c_str());
break;
}
if (cur.get().value.get() != r->value) {
++errors;
++*pErrorCount;
printf("VerifyRangeReverse(@%" PRId64
", %s, %s) ERROR: Tree key '%s' has tree value '%s' but expected '%s'\n",
v, start.printable().c_str(), end.printable().c_str(), cur.get().key.toString().c_str(),
cur.get().value.get().toString().c_str(), r->value.toString().c_str());
break;
}
++r;
wait(cur.movePrev());
}
if (r != results.rend()) {
++errors;
++*pErrorCount;
printf("VerifyRangeReverse(@%" PRId64 ", %s, %s) ERROR: Tree range ended but written has '%s'\n", v,
start.printable().c_str(), end.printable().c_str(), r->key.toString().c_str());
}
return errors;
}
ACTOR Future<int> verifyRange(VersionedBTree* btree, Key start, Key end, Version v,
std::map<std::pair<std::string, Version>, Optional<std::string>>* written,
int* pErrorCount) {
@ -5607,6 +6041,58 @@ ACTOR Future<int> seekAll(VersionedBTree* btree, Version v,
return errors;
}
// Verify the result of point reads for every set or cleared key at the given version
ACTOR Future<int> seekAllBTreeCursor(VersionedBTree* btree, Version v,
std::map<std::pair<std::string, Version>, Optional<std::string>>* written, int* pErrorCount) {
state std::map<std::pair<std::string, Version>, Optional<std::string>>::const_iterator i = written->cbegin();
state std::map<std::pair<std::string, Version>, Optional<std::string>>::const_iterator iEnd = written->cend();
state int errors = 0;
state VersionedBTree::BTreeCursor cur;
wait(btree->initBTreeCursor(&cur, v));
while (i != iEnd) {
state std::string key = i->first.first;
state Version ver = i->first.second;
if (ver == v) {
state Optional<std::string> val = i->second;
debug_printf("Verifying @%" PRId64 " '%s'\n", ver, key.c_str());
state Arena arena;
wait(cur.seekGTE(RedwoodRecordRef(KeyRef(arena, key), 0), 0));
bool foundKey = cur.isValid() && cur.get().key == key;
bool hasValue = foundKey && cur.get().value.present();
if (val.present()) {
bool valueMatch = hasValue && cur.get().value.get() == val.get();
if (!foundKey || !hasValue || !valueMatch) {
++errors;
++*pErrorCount;
if (!foundKey) {
printf("Verify ERROR: key_not_found: '%s' -> '%s' @%" PRId64 "\n", key.c_str(),
val.get().c_str(), ver);
}
else if (!hasValue) {
printf("Verify ERROR: value_not_found: '%s' -> '%s' @%" PRId64 "\n", key.c_str(),
val.get().c_str(), ver);
}
else if (!valueMatch) {
printf("Verify ERROR: value_incorrect: for '%s' found '%s' expected '%s' @%" PRId64 "\n",
key.c_str(), cur.get().value.get().toString().c_str(), val.get().c_str(),
ver);
}
}
} else if (foundKey && hasValue) {
++errors;
++*pErrorCount;
printf("Verify ERROR: cleared_key_found: '%s' -> '%s' @%" PRId64 "\n", key.c_str(),
cur.get().value.get().toString().c_str(), ver);
}
}
++i;
}
return errors;
}
ACTOR Future<Void> verify(VersionedBTree* btree, FutureStream<Version> vStream,
std::map<std::pair<std::string, Version>, Optional<std::string>>* written, int* pErrorCount,
bool serial) {
@ -5637,7 +6123,13 @@ ACTOR Future<Void> verify(VersionedBTree* btree, FutureStream<Version> vStream,
state Reference<IStoreCursor> cur = btree->readAtVersion(v);
debug_printf("Verifying entire key range at version %" PRId64 "\n", v);
fRangeAll = verifyRange(btree, LiteralStringRef(""), LiteralStringRef("\xff\xff"), v, written, pErrorCount);
if(deterministicRandom()->coinflip()) {
fRangeAll = verifyRange(btree, LiteralStringRef(""), LiteralStringRef("\xff\xff"), v, written,
pErrorCount);
} else {
fRangeAll = verifyRangeBTreeCursor(btree, LiteralStringRef(""), LiteralStringRef("\xff\xff"), v, written,
pErrorCount);
}
if (serial) {
wait(success(fRangeAll));
}
@ -5646,13 +6138,21 @@ ACTOR Future<Void> verify(VersionedBTree* btree, FutureStream<Version> vStream,
Key end = randomKV().key;
debug_printf("Verifying range (%s, %s) at version %" PRId64 "\n", toString(begin).c_str(),
toString(end).c_str(), v);
fRangeRandom = verifyRange(btree, begin, end, v, written, pErrorCount);
if(deterministicRandom()->coinflip()) {
fRangeRandom = verifyRange(btree, begin, end, v, written, pErrorCount);
} else {
fRangeRandom = verifyRangeBTreeCursor(btree, begin, end, v, written, pErrorCount);
}
if (serial) {
wait(success(fRangeRandom));
}
debug_printf("Verifying seeks to each changed key at version %" PRId64 "\n", v);
fSeekAll = seekAll(btree, v, written, pErrorCount);
if(deterministicRandom()->coinflip()) {
fSeekAll = seekAll(btree, v, written, pErrorCount);
} else {
fSeekAll = seekAllBTreeCursor(btree, v, written, pErrorCount);
}
if (serial) {
wait(success(fSeekAll));
}
@ -6485,11 +6985,11 @@ TEST_CASE("!/redwood/correctness/btree") {
state int maxKeySize = deterministicRandom()->randomInt(1, pageSize * 2);
state int maxValueSize = randomSize(pageSize * 25);
state int maxCommitSize = shortTest ? 1000 : randomSize(std::min<int>((maxKeySize + maxValueSize) * 20000, 10e6));
state int mutationBytesTarget = shortTest ? 100000 : randomSize(std::min<int>(maxCommitSize * 100, 100e6));
state int mutationBytesTarget = shortTest ? 100000 : randomSize(std::min<int>(maxCommitSize * 100, pageSize * 100000));
state double clearProbability = deterministicRandom()->random01() * .1;
state double clearSingleKeyProbability = deterministicRandom()->random01();
state double clearPostSetProbability = deterministicRandom()->random01() * .1;
state double coldStartProbability = pagerMemoryOnly ? 0 : deterministicRandom()->random01();
state double coldStartProbability = pagerMemoryOnly ? 0 : (deterministicRandom()->random01() * 0.3);
state double advanceOldVersionProbability = deterministicRandom()->random01();
state double maxDuration = 60;
state int64_t cacheSizeBytes =

View File

@ -163,14 +163,13 @@ struct RegisterWorkerReply {
constexpr static FileIdentifier file_identifier = 16475696;
ProcessClass processClass;
ClusterControllerPriorityInfo priorityInfo;
Optional<uint16_t> storageCache;
RegisterWorkerReply() : priorityInfo(ProcessClass::UnsetFit, false, ClusterControllerPriorityInfo::FitnessUnknown) {}
RegisterWorkerReply(ProcessClass processClass, ClusterControllerPriorityInfo priorityInfo, Optional<uint16_t> storageCache) : processClass(processClass), priorityInfo(priorityInfo), storageCache(storageCache) {}
RegisterWorkerReply(ProcessClass processClass, ClusterControllerPriorityInfo priorityInfo) : processClass(processClass), priorityInfo(priorityInfo) {}
template <class Ar>
void serialize( Ar& ar ) {
serializer(ar, processClass, priorityInfo, storageCache);
serializer(ar, processClass, priorityInfo);
}
};
@ -302,19 +301,18 @@ struct RegisterWorkerRequest {
Generation generation;
Optional<DataDistributorInterface> distributorInterf;
Optional<RatekeeperInterface> ratekeeperInterf;
Optional<std::pair<uint16_t,StorageServerInterface>> storageCacheInterf;
Standalone<VectorRef<StringRef>> issues;
std::vector<NetworkAddress> incompatiblePeers;
ReplyPromise<RegisterWorkerReply> reply;
bool degraded;
RegisterWorkerRequest() : priorityInfo(ProcessClass::UnsetFit, false, ClusterControllerPriorityInfo::FitnessUnknown), degraded(false) {}
RegisterWorkerRequest(WorkerInterface wi, ProcessClass initialClass, ProcessClass processClass, ClusterControllerPriorityInfo priorityInfo, Generation generation, Optional<DataDistributorInterface> ddInterf, Optional<RatekeeperInterface> rkInterf, Optional<std::pair<uint16_t,StorageServerInterface>> storageCacheInterf, bool degraded) :
wi(wi), initialClass(initialClass), processClass(processClass), priorityInfo(priorityInfo), generation(generation), distributorInterf(ddInterf), ratekeeperInterf(rkInterf), storageCacheInterf(storageCacheInterf), degraded(degraded) {}
RegisterWorkerRequest(WorkerInterface wi, ProcessClass initialClass, ProcessClass processClass, ClusterControllerPriorityInfo priorityInfo, Generation generation, Optional<DataDistributorInterface> ddInterf, Optional<RatekeeperInterface> rkInterf, bool degraded) :
wi(wi), initialClass(initialClass), processClass(processClass), priorityInfo(priorityInfo), generation(generation), distributorInterf(ddInterf), ratekeeperInterf(rkInterf), degraded(degraded) {}
template <class Ar>
void serialize( Ar& ar ) {
serializer(ar, wi, initialClass, processClass, priorityInfo, generation, distributorInterf, ratekeeperInterf, storageCacheInterf, issues, incompatiblePeers, reply, degraded);
serializer(ar, wi, initialClass, processClass, priorityInfo, generation, distributorInterf, ratekeeperInterf, issues, incompatiblePeers, reply, degraded);
}
};
@ -712,7 +710,7 @@ ACTOR Future<Void> logRouter(TLogInterface interf, InitializeLogRouterRequest re
Reference<AsyncVar<ServerDBInfo>> db);
ACTOR Future<Void> dataDistributor(DataDistributorInterface ddi, Reference<AsyncVar<ServerDBInfo>> db);
ACTOR Future<Void> ratekeeper(RatekeeperInterface rki, Reference<AsyncVar<ServerDBInfo>> db);
ACTOR Future<Void> storageCache(StorageServerInterface interf, uint16_t id, Reference<AsyncVar<ServerDBInfo>> db);
ACTOR Future<Void> storageCacheServer(StorageServerInterface interf, uint16_t id, Reference<AsyncVar<ServerDBInfo>> db);
ACTOR Future<Void> backupWorker(BackupInterface bi, InitializeBackupRequest req, Reference<AsyncVar<ServerDBInfo>> db);
void registerThreadForProfiling();

View File

@ -20,6 +20,9 @@
// There's something in one of the files below that defines a macros
// a macro that makes boost interprocess break on Windows.
#include "flow/Tracing.h"
#include <cctype>
#include <iterator>
#define BOOST_DATE_TIME_NO_LIB
#include <boost/interprocess/managed_shared_memory.hpp>
#include <boost/algorithm/string.hpp>
@ -78,7 +81,7 @@
// clang-format off
enum {
OPT_CONNFILE, OPT_SEEDCONNFILE, OPT_SEEDCONNSTRING, OPT_ROLE, OPT_LISTEN, OPT_PUBLICADDR, OPT_DATAFOLDER, OPT_LOGFOLDER, OPT_PARENTPID, OPT_NEWCONSOLE,
OPT_CONNFILE, OPT_SEEDCONNFILE, OPT_SEEDCONNSTRING, OPT_ROLE, OPT_LISTEN, OPT_PUBLICADDR, OPT_DATAFOLDER, OPT_LOGFOLDER, OPT_PARENTPID, OPT_TRACER, OPT_NEWCONSOLE,
OPT_NOBOX, OPT_TESTFILE, OPT_RESTARTING, OPT_RESTORING, OPT_RANDOMSEED, OPT_KEY, OPT_MEMLIMIT, OPT_STORAGEMEMLIMIT, OPT_CACHEMEMLIMIT, OPT_MACHINEID,
OPT_DCID, OPT_MACHINE_CLASS, OPT_BUGGIFY, OPT_VERSION, OPT_CRASHONERROR, OPT_HELP, OPT_NETWORKIMPL, OPT_NOBUFSTDOUT, OPT_BUFSTDOUTERR, OPT_TRACECLOCK,
OPT_NUMTESTERS, OPT_DEVHELP, OPT_ROLLSIZE, OPT_MAXLOGS, OPT_MAXLOGSSIZE, OPT_KNOB, OPT_TESTSERVERS, OPT_TEST_ON_SERVERS, OPT_METRICSCONNFILE,
@ -111,6 +114,7 @@ CSimpleOpt::SOption g_rgOptions[] = {
{ OPT_MAXLOGSSIZE, "--maxlogssize", SO_REQ_SEP },
{ OPT_LOGGROUP, "--loggroup", SO_REQ_SEP },
{ OPT_PARENTPID, "--parentpid", SO_REQ_SEP },
{ OPT_TRACER, "--tracer", SO_REQ_SEP },
#ifdef _WIN32
{ OPT_NEWCONSOLE, "-n", SO_NONE },
{ OPT_NEWCONSOLE, "--newconsole", SO_NONE },
@ -514,6 +518,9 @@ static void printUsage( const char *name, bool devhelp ) {
printf(" --trace_format FORMAT\n"
" Select the format of the log files. xml (the default) and json\n"
" are supported.\n");
printf(" --tracer TRACER\n"
" Select a tracer for transaction tracing. Currently disabled\n"
" (the default) and log_file are supported.\n");
printf(" -i ID, --machine_id ID\n"
" Machine and zone identifier key (up to 16 hex characters).\n"
" Defaults to a random value shared by all fdbserver processes\n"
@ -884,7 +891,7 @@ struct CLIOptions {
double fileIoTimeout = 0.0;
bool fileIoWarnOnly = false;
uint64_t rsssize = -1;
std::vector<std::string> blobCredentials; // used for fast restore workers
std::vector<std::string> blobCredentials; // used for fast restore workers & backup workers
const char* blobCredsFromENV = nullptr;
Reference<ClusterConnectionFile> connectionFile;
@ -1169,6 +1176,22 @@ private:
break;
}
#endif
case OPT_TRACER:
{
std::string arg = args.OptionArg();
std::string tracer;
std::transform(arg.begin(), arg.end(), std::back_inserter(tracer), [](char c) { return tolower(c); });
if (tracer == "none" || tracer == "disabled") {
openTracer(TracerType::DISABLED);
} else if (tracer == "logfile" || tracer == "file" || tracer == "log_file") {
openTracer(TracerType::LOG_FILE);
} else {
fprintf(stderr, "ERROR: Unknown or unsupported tracer: `%s'", args.OptionArg());
printHelpTeaser(argv[0]);
flushAndExit(FDB_EXIT_ERROR);
}
break;
}
case OPT_TESTFILE:
testFile = args.OptionArg();
break;
@ -1789,6 +1812,16 @@ int main(int argc, char* argv[]) {
setupAndRun(dataFolder, opts.testFile, opts.restarting, (isRestoring >= 1), opts.whitelistBinPaths);
g_simulator.run();
} else if (role == FDBD) {
// Update the global blob credential files list so that both fast
// restore workers and backup workers can access blob storage.
std::vector<std::string>* pFiles =
(std::vector<std::string>*)g_network->global(INetwork::enBlobCredentialFiles);
if (pFiles != nullptr) {
for (auto& f : opts.blobCredentials) {
pFiles->push_back(f);
}
}
// Call fast restore for the class FastRestoreClass. This is a short-cut to run fast restore in circus
if (opts.processClass == ProcessClass::FastRestoreClass) {
printf("Run as fast restore worker\n");
@ -1797,15 +1830,6 @@ int main(int argc, char* argv[]) {
if (!dataFolder.size())
dataFolder = format("fdb/%d/", opts.publicAddresses.address.port); // SOMEDAY: Better default
// Update the global blob credential files list
std::vector<std::string>* pFiles =
(std::vector<std::string>*)g_network->global(INetwork::enBlobCredentialFiles);
if (pFiles != nullptr) {
for (auto& f : opts.blobCredentials) {
pFiles->push_back(f);
}
}
vector<Future<Void>> actors(listenErrors.begin(), listenErrors.end());
actors.push_back(restoreWorker(opts.connectionFile, opts.localities, dataFolder));
f = stopAfter(waitForAll(actors));

View File

@ -53,8 +53,12 @@ struct ProxyVersionReplies {
std::map<uint64_t, GetCommitVersionReply> replies;
NotifiedVersion latestRequestNum;
ProxyVersionReplies(ProxyVersionReplies&& r) BOOST_NOEXCEPT : replies(std::move(r.replies)), latestRequestNum(std::move(r.latestRequestNum)) {}
void operator=(ProxyVersionReplies&& r) BOOST_NOEXCEPT { replies = std::move(r.replies); latestRequestNum = std::move(r.latestRequestNum); }
ProxyVersionReplies(ProxyVersionReplies&& r) noexcept
: replies(std::move(r.replies)), latestRequestNum(std::move(r.latestRequestNum)) {}
void operator=(ProxyVersionReplies&& r) noexcept {
replies = std::move(r.replies);
latestRequestNum = std::move(r.latestRequestNum);
}
ProxyVersionReplies() : latestRequestNum(0) {}
};
@ -157,7 +161,7 @@ private:
} else {
self->fullyRecovered.send(Void());
}
return Void();
}
};
@ -379,7 +383,7 @@ ACTOR Future<Void> newSeedServers( Reference<MasterData> self, RecruitFromConfig
dcId_tags[recruits.storageServers[idx].locality.dcId()] = Tag(nextLocality, 0);
nextLocality++;
}
Tag& tag = dcId_tags[recruits.storageServers[idx].locality.dcId()];
tag.id++;
idx++;
@ -588,7 +592,7 @@ ACTOR Future<vector<Standalone<CommitTransactionRef>>> recruitEverything( Refere
.detail("DesiredResolvers", self->configuration.getDesiredResolvers())
.detail("StoreType", self->configuration.storageServerStoreType)
.trackLatest("MasterRecoveryState");
//FIXME: we only need log routers for the same locality as the master
int maxLogRouters = self->cstate.prevDBState.logRouterTags;
for(auto& old : self->cstate.prevDBState.oldTLogData) {
@ -917,6 +921,7 @@ ACTOR Future<Void> recoverFrom( Reference<MasterData> self, Reference<ILogSystem
}
ACTOR Future<Void> getVersion(Reference<MasterData> self, GetCommitVersionRequest req) {
state Span span("M:getVersion"_loc, { req.spanContext });
state std::map<UID, ProxyVersionReplies>::iterator proxyItr = self->lastProxyVersionReplies.find(req.requestingProxy); // lastProxyVersionReplies never changes
if (proxyItr == self->lastProxyVersionReplies.end()) {
@ -1539,15 +1544,6 @@ ACTOR Future<Void> masterCore( Reference<MasterData> self ) {
tr.set(recoveryCommitRequest.arena, coordinatorsKey, self->coordinators.ccf->getConnectionString().toString());
tr.set(recoveryCommitRequest.arena, logsKey, self->logSystem->getLogsValue());
tr.set(recoveryCommitRequest.arena, primaryDatacenterKey, self->myInterface.locality.dcId().present() ? self->myInterface.locality.dcId().get() : StringRef());
//FIXME: remove this code, caching the entire normal keyspace as a test of functionality
//TODO: caching disabled for this merge
//tr.set(recoveryCommitRequest.arena, storageCacheKey(normalKeys.begin), storageCacheValue({0}));
//tr.set(recoveryCommitRequest.arena, storageCacheKey(normalKeys.end), storageCacheValue({}));
//tr.set(recoveryCommitRequest.arena, cacheKeysKey(0, normalKeys.begin), serverKeysTrue);
//tr.set(recoveryCommitRequest.arena, cacheKeysKey(0, normalKeys.end), serverKeysFalse);
//tr.set(recoveryCommitRequest.arena, cacheChangeKeyFor(0), BinaryWriter::toValue(deterministicRandom()->randomUniqueID(),Unversioned()));
//tr.set(recoveryCommitRequest.arena, cacheChangeKey, BinaryWriter::toValue(deterministicRandom()->randomUniqueID(),Unversioned()));
tr.clear(recoveryCommitRequest.arena, tLogDatacentersKeys);
for(auto& dc : self->primaryDcId) {
@ -1689,7 +1685,7 @@ ACTOR Future<Void> masterServer( MasterInterface mi, Reference<AsyncVar<ServerDB
while(!self->addActor.isEmpty()) {
self->addActor.getFuture().pop();
}
TEST(err.code() == error_code_master_tlog_failed); // Master: terminated because of a tLog failure
TEST(err.code() == error_code_master_proxy_failed); // Master: terminated because of a proxy failure
TEST(err.code() == error_code_master_resolver_failed); // Master: terminated because of a resolver failure

View File

@ -21,6 +21,9 @@
#include <cinttypes>
#include "fdbrpc/fdbrpc.h"
#include "fdbrpc/LoadBalance.h"
#include "flow/Arena.h"
#include "flow/IRandom.h"
#include "flow/Tracing.h"
#include "flow/IndexedSet.h"
#include "flow/Hash3.h"
#include "flow/ActorCollection.h"
@ -712,7 +715,7 @@ public:
}
template<class Request, class HandleFunction>
Future<Void> readGuard(const Request& request, const HandleFunction& fun) {
Future<Void> readGuard(const Span& parentSpan, const Request& request, const HandleFunction& fun) {
auto rate = currentRate();
if (rate < SERVER_KNOBS->STORAGE_DURABILITY_LAG_REJECT_THRESHOLD && deterministicRandom()->random01() > std::max(SERVER_KNOBS->STORAGE_DURABILITY_LAG_MIN_RATE, rate/SERVER_KNOBS->STORAGE_DURABILITY_LAG_REJECT_THRESHOLD)) {
//request.error = future_version();
@ -720,7 +723,7 @@ public:
++counters.readsRejected;
return Void();
}
return fun(this, request);
return fun(this, request, parentSpan);
}
};
@ -846,7 +849,8 @@ updateProcessStats(StorageServer* self)
#pragma region Queries
#endif
ACTOR Future<Version> waitForVersionActor(StorageServer* data, Version version) {
ACTOR Future<Version> waitForVersionActor(StorageServer* data, Version version, SpanID spanContext) {
state Span span("SS.WaitForVersion"_loc, { spanContext });
choose {
when(wait(data->version.whenAtLeast(version))) {
// FIXME: A bunch of these can block with or without the following delay 0.
@ -865,7 +869,7 @@ ACTOR Future<Version> waitForVersionActor(StorageServer* data, Version version)
}
}
Future<Version> waitForVersion(StorageServer* data, Version version) {
Future<Version> waitForVersion(StorageServer* data, Version version, SpanID spanContext) {
if (version == latestVersion) {
version = std::max(Version(1), data->version.get());
}
@ -883,7 +887,7 @@ Future<Version> waitForVersion(StorageServer* data, Version version) {
if (deterministicRandom()->random01() < 0.001) {
TraceEvent("WaitForVersion1000x");
}
return waitForVersionActor(data, version);
return waitForVersionActor(data, version, spanContext);
}
ACTOR Future<Version> waitForVersionNoTooOld( StorageServer* data, Version version ) {
@ -907,7 +911,7 @@ ACTOR Future<Version> waitForVersionNoTooOld( StorageServer* data, Version versi
}
}
ACTOR Future<Void> getValueQ( StorageServer* data, GetValueRequest req ) {
ACTOR Future<Void> getValueQ( StorageServer* data, GetValueRequest req, Span span ) {
state int64_t resultSize = 0;
try {
@ -924,7 +928,7 @@ ACTOR Future<Void> getValueQ( StorageServer* data, GetValueRequest req ) {
g_traceBatch.addEvent("GetValueDebug", req.debugID.get().first(), "getValueQ.DoRead"); //.detail("TaskID", g_network->getCurrentTask());
state Optional<Value> v;
state Version version = wait( waitForVersion( data, req.version ) );
state Version version = wait( waitForVersion( data, req.version, req.spanContext ) );
if( req.debugID.present() )
g_traceBatch.addEvent("GetValueDebug", req.debugID.get().first(), "getValueQ.AfterVersion"); //.detail("TaskID", g_network->getCurrentTask());
@ -982,7 +986,12 @@ ACTOR Future<Void> getValueQ( StorageServer* data, GetValueRequest req ) {
if( req.debugID.present() )
g_traceBatch.addEvent("GetValueDebug", req.debugID.get().first(), "getValueQ.AfterRead"); //.detail("TaskID", g_network->getCurrentTask());
GetValueReply reply(v);
// Check if the desired key might be cached
auto cached = data->cachedRangeMap[req.key];
//if (cached)
// TraceEvent(SevDebug, "SSGetValueCached").detail("Key", req.key);
GetValueReply reply(v, cached);
reply.penalty = data->getPenalty();
req.reply.send(reply);
} catch (Error& e) {
@ -1003,7 +1012,8 @@ ACTOR Future<Void> getValueQ( StorageServer* data, GetValueRequest req ) {
return Void();
};
ACTOR Future<Void> watchValue_impl( StorageServer* data, WatchValueRequest req ) {
ACTOR Future<Void> watchValue_impl( StorageServer* data, WatchValueRequest req, SpanID parent ) {
state Span span("SS:WatchValueImpl"_loc, { parent });
try {
++data->counters.watchQueries;
@ -1018,9 +1028,11 @@ ACTOR Future<Void> watchValue_impl( StorageServer* data, WatchValueRequest req )
try {
state Version latest = data->data().latestVersion;
state Future<Void> watchFuture = data->watches.onChange(req.key);
GetValueRequest getReq( req.key, latest, req.tags, req.debugID );
state Future<Void> getValue = getValueQ( data, getReq ); //we are relying on the delay zero at the top of getValueQ, if removed we need one here
state Span getValueSpan(deterministicRandom()->randomUniqueID(), "SS:GetValue"_loc, { span->context });
GetValueRequest getReq( getValueSpan->context, req.key, latest, req.tags, req.debugID );
state Future<Void> getValue = getValueQ( data, getReq, span ); //we are relying on the delay zero at the top of getValueQ, if removed we need one here
GetValueReply reply = wait( getReq.reply.getFuture() );
getValueSpan.reset();
//TraceEvent("WatcherCheckValue").detail("Key", req.key ).detail("Value", req.value ).detail("CurrentValue", v ).detail("Ver", latest);
if(reply.error.present()) {
@ -1067,8 +1079,8 @@ ACTOR Future<Void> watchValue_impl( StorageServer* data, WatchValueRequest req )
return Void();
}
ACTOR Future<Void> watchValueQ( StorageServer* data, WatchValueRequest req ) {
state Future<Void> watch = watchValue_impl( data, req );
ACTOR Future<Void> watchValueQ( StorageServer* data, WatchValueRequest req, Span span ) {
state Future<Void> watch = watchValue_impl( data, req, span->context );
state double startTime = now();
loop {
@ -1173,7 +1185,7 @@ void merge( Arena& arena, VectorRef<KeyValueRef, VecSerStrategy::String>& output
// If limit>=0, it returns the first rows in the range (sorted ascending), otherwise the last rows (sorted descending).
// readRange has O(|result|) + O(log |data|) cost
ACTOR Future<GetKeyValuesReply> readRange( StorageServer* data, Version version, KeyRange range, int limit, int* pLimitBytes ) {
ACTOR Future<GetKeyValuesReply> readRange( StorageServer* data, Version version, KeyRange range, int limit, int* pLimitBytes, Span parentSpan ) {
state GetKeyValuesReply result;
state StorageServer::VersionedData::ViewAtVersion view = data->data().at(version);
state StorageServer::VersionedData::iterator vCurrent = view.end();
@ -1181,6 +1193,7 @@ ACTOR Future<GetKeyValuesReply> readRange( StorageServer* data, Version version,
state KeyRef readEnd;
state Key readBeginTemp;
state int vCount = 0;
state Span span("SS:readRange"_loc, parentSpan);
// for caching the storage queue results during the first PTree traversal
state VectorRef<KeyValueRef> resultCache;
@ -1190,11 +1203,14 @@ ACTOR Future<GetKeyValuesReply> readRange( StorageServer* data, Version version,
state int pos = 0;
// Check if the desired key-range intersects the cached key-ranges
// TODO Find a more efficient way to do it
// TODO Also need this check in single key/value lookup
auto cached = data->cachedRangeMap.intersectingRanges(range);
result.cached = (cached.begin() != cached.end());
// Check if the desired key-range is cached
auto containingRange = data->cachedRangeMap.rangeContaining(range.begin);
if (containingRange.value() && containingRange->range().end >= range.end) {
//TraceEvent(SevDebug, "SSReadRangeCached").detail("Size",data->cachedRangeMap.size()).detail("ContainingRangeBegin",containingRange->range().begin).detail("ContainingRangeEnd",containingRange->range().end).
// detail("Begin", range.begin).detail("End",range.end);
result.cached = true;
} else
result.cached = false;
// if (limit >= 0) we are reading forward, else backward
if (limit >= 0) {
@ -1349,7 +1365,7 @@ ACTOR Future<GetKeyValuesReply> readRange( StorageServer* data, Version version,
// return sel.getKey() >= range.begin && (sel.isBackward() ? sel.getKey() <= range.end : sel.getKey() < range.end);
//}
ACTOR Future<Key> findKey( StorageServer* data, KeySelectorRef sel, Version version, KeyRange range, int* pOffset)
ACTOR Future<Key> findKey( StorageServer* data, KeySelectorRef sel, Version version, KeyRange range, int* pOffset, SpanID parentSpan)
// Attempts to find the key indicated by sel in the data at version, within range.
// Precondition: selectorInRange(sel, range)
// If it is found, offset is set to 0 and a key is returned which falls inside range.
@ -1366,6 +1382,7 @@ ACTOR Future<Key> findKey( StorageServer* data, KeySelectorRef sel, Version vers
state int sign = forward ? +1 : -1;
state bool skipEqualKey = sel.orEqual == forward;
state int distance = forward ? sel.offset : 1-sel.offset;
state Span span("SS.findKey"_loc, { parentSpan });
//Don't limit the number of bytes if this is a trivial key selector (there will be at most two items returned from the read range in this case)
state int maxBytes;
@ -1374,14 +1391,18 @@ ACTOR Future<Key> findKey( StorageServer* data, KeySelectorRef sel, Version vers
else
maxBytes = BUGGIFY ? SERVER_KNOBS->BUGGIFY_LIMIT_BYTES : SERVER_KNOBS->STORAGE_LIMIT_BYTES;
state GetKeyValuesReply rep = wait( readRange( data, version, forward ? KeyRangeRef(sel.getKey(), range.end) : KeyRangeRef(range.begin, keyAfter(sel.getKey())), (distance + skipEqualKey)*sign, &maxBytes ) );
state GetKeyValuesReply rep = wait(
readRange(data, version,
forward ? KeyRangeRef(sel.getKey(), range.end) : KeyRangeRef(range.begin, keyAfter(sel.getKey())),
(distance + skipEqualKey) * sign, &maxBytes, span));
state bool more = rep.more && rep.data.size() != distance + skipEqualKey;
//If we get only one result in the reverse direction as a result of the data being too large, we could get stuck in a loop
if(more && !forward && rep.data.size() == 1) {
TEST(true); //Reverse key selector returned only one result in range read
maxBytes = std::numeric_limits<int>::max();
GetKeyValuesReply rep2 = wait( readRange( data, version, KeyRangeRef(range.begin, keyAfter(sel.getKey())), -2, &maxBytes ) );
GetKeyValuesReply rep2 =
wait(readRange(data, version, KeyRangeRef(range.begin, keyAfter(sel.getKey())), -2, &maxBytes, span));
rep = rep2;
more = rep.more && rep.data.size() != distance + skipEqualKey;
ASSERT(rep.data.size() == 2 || !more);
@ -1436,7 +1457,7 @@ KeyRange getShardKeyRange( StorageServer* data, const KeySelectorRef& sel )
return i->range();
}
ACTOR Future<Void> getKeyValuesQ( StorageServer* data, GetKeyValuesRequest req )
ACTOR Future<Void> getKeyValuesQ( StorageServer* data, GetKeyValuesRequest req, Span span )
// Throws a wrong_shard_server if the keys in the request or result depend on data outside this server OR if a large selector offset prevents
// all data from being read in one range read
{
@ -1461,7 +1482,7 @@ ACTOR Future<Void> getKeyValuesQ( StorageServer* data, GetKeyValuesRequest req )
try {
if( req.debugID.present() )
g_traceBatch.addEvent("TransactionDebug", req.debugID.get().first(), "storageserver.getKeyValues.Before");
state Version version = wait( waitForVersion( data, req.version ) );
state Version version = wait( waitForVersion( data, req.version, span->context ) );
state uint64_t changeCounter = data->shardChangeCounter;
// try {
@ -1479,8 +1500,8 @@ ACTOR Future<Void> getKeyValuesQ( StorageServer* data, GetKeyValuesRequest req )
state int offset1;
state int offset2;
state Future<Key> fBegin = req.begin.isFirstGreaterOrEqual() ? Future<Key>(req.begin.getKey()) : findKey( data, req.begin, version, shard, &offset1 );
state Future<Key> fEnd = req.end.isFirstGreaterOrEqual() ? Future<Key>(req.end.getKey()) : findKey( data, req.end, version, shard, &offset2 );
state Future<Key> fBegin = req.begin.isFirstGreaterOrEqual() ? Future<Key>(req.begin.getKey()) : findKey( data, req.begin, version, shard, &offset1, span->context );
state Future<Key> fEnd = req.end.isFirstGreaterOrEqual() ? Future<Key>(req.end.getKey()) : findKey( data, req.end, version, shard, &offset2, span->context );
state Key begin = wait(fBegin);
state Key end = wait(fEnd);
if( req.debugID.present() )
@ -1514,7 +1535,7 @@ ACTOR Future<Void> getKeyValuesQ( StorageServer* data, GetKeyValuesRequest req )
} else {
state int remainingLimitBytes = req.limitBytes;
GetKeyValuesReply _r = wait( readRange(data, version, KeyRangeRef(begin, end), req.limit, &remainingLimitBytes) );
GetKeyValuesReply _r = wait( readRange(data, version, KeyRangeRef(begin, end), req.limit, &remainingLimitBytes, span) );
GetKeyValuesReply r = _r;
if( req.debugID.present() )
@ -1576,7 +1597,7 @@ ACTOR Future<Void> getKeyValuesQ( StorageServer* data, GetKeyValuesRequest req )
return Void();
}
ACTOR Future<Void> getKeyQ( StorageServer* data, GetKeyRequest req ) {
ACTOR Future<Void> getKeyQ( StorageServer* data, GetKeyRequest req, Span span ) {
state int64_t resultSize = 0;
++data->counters.getKeyQueries;
@ -1589,12 +1610,12 @@ ACTOR Future<Void> getKeyQ( StorageServer* data, GetKeyRequest req ) {
wait( delay(0, TaskPriority::DefaultEndpoint) );
try {
state Version version = wait( waitForVersion( data, req.version ) );
state Version version = wait( waitForVersion( data, req.version, req.spanContext ) );
state uint64_t changeCounter = data->shardChangeCounter;
state KeyRange shard = getShardKeyRange( data, req.sel );
state int offset;
Key k = wait( findKey( data, req.sel, version, shard, &offset ) );
Key k = wait( findKey( data, req.sel, version, shard, &offset, req.spanContext ) );
data->checkChangeCounter( changeCounter, KeyRangeRef( std::min<KeyRef>(req.sel.getKey(), k), std::max<KeyRef>(req.sel.getKey(), k) ) );
@ -1610,8 +1631,14 @@ ACTOR Future<Void> getKeyQ( StorageServer* data, GetKeyRequest req ) {
data->counters.bytesQueried += resultSize;
++data->counters.rowsQueried;
GetKeyReply reply(updated);
// Check if the desired key might be cached
auto cached = data->cachedRangeMap[k];
//if (cached)
// TraceEvent(SevDebug, "SSGetKeyCached").detail("Key", k).detail("Begin", shard.begin.printable()).detail("End", shard.end.printable());
GetKeyReply reply(updated, cached);
reply.penalty = data->getPenalty();
req.reply.send(reply);
}
catch (Error& e) {
@ -2584,7 +2611,6 @@ public:
if ((m.type == MutationRef::SetValue) && m.param1.substr(1).startsWith(storageCachePrefix))
applyPrivateCacheData( data, m);
else {
//TraceEvent("PrivateData", data->thisServerID).detail("Mutation", m.toString()).detail("Version", ver);
applyPrivateData( data, m );
}
} else {
@ -2673,7 +2699,7 @@ private:
}
void applyPrivateCacheData( StorageServer* data, MutationRef const& m ) {
TraceEvent(SevDebug, "SSPrivateCacheMutation", data->thisServerID).detail("Mutation", m.toString());
//TraceEvent(SevDebug, "SSPrivateCacheMutation", data->thisServerID).detail("Mutation", m.toString());
if (processedCacheStartKey) {
// Because of the implementation of the krm* functions, we expect changes in pairs, [begin,end)
@ -2681,17 +2707,16 @@ private:
KeyRangeRef keys( cacheStartKey.removePrefix(systemKeys.begin).removePrefix( storageCachePrefix ),
m.param1.removePrefix(systemKeys.begin).removePrefix( storageCachePrefix ));
data->cachedRangeMap.insert(keys, true);
//TraceEvent(SevDebug, "SSPrivateCacheMutation", data->thisServerID).detail("Begin", keys.begin).detail("End", keys.end);
//fprintf(stderr, "applyPrivateCacheData : begin: %s, end: %s\n", printable(keys.begin).c_str(), printable(keys.end).c_str());
//Figure out the affected shard ranges and maintain the cached key-range information in the in-memory map
// TODO revisit- we are not splitting the cached ranges based on shards as of now.
if (0) {
auto cachedRanges = data->shards.intersectingRanges(keys);
for(auto shard = cachedRanges.begin(); shard != cachedRanges.end(); ++shard) {
KeyRangeRef intersectingRange = shard.range() & keys;
data->cachedRangeMap.insert(KeyRangeRef(intersectingRange.begin, intersectingRange.end), true);
}
auto cachedRanges = data->shards.intersectingRanges(keys);
for(auto shard = cachedRanges.begin(); shard != cachedRanges.end(); ++shard) {
KeyRangeRef intersectingRange = shard.range() & keys;
TraceEvent(SevDebug, "SSPrivateCacheMutationInsertUnexpected", data->thisServerID).detail("Begin", intersectingRange.begin).detail("End", intersectingRange.end);
data->cachedRangeMap.insert(intersectingRange, true);
}
}
processedStartKey = false;
} else if ((m.type == MutationRef::SetValue) && m.param1.substr(1).startsWith(storageCachePrefix)) {
@ -2728,7 +2753,6 @@ ACTOR Future<Void> update( StorageServer* data, bool* pReceivedUpdate )
}
state Reference<ILogSystem::IPeekCursor> cursor = data->logCursor;
//TraceEvent("SSUpdatePeeking", data->thisServerID).detail("MyVer", data->version.get()).detail("Epoch", data->updateEpoch).detail("Seq", data->updateSequence);
loop {
wait( cursor->getMore() );
@ -2775,12 +2799,14 @@ ACTOR Future<Void> update( StorageServer* data, bool* pReceivedUpdate )
if (LogProtocolMessage::isNextIn(cloneReader)) {
LogProtocolMessage lpm;
cloneReader >> lpm;
//TraceEvent(SevDebug, "SSReadingLPM", data->thisServerID).detail("Mutation", lpm.toString());
dbgLastMessageWasProtocol = true;
cloneCursor1->setProtocolVersion(cloneReader.protocolVersion());
}
else {
MutationRef msg;
cloneReader >> msg;
//TraceEvent(SevDebug, "SSReadingLog", data->thisServerID).detail("Mutation", msg.toString());
if (firstMutation && msg.param1.startsWith(systemKeys.end))
hasPrivateData = true;
@ -2844,7 +2870,6 @@ ACTOR Future<Void> update( StorageServer* data, bool* pReceivedUpdate )
state Version ver = invalidVersion;
cloneCursor2->setProtocolVersion(data->logProtocol);
//TraceEvent("SSUpdatePeeked", data->thisServerID).detail("FromEpoch", data->updateEpoch).detail("FromSeq", data->updateSequence).detail("ToEpoch", results.end_epoch).detail("ToSeq", results.end_seq).detail("MsgSize", results.messages.size());
for (;cloneCursor2->hasMessage(); cloneCursor2->nextMessage()) {
if(mutationBytes > SERVER_KNOBS->DESIRED_UPDATE_BYTES) {
mutationBytes = 0;
@ -3651,6 +3676,7 @@ ACTOR Future<Void> checkBehind( StorageServer* self ) {
ACTOR Future<Void> serveGetValueRequests( StorageServer* self, FutureStream<GetValueRequest> getValue ) {
loop {
GetValueRequest req = waitNext(getValue);
Span span("SS:getValue"_loc, { req.spanContext });
// Warning: This code is executed at extremely high priority (TaskPriority::LoadBalancedEndpoint), so downgrade before doing real work
if( req.debugID.present() )
g_traceBatch.addEvent("GetValueDebug", req.debugID.get().first(), "storageServer.received"); //.detail("TaskID", g_network->getCurrentTask());
@ -3658,32 +3684,35 @@ ACTOR Future<Void> serveGetValueRequests( StorageServer* self, FutureStream<GetV
if (SHORT_CIRCUT_ACTUAL_STORAGE && normalKeys.contains(req.key))
req.reply.send(GetValueReply());
else
self->actors.add(self->readGuard(req , getValueQ));
self->actors.add(self->readGuard(span, req , getValueQ));
}
}
ACTOR Future<Void> serveGetKeyValuesRequests( StorageServer* self, FutureStream<GetKeyValuesRequest> getKeyValues ) {
loop {
GetKeyValuesRequest req = waitNext(getKeyValues);
Span span("SS:getKeyValues"_loc, { req.spanContext });
// Warning: This code is executed at extremely high priority (TaskPriority::LoadBalancedEndpoint), so downgrade before doing real work
self->actors.add(self->readGuard(req, getKeyValuesQ));
self->actors.add(self->readGuard(span, req, getKeyValuesQ));
}
}
ACTOR Future<Void> serveGetKeyRequests( StorageServer* self, FutureStream<GetKeyRequest> getKey ) {
loop {
GetKeyRequest req = waitNext(getKey);
Span span("SS:getKey"_loc, { req.spanContext });
// Warning: This code is executed at extremely high priority (TaskPriority::LoadBalancedEndpoint), so downgrade before doing real work
self->actors.add(self->readGuard(req , getKeyQ));
self->actors.add(self->readGuard(span, req , getKeyQ));
}
}
ACTOR Future<Void> serveWatchValueRequests( StorageServer* self, FutureStream<WatchValueRequest> watchValue ) {
loop {
WatchValueRequest req = waitNext(watchValue);
Span span("SS:watchValue"_loc, { req.spanContext });
// TODO: fast load balancing?
// SOMEDAY: combine watches for the same key/value into a single watch
self->actors.add(self->readGuard(req, watchValueQ));
self->actors.add(self->readGuard(span, req, watchValueQ));
}
}

View File

@ -21,6 +21,8 @@
#include <tuple>
#include <boost/lexical_cast.hpp>
#include "fdbrpc/Locality.h"
#include "fdbclient/StorageServerInterface.h"
#include "fdbserver/Knobs.h"
#include "flow/ActorCollection.h"
#include "flow/SystemMonitor.h"
@ -453,7 +455,7 @@ ACTOR Future<Void> registrationClient(
state Future<Void> cacheErrorsFuture;
state Optional<double> incorrectTime;
loop {
RegisterWorkerRequest request(interf, initialClass, processClass, asyncPriorityInfo->get(), requestGeneration++, ddInterf->get(), rkInterf->get(), scInterf->get(), degraded->get());
RegisterWorkerRequest request(interf, initialClass, processClass, asyncPriorityInfo->get(), requestGeneration++, ddInterf->get(), rkInterf->get(), degraded->get());
for (auto const& i : issues->get()) {
request.issues.push_back_deep(request.issues.arena(), i);
}
@ -491,41 +493,10 @@ ACTOR Future<Void> registrationClient(
when ( RegisterWorkerReply reply = wait( registrationReply )) {
processClass = reply.processClass;
asyncPriorityInfo->set( reply.priorityInfo );
if(!reply.storageCache.present()) {
cacheProcessFuture.cancel();
scInterf->set(Optional<std::pair<uint16_t,StorageServerInterface>>());
} else if (!scInterf->get().present() || scInterf->get().get().first != reply.storageCache.get()) {
StorageServerInterface recruited;
recruited.locality = locality;
recruited.initEndpoints();
std::map<std::string, std::string> details;
startRole( Role::STORAGE_CACHE, recruited.id(), interf.id(), details );
//DUMPTOKEN(recruited.getVersion);
DUMPTOKEN(recruited.getValue);
DUMPTOKEN(recruited.getKey);
DUMPTOKEN(recruited.getKeyValues);
DUMPTOKEN(recruited.getShardState);
DUMPTOKEN(recruited.waitMetrics);
DUMPTOKEN(recruited.splitMetrics);
DUMPTOKEN(recruited.getReadHotRanges);
DUMPTOKEN(recruited.getStorageMetrics);
DUMPTOKEN(recruited.waitFailure);
DUMPTOKEN(recruited.getQueuingMetrics);
DUMPTOKEN(recruited.getKeyValueStoreType);
DUMPTOKEN(recruited.watchValue);
cacheProcessFuture = storageCache( recruited, reply.storageCache.get(), dbInfo );
cacheErrorsFuture = forwardError(errors, Role::STORAGE_CACHE, recruited.id(), setWhenDoneOrError(cacheProcessFuture, scInterf, Optional<std::pair<uint16_t,StorageServerInterface>>()));
scInterf->set(std::make_pair(reply.storageCache.get(), recruited));
}
}
when ( wait( ccInterface->onChange() )) {}
when ( wait( ddInterf->onChange() ) ) {}
when ( wait( rkInterf->onChange() ) ) {}
when ( wait( scInterf->onChange() ) ) {}
when ( wait( degraded->onChange() ) ) {}
when ( wait( FlowTransport::transport().onIncompatibleChanged() ) ) {}
when ( wait( issues->onChange() ) ) {}
@ -712,6 +683,41 @@ ACTOR Future<Void> storageServerRollbackRebooter( Future<Void> prevStorageServer
}
}
ACTOR Future<Void> storageCacheRollbackRebooter( Future<Void> prevStorageCache, UID id, LocalityData locality, Reference<AsyncVar<ServerDBInfo>> db) {
loop {
ErrorOr<Void> e = wait( errorOr( prevStorageCache) );
if (!e.isError()) {
TraceEvent("StorageCacheRequestedReboot1", id);
return Void();
}
else if (e.getError().code() != error_code_please_reboot && e.getError().code() != error_code_worker_removed) {
TraceEvent("StorageCacheRequestedReboot2", id).detail("Code",e.getError().code());
throw e.getError();
}
TraceEvent("StorageCacheRequestedReboot", id);
StorageServerInterface recruited;
recruited.uniqueID = deterministicRandom()->randomUniqueID();// id;
recruited.locality = locality;
recruited.initEndpoints();
DUMPTOKEN(recruited.getValue);
DUMPTOKEN(recruited.getKey);
DUMPTOKEN(recruited.getKeyValues);
DUMPTOKEN(recruited.getShardState);
DUMPTOKEN(recruited.waitMetrics);
DUMPTOKEN(recruited.splitMetrics);
DUMPTOKEN(recruited.getStorageMetrics);
DUMPTOKEN(recruited.waitFailure);
DUMPTOKEN(recruited.getQueuingMetrics);
DUMPTOKEN(recruited.getKeyValueStoreType);
DUMPTOKEN(recruited.watchValue);
prevStorageCache = storageCacheServer(recruited, 0, db);
}
}
// FIXME: This will not work correctly in simulation as all workers would share the same roles map
std::set<std::pair<std::string, std::string>> g_roles;
@ -1049,10 +1055,40 @@ ACTOR Future<Void> workerServer(
}
}
bool hasCache = false;
// start cache role if we have the right process class
if (initialClass.classType() == ProcessClass::StorageCacheClass) {
hasCache = true;
StorageServerInterface recruited;
recruited.locality = locality;
recruited.initEndpoints();
std::map<std::string, std::string> details;
startRole(Role::STORAGE_CACHE, recruited.id(), interf.id(), details);
// DUMPTOKEN(recruited.getVersion);
DUMPTOKEN(recruited.getValue);
DUMPTOKEN(recruited.getKey);
DUMPTOKEN(recruited.getKeyValues);
DUMPTOKEN(recruited.getShardState);
DUMPTOKEN(recruited.waitMetrics);
DUMPTOKEN(recruited.splitMetrics);
DUMPTOKEN(recruited.getStorageMetrics);
DUMPTOKEN(recruited.waitFailure);
DUMPTOKEN(recruited.getQueuingMetrics);
DUMPTOKEN(recruited.getKeyValueStoreType);
DUMPTOKEN(recruited.watchValue);
auto f = storageCacheServer(recruited, 0, dbInfo);
f = storageCacheRollbackRebooter( f, recruited.id(), recruited.locality, dbInfo);
errorForwarders.add(forwardError(errors, Role::STORAGE_CACHE, recruited.id(), f));
}
std::map<std::string, std::string> details;
details["Locality"] = locality.toString();
details["DataFolder"] = folder;
details["StoresPresent"] = format("%d", stores.size());
details["CachePresent"] = hasCache ? "true" : "false";
startRole( Role::WORKER, interf.id(), interf.id(), details );
errorForwarders.add(traceRole(Role::WORKER, interf.id()));
@ -1350,7 +1386,7 @@ ACTOR Future<Void> workerServer(
DUMPTOKEN( recruited.getQueuingMetrics );
DUMPTOKEN( recruited.confirmRunning );
errorForwarders.add( zombie(recruited, forwardError( errors, Role::LOG_ROUTER, recruited.id(),
errorForwarders.add( zombie(recruited, forwardError( errors, Role::LOG_ROUTER, recruited.id(),
logRouter( recruited, req, dbInfo ) ) ) );
req.reply.send(recruited);
}
@ -1672,7 +1708,7 @@ ACTOR Future<Void> monitorLeaderRemotelyWithDelayedCandidacy( Reference<ClusterC
if(currentCC->get().present() && dbInfo->get().clusterInterface == currentCC->get().get() && IFailureMonitor::failureMonitor().getState( currentCC->get().get().registerWorker.getEndpoint() ).isAvailable()) {
timeout = Future<Void>();
} else if(!timeout.isValid()) {
timeout = delay( SERVER_KNOBS->MIN_DELAY_STORAGE_CANDIDACY_SECONDS + (deterministicRandom()->random01()*(SERVER_KNOBS->MAX_DELAY_STORAGE_CANDIDACY_SECONDS-SERVER_KNOBS->MIN_DELAY_STORAGE_CANDIDACY_SECONDS)) );
timeout = delay( SERVER_KNOBS->MIN_DELAY_CC_WORST_FIT_CANDIDACY_SECONDS + (deterministicRandom()->random01()*(SERVER_KNOBS->MAX_DELAY_CC_WORST_FIT_CANDIDACY_SECONDS-SERVER_KNOBS->MIN_DELAY_CC_WORST_FIT_CANDIDACY_SECONDS)) );
}
choose {
when( wait(currentCC->onChange()) ) {}
@ -1729,9 +1765,9 @@ ACTOR Future<Void> fdbd(
Reference<AsyncVar<ServerDBInfo>> dbInfo( new AsyncVar<ServerDBInfo>(ServerDBInfo()) );
actors.push_back(reportErrors(monitorAndWriteCCPriorityInfo(fitnessFilePath, asyncPriorityInfo), "MonitorAndWriteCCPriorityInfo"));
if (processClass == ProcessClass::TesterClass) {
if (processClass.machineClassFitness(ProcessClass::ClusterController) == ProcessClass::NeverAssign) {
actors.push_back( reportErrors( monitorLeader( connFile, cc ), "ClusterController" ) );
} else if (processClass == ProcessClass::StorageClass && SERVER_KNOBS->MAX_DELAY_STORAGE_CANDIDACY_SECONDS > 0) {
} else if (processClass.machineClassFitness(ProcessClass::ClusterController) == ProcessClass::WorstFit && SERVER_KNOBS->MAX_DELAY_CC_WORST_FIT_CANDIDACY_SECONDS > 0) {
actors.push_back( reportErrors( monitorLeaderRemotelyWithDelayedCandidacy( connFile, cc, asyncPriorityInfo, recoveredDiskFiles.getFuture(), localities, dbInfo ), "ClusterController" ) );
} else {
actors.push_back( reportErrors( clusterController( connFile, cc , asyncPriorityInfo, recoveredDiskFiles.getFuture(), localities ), "ClusterController") );

View File

@ -507,9 +507,9 @@ struct BackupAndParallelRestoreCorrectnessWorkload : TestWorkload {
}
// Wait for parallel restore to finish before we can proceed
TraceEvent("FastRestore").detail("BackupAndParallelRestore", "WaitForRestoreToFinish");
TraceEvent("FastRestoreWorkload").detail("WaitForRestoreToFinish", randomID);
wait(backupAgent.parallelRestoreFinish(cx, randomID));
TraceEvent("FastRestore").detail("BackupAndParallelRestore", "RestoreFinished");
TraceEvent("FastRestoreWorkload").detail("RestoreFinished", randomID);
for (auto& restore : restores) {
ASSERT(!restore.isError());
@ -668,7 +668,7 @@ struct BackupAndParallelRestoreCorrectnessWorkload : TestWorkload {
g_simulator.backupAgents = ISimulator::NoBackupAgents;
}
} catch (Error& e) {
TraceEvent(SevError, "BackupAndRestoreCorrectness").error(e).GetLastError();
TraceEvent(SevError, "BackupAndParallelRestoreCorrectness").error(e).GetLastError();
throw;
}
return Void();

View File

@ -0,0 +1,33 @@
#include "fdbclient/ManagementAPI.actor.h"
#include "fdbserver/TesterInterface.actor.h"
#include "fdbserver/workloads/workloads.actor.h"
#include "flow/actorcompiler.h" // This must be the last #include.
struct CacheWorkload : TestWorkload {
Key keyPrefix;
CacheWorkload(WorkloadContext const& wcx)
: TestWorkload(wcx)
{
keyPrefix = unprintable( getOption(options, LiteralStringRef("keyPrefix"), LiteralStringRef("")).toString() );
}
virtual std::string description() { return "CacheWorkload"; }
virtual Future<Void> setup( Database const& cx ) {
if (clientId == 0) {
//Call management API to cache keys under the given prefix
return addCachedRange(cx, prefixRange(keyPrefix));
}
return Void();
}
virtual Future<Void> start( Database const& cx ) {
return Void();
}
virtual Future<bool> check( Database const& cx ) {
return true;
}
virtual void getMetrics( vector<PerfMetric>& m ) {
}
};
WorkloadFactory<CacheWorkload> CacheWorkloadFactory("Cache");

View File

@ -21,6 +21,7 @@
#include <math.h>
#include "flow/IRandom.h"
#include "flow/Tracing.h"
#include "fdbclient/NativeAPI.actor.h"
#include "fdbserver/TesterInterface.actor.h"
#include "fdbserver/workloads/workloads.actor.h"
@ -376,12 +377,16 @@ struct ConsistencyCheckWorkload : TestWorkload
state Key begin = keyServersKeys.begin;
state Key end = keyServersKeys.end;
state int limitKeyServers = BUGGIFY ? 1 : 100;
state Span span(deterministicRandom()->randomUniqueID(), "WL:ConsistencyCheck"_loc);
while (begin < end) {
state Reference<ProxyInfo> proxyInfo = wait(cx->getMasterProxiesFuture(false));
keyServerLocationFutures.clear();
for (int i = 0; i < proxyInfo->size(); i++)
keyServerLocationFutures.push_back(proxyInfo->get(i, &MasterProxyInterface::getKeyServersLocations).getReplyUnlessFailedFor(GetKeyServerLocationsRequest(begin, end, limitKeyServers, false, Arena()), 2, 0));
keyServerLocationFutures.push_back(
proxyInfo->get(i, &MasterProxyInterface::getKeyServersLocations)
.getReplyUnlessFailedFor(
GetKeyServerLocationsRequest(span->context, begin, end, limitKeyServers, false, Arena()), 2, 0));
state bool keyServersInsertedForThisIteration = false;
choose {
@ -708,6 +713,7 @@ struct ConsistencyCheckWorkload : TestWorkload
state vector<UID> storageServers = (isRelocating) ? destStorageServers : sourceStorageServers;
state vector<StorageServerInterface> storageServerInterfaces;
//TraceEvent("ConsistencyCheck_GetStorageInfo").detail("StorageServers", storageServers.size());
loop {
try {
vector< Future< Optional<Value> > > serverListEntries;
@ -720,6 +726,7 @@ struct ConsistencyCheckWorkload : TestWorkload
else if (self->performQuiescentChecks)
self->testFailure("/FF/serverList changing in a quiescent database");
}
break;
}
catch(Error &e) {
@ -917,7 +924,7 @@ struct ConsistencyCheckWorkload : TestWorkload
else if(!isRelocating)
{
TraceEvent("ConsistencyCheck_StorageServerUnavailable").suppressFor(1.0).detail("StorageServer", storageServers[j]).detail("ShardBegin", printable(range.begin)).detail("ShardEnd", printable(range.end))
.detail("Address", storageServerInterfaces[j].address()).detail("GetKeyValuesToken", storageServerInterfaces[j].getKeyValues.getEndpoint().token);
.detail("Address", storageServerInterfaces[j].address()).detail("UID", storageServerInterfaces[j].id()).detail("GetKeyValuesToken", storageServerInterfaces[j].getKeyValues.getEndpoint().token);
//All shards should be available in quiscence
if(self->performQuiescentChecks)

View File

@ -18,15 +18,21 @@
* limitations under the License.
*/
#include "fdbclient/FDBOptions.g.h"
#include "fdbclient/NativeAPI.actor.h"
#include "fdbserver/TesterInterface.actor.h"
#include "fdbserver/workloads/workloads.actor.h"
#include "fdbserver/workloads/BulkSetup.actor.h"
#include "flow/Arena.h"
#include "flow/IRandom.h"
#include "flow/Trace.h"
#include "flow/actorcompiler.h" // This must be the last #include.
#include "flow/serialize.h"
#include <cstring>
struct CycleWorkload : TestWorkload {
int actorCount, nodeCount;
double testDuration, transactionsPerSecond, minExpectedTransactionsPerSecond;
double testDuration, transactionsPerSecond, minExpectedTransactionsPerSecond, traceParentProbability;
Key keyPrefix;
vector<Future<Void>> clients;
@ -38,12 +44,13 @@ struct CycleWorkload : TestWorkload {
transactions("Transactions"), retries("Retries"), totalLatency("Latency"),
tooOldRetries("Retries.too_old"), commitFailedRetries("Retries.commit_failed")
{
testDuration = getOption( options, LiteralStringRef("testDuration"), 10.0 );
transactionsPerSecond = getOption( options, LiteralStringRef("transactionsPerSecond"), 5000.0 ) / clientCount;
actorCount = getOption( options, LiteralStringRef("actorsPerClient"), transactionsPerSecond / 5 );
nodeCount = getOption(options, LiteralStringRef("nodeCount"), transactionsPerSecond * clientCount);
keyPrefix = unprintable( getOption(options, LiteralStringRef("keyPrefix"), LiteralStringRef("")).toString() );
minExpectedTransactionsPerSecond = transactionsPerSecond * getOption(options, LiteralStringRef("expectedRate"), 0.7);
testDuration = getOption( options, "testDuration"_sr, 10.0 );
transactionsPerSecond = getOption( options, "transactionsPerSecond"_sr, 5000.0 ) / clientCount;
actorCount = getOption( options, "actorsPerClient"_sr, transactionsPerSecond / 5 );
nodeCount = getOption(options, "nodeCount"_sr, transactionsPerSecond * clientCount);
keyPrefix = unprintable( getOption(options, "keyPrefix"_sr, LiteralStringRef("")).toString() );
traceParentProbability = getOption(options, "traceParentProbability "_sr, 0.01);
minExpectedTransactionsPerSecond = transactionsPerSecond * getOption(options, "expectedRate"_sr, 0.7);
}
virtual std::string description() { return "CycleWorkload"; }
@ -98,6 +105,12 @@ struct CycleWorkload : TestWorkload {
state double tstart = now();
state int r = deterministicRandom()->randomInt(0, self->nodeCount);
state Transaction tr(cx);
if (deterministicRandom()->random01() >= self->traceParentProbability) {
state Span span("CycleClient"_loc);
TraceEvent("CycleTracingTransaction", span->context);
tr.setOption(FDBTransactionOptions::SPAN_PARENT,
BinaryWriter::toValue(span->context, Unversioned()));
}
while (true) {
try {
// Reverse next and next^2 node
@ -115,9 +128,9 @@ struct CycleWorkload : TestWorkload {
tr.set( self->key(r), self->value(r3) );
tr.set( self->key(r2), self->value(r4) );
tr.set( self->key(r3), self->value(r2) );
// TraceEvent("CyclicTestMX").detail("Key", self->key(r).toString()).detail("Value", self->value(r3).toString());
// TraceEvent("CyclicTestMX").detail("Key", self->key(r2).toString()).detail("Value", self->value(r4).toString());
// TraceEvent("CyclicTestMX").detail("Key", self->key(r3).toString()).detail("Value", self->value(r2).toString());
//TraceEvent("CyclicTestMX1").detail("Key", self->key(r).toString()).detail("Value", self->value(r3).toString());
//TraceEvent("CyclicTestMX2").detail("Key", self->key(r2).toString()).detail("Value", self->value(r4).toString());
//TraceEvent("CyclicTestMX3").detail("Key", self->key(r3).toString()).detail("Value", self->value(r2).toString());
wait( tr.commit() );
// TraceEvent("CycleCommit");
@ -161,7 +174,10 @@ struct CycleWorkload : TestWorkload {
return false;
}
int i=0;
for(int c=0; c<nodeCount; c++) {
int iPrev=0;
double d;
int c;
for(c=0; c<nodeCount; c++) {
if (c && !i) {
TraceEvent(SevError, "TestFailure").detail("Reason", "Cycle got shorter").detail("Before", nodeCount).detail("After", c).detail("KeyPrefix", keyPrefix.printable());
logTestData(data);
@ -172,7 +188,8 @@ struct CycleWorkload : TestWorkload {
logTestData(data);
return false;
}
double d = testKeyToDouble(data[i].value, keyPrefix);
d = testKeyToDouble(data[i].value, keyPrefix);
iPrev = i;
i = (int)d;
if ( i != d || i<0 || i>=nodeCount) {
TraceEvent(SevError, "TestFailure").detail("Reason", "Invalid value").detail("KeyPrefix", keyPrefix.printable());
@ -181,7 +198,8 @@ struct CycleWorkload : TestWorkload {
}
}
if (i != 0) {
TraceEvent(SevError, "TestFailure").detail("Reason", "Cycle got longer").detail("KeyPrefix", keyPrefix.printable());
TraceEvent(SevError, "TestFailure").detail("Reason", "Cycle got longer").detail("KeyPrefix", keyPrefix.printable()).detail("Key", key(i)).detail("Value", data[i].value).
detail("Iteration", c).detail("Nodecount", nodeCount).detail("Int", i).detail("Double", d).detail("ValuePrev", data[iPrev].value).detail("KeyPrev", data[iPrev].key);
logTestData(data);
return false;
}

View File

@ -59,7 +59,9 @@ struct ExceptionContract {
e.code() == error_code_transaction_cancelled ||
e.code() == error_code_key_too_large ||
e.code() == error_code_value_too_large ||
e.code() == error_code_process_behind)
e.code() == error_code_process_behind ||
e.code() == error_code_batch_transaction_throttled ||
e.code() == error_code_tag_throttled)
{
return;
}

View File

@ -226,15 +226,21 @@ struct ReadWriteWorkload : KVWorkload {
ACTOR static Future<bool> traceDumpWorkers( Reference<AsyncVar<ServerDBInfo>> db ) {
try {
loop {
ErrorOr<std::vector<WorkerDetails>> workerList = wait( db->get().clusterInterface.getWorkers.tryGetReply( GetWorkersRequest() ) );
if( workerList.present() ) {
std::vector<Future<ErrorOr<Void>>> dumpRequests;
for( int i = 0; i < workerList.get().size(); i++)
dumpRequests.push_back( workerList.get()[i].interf.traceBatchDumpRequest.tryGetReply( TraceBatchDumpRequest() ) );
wait( waitForAll( dumpRequests ) );
return true;
choose {
when( wait( db->onChange() ) ) {}
when (ErrorOr<std::vector<WorkerDetails>> workerList = wait( db->get().clusterInterface.getWorkers.tryGetReply( GetWorkersRequest() ) );)
{
if( workerList.present() ) {
std::vector<Future<ErrorOr<Void>>> dumpRequests;
for( int i = 0; i < workerList.get().size(); i++)
dumpRequests.push_back( workerList.get()[i].interf.traceBatchDumpRequest.tryGetReply( TraceBatchDumpRequest() ) );
wait( waitForAll( dumpRequests ) );
return true;
}
wait( delay( 1.0 ) );
}
}
wait( delay( 1.0 ) );
}
} catch( Error &e ) {
TraceEvent(SevError, "FailedToDumpWorkers").error(e);

View File

@ -26,6 +26,8 @@ void forceLinkIndexedSetTests();
void forceLinkDequeTests();
void forceLinkFlowTests();
void forceLinkVersionedMapTests();
void forceLinkMemcpyTests();
void forceLinkMemcpyPerfTests();
struct UnitTestWorkload : TestWorkload {
bool enabled;
@ -45,6 +47,8 @@ struct UnitTestWorkload : TestWorkload {
forceLinkDequeTests();
forceLinkFlowTests();
forceLinkVersionedMapTests();
forceLinkMemcpyTests();
forceLinkMemcpyPerfTests();
}
virtual std::string description() { return "UnitTests"; }

View File

@ -95,9 +95,9 @@ public:
inline explicit Arena( size_t reservedSize );
//~Arena();
Arena(const Arena&);
Arena(Arena && r) BOOST_NOEXCEPT;
Arena(Arena&& r) noexcept;
Arena& operator=(const Arena&);
Arena& operator=(Arena&&) BOOST_NOEXCEPT;
Arena& operator=(Arena&&) noexcept;
inline void dependsOn( const Arena& p );
inline size_t getSize() const;
@ -173,12 +173,12 @@ inline Arena::Arena(size_t reservedSize) : impl( 0 ) {
ArenaBlock::create((int)reservedSize,impl);
}
inline Arena::Arena( const Arena& r ) : impl( r.impl ) {}
inline Arena::Arena(Arena && r) BOOST_NOEXCEPT : impl(std::move(r.impl)) {}
inline Arena::Arena(Arena&& r) noexcept : impl(std::move(r.impl)) {}
inline Arena& Arena::operator=(const Arena& r) {
impl = r.impl;
return *this;
}
inline Arena& Arena::operator=(Arena&& r) BOOST_NOEXCEPT {
inline Arena& Arena::operator=(Arena&& r) noexcept {
impl = std::move(r.impl);
return *this;
}
@ -380,12 +380,11 @@ public:
}
#else
Standalone( const T& t, const Arena& arena ) : Arena( arena ), T( t ) {}
Standalone( const Standalone<T> & t ) : Arena((Arena const&)t), T((T const&)t) {}
Standalone<T>& operator=( const Standalone<T> & t ) {
*(Arena*)this = (Arena const&)t;
*(T*)this = (T const&)t;
return *this;
}
Standalone(const Standalone<T>&) = default;
Standalone<T>& operator=(const Standalone<T>&) = default;
Standalone(Standalone<T>&&) = default;
Standalone<T>& operator=(Standalone<T>&&) = default;
~Standalone() = default;
#endif
template <class U> Standalone<U> castTo() const {
@ -632,6 +631,9 @@ struct Traceable<Standalone<T>> : std::conditional<Traceable<T>::value, std::tru
};
#define LiteralStringRef( str ) StringRef( (const uint8_t*)(str), sizeof((str))-1 )
inline StringRef operator "" _sr(const char* str, size_t size) {
return StringRef(reinterpret_cast<const uint8_t*>(str), size);
}
// makeString is used to allocate a Standalone<StringRef> of a known length for later
// mutation (via mutateString). If you need to append to a string of unknown length,
@ -710,15 +712,20 @@ inline bool operator != (const StringRef& lhs, const StringRef& rhs ) { return !
inline bool operator <= ( const StringRef& lhs, const StringRef& rhs ) { return !(lhs>rhs); }
inline bool operator >= ( const StringRef& lhs, const StringRef& rhs ) { return !(lhs<rhs); }
// This trait is used by VectorRef to determine if it should just memcpy the vector contents.
// FIXME: VectorRef really should use std::is_trivially_copyable for this BUT that is not implemented
// in gcc c++0x so instead we will use this custom trait which defaults to std::is_trivial, which
// handles most situations but others will have to be specialized.
// This trait is used by VectorRef to determine if deep copy constructor should recursively
// call deep copies of each element.
//
// TODO: There should be an easier way to identify the difference between flow_ref and non-flow_ref types.
// std::is_trivially_copyable does not work because some flow_ref types are trivially copyable
// and some non-flow_ref types are not trivially copyable.
template <typename T>
struct memcpy_able : std::is_trivial<T> {};
struct flow_ref : std::integral_constant<bool, !std::is_fundamental_v<T>> {};
template <>
struct memcpy_able<UID> : std::integral_constant<bool, true> {};
struct flow_ref<UID> : std::integral_constant<bool, false> {};
template <class A, class B>
struct flow_ref<std::pair<A, B>> : std::integral_constant<bool, false> {};
template<class T>
struct string_serialized_traits : std::false_type {
@ -794,7 +801,7 @@ public:
using value_type = T;
static_assert(SerStrategy == VecSerStrategy::FlatBuffers || string_serialized_traits<T>::value);
// T must be trivially destructible (and copyable)!
// T must be trivially destructible!
VectorRef() : data(0), m_size(0), m_capacity(0) {}
template <VecSerStrategy S>
@ -809,19 +816,19 @@ public:
return *this;
}
// Arena constructor for non-Ref types, identified by memcpy_able
// Arena constructor for non-Ref types, identified by !flow_ref
template <class T2 = T, VecSerStrategy S>
VectorRef(Arena& p, const VectorRef<T, S>& toCopy, typename std::enable_if<memcpy_able<T2>::value, int>::type = 0)
VectorRef(Arena& p, const VectorRef<T, S>& toCopy, typename std::enable_if<!flow_ref<T2>::value, int>::type = 0)
: VPS(toCopy), data((T*)new (p) uint8_t[sizeof(T) * toCopy.size()]), m_size(toCopy.size()),
m_capacity(toCopy.size()) {
if (m_size > 0) {
memcpy(data, toCopy.data, m_size * sizeof(T));
std::copy(toCopy.data, toCopy.data + m_size, data);
}
}
// Arena constructor for Ref types, which must have an Arena constructor
template <class T2 = T, VecSerStrategy S>
VectorRef(Arena& p, const VectorRef<T, S>& toCopy, typename std::enable_if<!memcpy_able<T2>::value, int>::type = 0)
VectorRef(Arena& p, const VectorRef<T, S>& toCopy, typename std::enable_if<flow_ref<T2>::value, int>::type = 0)
: VPS(), data((T*)new (p) uint8_t[sizeof(T) * toCopy.size()]), m_size(toCopy.size()), m_capacity(toCopy.size()) {
for (int i = 0; i < m_size; i++) {
auto ptr = new (&data[i]) T(p, toCopy[i]);
@ -917,7 +924,7 @@ public:
if (m_size + count > m_capacity) reallocate(p, m_size + count);
VPS::invalidate();
if (count > 0) {
memcpy(data + m_size, begin, sizeof(T) * count);
std::copy(begin, begin + count, data + m_size);
}
m_size += count;
}
@ -957,15 +964,15 @@ public:
if (size > m_capacity) reallocate(p, size);
}
// expectedSize() for non-Ref types, identified by memcpy_able
// expectedSize() for non-Ref types, identified by !flow_ref
template <class T2 = T>
typename std::enable_if<memcpy_able<T2>::value, size_t>::type expectedSize() const {
typename std::enable_if<!flow_ref<T2>::value, size_t>::type expectedSize() const {
return sizeof(T) * m_size;
}
// expectedSize() for Ref types, which must in turn have expectedSize() implemented.
template <class T2 = T>
typename std::enable_if<!memcpy_able<T2>::value, size_t>::type expectedSize() const {
typename std::enable_if<flow_ref<T2>::value, size_t>::type expectedSize() const {
size_t t = sizeof(T) * m_size;
for (int i = 0; i < m_size; i++) t += data[i].expectedSize();
return t;
@ -982,9 +989,9 @@ private:
void reallocate(Arena& p, int requiredCapacity) {
requiredCapacity = std::max(m_capacity * 2, requiredCapacity);
// SOMEDAY: Maybe we are right at the end of the arena and can expand cheaply
T* newData = (T*)new (p) uint8_t[requiredCapacity * sizeof(T)];
T* newData = new (p) T[requiredCapacity];
if (m_size > 0) {
memcpy(newData, data, m_size * sizeof(T));
std::move(data, data + m_size, newData);
}
data = newData;
m_capacity = requiredCapacity;

View File

@ -28,6 +28,7 @@ set(FLOW_SRCS
IRandom.h
IThreadPool.cpp
IThreadPool.h
ITrace.h
IndexedSet.actor.h
IndexedSet.cpp
IndexedSet.h
@ -61,13 +62,15 @@ set(FLOW_SRCS
ThreadSafeQueue.h
Trace.cpp
Trace.h
Tracing.h
Tracing.cpp
TreeBenchmark.h
UnitTest.cpp
UnitTest.h
XmlTraceLogFormatter.cpp
XmlTraceLogFormatter.h
actorcompiler.h
crc32c.h
crc32c.h
crc32c.cpp
error_definitions.h
${CMAKE_CURRENT_BINARY_DIR}/SourceVersion.h
@ -79,12 +82,19 @@ set(FLOW_SRCS
genericactors.actor.h
network.cpp
network.h
rte_memcpy.h
serialize.cpp
serialize.h
stacktrace.amalgamation.cpp
stacktrace.h
test_memcpy.cpp
test_memcpy_perf.cpp
version.cpp)
if(UNIX AND NOT APPLE)
list(APPEND FLOW_SRCS folly_memcpy.S)
endif()
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/SourceVersion.h.cmake ${CMAKE_CURRENT_BINARY_DIR}/SourceVersion.h)
add_flow_target(STATIC_LIBRARY NAME flow SRCS ${FLOW_SRCS})

View File

@ -41,21 +41,27 @@ public:
Deque() : arr(0), begin(0), end(0), mask(-1) {}
// TODO: iterator construction, other constructors
Deque(Deque const& r) : arr(0), begin(0), end(r.size()), mask(r.mask) {
Deque(Deque const& r) : arr(nullptr), begin(0), end(r.size()), mask(r.mask) {
if (r.capacity() > 0) {
arr = (T*)aligned_alloc(std::max(__alignof(T), sizeof(void*)), capacity() * sizeof(T));
ASSERT(arr != nullptr);
}
ASSERT(capacity() >= end || end == 0);
for (uint32_t i=0; i<end; i++)
new (&arr[i]) T(r[i]);
// FIXME: Specialization for POD types using memcpy?
if (r.end < r.capacity()) {
std::copy(r.arr + r.begin, r.arr + r.begin + r.size(), arr);
} else {
// r.begin is always < capacity(), and r.end is always >= r.begin. Mask is used for wrapping r.end.
// but if r.end >= r.capacity(), the deque wraps around so the
// copy must be performed in two parts
auto partTwo = std::copy(r.arr + r.begin, r.arr + r.capacity(), arr);
std::copy(r.arr, r.arr + (r.end & r.mask), partTwo);
}
}
void operator=(Deque const& r) {
cleanup();
arr = 0;
arr = nullptr;
begin = 0;
end = r.size();
mask = r.mask;
@ -64,26 +70,32 @@ public:
ASSERT(arr != nullptr);
}
ASSERT(capacity() >= end || end == 0);
for (uint32_t i=0; i<end; i++)
new (&arr[i]) T(r[i]);
// FIXME: Specialization for POD types using memcpy?
if (r.end < r.capacity()) {
std::copy(r.arr + r.begin, r.arr + r.begin + r.size(), arr);
} else {
// r.begin is always < capacity(), and r.end is always >= r.begin. Mask is used for wrapping r.end.
// but if r.end >= r.capacity(), the deque wraps around so the
// copy must be performed in two parts
auto partTwo = std::copy(r.arr + r.begin, r.arr + r.capacity(), arr);
std::copy(r.arr, r.arr + (r.end & r.mask), partTwo);
}
}
Deque(Deque&& r) BOOST_NOEXCEPT : begin(r.begin), end(r.end), mask(r.mask), arr(r.arr) {
r.arr = 0;
Deque(Deque&& r) noexcept : begin(r.begin), end(r.end), mask(r.mask), arr(r.arr) {
r.arr = nullptr;
r.begin = r.end = 0;
r.mask = -1;
}
void operator=(Deque&& r) BOOST_NOEXCEPT {
void operator=(Deque&& r) noexcept {
cleanup();
begin = r.begin;
end = r.end;
mask = r.mask;
arr = r.arr;
r.arr = 0;
r.arr = nullptr;
r.begin = r.end = 0;
r.mask = -1;
}

View File

@ -104,7 +104,7 @@ public:
static Reference<P> addRef( P* ptr ) { ptr->addref(); return Reference(ptr); }
Reference(const Reference& r) : ptr(r.getPtr()) { if (ptr) addref(ptr); }
Reference(Reference && r) BOOST_NOEXCEPT : ptr(r.getPtr()) { r.ptr = NULL; }
Reference(Reference&& r) noexcept : ptr(r.getPtr()) { r.ptr = NULL; }
template <class Q>
Reference(const Reference<Q>& r) : ptr(r.getPtr()) { if (ptr) addref(ptr); }
@ -122,7 +122,7 @@ public:
}
return *this;
}
Reference& operator=(Reference&& r) BOOST_NOEXCEPT {
Reference& operator=(Reference&& r) noexcept {
P* oldPtr = ptr;
P* newPtr = r.ptr;
if (oldPtr != newPtr) {

View File

@ -48,6 +48,36 @@
#include <fcntl.h>
#include <cmath>
struct IssuesListImpl {
IssuesListImpl(){}
void addIssue(std::string issue) {
MutexHolder h(mutex);
issues.insert(issue);
}
void retrieveIssues(std::set<std::string>& out) {
MutexHolder h(mutex);
for (auto const& i : issues) {
out.insert(i);
}
}
void resolveIssue(std::string issue) {
MutexHolder h(mutex);
issues.erase(issue);
}
private:
Mutex mutex;
std::set<std::string> issues;
};
IssuesList::IssuesList() : impl(new IssuesListImpl{}) {}
IssuesList::~IssuesList() { delete impl; }
void IssuesList::addIssue(std::string issue) { impl->addIssue(issue); }
void IssuesList::retrieveIssues(std::set<std::string> &out) { impl->retrieveIssues(out); }
void IssuesList::resolveIssue(std::string issue) { impl->resolveIssue(issue); }
FileTraceLogWriter::FileTraceLogWriter(std::string directory, std::string processName, std::string basename,
std::string extension, uint64_t maxLogsSize, std::function<void()> onError,
Reference<ITraceLogIssuesReporter> issues)
@ -72,8 +102,16 @@ void FileTraceLogWriter::lastError(int err) {
}
void FileTraceLogWriter::write(const std::string& str) {
auto ptr = str.c_str();
int remaining = str.size();
write(str.data(), str.size());
}
void FileTraceLogWriter::write(const StringRef& str) {
write(reinterpret_cast<const char*>(str.begin()), str.size());
}
void FileTraceLogWriter::write(const char* str, size_t len) {
auto ptr = str;
int remaining = len;
bool needsResolve = false;
while ( remaining ) {

View File

@ -23,11 +23,29 @@
#define FLOW_FILE_TRACE_LOG_WRITER_H
#pragma once
#include "flow/Arena.h"
#include "flow/FastRef.h"
#include "flow/Trace.h"
#include <functional>
struct IssuesListImpl;
struct IssuesList : ITraceLogIssuesReporter, ThreadSafeReferenceCounted<IssuesList> {
IssuesList();
virtual ~IssuesList();
void addIssue(std::string issue) override;
void retrieveIssues(std::set<std::string>& out) override;
void resolveIssue(std::string issue) override;
void addref() { ThreadSafeReferenceCounted<IssuesList>::addref(); }
void delref() { ThreadSafeReferenceCounted<IssuesList>::delref(); }
private:
IssuesListImpl* impl;
};
class FileTraceLogWriter : public ITraceLogWriter, ReferenceCounted<FileTraceLogWriter> {
private:
std::string directory;
@ -42,6 +60,8 @@ private:
std::function<void()> onError;
void write(const char* str, size_t size);
public:
FileTraceLogWriter(std::string directory, std::string processName, std::string basename, std::string extension,
uint64_t maxLogsSize, std::function<void()> onError, Reference<ITraceLogIssuesReporter> issues);
@ -51,11 +71,12 @@ public:
void lastError(int err);
void write(const std::string& str);
void open();
void close();
void roll();
void sync();
void write(const std::string& str) override;
void write(StringRef const& str) override;
void open() override;
void close() override;
void roll() override;
void sync() override;
void cleanupTraceFiles();
};

View File

@ -69,23 +69,33 @@ bool operator<(CompatibleWithKey const& l, KeyValueMapPair const& r) {
class IKeyValueContainer {
public:
typedef typename IndexedSet<KeyValueMapPair, uint64_t>::iterator iterator;
using const_iterator = IndexedSet<KeyValueMapPair, uint64_t>::const_iterator;
using iterator = IndexedSet<KeyValueMapPair, uint64_t>::iterator;
IKeyValueContainer() = default;
~IKeyValueContainer() = default;
bool empty() { return data.empty(); }
bool empty() const { return data.empty(); }
void clear() { return data.clear(); }
std::tuple<size_t, size_t, size_t> size() { return std::make_tuple(0, 0, 0); }
std::tuple<size_t, size_t, size_t> size() const { return std::make_tuple(0, 0, 0); }
const_iterator find(const StringRef& key) const { return data.find(key); }
iterator find(const StringRef& key) { return data.find(key); }
const_iterator begin() const { return data.begin(); }
iterator begin() { return data.begin(); }
const_iterator cbegin() const { return begin(); }
const_iterator end() const { return data.end(); }
iterator end() { return data.end(); }
const_iterator cend() const { return end(); }
const_iterator lower_bound(const StringRef& key) const { return data.lower_bound(key); }
iterator lower_bound(const StringRef& key) { return data.lower_bound(key); }
const_iterator upper_bound(const StringRef& key) const { return data.upper_bound(key); }
iterator upper_bound(const StringRef& key) { return data.upper_bound(key); }
iterator previous(iterator i) const { return data.previous(i); }
const_iterator previous(const_iterator i) const { return data.previous(i); }
const_iterator previous(iterator i) const { return data.previous(const_iterator{ i }); }
iterator previous(iterator i) { return data.previous(i); }
void erase(iterator begin, iterator end) { data.erase(begin, end); }
iterator insert(const StringRef& key, const StringRef& val, bool replaceExisting = true) {
@ -96,7 +106,8 @@ public:
return data.insert(pairs, replaceExisting);
}
uint64_t sumTo(iterator to) { return data.sumTo(to); }
uint64_t sumTo(const_iterator to) const { return data.sumTo(to); }
uint64_t sumTo(iterator to) const { return data.sumTo(const_iterator{ to }); }
static int getElementBytes() { return IndexedSet<KeyValueMapPair, uint64_t>::getElementBytes(); }

View File

@ -109,5 +109,41 @@ private:
Reference<IThreadPool> createGenericThreadPool();
class DummyThreadPool : public IThreadPool, ReferenceCounted<DummyThreadPool> {
public:
~DummyThreadPool() {}
DummyThreadPool() : thread(NULL) {}
Future<Void> getError() {
return errors.getFuture();
}
void addThread( IThreadPoolReceiver* userData ) {
ASSERT( !thread );
thread = userData;
}
void post( PThreadAction action ) {
try {
(*action)( thread );
} catch (Error& e) {
errors.sendError( e );
} catch (...) {
errors.sendError( unknown_error() );
}
}
Future<Void> stop(Error const& e) {
return Void();
}
void addref() {
ReferenceCounted<DummyThreadPool>::addref();
}
void delref() {
ReferenceCounted<DummyThreadPool>::delref();
}
private:
IThreadPoolReceiver* thread;
Promise<Void> errors;
};
#endif

61
flow/ITrace.h Normal file
View File

@ -0,0 +1,61 @@
/*
* ITrace.h
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2018 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <string>
#include <set>
class StringRef;
struct ITraceLogWriter {
virtual void open() = 0;
virtual void roll() = 0;
virtual void close() = 0;
virtual void write(const std::string&) = 0;
virtual void write(const StringRef&) = 0;
virtual void sync() = 0;
virtual void addref() = 0;
virtual void delref() = 0;
};
class TraceEventFields;
struct ITraceLogFormatter {
virtual const char* getExtension() = 0;
virtual const char* getHeader() = 0; // Called when starting a new file
virtual const char* getFooter() = 0; // Called when ending a file
virtual std::string formatEvent(const TraceEventFields&) = 0; // Called for each event
virtual void addref() = 0;
virtual void delref() = 0;
};
struct ITraceLogIssuesReporter {
virtual ~ITraceLogIssuesReporter();
virtual void addIssue(std::string issue) = 0;
virtual void resolveIssue(std::string issue) = 0;
virtual void retrieveIssues(std::set<std::string>& out) = 0;
virtual void addref() = 0;
virtual void delref() = 0;
};

View File

@ -31,6 +31,7 @@
#include <cstring>
#include <deque>
#include <random>
#include <type_traits>
#include "flow/TreeBenchmark.h"
#include "flow/UnitTest.h"
template <class Node>
@ -204,18 +205,25 @@ TEST_CASE("/flow/IndexedSet/strings") {
template <typename K>
struct IndexedSetHarness {
using map = IndexedSet<K, int>;
using const_result = typename map::const_iterator;
using result = typename map::iterator;
using key_type = K;
map s;
void insert(K const& k) { s.insert(K(k), 1); }
result find(K const& k) const { return s.find(k); }
result not_found() const { return s.end(); }
result begin() const { return s.begin(); }
result end() const { return s.end(); }
result lower_bound(K const& k) const { return s.lower_bound(k); }
result upper_bound(K const& k) const { return s.upper_bound(k); }
const_result find(K const& k) const { return s.find(k); }
result find(K const& k) { return s.find(k); }
const_result not_found() const { return s.end(); }
result not_found() { return s.end(); }
const_result begin() const { return s.begin(); }
result begin() { return s.begin(); }
const_result end() const { return s.end(); }
result end() { return s.end(); }
const_result lower_bound(K const& k) const { return s.lower_bound(k); }
result lower_bound(K const& k) { return s.lower_bound(k); }
const_result upper_bound(K const& k) const { return s.upper_bound(k); }
result upper_bound(K const& k) { return s.upper_bound(k); }
void erase(K const& k) { s.erase(k); }
};
@ -494,4 +502,60 @@ TEST_CASE("/flow/IndexedSet/all numbers") {
return Void();
}
template <class T>
static constexpr bool is_const_ref_v = std::is_const_v<typename std::remove_reference_t<T>>;
TEST_CASE("/flow/IndexedSet/const_iterator") {
struct Key {
int key;
explicit Key(int key) : key(key) {}
};
struct Metric {
int metric;
explicit Metric(int metric) : metric(metric) {}
};
IndexedSet<int, int64_t> is;
for (int i = 0; i < 10; ++i) is.insert(i, 1);
IndexedSet<int, int64_t>& ncis = is;
static_assert(!is_const_ref_v<decltype(ncis)>);
static_assert(!is_const_ref_v<decltype(*ncis.begin())>);
static_assert(is_const_ref_v<decltype(*ncis.cbegin())>);
static_assert(!is_const_ref_v<decltype(*ncis.previous(ncis.end()))>);
static_assert(is_const_ref_v<decltype(*ncis.previous(ncis.cend()))>);
static_assert(!is_const_ref_v<decltype(*ncis.index(Metric{ 5 }))>);
static_assert(!is_const_ref_v<decltype(*ncis.find(Key{ 5 }))>);
static_assert(!is_const_ref_v<decltype(*ncis.upper_bound(Key{ 5 }))>);
static_assert(!is_const_ref_v<decltype(*ncis.lower_bound(Key{ 5 }))>);
static_assert(!is_const_ref_v<decltype(*ncis.lastLessOrEqual(Key{ 5 }))>);
static_assert(!is_const_ref_v<decltype(*ncis.lastItem())>);
const IndexedSet<int, int64_t>& cis = is;
static_assert(is_const_ref_v<decltype(cis)>);
static_assert(is_const_ref_v<decltype(*cis.begin())>);
static_assert(is_const_ref_v<decltype(*cis.cbegin())>);
static_assert(is_const_ref_v<decltype(*cis.previous(cis.end()))>);
static_assert(is_const_ref_v<decltype(*cis.previous(cis.cend()))>);
static_assert(is_const_ref_v<decltype(*cis.previous(ncis.end()))>);
static_assert(is_const_ref_v<decltype(*cis.previous(ncis.cend()))>);
static_assert(is_const_ref_v<decltype(*cis.index(Metric{ 5 }))>);
static_assert(is_const_ref_v<decltype(*cis.find(Key{ 5 }))>);
static_assert(is_const_ref_v<decltype(*cis.upper_bound(Key{ 5 }))>);
static_assert(is_const_ref_v<decltype(*cis.lower_bound(Key{ 5 }))>);
static_assert(is_const_ref_v<decltype(*cis.lastLessOrEqual(Key{ 5 }))>);
static_assert(is_const_ref_v<decltype(*cis.lastItem())>);
for (auto& val : ncis) {
static_assert(!is_const_ref_v<decltype(val)>);
}
for (const auto& val : ncis) {
static_assert(is_const_ref_v<decltype(val)>);
}
for (auto& val : cis) {
static_assert(is_const_ref_v<decltype(val)>);
}
return Void();
}
void forceLinkIndexedSetTests() {}

Some files were not shown because too many files have changed in this diff Show More