Merge branch 'main' into granule_merging_batch

This commit is contained in:
Josh Slocum 2022-07-20 07:42:26 -05:00
commit 78b6a96006
162 changed files with 3844 additions and 1270 deletions

View File

@ -467,7 +467,9 @@ elseif(NOT WIN32 AND NOT APPLE AND NOT OPEN_FOR_IDE) # Linux Only
add_test(NAME fdb_c_shim_library_tests
COMMAND $<TARGET_FILE:Python::Interpreter> ${CMAKE_CURRENT_SOURCE_DIR}/test/fdb_c_shim_tests.py
--build-dir ${CMAKE_BINARY_DIR}
--source-dir ${CMAKE_SOURCE_DIR}
--unit-tests-bin $<TARGET_FILE:fdb_c_shim_unit_tests>
--api-tester-bin $<TARGET_FILE:fdb_c_shim_api_tester>
--api-test-dir ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests
)
endif() # End Linux only

View File

@ -87,14 +87,12 @@ class FdbCShimTests:
self.build_dir = Path(args.build_dir).resolve()
assert self.build_dir.exists(), "{} does not exist".format(args.build_dir)
assert self.build_dir.is_dir(), "{} is not a directory".format(args.build_dir)
self.source_dir = Path(args.source_dir).resolve()
assert self.source_dir.exists(), "{} does not exist".format(args.source_dir)
assert self.source_dir.is_dir(), "{} is not a directory".format(args.source_dir)
self.api_tester_bin = self.build_dir.joinpath("bin", "fdb_c_shim_api_tester")
assert self.api_tester_bin.exists(), "{} does not exist".format(self.api_tester_bin)
self.unit_tests_bin = self.build_dir.joinpath("bin", "fdb_c_shim_unit_tests")
self.unit_tests_bin = Path(args.unit_tests_bin).resolve()
assert self.unit_tests_bin.exists(), "{} does not exist".format(self.unit_tests_bin)
self.api_test_dir = self.source_dir.joinpath("bindings", "c", "test", "apitester", "tests")
self.api_tester_bin = Path(args.api_tester_bin).resolve()
assert self.api_tester_bin.exists(), "{} does not exist".format(self.api_tests_bin)
self.api_test_dir = Path(args.api_test_dir).resolve()
assert self.api_test_dir.exists(), "{} does not exist".format(self.api_test_dir)
self.downloader = FdbBinaryDownloader(args.build_dir)
# binary downloads are currently available only for x86_64
self.platform = platform.machine()
@ -196,13 +194,12 @@ if __name__ == "__main__":
help="FDB build directory",
required=True,
)
parser.add_argument(
"--source-dir",
"-s",
metavar="SOURCE_DIRECTORY",
help="FDB source directory",
required=True,
)
parser.add_argument('--unit-tests-bin', type=str,
help='Path to the fdb_c_shim_unit_tests executable.')
parser.add_argument('--api-tester-bin', type=str,
help='Path to the fdb_c_shim_api_tester executable.')
parser.add_argument('--api-test-dir', type=str,
help='Path to a directory with api test definitions.')
args = parser.parse_args()
test = FdbCShimTests(args)
test.run_tests()

View File

@ -628,6 +628,9 @@ def tenants(logger):
assert(len(json_output['tenant']) == 2)
assert('id' in json_output['tenant'])
assert('prefix' in json_output['tenant'])
assert(len(json_output['tenant']['prefix']) == 2)
assert('base64' in json_output['tenant']['prefix'])
assert('printable' in json_output['tenant']['prefix'])
output = run_fdbcli_command('usetenant')
assert output == 'Using the default tenant'

View File

@ -21,6 +21,7 @@
import fdb
import sys
import json
import base64
from fdb.tuple import pack
if __name__ == '__main__':
@ -65,11 +66,11 @@ def test_tenant_operations(db):
t1_entry = tenant_list[0].value
t1_json = json.loads(t1_entry)
p1 = t1_json['prefix'].encode('utf8')
p1 = base64.b64decode(t1_json['prefix']['base64'])
t2_entry = tenant_list[1].value
t2_json = json.loads(t2_entry)
p2 = t2_json['prefix'].encode('utf8')
p2 = base64.b64decode(t2_json['prefix']['base64'])
tenant1 = db.open_tenant(b'tenant1')
tenant2 = db.open_tenant(b'tenant2')
@ -80,12 +81,12 @@ def test_tenant_operations(db):
tenant1_entry = db[b'\xff\xff/management/tenant/map/tenant1']
tenant1_json = json.loads(tenant1_entry)
prefix1 = tenant1_json['prefix'].encode('utf8')
prefix1 = base64.b64decode(tenant1_json['prefix']['base64'])
assert prefix1 == p1
tenant2_entry = db[b'\xff\xff/management/tenant/map/tenant2']
tenant2_json = json.loads(tenant2_entry)
prefix2 = tenant2_json['prefix'].encode('utf8')
prefix2 = base64.b64decode(tenant2_json['prefix']['base64'])
assert prefix2 == p2
assert tenant1[b'tenant_test_key'] == b'tenant1'

View File

@ -9,7 +9,7 @@ function(compile_boost)
# Configure bootstrap command
set(BOOTSTRAP_COMMAND "./bootstrap.sh")
set(BOOTSTRAP_LIBRARIES "context,filesystem")
set(BOOTSTRAP_LIBRARIES "context,filesystem,iostreams")
set(BOOST_CXX_COMPILER "${CMAKE_CXX_COMPILER}")
# Can't build Boost with Intel compiler, use clang instead.
@ -65,7 +65,8 @@ function(compile_boost)
UPDATE_COMMAND ""
BUILD_BYPRODUCTS "${BOOST_INSTALL_DIR}/boost/config.hpp"
"${BOOST_INSTALL_DIR}/lib/libboost_context.a"
"${BOOST_INSTALL_DIR}/lib/libboost_filesystem.a")
"${BOOST_INSTALL_DIR}/lib/libboost_filesystem.a"
"${BOOST_INSTALL_DIR}/lib/libboost_iostreams.a")
add_library(${COMPILE_BOOST_TARGET}_context STATIC IMPORTED)
add_dependencies(${COMPILE_BOOST_TARGET}_context ${COMPILE_BOOST_TARGET}Project)
@ -75,9 +76,13 @@ function(compile_boost)
add_dependencies(${COMPILE_BOOST_TARGET}_filesystem ${COMPILE_BOOST_TARGET}Project)
set_target_properties(${COMPILE_BOOST_TARGET}_filesystem PROPERTIES IMPORTED_LOCATION "${BOOST_INSTALL_DIR}/lib/libboost_filesystem.a")
add_library(${COMPILE_BOOST_TARGET}_iostreams STATIC IMPORTED)
add_dependencies(${COMPILE_BOOST_TARGET}_iostreams ${COMPILE_BOOST_TARGET}Project)
set_target_properties(${COMPILE_BOOST_TARGET}_iostreams PROPERTIES IMPORTED_LOCATION "${BOOST_INSTALL_DIR}/lib/libboost_iostreams.a")
add_library(${COMPILE_BOOST_TARGET} INTERFACE)
target_include_directories(${COMPILE_BOOST_TARGET} SYSTEM INTERFACE ${BOOST_INSTALL_DIR}/include)
target_link_libraries(${COMPILE_BOOST_TARGET} INTERFACE ${COMPILE_BOOST_TARGET}_context ${COMPILE_BOOST_TARGET}_filesystem)
target_link_libraries(${COMPILE_BOOST_TARGET} INTERFACE ${COMPILE_BOOST_TARGET}_context ${COMPILE_BOOST_TARGET}_filesystem ${COMPILE_BOOST_TARGET}_iostreams)
endfunction(compile_boost)
@ -103,11 +108,11 @@ set(Boost_USE_STATIC_LIBS ON)
if (UNIX AND CMAKE_CXX_COMPILER_ID MATCHES "Clang$")
list(APPEND CMAKE_PREFIX_PATH /opt/boost_1_78_0_clang)
set(BOOST_HINT_PATHS /opt/boost_1_78_0_clang)
message(STATUS "Using Clang version of boost::context and boost::filesystem")
message(STATUS "Using Clang version of boost::context boost::filesystem and boost::iostreams")
else ()
list(APPEND CMAKE_PREFIX_PATH /opt/boost_1_78_0)
set(BOOST_HINT_PATHS /opt/boost_1_78_0)
message(STATUS "Using g++ version of boost::context and boost::filesystem")
message(STATUS "Using g++ version of boost::context boost::filesystem and boost::iostreams")
endif ()
if(BOOST_ROOT)
@ -119,18 +124,18 @@ if(WIN32)
# properly for config mode. So we use the old way on Windows
# find_package(Boost 1.72.0 EXACT QUIET REQUIRED CONFIG PATHS ${BOOST_HINT_PATHS})
# I think depending on the cmake version this will cause weird warnings
find_package(Boost 1.72 COMPONENTS filesystem)
find_package(Boost 1.72 COMPONENTS filesystem iostreams)
add_library(boost_target INTERFACE)
target_link_libraries(boost_target INTERFACE Boost::boost Boost::filesystem)
target_link_libraries(boost_target INTERFACE Boost::boost Boost::filesystem Boost::iostreams)
return()
endif()
find_package(Boost 1.78.0 EXACT QUIET COMPONENTS context filesystem CONFIG PATHS ${BOOST_HINT_PATHS})
set(FORCE_BOOST_BUILD OFF CACHE BOOL "Forces cmake to build boost and ignores any installed boost")
if(Boost_FOUND AND Boost_filesystem_FOUND AND Boost_context_FOUND AND NOT FORCE_BOOST_BUILD)
if(Boost_FOUND AND Boost_filesystem_FOUND AND Boost_context_FOUND AND Boost_iostreams_FOUND AND NOT FORCE_BOOST_BUILD)
add_library(boost_target INTERFACE)
target_link_libraries(boost_target INTERFACE Boost::boost Boost::context Boost::filesystem)
target_link_libraries(boost_target INTERFACE Boost::boost Boost::context Boost::filesystem Boost::iostreams)
elseif(WIN32)
message(FATAL_ERROR "Could not find Boost")
else()

View File

@ -9,6 +9,14 @@ define_property(TARGET PROPERTY COVERAGE_FILTERS
expression in this list will be ignored when the coverage.target.xml file is \
generated. This property is set through the add_flow_target function.")
if(WIN32)
set(compilation_unit_macro_default OFF)
else()
set(compilation_unit_macro_default ON)
endif()
set(PASS_COMPILATION_UNIT "${compilation_unit_macro_default}" CACHE BOOL
"Pass path to compilation unit as macro to each compilation unit (useful for code probes)")
function(generate_coverage_xml)
if(NOT (${ARGC} EQUAL "1"))
@ -259,6 +267,11 @@ function(add_flow_target)
endif()
endif()
endforeach()
if(PASS_COMPILATION_UNIT)
foreach(s IN LISTS sources)
set_source_files_properties("${s}" PROPERTIES COMPILE_DEFINITIONS "COMPILATION_UNIT=${s}")
endforeach()
endif()
if(AFT_EXECUTABLE)
set(strip_target ON)
set(target_type exec)

8
contrib/ctest_to_joshua.py Normal file → Executable file
View File

@ -1,3 +1,5 @@
#!/usr/bin/env python3
from argparse import ArgumentParser
import glob
import io
@ -31,10 +33,14 @@ class JoshuaBuilder:
if os.path.exists(arg):
if not os.path.relpath(arg, self.build_dir).startswith(".."):
relpath = "build/" + os.path.relpath(arg, self.build_dir)
# Avoid packaging the full build directory.
if relpath != "build/.":
self.files[arg] = relpath
return relpath
elif not os.path.relpath(arg, self.src_dir).startswith(".."):
relpath = "src/" + os.path.relpath(arg, self.src_dir)
# Avoid packaging the full source directory.
if relpath != "src/.":
self.files[arg] = relpath
return relpath
elif os.access(arg, os.X_OK):
@ -61,7 +67,6 @@ class JoshuaBuilder:
def write_tarball(self, output, joshua_test):
with tarfile.open(output, "w:gz") as tar:
for file, arcfile in self.files.items():
if not os.path.isdir(file):
self._add_file(tar, file, arcfile)
tarinfo = tarfile.TarInfo("joshua_test")
tarinfo.mode = 0o755
@ -114,6 +119,7 @@ Unknown arguments are forwarded to ctest, so you may use -R to filter tests e.g.
joshua_builder.add_arg(os.path.join(args.build_dir, "bin/fdbcli"))
joshua_builder.add_arg(os.path.join(args.build_dir, "bin/fdbmonitor"))
joshua_builder.add_arg(os.path.join(args.build_dir, "bin/fdbserver"))
joshua_builder.add_arg(os.path.join(args.build_dir, "bin/mkcert"))
if platform.system() == "Darwin":
joshua_builder.add_arg(os.path.join(args.build_dir, "lib/libfdb_c.dylib"))
else:

View File

@ -87009,7 +87009,7 @@ SQLITE_PRIVATE WhereInfo *sqlite3WhereBegin(
}
sqlite3_query_plan[nQPlan] = 0;
nQPlan = 0;
#endif /* SQLITE_TEST // Testing and debugging use only */
#endif /* SQLITE_TEST // Testing and debugging use only */");
/* Record the continuation address in the WhereInfo structure. Then
** clean up and return.

View File

@ -0,0 +1,227 @@
# Load Balancing in FoundationDB
## Introduction
FoundationDB is a distributed key-value database. A FoundationDB cluster is constituted by one or more processes over one or more physical machines, where each process is a *worker* and takes certain *role*s, such as coordinator, proxy, TLog, storage server, etc., in the system.
The interpocess communications (IPC) between the processes are supported by the [`flow`](https://github.com/apple/foundationdb/tree/main/flow) infrastructure. In the `flow` context, each process will expose one or more *interface*(s). Each interface is able to accept given type of *request*s, and *reply* `Void`, requested data or error. The interfaces and the corresponding request/reply pairs forms the IPC protocol of FoundationDB.
In many cases, the same request can be proceed by multiple processes, e.g. all commit proxies can accept commit requests, and multiple storage server processes can provide values for a given key in double/triple redundancy mode. A load balancer (LB) can be used to distribute the requests over the possible interfaces, preventing one or a few processes getting overloaded. The interface candidates are also referred as *alternative*s. The LB is also able to react when one or more interfaces are (temporarily) unavailable by retrying, or re-routing the request to other candidates. The interface candidates are also known as *alternative*s.
Two LBs are provided in FoundationDB: `basicLoadBalance` and `loadBalance`, both defined in [`LoadBalance.actor.h`](https://github.com/apple/foundationdb/blob/main/fdbrpc/include/fdbrpc/LoadBalance.actor.h). The `basicLoadBalance` is a simple load balancer which each interface is equally chosen; while the `loadBalance` accepts a model object, which provides [datacenter](https://apple.github.io/foundationdb/configuration.html#configuring-regions) (DC) awaring balancing algorithms, allowing requests being sent to interfaces in the same DC.
In the following sections, the two LBs will be discussed in details.
## `basicLoadBalance`
`basicLoadBalance` implements a simple load balancing algorithm. It applies to
* Commit proxy interface
* GetReadVersion proxy interface
* ConfigFollower interface
Here, the interfaces are assumed to be always *fresh*, i.e. the list of the servers is fixed.
```mermaid
graph LR
H0{Has alternatives?}
H1[Pick an alternative]
H2[Backoff]
H3[Request]
H4([Reply])
H5([Error])
H6([Never])
H((Start)) --> H0
H0 --No--> H6
H0 --Yes--> H1
H1 --No healthy alternatives--> H2 --Retry--> H1
H1 --Has alternative--> H3 --Success--> H4
H3 --Exception--> H5
H3 --Broken Promise --> H2
```
### Alternative pick algorithm
In `basicLoadBalance`, a *best* alternative is picked and used at the beginning. At this stage, this alternative is randomly picked among all alternatives. If the best alternative does not work, it will iteratively try other interfaces, see [here](#picking-up-an-alternative-in-basic-load-balancing-algorithm).
## `loadBalance`
`loadBalance` provides a more sophisticated implementation of load balancing. In addition of the basic load balancing, it also provides a variety of features:
* Support for Test Storage Server ([TSS](https://github.com/apple/foundationdb/blob/main/documentation/sphinx/source/tss.rst))
* Datacenter awaring alternative election
* Recording the latency and penalty from interfaces, and [prioritize the interfaces based on previously stored data](#with-queuemodel).
* Able to handle timeouts and SS exceptions with retries.
Currently it is used for
* Storage Server interface
* BlobWorker interface
```mermaid
graph LR
H((Start))
H0{Has alternatives?}
H1[Choose initial candidates]
H4([Never])
H5[pick an alternative]
H6[Send request]
H7[Wait for available alternative]
H8([Response])
H9([All alternatives failed])
H --> H0 --No--> H4
H0 --Yes--> H1
H1 --> H5
H5 --Has alternative--> H6
H5 --No alternative-->H7
H6 --Success--> H8
H6 --Failure--> H5
H7 --At least one alternative--> H5
H7 --> H9
```
Note:
* The response could be either a reply, or an `Error`, e.g. `process_behind` or `request_maybe_delivered`.
### Choose initial candidates
Two initial candidates will be picked before the requests start. They will be selected as the first two alternatives for the load balancer. If both of them failed, other alternatives are used in a round-robin way.
#### No `QueueModel`
If no `QueueModel` is provided, the initial candidates are picked randomly. The first candidate, or the *best* alternative, will be the one that in the same DC, if possible.
#### With `QueueModel`
`QueueModel` holds information about each candidate related to future version, latency and penalty.
* If the storage server is returning a future version error, it is marked as not available until some certain time.
* Penalty is reported by storage server in each response (see `storageserver.actor.cpp:StorageServer::getPenalty`). It is determined by the write queue length and the durability lagging.
If `QueueModel` exists, the candidates will be picked base on the penalty. Workers with high penalties will be avoided when picking the first two candidates.
### Pick an alternative
The alternatives are chosen in the round-robin way when the first two candidates failed. If all alternatives failed, a flag is set, and if the next request fails with `process_behind`, the caller will receive the `process_behind` error.
### Send requests to workers
Here it is assumed that there are at least one alternative available. If no alternative is available, the LB will wait.
```mermaid
graph LR
H((start))
H0{Is first request}
H1[Send first request]
H2([Response])
H3[Pick up next alternative]
H4[Send additional request]
H --> H3
H3 -->H0
H0 --Yes--> H1
H1 --Success--> H2
H1 --Timeout--> H3
H0 --No--> H4
H4 --First request succeed--> H2
H4 --Second request succeed--> H2
H4 --Additional request failed--> H3
```
The first request has a timeout option. If the LB is not able to retrieve the response within the timout, more requests will be sent to secondary and other available interfaces. If the first request failed, it is reset and the next request will be considered as the first request. Certain types of errors can also be returned as response, e.g. `request_may_be_delivered` or `process_behind`, which may not trigger a load-balancer retry.
### Wait for available alternative
When there is no alternatives available, the load balancer may wait until at least one interface is up.
```mermaid
graph LR
H0((start))
H1{Is first request in-flight}
H2[Wait for the first request]
H3([Response])
H4([Retry])
H5[Wait for alternatives]
H6([all_alternatives_failed])
H0 --> H1
H1 --Yes--> H2
H1 --No--> H5
H5 --Timeout-->H6
H5 --Success-->H4
H2 --Success-->H3
H2 --Failed-->H4
```
Note that "Wait for alternatives" will only timeout if the alternatives are always not fresh, i.e. this only happens when accessing storage servers. LB will throw `all_alternatives_failed` when timeout in this case.
#### Requests
Original requests in `loadBalancer` are wrapped by `LoadBalance.actor.h:RequestData`. It provides the following additional operations besides the original `flow` request:
* TSS support if `QueueModel` is available
* Translate some errors into `maybe_delivered`, `process_behind` or retries
* Update the `QueueModel` information including latency, penalty, etc.
## Appendix
### Picking an alternative in basic load balancing algorithm
The following script simulates the alternative picking up algorithm. The chosen alternatives will be printed out one-by-one. The `loadBalance` function uses a similar approach, though the interfaces in the same DC are used firstly.
```python
#! /usr/bin/env python3
import random
import time
class Alternatives:
def __init__(self, num_alternatives):
self._size = num_alternatives
def size(self):
return self._size
def get_best(self):
return random.randint(0, self._size - 1)
# Entry
NUM_ALTERNATIVES = 10
alts = Alternatives(NUM_ALTERNATIVES)
best_alt = alts.get_best()
next_alt = random.randint(0, alts.size() - 2)
if next_alt >= best_alt:
next_alt += 1
start_alt = next_alt
start_distance = (best_alt + alts.size() - start_alt) % alts.size()
use_alt = None
print("best_alt = {}".format(best_alt))
print("start_alt = {}".format(start_alt))
print("start_distance = {}".format(start_distance))
while True:
for alt_num in range(0, alts.size()):
use_alt = next_alt
if next_alt == start_alt:
print(" Going back to the start_alt")
use_alt = best_alt
elif (next_alt + alts.size() - start_alt) % alts.size() <= start_distance:
print(" Entering start_distance")
use_alt = (next_alt + alts.size() - 1) % alts.size()
print("Attempting alt: {}".format(use_alt))
# Next loop
next_alt = (next_alt + 1) % alts.size()
time.sleep(.2)
```

Binary file not shown.

View File

@ -69,10 +69,11 @@ When a data distribution role is created, it recovers the states of the previous
### When to move keys?
Keys can be moved from a server to another for several reasons:
(1) DD moves keys from overutilized servers to underutilized servers, where a servers utilization is defined as the servers disk usage;
(2) DD splits or merges shards in order to rebalance the disk usage of servers;
(3) DD removes redundant teams when the team number is larger than the desired number;
(4) DD repairs the replication factor by duplicate shards from a server to another when servers in a team fail.
(1) DD moves keys from disk-overutilized servers to disk-underutilized servers, where a servers disk-utilization is defined as the servers disk space usage;
(2) DD moves keys from read-busy servers to read-cold servers if read-aware data distribution is enabled;
(3) DD splits or merges shards in order to rebalance the disk usage of servers;
(4) DD removes redundant teams when the team number is larger than the desired number;
(5) DD repairs the replication factor by duplicate shards from a server to another when servers in a team fail.
Actors are created to monitor the reasons of key movement:
(1) `MountainChopper` and `ValleyFiller` actors periodically measure a random server teams utilization and rebalance the servers keys among other servers;
@ -93,3 +94,62 @@ The data movement from one server (called source server) to another (called dest
(2) The destination server will issue transactions to read the shard range and write the key-value pairs back. The key-value will be routed to the destination server and saved in the servers storage engine;
(3) DD removes the source server from the shards ownership by modifying the system keyspace;
(4) DD removes the shards information owned by the source server from the servers team information (i.e., *shardsAffectedByTeamFailure*).
# Read-aware Data Distribution
## Motivation
Before FDB 7.2, when the data distributor wants to rebalance shard, it only considers write bandwidth when choosing source and destination team, and the moved shard is chosen randomly. There are several cases where uneven read distribution from users causes a small subset of servers to be busy with read requests. This motivates the data distributor considering read busyness to minimize the read load unevenness.
## When does read rebalance happen
The data distributor will periodically check whether the read rebalance is needed. The conditions of rebalancing are
* the **worst CPU usage of source team >= 0.15** , which means the source team is somewhat busy;
* the ongoing relocation is less than the parallelism budget. `queuedRelocation[ priority ] < countLimit (default 50)`;
* the source team is not throttled to be a data movement source team. `( now() - The last time the source team was selected ) * time volumn (default 20) > read sample interval (2 min default)`;
* the read load difference between source team and destination team is larger than 30% of the source team load;
## Metrics definition
* READ_LOAD = ceil(READ_BYTES_PER_KSECOND / PAGE_SIZE)
* READ_IMBALANCE = ( MAX READ_LOAD / AVG READ_LOAD )
* MOVE_SCORE = READ_DENSITY = READ_BYTES_PER_KSECOND / SHARD_BYTE
The aim for read-aware data distributor is to minimize the IMBALANCE while not harm the disk utilization balance.
## Which shard to move
Basically, the MountainChopper will handle read-hot shards distribution with following steps:
1. The MountainChopper chooses **the source team** with the largest READ_LOAD while it satisfies HARD_CONSTRAINT, then check whether rebalance is needed;
* Hard constraint:
* Team is healthy
* The last time this team was source team is larger than (READ_SAMPLE_INTERVAL / MOVEMENT_PER_SAMPLE)
* The worst CPU usage of source team >= 0.15
2. Choose the destination team for moving
* Hard constraint:
* Team is healthy
* The teams available space is larger than the median free space
* Goals
* The destination team has the least LOAD in a random team set while it satisfies HARD_CONSTRAINT;
3. Select K shards on the source team of which
a. `LOAD(shard) < (LOAD(src) - LOAD(dest)) * READ_REBALANCE_MAX_SHARD_FRAC `;
b. `LOAD(shard) > AVG(SourceShardLoad)`;
c. with the highest top-K `MOVE_SCORE`;
We use 3.a and 3.b to set a eligible shard bandwidth for read rebalance moving. If the upper bound is too large, itll just make the hot shard shift to another team but not even the read load. If the upper bound is small, well just move some cold shards to other servers, which is also not helpful. The default value of READ_REBALANCE_MAX_SHARD_FRAC is 0.2 (up to 0.5) which is decided based on skewed workload test.
4. Issue relocation request to move a random shard in the top k set. If the maximum limit of read-balance movement is reached, give up this relocation.
Note: The ValleyFiller chooses a source team from a random set with the largest LOAD, and a destination team with the least LOAD.
## Performance Test and Summary
### Metrics to measure
1. StorageMetrics trace event report “FinishedQueries” which means the current storage server finishes how many read operations. The rate of FinishedQueries is what we measure first. The better the load balance is, the more similar the FinishedQueries rate across all storage servers.
CPU utilization. This metric is in a positive relationship with “FinishedQueries rate”. A even “FinishedQueries” generally means even CPU utilization in the read-only scenario.
2. Data movement size. We want to achieve load balance with as little movement as possible;
3. StandardDeviation(FinishedQueries). It indicates how much difference read load each storage server has.
### Typical Test Setup
120GB data, key=32B, value=200B; Single replica; 8 SS (20%) serves 80% read; 8 SS servers 60% write; 4 servers are both read and write hot; TPS=100000, 7 read/txn + 1 write/txn;
### Test Result Summary and Recommendation
* With intersected sets of read-hot and write-hot servers, read-aware DD even out the read + write load on the double-hot (be both read and write hot) server, which means the converged write load is similar to disk rebalance only algorithm.
* Read-aware DD will balance the read workload under the read-skew scenario. Starting from an imbalance `STD(FinishedQueries per minute)=16k`,the best result it can achieve is `STD(FinishedQueries per minute) = 2k`.
* The typical movement size under a read-skew scenario is 100M ~ 600M under default KNOB value `READ_REBALANCE_MAX_SHARD_FRAC=0.2, READ_REBALANCE_SRC_PARALLELISM = 20`. Increasing those knobs may accelerate the converge speed with the risk of data movement churn, which overwhelms the destination and over-cold the source.
* The upper bound of `READ_REBALANCE_MAX_SHARD_FRAC` is 0.5. Any value larger than 0.5 can result in hot server switching.
* When needing a deeper diagnosis of the read aware DD, `BgDDMountainChopper_New`, and `BgDDValleyFiller_New` trace events are where to go.

View File

@ -241,7 +241,10 @@ Included in the output of this command are the ``id`` and ``prefix`` assigned to
{
"tenant": {
"id": 0,
"prefix": "\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000"
"prefix": {
"base64": "AAAAAAAAAAU=",
"printable": "\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x05",
}
},
"type": "success"
}

View File

@ -29,6 +29,7 @@
#include "flow/Arena.h"
#include "flow/FastRef.h"
#include "flow/ThreadHelper.actor.h"
#include "flow/CodeProbe.h"
#include "flow/actorcompiler.h" // This must be the last #include.
namespace fdb_cli {

View File

@ -210,7 +210,7 @@ CommandFactory listTenantsFactory(
"The number of tenants to print can be specified using the [LIMIT] parameter, which defaults to 100."));
// gettenant command
ACTOR Future<bool> getTenantCommandActor(Reference<IDatabase> db, std::vector<StringRef> tokens) {
ACTOR Future<bool> getTenantCommandActor(Reference<IDatabase> db, std::vector<StringRef> tokens, int apiVersion) {
if (tokens.size() < 2 || tokens.size() > 3 || (tokens.size() == 3 && tokens[2] != "JSON"_sr)) {
printUsage(tokens[0]);
return false;
@ -243,11 +243,16 @@ ACTOR Future<bool> getTenantCommandActor(Reference<IDatabase> db, std::vector<St
int64_t id;
std::string prefix;
doc.get("id", id);
if (apiVersion >= 720) {
doc.get("prefix.printable", prefix);
} else {
doc.get("prefix", prefix);
}
printf(" id: %" PRId64 "\n", id);
printf(" prefix: %s\n", printable(prefix).c_str());
printf(" prefix: %s\n", prefix.c_str());
}
return true;

View File

@ -49,6 +49,7 @@
#include "flow/FastRef.h"
#include "flow/Platform.h"
#include "flow/SystemMonitor.h"
#include "flow/CodeProbe.h"
#include "flow/TLSConfig.actor.h"
#include "flow/ThreadHelper.actor.h"
@ -882,7 +883,7 @@ struct CLIOptions {
std::vector<std::pair<std::string, std::string>> knobs;
// api version, using the latest version by default
int api_version = FDB_API_VERSION;
int apiVersion = FDB_API_VERSION;
CLIOptions(int argc, char* argv[]) {
program_name = argv[0];
@ -927,11 +928,11 @@ struct CLIOptions {
break;
case OPT_API_VERSION: {
char* endptr;
api_version = strtoul((char*)args.OptionArg(), &endptr, 10);
apiVersion = strtoul((char*)args.OptionArg(), &endptr, 10);
if (*endptr != '\0') {
fprintf(stderr, "ERROR: invalid client version %s\n", args.OptionArg());
return 1;
} else if (api_version < 700 || api_version > FDB_API_VERSION) {
} else if (apiVersion < 700 || apiVersion > FDB_API_VERSION) {
// multi-version fdbcli only available after 7.0
fprintf(stderr,
"ERROR: api version %s is not supported. (Min: 700, Max: %d)\n",
@ -1113,7 +1114,7 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise) {
TraceEvent::setNetworkThread();
try {
localDb = Database::createDatabase(ccf, opt.api_version, IsInternal::False);
localDb = Database::createDatabase(ccf, opt.apiVersion, IsInternal::False);
if (!opt.exec.present()) {
printf("Using cluster file `%s'.\n", ccf->getLocation().c_str());
}
@ -1934,7 +1935,7 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise) {
}
if (tokencmp(tokens[0], "gettenant")) {
bool _result = wait(makeInterruptable(getTenantCommandActor(db, tokens)));
bool _result = wait(makeInterruptable(getTenantCommandActor(db, tokens, opt.apiVersion)));
if (!_result)
is_error = true;
continue;
@ -2171,7 +2172,7 @@ int main(int argc, char** argv) {
}
try {
API->selectApiVersion(opt.api_version);
API->selectApiVersion(opt.apiVersion);
API->setupNetwork();
opt.setupKnobs();
if (opt.exit_code != -1) {

View File

@ -185,7 +185,7 @@ ACTOR Future<bool> fileConfigureCommandActor(Reference<IDatabase> db,
// force_recovery_with_data_loss command
ACTOR Future<bool> forceRecoveryWithDataLossCommandActor(Reference<IDatabase> db, std::vector<StringRef> tokens);
// gettenant command
ACTOR Future<bool> getTenantCommandActor(Reference<IDatabase> db, std::vector<StringRef> tokens);
ACTOR Future<bool> getTenantCommandActor(Reference<IDatabase> db, std::vector<StringRef> tokens, int apiVersion);
// include command
ACTOR Future<bool> includeCommandActor(Reference<IDatabase> db, std::vector<StringRef> tokens);
// kill command

View File

@ -18,16 +18,26 @@
* limitations under the License.
*/
#include <cstring>
#include <vector>
#include "fmt/format.h"
#include "flow/IRandom.h"
#include "flow/serialize.h"
#include "fdbclient/BlobGranuleFiles.h"
#include "fdbclient/BlobGranuleCommon.h"
#include "fdbclient/ClientKnobs.h"
#include "fdbclient/Knobs.h"
#include "fdbclient/SystemData.h" // for allKeys unit test - could remove
#include "flow/BlobCipher.h"
#include "flow/CompressionUtils.h"
#include "flow/DeterministicRandom.h"
#include "flow/IRandom.h"
#include "flow/Trace.h"
#include "flow/serialize.h"
#include "flow/UnitTest.h"
#include "flow/xxhash.h"
#include "fmt/format.h"
#include <cstring>
#include <vector>
#define BG_READ_DEBUG false
@ -73,7 +83,67 @@ struct ChildBlockPointerRef {
};
};
struct IndexBlockRef {
namespace {
BlobGranuleFileEncryptionKeys getEncryptBlobCipherKey(const BlobGranuleCipherKeysCtx cipherKeysCtx) {
BlobGranuleFileEncryptionKeys eKeys;
eKeys.textCipherKey = makeReference<BlobCipherKey>(cipherKeysCtx.textCipherKey.encryptDomainId,
cipherKeysCtx.textCipherKey.baseCipherId,
cipherKeysCtx.textCipherKey.baseCipher.begin(),
cipherKeysCtx.textCipherKey.baseCipher.size(),
cipherKeysCtx.textCipherKey.salt);
eKeys.headerCipherKey = makeReference<BlobCipherKey>(cipherKeysCtx.headerCipherKey.encryptDomainId,
cipherKeysCtx.headerCipherKey.baseCipherId,
cipherKeysCtx.headerCipherKey.baseCipher.begin(),
cipherKeysCtx.headerCipherKey.baseCipher.size(),
cipherKeysCtx.headerCipherKey.salt);
return eKeys;
}
void validateEncryptionHeaderDetails(const BlobGranuleFileEncryptionKeys& eKeys,
const BlobCipherEncryptHeader& header,
const StringRef& ivRef) {
// Validate encryption header 'cipherHeader' details sanity
if (!(header.cipherHeaderDetails.baseCipherId == eKeys.headerCipherKey->getBaseCipherId() &&
header.cipherHeaderDetails.encryptDomainId == eKeys.headerCipherKey->getDomainId() &&
header.cipherHeaderDetails.salt == eKeys.headerCipherKey->getSalt())) {
TraceEvent(SevError, "EncryptionHeader_CipherHeaderMismatch")
.detail("HeaderDomainId", eKeys.headerCipherKey->getDomainId())
.detail("ExpectedHeaderDomainId", header.cipherHeaderDetails.encryptDomainId)
.detail("HeaderBaseCipherId", eKeys.headerCipherKey->getBaseCipherId())
.detail("ExpectedHeaderBaseCipherId", header.cipherHeaderDetails.baseCipherId)
.detail("HeaderSalt", eKeys.headerCipherKey->getSalt())
.detail("ExpectedHeaderSalt", header.cipherHeaderDetails.salt);
throw encrypt_header_metadata_mismatch();
}
// Validate encryption header 'cipherHeader' details sanity
if (!(header.cipherHeaderDetails.baseCipherId == eKeys.headerCipherKey->getBaseCipherId() &&
header.cipherHeaderDetails.encryptDomainId == eKeys.headerCipherKey->getDomainId() &&
header.cipherHeaderDetails.salt == eKeys.headerCipherKey->getSalt())) {
TraceEvent(SevError, "EncryptionHeader_CipherTextMismatch")
.detail("TextDomainId", eKeys.textCipherKey->getDomainId())
.detail("ExpectedTextDomainId", header.cipherTextDetails.encryptDomainId)
.detail("TextBaseCipherId", eKeys.textCipherKey->getBaseCipherId())
.detail("ExpectedTextBaseCipherId", header.cipherTextDetails.baseCipherId)
.detail("TextSalt", eKeys.textCipherKey->getSalt())
.detail("ExpectedTextSalt", header.cipherTextDetails.salt);
throw encrypt_header_metadata_mismatch();
}
// Validate 'Initialization Vector' sanity
if (memcmp(ivRef.begin(), &header.iv[0], AES_256_IV_LENGTH) != 0) {
TraceEvent(SevError, "EncryptionHeader_IVMismatch")
.detail("IVChecksum", XXH3_64bits(ivRef.begin(), ivRef.size()))
.detail("ExpectedIVChecksum", XXH3_64bits(&header.iv[0], AES_256_IV_LENGTH));
throw encrypt_header_metadata_mismatch();
}
}
} // namespace
struct IndexBlock {
constexpr static FileIdentifier file_identifier = 6525412;
// Serializable fields
VectorRef<ChildBlockPointerRef> children;
template <class Ar>
@ -82,9 +152,281 @@ struct IndexBlockRef {
}
};
struct IndexBlockRef {
constexpr static FileIdentifier file_identifier = 1945731;
// Serialized fields
Optional<StringRef> encryptHeaderRef;
// Encrypted/unencrypted IndexBlock
StringRef buffer;
// Non-serializable fields
IndexBlock block;
void encrypt(const BlobGranuleCipherKeysCtx cipherKeysCtx, Arena& arena) {
BlobGranuleFileEncryptionKeys eKeys = getEncryptBlobCipherKey(cipherKeysCtx);
ASSERT(eKeys.headerCipherKey.isValid() && eKeys.textCipherKey.isValid());
if (BG_ENCRYPT_COMPRESS_DEBUG) {
XXH64_hash_t chksum = XXH3_64bits(buffer.begin(), buffer.size());
TraceEvent(SevDebug, "IndexBlockEncrypt_Before").detail("Chksum", chksum);
}
EncryptBlobCipherAes265Ctr encryptor(eKeys.textCipherKey,
eKeys.headerCipherKey,
cipherKeysCtx.ivRef.begin(),
AES_256_IV_LENGTH,
ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE);
Value serializedBuff = ObjectWriter::toValue(block, Unversioned());
BlobCipherEncryptHeader header;
buffer = encryptor.encrypt(serializedBuff.contents().begin(), serializedBuff.contents().size(), &header, arena)
->toStringRef();
encryptHeaderRef = BlobCipherEncryptHeader::toStringRef(header, arena);
if (BG_ENCRYPT_COMPRESS_DEBUG) {
XXH64_hash_t chksum = XXH3_64bits(buffer.begin(), buffer.size());
TraceEvent(SevDebug, "IndexBlockEncrypt_After").detail("Chksum", chksum);
}
}
static void decrypt(const BlobGranuleCipherKeysCtx cipherKeysCtx, IndexBlockRef& idxRef, Arena& arena) {
BlobGranuleFileEncryptionKeys eKeys = getEncryptBlobCipherKey(cipherKeysCtx);
ASSERT(eKeys.headerCipherKey.isValid() && eKeys.textCipherKey.isValid());
ASSERT(idxRef.encryptHeaderRef.present());
if (BG_ENCRYPT_COMPRESS_DEBUG) {
XXH64_hash_t chksum = XXH3_64bits(idxRef.buffer.begin(), idxRef.buffer.size());
TraceEvent(SevDebug, "IndexBlockEncrypt_Before").detail("Chksum", chksum);
}
BlobCipherEncryptHeader header = BlobCipherEncryptHeader::fromStringRef(idxRef.encryptHeaderRef.get());
validateEncryptionHeaderDetails(eKeys, header, cipherKeysCtx.ivRef);
DecryptBlobCipherAes256Ctr decryptor(eKeys.textCipherKey, eKeys.headerCipherKey, cipherKeysCtx.ivRef.begin());
StringRef decrypted =
decryptor.decrypt(idxRef.buffer.begin(), idxRef.buffer.size(), header, arena)->toStringRef();
if (BG_ENCRYPT_COMPRESS_DEBUG) {
XXH64_hash_t chksum = XXH3_64bits(decrypted.begin(), decrypted.size());
TraceEvent(SevDebug, "IndexBlockEncrypt_After").detail("Chksum", chksum);
}
// TODO: Add version?
ObjectReader dataReader(decrypted.begin(), Unversioned());
dataReader.deserialize(FileIdentifierFor<IndexBlock>::value, idxRef.block, arena);
}
void init(Optional<BlobGranuleCipherKeysCtx> cipherKeysCtx, Arena& arena) {
if (encryptHeaderRef.present()) {
ASSERT(cipherKeysCtx.present());
decrypt(cipherKeysCtx.get(), *this, arena);
} else {
TraceEvent("IndexBlockSize").detail("Sz", buffer.size());
// TODO: Add version?
ObjectReader dataReader(buffer.begin(), Unversioned());
dataReader.deserialize(FileIdentifierFor<IndexBlock>::value, block, arena);
}
}
void finalize(Optional<BlobGranuleCipherKeysCtx> cipherKeysCtx, Arena& arena) {
if (cipherKeysCtx.present()) {
// IndexBlock childBlock pointers offsets are relative to IndexBlock endOffset instead of file start offset.
// Compressing indexBlock will need offset recalculation (circular depedency). IndexBlock size is bounded by
// number of chunks and sizeof(KeyPrefix), 'not' compressing IndexBlock shouldn't cause significant file
// size bloat.
ASSERT(cipherKeysCtx.present());
encrypt(cipherKeysCtx.get(), arena);
} else {
encryptHeaderRef.reset();
buffer = StringRef(arena, ObjectWriter::toValue(block, Unversioned()).contents());
}
TraceEvent(SevDebug, "IndexBlockSize").detail("Sz", buffer.size()).detail("Encrypted", cipherKeysCtx.present());
}
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, encryptHeaderRef, buffer);
}
};
// On-disk and/or in-memory representation of a IndexBlobGranuleFile 'chunk'.
//
// Encryption: A 'chunk' gets encrypted before getting persisted if enabled. Encryption header is persisted along with
// the chunk data to assist decryption on reads.
//
// Compression: A 'chunk' gets compressed before getting persisted if enabled. Compression filter (algoritm) infomration
// is persisted as part of 'chunk metadata' to assist decompression on reads.
struct IndexBlobGranuleFileChunkRef {
constexpr static FileIdentifier file_identifier = 2814019;
// Serialized fields
Optional<CompressionFilter> compressionFilter;
Optional<StringRef> encryptHeaderRef;
// encrypted and/or compressed chunk;
StringRef buffer;
// Non-serialized
Optional<StringRef> chunkBytes;
static void encrypt(const BlobGranuleCipherKeysCtx& cipherKeysCtx,
IndexBlobGranuleFileChunkRef& chunkRef,
Arena& arena) {
BlobGranuleFileEncryptionKeys eKeys = getEncryptBlobCipherKey(cipherKeysCtx);
ASSERT(eKeys.headerCipherKey.isValid() && eKeys.textCipherKey.isValid());
if (BG_ENCRYPT_COMPRESS_DEBUG) {
XXH64_hash_t chksum = XXH3_64bits(chunkRef.buffer.begin(), chunkRef.buffer.size());
TraceEvent(SevDebug, "BlobChunkEncrypt_Before").detail("Chksum", chksum);
}
EncryptBlobCipherAes265Ctr encryptor(eKeys.textCipherKey,
eKeys.headerCipherKey,
cipherKeysCtx.ivRef.begin(),
AES_256_IV_LENGTH,
ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE);
BlobCipherEncryptHeader header;
chunkRef.buffer =
encryptor.encrypt(chunkRef.buffer.begin(), chunkRef.buffer.size(), &header, arena)->toStringRef();
chunkRef.encryptHeaderRef = BlobCipherEncryptHeader::toStringRef(header, arena);
if (BG_ENCRYPT_COMPRESS_DEBUG) {
XXH64_hash_t chksum = XXH3_64bits(chunkRef.buffer.begin(), chunkRef.buffer.size());
TraceEvent(SevDebug, "BlobChunkEncrypt_After").detail("Chksum", chksum);
}
}
static StringRef decrypt(const BlobGranuleCipherKeysCtx& cipherKeysCtx,
const IndexBlobGranuleFileChunkRef& chunkRef,
Arena& arena) {
BlobGranuleFileEncryptionKeys eKeys = getEncryptBlobCipherKey(cipherKeysCtx);
ASSERT(eKeys.headerCipherKey.isValid() && eKeys.textCipherKey.isValid());
ASSERT(chunkRef.encryptHeaderRef.present());
if (BG_ENCRYPT_COMPRESS_DEBUG) {
XXH64_hash_t chksum = XXH3_64bits(chunkRef.buffer.begin(), chunkRef.buffer.size());
TraceEvent(SevDebug, "BlobChunkDecrypt_Before").detail("Chksum", chksum);
}
BlobCipherEncryptHeader header = BlobCipherEncryptHeader::fromStringRef(chunkRef.encryptHeaderRef.get());
validateEncryptionHeaderDetails(eKeys, header, cipherKeysCtx.ivRef);
DecryptBlobCipherAes256Ctr decryptor(eKeys.textCipherKey, eKeys.headerCipherKey, cipherKeysCtx.ivRef.begin());
StringRef decrypted =
decryptor.decrypt(chunkRef.buffer.begin(), chunkRef.buffer.size(), header, arena)->toStringRef();
if (BG_ENCRYPT_COMPRESS_DEBUG) {
XXH64_hash_t chksum = XXH3_64bits(decrypted.begin(), decrypted.size());
TraceEvent(SevDebug, "BlobChunkDecrypt_After").detail("Chksum", chksum);
}
return decrypted;
}
static void compress(IndexBlobGranuleFileChunkRef& chunkRef,
const Value& chunk,
const CompressionFilter compFilter,
Arena& arena) {
chunkRef.compressionFilter = compFilter;
chunkRef.buffer = CompressionUtils::compress(chunkRef.compressionFilter.get(), chunk.contents(), arena);
if (BG_ENCRYPT_COMPRESS_DEBUG) {
XXH64_hash_t chunkChksum = XXH3_64bits(chunk.contents().begin(), chunk.contents().size());
XXH64_hash_t chksum = XXH3_64bits(chunkRef.buffer.begin(), chunkRef.buffer.size());
TraceEvent("CompressBlobChunk")
.detail("Filter", CompressionUtils::toString(chunkRef.compressionFilter.get()))
.detail("ChkSumBefore", chunkChksum)
.detail("ChkSumAfter", chksum);
}
}
static StringRef decompress(const IndexBlobGranuleFileChunkRef& chunkRef, Arena& arena) {
ASSERT(chunkRef.compressionFilter.present());
return CompressionUtils::decompress(chunkRef.compressionFilter.get(), chunkRef.chunkBytes.get(), arena);
}
static Value toBytes(Optional<BlobGranuleCipherKeysCtx> cipherKeysCtx,
Optional<CompressionFilter> compFilter,
const Value& chunk,
Arena& arena) {
IndexBlobGranuleFileChunkRef chunkRef;
if (compFilter.present()) {
IndexBlobGranuleFileChunkRef::compress(chunkRef, chunk, compFilter.get(), arena);
} else {
chunkRef.buffer = StringRef(arena, chunk.contents());
}
if (cipherKeysCtx.present()) {
IndexBlobGranuleFileChunkRef::encrypt(cipherKeysCtx.get(), chunkRef, arena);
}
if (BG_ENCRYPT_COMPRESS_DEBUG) {
TraceEvent(SevDebug, "GenerateBlobGranuleFileChunk")
.detail("Encrypt", cipherKeysCtx.present())
.detail("Compress", compFilter.present())
.detail("CompFilter",
compFilter.present() ? CompressionUtils::toString(compFilter.get())
: CompressionUtils::toString(CompressionFilter::NONE));
}
// TODO: Add version?
return ObjectWriter::toValue(chunkRef, Unversioned());
}
static IndexBlobGranuleFileChunkRef fromBytes(Optional<BlobGranuleCipherKeysCtx> cipherKeysCtx,
StringRef buffer,
Arena& arena) {
IndexBlobGranuleFileChunkRef chunkRef;
// TODO: Add version?
ObjectReader dataReader(buffer.begin(), Unversioned());
dataReader.deserialize(FileIdentifierFor<IndexBlobGranuleFileChunkRef>::value, chunkRef, arena);
if (chunkRef.encryptHeaderRef.present()) {
ASSERT(cipherKeysCtx.present());
chunkRef.chunkBytes = IndexBlobGranuleFileChunkRef::decrypt(cipherKeysCtx.get(), chunkRef, arena);
} else {
chunkRef.chunkBytes = chunkRef.buffer;
}
if (chunkRef.compressionFilter.present()) {
chunkRef.chunkBytes = IndexBlobGranuleFileChunkRef::decompress(chunkRef, arena);
} else if (!chunkRef.chunkBytes.present()) {
// 'Encryption' & 'Compression' aren't enabled.
chunkRef.chunkBytes = chunkRef.buffer;
}
ASSERT(chunkRef.chunkBytes.present());
if (BG_ENCRYPT_COMPRESS_DEBUG) {
TraceEvent(SevDebug, "ParseBlobGranuleFileChunk")
.detail("Encrypted", chunkRef.encryptHeaderRef.present())
.detail("Compressed", chunkRef.compressionFilter.present())
.detail("CompFilter",
chunkRef.compressionFilter.present()
? CompressionUtils::toString(chunkRef.compressionFilter.get())
: CompressionUtils::toString(CompressionFilter::NONE));
}
return chunkRef;
}
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, compressionFilter, encryptHeaderRef, buffer);
}
};
/*
* A file header for a key-ordered file that is chunked on disk, where each chunk is a disjoint key range of data.
* FIXME: encryption and compression support
*/
struct IndexedBlobGranuleFile {
constexpr static FileIdentifier file_identifier = 3828201;
@ -93,16 +435,27 @@ struct IndexedBlobGranuleFile {
uint8_t fileType;
Optional<StringRef> filter; // not used currently
// TODO: add encrypted/compressed versions of index block
IndexBlockRef indexBlock;
IndexBlockRef indexBlockRef;
int chunkStartOffset;
// Non-serialized member fields
// TODO: add encryption and compression metadata for whole file
StringRef fileBytes;
static Standalone<IndexedBlobGranuleFile> fromFileBytes(const StringRef& fileBytes) {
// TODO: decrypt/decompress index block here if necessary first
void init(const Optional<BlobGranuleCipherKeysCtx> cipherKeysCtx) {
formatVersion = LATEST_BG_FORMAT_VERSION;
fileType = SNAPSHOT_FILE_TYPE;
chunkStartOffset = -1;
}
void init(const StringRef& fBytes, Arena& arena, const Optional<BlobGranuleCipherKeysCtx> cipherKeysCtx) {
ASSERT(chunkStartOffset > 0);
fileBytes = fBytes;
indexBlockRef.init(cipherKeysCtx, arena);
}
static Standalone<IndexedBlobGranuleFile> fromFileBytes(const StringRef& fileBytes,
const Optional<BlobGranuleCipherKeysCtx> cipherKeysCtx) {
// parse index block at head of file
Arena arena;
IndexedBlobGranuleFile file;
@ -110,7 +463,7 @@ struct IndexedBlobGranuleFile {
ObjectReader dataReader(fileBytes.begin(), Unversioned());
dataReader.deserialize(FileIdentifierFor<IndexedBlobGranuleFile>::value, file, arena);
file.fileBytes = fileBytes;
file.init(fileBytes, arena, cipherKeysCtx);
// do sanity checks
if (file.formatVersion > LATEST_BG_FORMAT_VERSION || file.formatVersion < MIN_SUPPORTED_BG_FORMAT_VERSION) {
@ -128,13 +481,15 @@ struct IndexedBlobGranuleFile {
ChildBlockPointerRef* findStartBlock(const KeyRef& beginKey) const {
ChildBlockPointerRef searchKey(beginKey, 0);
ChildBlockPointerRef* startBlock = (ChildBlockPointerRef*)std::lower_bound(
indexBlock.children.begin(), indexBlock.children.end(), searchKey, ChildBlockPointerRef::OrderByKey());
ChildBlockPointerRef* startBlock = (ChildBlockPointerRef*)std::lower_bound(indexBlockRef.block.children.begin(),
indexBlockRef.block.children.end(),
searchKey,
ChildBlockPointerRef::OrderByKey());
if (startBlock != indexBlock.children.end() && startBlock != indexBlock.children.begin() &&
if (startBlock != indexBlockRef.block.children.end() && startBlock != indexBlockRef.block.children.begin() &&
beginKey < startBlock->key) {
startBlock--;
} else if (startBlock == indexBlock.children.end()) {
} else if (startBlock == indexBlockRef.block.children.end()) {
startBlock--;
}
@ -143,19 +498,31 @@ struct IndexedBlobGranuleFile {
// FIXME: implement some sort of iterator type interface?
template <class ChildType>
Standalone<ChildType> getChild(const ChildBlockPointerRef* childPointer) {
// TODO decrypt/decompress if necessary
ASSERT(childPointer != indexBlock.children.end());
Standalone<ChildType> getChild(const ChildBlockPointerRef* childPointer,
Optional<BlobGranuleCipherKeysCtx> cipherKeysCtx,
int startOffset) {
ASSERT(childPointer != indexBlockRef.block.children.end());
const ChildBlockPointerRef* nextPointer = childPointer + 1;
ASSERT(nextPointer != indexBlock.children.end());
ASSERT(nextPointer != indexBlockRef.block.children.end());
size_t blockSize = nextPointer->offset - childPointer->offset;
StringRef childData(fileBytes.begin() + childPointer->offset, blockSize);
// Account for IndexBlockRef size for chunk offset computation
StringRef childData(fileBytes.begin() + childPointer->offset + startOffset, blockSize);
if (BG_ENCRYPT_COMPRESS_DEBUG) {
TraceEvent(SevDebug, "GetChild")
.detail("BlkSize", blockSize)
.detail("Offset", childPointer->offset)
.detail("StartOffset", chunkStartOffset);
}
Arena childArena;
IndexBlobGranuleFileChunkRef chunkRef =
IndexBlobGranuleFileChunkRef::fromBytes(cipherKeysCtx, childData, childArena);
ChildType child;
// TODO: version?
ObjectReader dataReader(childData.begin(), Unversioned());
ObjectReader dataReader(chunkRef.chunkBytes.get().begin(), Unversioned());
dataReader.deserialize(FileIdentifierFor<ChildType>::value, child, childArena);
// TODO implement some sort of decrypted+decompressed+deserialized cache, if this object gets reused?
@ -164,7 +531,7 @@ struct IndexedBlobGranuleFile {
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, formatVersion, fileType, filter, indexBlock);
serializer(ar, formatVersion, fileType, filter, indexBlockRef, chunkStartOffset);
}
};
@ -172,22 +539,30 @@ struct IndexedBlobGranuleFile {
// serializing once, adding the serialized size to each offset, and serializing again. This relies on the fact that
// ObjectWriter/flatbuffers uses fixed size integers instead of variable size.
Value serializeIndexBlock(Standalone<IndexedBlobGranuleFile>& file) {
Value serializeIndexBlock(Standalone<IndexedBlobGranuleFile>& file, Optional<BlobGranuleCipherKeysCtx> cipherKeysCtx) {
file.indexBlockRef.finalize(cipherKeysCtx, file.arena());
// TODO: version?
Value indexBlock = ObjectWriter::toValue(file, Unversioned());
for (auto& it : file.indexBlock.children) {
it.offset += indexBlock.size();
Value serialized = ObjectWriter::toValue(file, Unversioned());
file.chunkStartOffset = serialized.contents().size();
if (BG_ENCRYPT_COMPRESS_DEBUG) {
TraceEvent(SevDebug, "SerializeIndexBlock").detail("StartOffset", file.chunkStartOffset);
}
return ObjectWriter::toValue(file, Unversioned());
}
// TODO: this should probably be in actor file with yields?
// TODO: optimize memory copying
// TODO: sanity check no oversized files
Value serializeChunkedSnapshot(Standalone<GranuleSnapshot> snapshot, int chunkCount) {
Value serializeChunkedSnapshot(Standalone<GranuleSnapshot> snapshot,
int chunkCount,
Optional<CompressionFilter> compressFilter,
Optional<BlobGranuleCipherKeysCtx> cipherKeysCtx) {
Standalone<IndexedBlobGranuleFile> file;
file.formatVersion = LATEST_BG_FORMAT_VERSION;
file.fileType = SNAPSHOT_FILE_TYPE;
file.init(cipherKeysCtx);
size_t targetChunkBytes = snapshot.expectedSize() / chunkCount;
size_t currentChunkBytesEstimate = 0;
@ -208,17 +583,25 @@ Value serializeChunkedSnapshot(Standalone<GranuleSnapshot> snapshot, int chunkCo
currentChunkBytesEstimate += snapshot[i].expectedSize();
if (currentChunkBytesEstimate >= targetChunkBytes || i == snapshot.size() - 1) {
// TODO: add encryption/compression for each chunk
// TODO: protocol version
Value serialized = ObjectWriter::toValue(currentChunk, Unversioned());
chunks.push_back(serialized);
Value chunkBytes =
IndexBlobGranuleFileChunkRef::toBytes(cipherKeysCtx, compressFilter, serialized, file.arena());
chunks.push_back(chunkBytes);
// TODO remove validation
if (!file.indexBlock.children.empty()) {
ASSERT(file.indexBlock.children.back().key < currentChunk.begin()->key);
if (!file.indexBlockRef.block.children.empty()) {
ASSERT(file.indexBlockRef.block.children.back().key < currentChunk.begin()->key);
}
file.indexBlock.children.emplace_back_deep(file.arena(), currentChunk.begin()->key, previousChunkBytes);
file.indexBlockRef.block.children.emplace_back_deep(
file.arena(), currentChunk.begin()->key, previousChunkBytes);
previousChunkBytes += serialized.size();
if (BG_ENCRYPT_COMPRESS_DEBUG) {
TraceEvent(SevDebug, "ChunkSize")
.detail("ChunkBytes", chunkBytes.size())
.detail("PrvChunkBytes", previousChunkBytes);
}
previousChunkBytes += chunkBytes.size();
currentChunkBytesEstimate = 0;
currentChunk = Standalone<GranuleSnapshot>();
}
@ -226,12 +609,13 @@ Value serializeChunkedSnapshot(Standalone<GranuleSnapshot> snapshot, int chunkCo
ASSERT(currentChunk.empty());
// push back dummy last chunk to get last chunk size, and to know last key in last block without having to read it
if (!snapshot.empty()) {
file.indexBlock.children.emplace_back_deep(file.arena(), keyAfter(snapshot.back().key), previousChunkBytes);
file.indexBlockRef.block.children.emplace_back_deep(
file.arena(), keyAfter(snapshot.back().key), previousChunkBytes);
}
Value indexBlock = serializeIndexBlock(file);
int32_t indexSize = indexBlock.size();
chunks[0] = indexBlock;
Value indexBlockBytes = serializeIndexBlock(file, cipherKeysCtx);
int32_t indexSize = indexBlockBytes.size();
chunks[0] = indexBlockBytes;
// TODO: write this directly to stream to avoid extra copy?
Arena ret;
@ -240,7 +624,15 @@ Value serializeChunkedSnapshot(Standalone<GranuleSnapshot> snapshot, int chunkCo
uint8_t* buffer = new (ret) uint8_t[size];
previousChunkBytes = 0;
int idx = 0;
for (auto& it : chunks) {
if (BG_ENCRYPT_COMPRESS_DEBUG) {
TraceEvent(SevDebug, "SerializeSnapshot")
.detail("ChunkIdx", idx++)
.detail("Size", it.size())
.detail("Offset", previousChunkBytes);
}
memcpy(buffer + previousChunkBytes, it.begin(), it.size());
previousChunkBytes += it.size();
}
@ -252,19 +644,21 @@ Value serializeChunkedSnapshot(Standalone<GranuleSnapshot> snapshot, int chunkCo
// TODO: use redwood prefix trick to optimize cpu comparison
static Arena loadSnapshotFile(const StringRef& snapshotData,
KeyRangeRef keyRange,
std::map<KeyRef, ValueRef>& dataMap) {
std::map<KeyRef, ValueRef>& dataMap,
Optional<BlobGranuleCipherKeysCtx> cipherKeysCtx) {
Arena rootArena;
Standalone<IndexedBlobGranuleFile> file = IndexedBlobGranuleFile::fromFileBytes(snapshotData);
Standalone<IndexedBlobGranuleFile> file = IndexedBlobGranuleFile::fromFileBytes(snapshotData, cipherKeysCtx);
ASSERT(file.fileType == SNAPSHOT_FILE_TYPE);
ASSERT(file.chunkStartOffset > 0);
// empty snapshot file
if (file.indexBlock.children.empty()) {
if (file.indexBlockRef.block.children.empty()) {
return rootArena;
}
ASSERT(file.indexBlock.children.size() >= 2);
ASSERT(file.indexBlockRef.block.children.size() >= 2);
// TODO: refactor this out of delta tree
// int commonPrefixLen = commonPrefixLength(index.dataBlockOffsets.front().first,
@ -275,8 +669,9 @@ static Arena loadSnapshotFile(const StringRef& snapshotData,
// FIXME: optimize cpu comparisons here in first/last partial blocks, doing entire blocks at once based on
// comparison, and using shared prefix for key comparison
while (currentBlock != (file.indexBlock.children.end() - 1) && keyRange.end > currentBlock->key) {
Standalone<GranuleSnapshot> dataBlock = file.getChild<GranuleSnapshot>(currentBlock);
while (currentBlock != (file.indexBlockRef.block.children.end() - 1) && keyRange.end > currentBlock->key) {
Standalone<GranuleSnapshot> dataBlock =
file.getChild<GranuleSnapshot>(currentBlock, cipherKeysCtx, file.chunkStartOffset);
ASSERT(!dataBlock.empty());
ASSERT(currentBlock->key == dataBlock.front().key);
@ -426,7 +821,7 @@ RangeResult materializeBlobGranule(const BlobGranuleChunkRef& chunk,
}
if (snapshotData.present()) {
Arena snapshotArena = loadSnapshotFile(snapshotData.get(), requestRange, dataMap);
Arena snapshotArena = loadSnapshotFile(snapshotData.get(), requestRange, dataMap, chunk.cipherKeysCtx);
arena.dependsOn(snapshotArena);
}
@ -574,6 +969,40 @@ std::string randomBGFilename(UID blobWorkerID, UID granuleID, Version version, s
suffix;
}
namespace {
const EncryptCipherDomainId encryptDomainId = deterministicRandom()->randomInt64(786, 7860);
const EncryptCipherBaseKeyId encryptBaseCipherId = deterministicRandom()->randomUInt64();
const EncryptCipherRandomSalt encryptSalt = deterministicRandom()->randomUInt64();
Standalone<StringRef> getBaseCipher() {
Standalone<StringRef> baseCipher = makeString(AES_256_KEY_LENGTH);
generateRandomData(mutateString(baseCipher), baseCipher.size());
return baseCipher;
}
Standalone<StringRef> encryptBaseCipher = getBaseCipher();
BlobGranuleCipherKeysCtx getCipherKeysCtx(Arena& arena) {
BlobGranuleCipherKeysCtx cipherKeysCtx;
cipherKeysCtx.textCipherKey.encryptDomainId = encryptDomainId;
cipherKeysCtx.textCipherKey.baseCipherId = encryptBaseCipherId;
cipherKeysCtx.textCipherKey.salt = encryptSalt;
cipherKeysCtx.textCipherKey.baseCipher = StringRef(arena, encryptBaseCipher);
cipherKeysCtx.headerCipherKey.encryptDomainId = SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID;
cipherKeysCtx.headerCipherKey.baseCipherId = encryptBaseCipherId;
cipherKeysCtx.headerCipherKey.salt = encryptSalt;
cipherKeysCtx.headerCipherKey.baseCipher = StringRef(arena, encryptBaseCipher);
cipherKeysCtx.ivRef = makeString(AES_256_IV_LENGTH, arena);
generateRandomData(mutateString(cipherKeysCtx.ivRef), AES_256_IV_LENGTH);
return cipherKeysCtx;
}
} // namespace
TEST_CASE("/blobgranule/files/applyDelta") {
printf("Testing blob granule delta applying\n");
Arena a;
@ -731,14 +1160,18 @@ int randomExp(int minExp, int maxExp) {
return deterministicRandom()->randomInt(val, val * 2);
}
void checkEmpty(const Value& serialized, Key begin, Key end) {
void checkEmpty(const Value& serialized, Key begin, Key end, Optional<BlobGranuleCipherKeysCtx> cipherKeysCtx) {
std::map<KeyRef, ValueRef> result;
Arena ar = loadSnapshotFile(serialized, KeyRangeRef(begin, end), result);
Arena ar = loadSnapshotFile(serialized, KeyRangeRef(begin, end), result, cipherKeysCtx);
ASSERT(result.empty());
}
// endIdx is exclusive
void checkRead(const Standalone<GranuleSnapshot>& snapshot, const Value& serialized, int beginIdx, int endIdx) {
void checkRead(const Standalone<GranuleSnapshot>& snapshot,
const Value& serialized,
int beginIdx,
int endIdx,
Optional<BlobGranuleCipherKeysCtx> cipherKeysCtx) {
ASSERT(beginIdx < endIdx);
ASSERT(endIdx <= snapshot.size());
std::map<KeyRef, ValueRef> result;
@ -746,7 +1179,7 @@ void checkRead(const Standalone<GranuleSnapshot>& snapshot, const Value& seriali
Key endKey = endIdx == snapshot.size() ? keyAfter(snapshot.back().key) : snapshot[endIdx].key;
KeyRangeRef range(beginKey, endKey);
Arena ar = loadSnapshotFile(serialized, range, result);
Arena ar = loadSnapshotFile(serialized, range, result, cipherKeysCtx);
if (result.size() != endIdx - beginIdx) {
fmt::print("Read {0} rows != {1}\n", result.size(), endIdx - beginIdx);
@ -818,7 +1251,21 @@ TEST_CASE("/blobgranule/files/snapshotFormatUnitTest") {
fmt::print(
"Constructing snapshot with {0} rows, {1} bytes, and {2} chunks\n", data.size(), totalDataBytes, targetChunks);
Value serialized = serializeChunkedSnapshot(data, targetChunks);
Optional<BlobGranuleCipherKeysCtx> cipherKeysCtx = Optional<BlobGranuleCipherKeysCtx>();
Arena arena;
if (deterministicRandom()->coinflip()) {
cipherKeysCtx = getCipherKeysCtx(arena);
}
Optional<CompressionFilter> compressFilter;
if (deterministicRandom()->coinflip()) {
#ifdef ZLIB_LIB_SUPPORTED
compressFilter = CompressionFilter::GZIP;
#else
compressFilter = CompressionFilter::NONE;
#endif
}
Value serialized = serializeChunkedSnapshot(data, targetChunks, compressFilter, cipherKeysCtx);
fmt::print("Snapshot serialized! {0} bytes\n", serialized.size());
@ -829,7 +1276,7 @@ TEST_CASE("/blobgranule/files/snapshotFormatUnitTest") {
fmt::print("Initial read starting\n");
checkRead(data, serialized, 0, data.size());
checkRead(data, serialized, 0, data.size(), cipherKeysCtx);
fmt::print("Initial read complete\n");
@ -838,20 +1285,20 @@ TEST_CASE("/blobgranule/files/snapshotFormatUnitTest") {
int width = randomExp(0, maxExp);
ASSERT(width <= data.size());
int start = deterministicRandom()->randomInt(0, data.size() - width);
checkRead(data, serialized, start, start + width);
checkRead(data, serialized, start, start + width, cipherKeysCtx);
}
fmt::print("Doing empty checks\n");
int randomIdx = deterministicRandom()->randomInt(0, data.size() - 1);
checkEmpty(serialized, keyAfter(data[randomIdx].key), data[randomIdx + 1].key);
checkEmpty(serialized, keyAfter(data[randomIdx].key), data[randomIdx + 1].key, cipherKeysCtx);
} else {
fmt::print("Doing empty checks\n");
}
checkEmpty(serialized, normalKeys.begin, data.front().key);
checkEmpty(serialized, normalKeys.begin, LiteralStringRef("\x00"));
checkEmpty(serialized, keyAfter(data.back().key), normalKeys.end);
checkEmpty(serialized, LiteralStringRef("\xfe"), normalKeys.end);
checkEmpty(serialized, normalKeys.begin, data.front().key, cipherKeysCtx);
checkEmpty(serialized, normalKeys.begin, LiteralStringRef("\x00"), cipherKeysCtx);
checkEmpty(serialized, keyAfter(data.back().key), normalKeys.end, cipherKeysCtx);
checkEmpty(serialized, LiteralStringRef("\xfe"), normalKeys.end, cipherKeysCtx);
fmt::print("Snapshot format test done!\n");

View File

@ -71,6 +71,13 @@ if(WITH_AWS_BACKUP)
include(awssdk)
endif()
find_package(ZLIB)
if(ZLIB_FOUND)
add_compile_definitions(ZLIB_LIB_SUPPORTED)
else()
message(STATUS "ZLIB package not found")
endif()
add_flow_target(STATIC_LIBRARY NAME fdbclient SRCS ${FDBCLIENT_SRCS} ADDL_SRCS ${options_srcs})
target_include_directories(fdbclient PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/include" "${CMAKE_CURRENT_BINARY_DIR}/include")
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/versions.h.cmake ${CMAKE_CURRENT_BINARY_DIR}/include/fdbclient/versions.h)
@ -89,7 +96,7 @@ if(WIN32)
add_dependencies(fdbclient_sampling_actors fdbclient_actors)
endif()
add_flow_target(LINK_TEST NAME fdbclientlinktest SRCS ${FDBCLIENT_SRCS} LinkTest.cpp ADDL_SRCS ${options_srcs})
add_flow_target(LINK_TEST NAME fdbclientlinktest SRCS LinkTest.cpp)
target_link_libraries(fdbclientlinktest PRIVATE fdbclient rapidxml) # re-link rapidxml due to private link interface
if(BUILD_AZURE_BACKUP)
@ -101,3 +108,4 @@ if(BUILD_AWS_BACKUP)
target_link_libraries(fdbclient PUBLIC awssdk_target)
target_link_libraries(fdbclient_sampling PUBLIC awssdk_target)
endif()

View File

@ -363,7 +363,7 @@ struct BackupRangeTaskFunc : TaskFuncBase {
if ((!prevAdjacent || !nextAdjacent) &&
rangeCount > ((prevAdjacent || nextAdjacent) ? CLIENT_KNOBS->BACKUP_MAP_KEY_UPPER_LIMIT
: CLIENT_KNOBS->BACKUP_MAP_KEY_LOWER_LIMIT)) {
TEST(true); // range insert delayed because too versionMap is too large
CODE_PROBE(true, "range insert delayed because too versionMap is too large");
if (rangeCount > CLIENT_KNOBS->BACKUP_MAP_KEY_UPPER_LIMIT)
TraceEvent(SevWarnAlways, "DBA_KeyRangeMapTooLarge").log();
@ -2780,7 +2780,7 @@ public:
Version destVersion = wait(tr3.getReadVersion());
TraceEvent("DBA_SwitchoverVersionUpgrade").detail("Src", commitVersion).detail("Dest", destVersion);
if (destVersion <= commitVersion) {
TEST(true); // Forcing dest backup cluster to higher version
CODE_PROBE(true, "Forcing dest backup cluster to higher version");
tr3.set(minRequiredCommitVersionKey, BinaryWriter::toValue(commitVersion + 1, Unversioned()));
wait(tr3.commit());
} else {
@ -2933,7 +2933,7 @@ public:
Version applied = BinaryReader::fromStringRef<Version>(lastApplied.get(), Unversioned());
TraceEvent("DBA_AbortVersionUpgrade").detail("Src", applied).detail("Dest", current);
if (current <= applied) {
TEST(true); // Upgrading version of local database.
CODE_PROBE(true, "Upgrading version of local database.");
// The +1 is because we want to make sure that a versionstamped operation can't reuse
// the same version as an already-applied transaction.
tr->set(minRequiredCommitVersionKey, BinaryWriter::toValue(applied + 1, Unversioned()));

View File

@ -822,7 +822,7 @@ struct AbortFiveZeroBackupTask : TaskFuncBase {
state FileBackupAgent backupAgent;
state std::string tagName = task->params[BackupAgentBase::keyConfigBackupTag].toString();
TEST(true); // Canceling old backup task
CODE_PROBE(true, "Canceling old backup task");
TraceEvent(SevInfo, "FileBackupCancelOldTask")
.detail("Task", task->params[Task::reservedTaskParamKeyType])
@ -908,7 +908,7 @@ struct AbortFiveOneBackupTask : TaskFuncBase {
state BackupConfig config(task);
state std::string tagName = wait(config.tag().getOrThrow(tr));
TEST(true); // Canceling 5.1 backup task
CODE_PROBE(true, "Canceling 5.1 backup task");
TraceEvent(SevInfo, "FileBackupCancelFiveOneTask")
.detail("Task", task->params[Task::reservedTaskParamKeyType])
@ -1245,7 +1245,7 @@ struct BackupRangeTaskFunc : BackupTaskFuncBase {
// If we've seen a new read version OR hit the end of the stream, then if we were writing a file finish it.
if (values.second != outVersion || done) {
if (outFile) {
TEST(outVersion != invalidVersion); // Backup range task wrote multiple versions
CODE_PROBE(outVersion != invalidVersion, "Backup range task wrote multiple versions");
state Key nextKey = done ? endKey : keyAfter(lastKey);
wait(rangeFile.writeKey(nextKey));
@ -4098,7 +4098,7 @@ struct StartFullRestoreTaskFunc : RestoreTaskFuncBase {
.detail("RestoreVersion", restoreVersion)
.detail("Dest", destVersion);
if (destVersion <= restoreVersion) {
TEST(true); // Forcing restored cluster to higher version
CODE_PROBE(true, "Forcing restored cluster to higher version");
tr->set(minRequiredCommitVersionKey, BinaryWriter::toValue(restoreVersion + 1, Unversioned()));
wait(tr->commit());
} else {

View File

@ -22,6 +22,7 @@
#include <string>
#include <vector>
#include "fdbclient/GenericManagementAPI.actor.h"
#include "fmt/format.h"
#include "fdbclient/Knobs.h"
#include "flow/Arena.h"
@ -1002,8 +1003,8 @@ ACTOR Future<CoordinatorsResult> changeQuorum(Database cx, Reference<IQuorumChan
TraceEvent("AttemptingQuorumChange")
.detail("FromCS", oldClusterConnectionString.toString())
.detail("ToCS", newClusterConnectionString.toString());
TEST(oldClusterKeyName != newClusterKeyName); // Quorum change with new name
TEST(oldClusterKeyName == newClusterKeyName); // Quorum change with unchanged name
CODE_PROBE(oldClusterKeyName != newClusterKeyName, "Quorum change with new name");
CODE_PROBE(oldClusterKeyName == newClusterKeyName, "Quorum change with unchanged name");
state std::vector<Future<Optional<LeaderInfo>>> leaderServers;
state ClientCoordinators coord(Reference<ClusterConnectionMemoryRecord>(
@ -2461,6 +2462,21 @@ bool schemaMatch(json_spirit::mValue const& schemaValue,
}
}
void setStorageQuota(Transaction& tr, StringRef tenantName, uint64_t quota) {
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
auto key = storageQuotaKey(tenantName);
tr.set(key, BinaryWriter::toValue<uint64_t>(quota, Unversioned()));
}
ACTOR Future<Optional<uint64_t>> getStorageQuota(Transaction* tr, StringRef tenantName) {
tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
state Optional<Value> v = wait(tr->get(storageQuotaKey(tenantName)));
if (!v.present()) {
return Optional<uint64_t>();
}
return BinaryReader::fromStringRef<uint64_t>(v.get(), Unversioned());
}
std::string ManagementAPI::generateErrorMessage(const CoordinatorsResult& res) {
// Note: the error message here should not be changed if possible
// If you do change the message here,

View File

@ -987,8 +987,9 @@ ACTOR Future<MonitorLeaderInfo> monitorProxiesOneGeneration(
successIndex = index;
allConnectionsFailed = false;
} else {
TEST(rep.getError().code() == error_code_failed_to_progress); // Coordinator cant talk to cluster controller
TEST(rep.getError().code() == error_code_lookup_failed); // Coordinator hostname resolving failure
CODE_PROBE(rep.getError().code() == error_code_failed_to_progress,
"Coordinator cant talk to cluster controller");
CODE_PROBE(rep.getError().code() == error_code_lookup_failed, "Coordinator hostname resolving failure");
TraceEvent("MonitorProxiesConnectFailed")
.detail("Error", rep.getError().name())
.detail("Coordinator", clientLeaderServer.getAddressString());

View File

@ -170,7 +170,7 @@ void DatabaseContext::addTssMapping(StorageServerInterface const& ssi, StorageSe
if (result->second.id() == tssi.id()) {
metrics = tssMetrics[tssi.id()];
} else {
TEST(true); // SS now maps to new TSS! This will probably never happen in practice
CODE_PROBE(true, "SS now maps to new TSS! This will probably never happen in practice");
tssMetrics.erase(result->second.id());
metrics = makeReference<TSSMetrics>();
tssMetrics[tssi.id()] = metrics;
@ -444,7 +444,7 @@ void DatabaseContext::validateVersion(Version version) const {
throw client_invalid_operation();
}
if (switchable && version < minAcceptableReadVersion) {
TEST(true); // Attempted to read a version lower than any this client has seen from the current cluster
CODE_PROBE(true, "Attempted to read a version lower than any this client has seen from the current cluster");
throw transaction_too_old();
}
@ -1114,8 +1114,8 @@ ACTOR static Future<Void> handleTssMismatches(DatabaseContext* cx) {
state bool quarantine = CLIENT_KNOBS->QUARANTINE_TSS_ON_MISMATCH;
TraceEvent(SevWarnAlways, quarantine ? "TSS_QuarantineMismatch" : "TSS_KillMismatch")
.detail("TSSID", data.first.toString());
TEST(quarantine); // Quarantining TSS because it got mismatch
TEST(!quarantine); // Killing TSS because it got mismatch
CODE_PROBE(quarantine, "Quarantining TSS because it got mismatch");
CODE_PROBE(!quarantine, "Killing TSS because it got mismatch");
tr = makeReference<ReadYourWritesTransaction>(Database(Reference<DatabaseContext>::addRef(cx)));
state int tries = 0;
@ -1154,7 +1154,7 @@ ACTOR static Future<Void> handleTssMismatches(DatabaseContext* cx) {
// clear out txn so that the extra DatabaseContext ref gets decref'd and we can free cx
tr = makeReference<ReadYourWritesTransaction>();
} else {
TEST(true); // Not handling TSS with mismatch because it's already gone
CODE_PROBE(true, "Not handling TSS with mismatch because it's already gone");
}
}
}
@ -1860,7 +1860,7 @@ bool DatabaseContext::getCachedLocations(const Optional<TenantName>& tenantName,
loop {
auto r = reverse ? end : begin;
if (!r->value()) {
TEST(result.size()); // had some but not all cached locations
CODE_PROBE(result.size(), "had some but not all cached locations");
result.clear();
return false;
}
@ -1907,7 +1907,7 @@ Reference<LocationInfo> DatabaseContext::setCachedLocation(const Optional<Tenant
int maxEvictionAttempts = 100, attempts = 0;
auto loc = makeReference<LocationInfo>(serverRefs);
while (locationCache.size() > locationCacheSize && attempts < maxEvictionAttempts) {
TEST(true); // NativeAPI storage server locationCache entry evicted
CODE_PROBE(true, "NativeAPI storage server locationCache entry evicted");
attempts++;
auto r = locationCache.randomRange();
Key begin = r.begin(), end = r.end(); // insert invalidates r, so can't be passed a mere reference into it
@ -2091,7 +2091,7 @@ Future<Void> DatabaseContext::onConnected() {
ACTOR static Future<Void> switchConnectionRecordImpl(Reference<IClusterConnectionRecord> connRecord,
DatabaseContext* self) {
TEST(true); // Switch connection file
CODE_PROBE(true, "Switch connection file");
TraceEvent("SwitchConnectionRecord")
.detail("ClusterFile", connRecord->toString())
.detail("ConnectionString", connRecord->getConnectionString().toString());
@ -2152,7 +2152,7 @@ void DatabaseContext::expireThrottles() {
for (auto& priorityItr : throttledTags) {
for (auto tagItr = priorityItr.second.begin(); tagItr != priorityItr.second.end();) {
if (tagItr->second.expired()) {
TEST(true); // Expiring client throttle
CODE_PROBE(true, "Expiring client throttle");
tagItr = priorityItr.second.erase(tagItr);
} else {
++tagItr;
@ -2638,7 +2638,7 @@ bool DatabaseContext::isCurrentGrvProxy(UID proxyId) const {
if (proxy.id() == proxyId)
return true;
}
TEST(true); // stale GRV proxy detected
CODE_PROBE(true, "stale GRV proxy detected");
return false;
}
@ -2875,6 +2875,7 @@ ACTOR Future<KeyRangeLocationInfo> getKeyLocation_internal(Database cx,
auto locationInfo =
cx->setCachedLocation(tenant, rep.tenantEntry, rep.results[0].first, rep.results[0].second);
updateTssMappings(cx, rep);
updateTagMappings(cx, rep);
return KeyRangeLocationInfo(
rep.tenantEntry,
@ -3629,13 +3630,13 @@ ACTOR Future<Version> watchValue(Database cx, Reference<const WatchParameters> p
wait(delay(CLIENT_KNOBS->UNKNOWN_TENANT_RETRY_DELAY, parameters->taskID));
} else if (e.code() == error_code_watch_cancelled || e.code() == error_code_process_behind) {
// clang-format off
TEST(e.code() == error_code_watch_cancelled); // Too many watches on the storage server, poll for changes instead
TEST(e.code() == error_code_process_behind); // The storage servers are all behind
CODE_PROBE(e.code() == error_code_watch_cancelled, "Too many watches on the storage server, poll for changes instead");
CODE_PROBE(e.code() == error_code_process_behind, "The storage servers are all behind");
// clang-format on
wait(delay(CLIENT_KNOBS->WATCH_POLLING_TIME, parameters->taskID));
} else if (e.code() == error_code_timed_out) { // The storage server occasionally times out watches in case
// it was cancelled
TEST(true); // A watch timed out
CODE_PROBE(true, "A watch timed out");
wait(delay(CLIENT_KNOBS->FUTURE_VERSION_RETRY_DELAY, parameters->taskID));
} else {
state Error err = e;
@ -3667,7 +3668,8 @@ ACTOR Future<Void> watchStorageServerResp(int64_t tenantId, Key key, Database cx
}
// ABA happens
else {
TEST(true); // ABA issue where the version returned from the server is less than the version in the map
CODE_PROBE(true,
"ABA issue where the version returned from the server is less than the version in the map");
// case 2: version_1 < version_2 and future_count == 1
if (metadata->watchPromise.getFutureReferenceCount() == 1) {
@ -3758,7 +3760,8 @@ Future<Void> getWatchFuture(Database cx, Reference<WatchParameters> parameters)
// case 3: val_1 != val_2 && version_2 > version_1 (received watch with different value and a higher version so
// recreate in SS)
else if (parameters->version > metadata->parameters->version) {
TEST(true); // Setting a watch that has a different value than the one in the map but a higher version (newer)
CODE_PROBE(true,
"Setting a watch that has a different value than the one in the map but a higher version (newer)");
cx->deleteWatchMetadata(parameters->tenant.tenantId, parameters->key);
metadata->watchPromise.send(parameters->version);
@ -3773,10 +3776,10 @@ Future<Void> getWatchFuture(Database cx, Reference<WatchParameters> parameters)
}
// case 5: val_1 != val_2 && version_1 == version_2 (received watch with different value but same version)
else if (metadata->parameters->version == parameters->version) {
TEST(true); // Setting a watch which has a different value than the one in the map but the same version
CODE_PROBE(true, "Setting a watch which has a different value than the one in the map but the same version");
return sameVersionDiffValue(cx, parameters);
}
TEST(true); // Setting a watch which has a different value than the one in the map but a lower version (older)
CODE_PROBE(true, "Setting a watch which has a different value than the one in the map but a lower version (older)");
// case 4: val_1 != val_2 && version_2 < version_1
return Void();
@ -3970,7 +3973,7 @@ Future<RangeResultFamily> getExactRange(Reference<TransactionState> trState,
.detail("BlockBytes", rep.data.expectedSize());
ASSERT(false);
}
TEST(true); // GetKeyValuesFamilyReply.more in getExactRange
CODE_PROBE(true, "GetKeyValuesFamilyReply.more in getExactRange");
// Make next request to the same shard with a beginning key just after the last key returned
if (reverse)
locations[shard].range =
@ -3981,7 +3984,7 @@ Future<RangeResultFamily> getExactRange(Reference<TransactionState> trState,
}
if (!more || locations[shard].range.empty()) {
TEST(true); // getExactrange (!more || locations[shard].first.empty())
CODE_PROBE(true, "getExactrange (!more || locations[shard].first.empty())");
if (shard == locations.size() - 1) {
const KeyRangeRef& range = locations[shard].range;
KeyRef begin = reverse ? keys.begin : range.end;
@ -3991,7 +3994,7 @@ Future<RangeResultFamily> getExactRange(Reference<TransactionState> trState,
output.more = false;
return output;
}
TEST(true); // Multiple requests of key locations
CODE_PROBE(true, "Multiple requests of key locations");
keys = KeyRangeRef(begin, end);
break;
@ -4431,7 +4434,7 @@ Future<RangeResultFamily> getRange(Reference<TransactionState> trState,
if (!rep.more) {
ASSERT(modifiedSelectors);
TEST(true); // !GetKeyValuesFamilyReply.more and modifiedSelectors in getRange
CODE_PROBE(true, "!GetKeyValuesFamilyReply.more and modifiedSelectors in getRange");
if (!rep.data.size()) {
RangeResultFamily result = wait(
@ -4455,7 +4458,7 @@ Future<RangeResultFamily> getRange(Reference<TransactionState> trState,
else
begin = firstGreaterOrEqual(shard.end);
} else {
TEST(true); // GetKeyValuesFamilyReply.more in getRange
CODE_PROBE(true, "GetKeyValuesFamilyReply.more in getRange");
if (reverse)
end = firstGreaterOrEqual(output[output.size() - 1].key);
else
@ -4574,7 +4577,7 @@ static Future<Void> tssStreamComparison(Request request,
} else {
tssData.metrics->ssError(e.code());
}
TEST(e.code() != error_code_end_of_stream); // SS got error in TSS stream comparison
CODE_PROBE(e.code() != error_code_end_of_stream, "SS got error in TSS stream comparison");
}
state double sleepTime = std::max(startTime + FLOW_KNOBS->LOAD_BALANCE_TSS_TIMEOUT - now(), 0.0);
@ -4586,7 +4589,7 @@ static Future<Void> tssStreamComparison(Request request,
}
when(wait(delay(sleepTime))) {
++tssData.metrics->tssTimeouts;
TEST(true); // Got TSS timeout in stream comparison
CODE_PROBE(true, "Got TSS timeout in stream comparison");
}
}
} catch (Error& e) {
@ -4601,7 +4604,7 @@ static Future<Void> tssStreamComparison(Request request,
} else {
tssData.metrics->tssError(e.code());
}
TEST(e.code() != error_code_end_of_stream); // TSS got error in TSS stream comparison
CODE_PROBE(e.code() != error_code_end_of_stream, "TSS got error in TSS stream comparison");
}
if (!ssEndOfStream || !tssEndOfStream) {
@ -4614,11 +4617,11 @@ static Future<Void> tssStreamComparison(Request request,
// FIXME: this code is pretty much identical to LoadBalance.h
// TODO could add team check logic in if we added synchronous way to turn this into a fixed getRange request
// and send it to the whole team and compare? I think it's fine to skip that for streaming though
TEST(ssEndOfStream != tssEndOfStream); // SS or TSS stream finished early!
CODE_PROBE(ssEndOfStream != tssEndOfStream, "SS or TSS stream finished early!");
// skip tss comparison if both are end of stream
if ((!ssEndOfStream || !tssEndOfStream) && !TSS_doCompare(ssReply.get(), tssReply.get())) {
TEST(true); // TSS mismatch in stream comparison
CODE_PROBE(true, "TSS mismatch in stream comparison");
TraceEvent mismatchEvent(
(g_network->isSimulated() && g_simulator.tssMode == ISimulator::TSSMode::EnabledDropMutations)
? SevWarnAlways
@ -4630,10 +4633,10 @@ static Future<Void> tssStreamComparison(Request request,
if (tssData.metrics->shouldRecordDetailedMismatch()) {
TSS_traceMismatch(mismatchEvent, request, ssReply.get(), tssReply.get());
TEST(FLOW_KNOBS
->LOAD_BALANCE_TSS_MISMATCH_TRACE_FULL); // Tracing Full TSS Mismatch in stream comparison
TEST(!FLOW_KNOBS->LOAD_BALANCE_TSS_MISMATCH_TRACE_FULL); // Tracing Partial TSS Mismatch in stream
// comparison and storing the rest in FDB
CODE_PROBE(FLOW_KNOBS->LOAD_BALANCE_TSS_MISMATCH_TRACE_FULL,
"Tracing Full TSS Mismatch in stream comparison");
CODE_PROBE(!FLOW_KNOBS->LOAD_BALANCE_TSS_MISMATCH_TRACE_FULL,
"Tracing Partial TSS Mismatch in stream comparison and storing the rest in FDB");
if (!FLOW_KNOBS->LOAD_BALANCE_TSS_MISMATCH_TRACE_FULL) {
mismatchEvent.disable();
@ -4673,7 +4676,7 @@ maybeDuplicateTSSStreamFragment(Request& req, QueueModel* model, RequestStream<R
Optional<TSSEndpointData> tssData = model->getTssData(ssStream->getEndpoint().token.first());
if (tssData.present()) {
TEST(true); // duplicating stream to TSS
CODE_PROBE(true, "duplicating stream to TSS");
resetReply(req);
// FIXME: optimize to avoid creating new netNotifiedQueueWithAcknowledgements for each stream duplication
RequestStream<Request> tssRequestStream(tssData.get().endpoint);
@ -4873,7 +4876,7 @@ ACTOR Future<Void> getRangeStreamFragment(Reference<TransactionState> trState,
.detail("BlockBytes", rep.data.expectedSize());
ASSERT(false);
}
TEST(true); // GetKeyValuesStreamReply.more in getRangeStream
CODE_PROBE(true, "GetKeyValuesStreamReply.more in getRangeStream");
// Make next request to the same shard with a beginning key just after the last key returned
if (reverse)
locations[shard].range =
@ -5271,7 +5274,7 @@ ACTOR Future<Void> watch(Reference<Watch> watch,
when(wait(watch->watchFuture)) { break; }
when(wait(cx->connectionFileChanged())) {
TEST(true); // Recreated a watch after switch
CODE_PROBE(true, "Recreated a watch after switch");
cx->clearWatchMetadata();
watch->watchFuture = watchValueMap(cx->minAcceptableReadVersion,
tenantInfo,
@ -5444,18 +5447,18 @@ Future<RangeResultFamily> Transaction::getRangeInternal(const KeySelector& begin
KeySelector b = begin;
if (b.orEqual) {
TEST(true); // Native begin orEqual==true
CODE_PROBE(true, "Native begin orEqual==true");
b.removeOrEqual(b.arena());
}
KeySelector e = end;
if (e.orEqual) {
TEST(true); // Native end orEqual==true
CODE_PROBE(true, "Native end orEqual==true");
e.removeOrEqual(e.arena());
}
if (b.offset >= e.offset && b.getKey() >= e.getKey()) {
TEST(true); // Native range inverted
CODE_PROBE(true, "Native range inverted");
return RangeResultFamily();
}
@ -5518,18 +5521,18 @@ Future<Void> Transaction::getRangeStream(const PromiseStream<RangeResult>& resul
KeySelector b = begin;
if (b.orEqual) {
TEST(true); // Native stream begin orEqual==true
CODE_PROBE(true, "Native stream begin orEqual==true");
b.removeOrEqual(b.arena());
}
KeySelector e = end;
if (e.orEqual) {
TEST(true); // Native stream end orEqual==true
CODE_PROBE(true, "Native stream end orEqual==true");
e.removeOrEqual(e.arena());
}
if (b.offset >= e.offset && b.getKey() >= e.getKey()) {
TEST(true); // Native stream range inverted
CODE_PROBE(true, "Native stream range inverted");
results.sendError(end_of_stream());
return Void();
}
@ -5632,7 +5635,7 @@ void Transaction::atomicOp(const KeyRef& key,
if (addConflictRange && operationType != MutationRef::SetVersionstampedKey)
t.write_conflict_ranges.push_back(req.arena, r);
TEST(true); // NativeAPI atomic operation
CODE_PROBE(true, "NativeAPI atomic operation");
}
void Transaction::clear(const KeyRangeRef& range, AddConflictRange addConflictRange) {
@ -5718,7 +5721,7 @@ double Transaction::getBackoff(int errCode) {
if (priorityItr != trState->cx->throttledTags.end()) {
auto tagItr = priorityItr->second.find(tag);
if (tagItr != priorityItr->second.end()) {
TEST(true); // Returning throttle backoff
CODE_PROBE(true, "Returning throttle backoff");
returnedBackoff = std::max(
returnedBackoff,
std::min(CLIENT_KNOBS->TAG_THROTTLE_RECHECK_INTERVAL, tagItr->second.throttleDuration()));
@ -6249,7 +6252,7 @@ ACTOR static Future<Void> tryCommit(Reference<TransactionState> trState,
KeyRangeRef selfConflictingRange =
intersects(req.transaction.write_conflict_ranges, req.transaction.read_conflict_ranges).get();
TEST(true); // Waiting for dummy transaction to report commit_unknown_result
CODE_PROBE(true, "Waiting for dummy transaction to report commit_unknown_result");
wait(commitDummyTransaction(trState, singleKeyRange(selfConflictingRange.begin)));
}
@ -6587,7 +6590,7 @@ void Transaction::setOption(FDBTransactionOptions::Option option, Optional<Strin
if (value.get().size() != 33) {
throw invalid_option_value();
}
TEST(true); // Adding link in FDBTransactionOptions::SPAN_PARENT
CODE_PROBE(true, "Adding link in FDBTransactionOptions::SPAN_PARENT");
span.setParent(BinaryReader::fromStringRef<SpanContext>(value.get(), IncludeVersion()));
break;
@ -6667,10 +6670,10 @@ ACTOR Future<GetReadVersionReply> getConsistentReadVersion(SpanContext parentSpa
for (auto& tag : tags) {
auto itr = v.tagThrottleInfo.find(tag.first);
if (itr == v.tagThrottleInfo.end()) {
TEST(true); // Removing client throttle
CODE_PROBE(true, "Removing client throttle");
priorityThrottledTags.erase(tag.first);
} else {
TEST(true); // Setting client throttle
CODE_PROBE(true, "Setting client throttle");
auto result = priorityThrottledTags.try_emplace(tag.first, itr->second);
if (!result.second) {
result.first->second.update(itr->second);
@ -6853,7 +6856,7 @@ ACTOR Future<Version> extractReadVersion(Reference<TransactionState> trState,
if (itr->second.expired()) {
priorityThrottledTags.erase(itr);
} else if (itr->second.throttleDuration() > 0) {
TEST(true); // throttling transaction after getting read version
CODE_PROBE(true, "throttling transaction after getting read version");
++trState->cx->transactionReadVersionsThrottled;
throw tag_throttled();
}
@ -6959,12 +6962,12 @@ Future<Version> Transaction::getReadVersion(uint32_t flags) {
}
if (maxThrottleDelay > 0.0 && !canRecheck) { // TODO: allow delaying?
TEST(true); // Throttling tag before GRV request
CODE_PROBE(true, "Throttling tag before GRV request");
++trState->cx->transactionReadVersionsThrottled;
readVersion = tag_throttled();
return readVersion;
} else {
TEST(maxThrottleDelay > 0.0); // Rechecking throttle
CODE_PROBE(maxThrottleDelay > 0.0, "Rechecking throttle");
}
for (auto& tag : trState->options.tags) {
@ -7343,10 +7346,10 @@ ACTOR Future<Standalone<VectorRef<ReadHotRangeWithMetrics>>> getReadHotRanges(Da
wait(waitForAll(fReplies));
if (nLocs == 1) {
TEST(true); // Single-shard read hot range request
CODE_PROBE(true, "Single-shard read hot range request");
return fReplies[0].get().readHotRanges;
} else {
TEST(true); // Multi-shard read hot range request
CODE_PROBE(true, "Multi-shard read hot range request");
Standalone<VectorRef<ReadHotRangeWithMetrics>> results;
for (int i = 0; i < nLocs; i++) {
results.append(results.arena(),
@ -7855,7 +7858,7 @@ ACTOR Future<Standalone<VectorRef<BlobGranuleChunkRef>>> readBlobGranulesActor(
if (!results.empty() && results.back().keyRange.end != chunk.keyRange.begin) {
ASSERT(results.back().keyRange.end > chunk.keyRange.begin);
ASSERT(results.back().keyRange.end <= chunk.keyRange.end);
TEST(true); // Merge while reading granule range
CODE_PROBE(true, "Merge while reading granule range");
while (!results.empty() && results.back().keyRange.begin >= chunk.keyRange.begin) {
// TODO: we can't easily un-depend the arenas for these guys, but that's ok as this
// should be rare
@ -8980,8 +8983,8 @@ ACTOR Future<Void> mergeChangeFeedStream(Reference<DatabaseContext> db,
state std::vector<Future<Void>> onErrors(interfs.size());
state std::vector<MutationAndVersionStream> streams(interfs.size());
TEST(interfs.size() > 10); // Large change feed merge cursor
TEST(interfs.size() > 100); // Very large change feed merge cursor
CODE_PROBE(interfs.size() > 10, "Large change feed merge cursor");
CODE_PROBE(interfs.size() > 100, "Very large change feed merge cursor");
state UID mergeCursorUID = UID();
state std::vector<UID> debugUIDs;
@ -9305,13 +9308,13 @@ ACTOR Future<Void> getChangeFeedStreamActor(Reference<DatabaseContext> db,
interfs.emplace_back(locations[i].locations->getInterface(chosenLocations[i]),
locations[i].range & range);
}
TEST(true); // Change feed merge cursor
CODE_PROBE(true, "Change feed merge cursor");
// TODO (jslocum): validate connectionFileChanged behavior
wait(
mergeChangeFeedStream(db, interfs, results, rangeID, &begin, end, replyBufferSize, canReadPopped) ||
cx->connectionFileChanged());
} else {
TEST(true); // Change feed single cursor
CODE_PROBE(true, "Change feed single cursor");
StorageServerInterface interf = locations[0].locations->getInterface(chosenLocations[0]);
wait(singleChangeFeedStream(
db, interf, range, results, rangeID, &begin, end, replyBufferSize, canReadPopped) ||
@ -9327,7 +9330,7 @@ ACTOR Future<Void> getChangeFeedStreamActor(Reference<DatabaseContext> db,
results->streams.clear();
results->storageData.clear();
if (e.code() == error_code_change_feed_popped) {
TEST(true); // getChangeFeedStreamActor got popped
CODE_PROBE(true, "getChangeFeedStreamActor got popped");
results->mutations.sendError(e);
results->refresh.sendError(e);
} else {

View File

@ -199,7 +199,7 @@ class GetGenerationQuorum {
}
} catch (Error& e) {
if (e.code() == error_code_failed_to_reach_quorum) {
TEST(true); // Failed to reach quorum getting generation
CODE_PROBE(true, "Failed to reach quorum getting generation");
wait(delayJittered(
std::clamp(0.005 * (1 << retries), 0.0, CLIENT_KNOBS->TIMEOUT_RETRY_UPPER_BOUND)));
++retries;

View File

@ -1213,7 +1213,7 @@ public:
// isolation support. But it is not default and is rarely used. So we disallow it until we have thorough test
// coverage for it.)
if (snapshot) {
TEST(true); // getMappedRange not supported for snapshot.
CODE_PROBE(true, "getMappedRange not supported for snapshot.");
throw unsupported_operation();
}
// For now, getMappedRange requires read-your-writes being NOT disabled. But the support of RYW is limited
@ -1222,7 +1222,7 @@ public:
// which returns the written value transparently. In another word, it makes sure not break RYW semantics without
// actually implementing reading from the writes.
if (ryw->options.readYourWritesDisabled) {
TEST(true); // getMappedRange not supported for read-your-writes disabled.
CODE_PROBE(true, "getMappedRange not supported for read-your-writes disabled.");
throw unsupported_operation();
}
@ -1242,7 +1242,7 @@ public:
++it;
ASSERT(itCopy->value.size());
TEST(itCopy->value.size() > 1); // Multiple watches on the same key triggered by RYOW
CODE_PROBE(itCopy->value.size() > 1, "Multiple watches on the same key triggered by RYOW");
for (int i = 0; i < itCopy->value.size(); i++) {
if (itCopy->value[i]->onChangeTrigger.isSet()) {
@ -1535,11 +1535,11 @@ ACTOR Future<RangeResult> getWorkerInterfaces(Reference<IClusterConnectionRecord
}
Future<Optional<Value>> ReadYourWritesTransaction::get(const Key& key, Snapshot snapshot) {
TEST(true); // ReadYourWritesTransaction::get
CODE_PROBE(true, "ReadYourWritesTransaction::get");
if (getDatabase()->apiVersionAtLeast(630)) {
if (specialKeys.contains(key)) {
TEST(true); // Special keys get
CODE_PROBE(true, "Special keys get");
return getDatabase()->specialKeySpace->get(this, key);
}
} else {
@ -1622,7 +1622,7 @@ Future<RangeResult> ReadYourWritesTransaction::getRange(KeySelector begin,
if (getDatabase()->apiVersionAtLeast(630)) {
if (specialKeys.contains(begin.getKey()) && specialKeys.begin <= end.getKey() &&
end.getKey() <= specialKeys.end) {
TEST(true); // Special key space get range
CODE_PROBE(true, "Special key space get range");
return getDatabase()->specialKeySpace->getRange(this, begin, end, limits, reverse);
}
} else {
@ -1648,7 +1648,7 @@ Future<RangeResult> ReadYourWritesTransaction::getRange(KeySelector begin,
// This optimization prevents nullptr operations from being added to the conflict range
if (limits.isReached()) {
TEST(true); // RYW range read limit 0
CODE_PROBE(true, "RYW range read limit 0");
return RangeResult();
}
@ -1662,7 +1662,7 @@ Future<RangeResult> ReadYourWritesTransaction::getRange(KeySelector begin,
end.removeOrEqual(end.arena());
if (begin.offset >= end.offset && begin.getKey() >= end.getKey()) {
TEST(true); // RYW range inverted
CODE_PROBE(true, "RYW range inverted");
return RangeResult();
}
@ -1692,7 +1692,7 @@ Future<MappedRangeResult> ReadYourWritesTransaction::getMappedRange(KeySelector
if (getDatabase()->apiVersionAtLeast(630)) {
if (specialKeys.contains(begin.getKey()) && specialKeys.begin <= end.getKey() &&
end.getKey() <= specialKeys.end) {
TEST(true); // Special key space get range (getMappedRange)
CODE_PROBE(true, "Special key space get range (getMappedRange)");
throw client_invalid_operation(); // Not support special keys.
}
} else {
@ -1714,7 +1714,7 @@ Future<MappedRangeResult> ReadYourWritesTransaction::getMappedRange(KeySelector
// This optimization prevents nullptr operations from being added to the conflict range
if (limits.isReached()) {
TEST(true); // RYW range read limit 0 (getMappedRange)
CODE_PROBE(true, "RYW range read limit 0 (getMappedRange)");
return MappedRangeResult();
}
@ -1728,7 +1728,7 @@ Future<MappedRangeResult> ReadYourWritesTransaction::getMappedRange(KeySelector
end.removeOrEqual(end.arena());
if (begin.offset >= end.offset && begin.getKey() >= end.getKey()) {
TEST(true); // RYW range inverted (getMappedRange)
CODE_PROBE(true, "RYW range inverted (getMappedRange)");
return MappedRangeResult();
}
@ -1998,7 +1998,7 @@ void ReadYourWritesTransaction::setToken(uint64_t token) {
}
RangeResult ReadYourWritesTransaction::getReadConflictRangeIntersecting(KeyRangeRef kr) {
TEST(true); // Special keys read conflict range
CODE_PROBE(true, "Special keys read conflict range");
ASSERT(readConflictRangeKeysRange.contains(kr));
ASSERT(!tr.trState->options.checkWritesEnabled);
RangeResult result;
@ -2040,7 +2040,7 @@ RangeResult ReadYourWritesTransaction::getReadConflictRangeIntersecting(KeyRange
}
RangeResult ReadYourWritesTransaction::getWriteConflictRangeIntersecting(KeyRangeRef kr) {
TEST(true); // Special keys write conflict range
CODE_PROBE(true, "Special keys write conflict range");
ASSERT(writeConflictRangeKeysRange.contains(kr));
RangeResult result;
@ -2145,7 +2145,7 @@ void ReadYourWritesTransaction::atomicOp(const KeyRef& key, const ValueRef& oper
}
if (operationType == MutationRef::SetVersionstampedKey) {
TEST(options.readYourWritesDisabled); // SetVersionstampedKey without ryw enabled
CODE_PROBE(options.readYourWritesDisabled, "SetVersionstampedKey without ryw enabled");
// this does validation of the key and needs to be performed before the readYourWritesDisabled path
KeyRangeRef range = getVersionstampKeyRange(arena, k, tr.getCachedReadVersion().orDefault(0), getMaxReadKey());
versionStampKeys.push_back(arena, k);

View File

@ -881,12 +881,18 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
// Cluster recovery
init ( CLUSTER_RECOVERY_EVENT_NAME_PREFIX, "Master" );
// encrypt key proxy
// Encryption
init( ENABLE_ENCRYPTION, false ); if ( randomize && BUGGIFY ) { ENABLE_ENCRYPTION = deterministicRandom()->coinflip(); }
init( ENCRYPTION_MODE, "AES-256-CTR" );
init( SIM_KMS_MAX_KEYS, 4096 );
init( ENCRYPT_PROXY_MAX_DBG_TRACE_LENGTH, 100000 );
init( ENABLE_TLOG_ENCRYPTION, ENABLE_ENCRYPTION ); if ( randomize && BUGGIFY) { ENABLE_TLOG_ENCRYPTION = (ENABLE_ENCRYPTION && !PROXY_USE_RESOLVER_PRIVATE_MUTATIONS && deterministicRandom()->coinflip()); }
init( ENABLE_TLOG_ENCRYPTION, ENABLE_ENCRYPTION ); if ( randomize && BUGGIFY ) { ENABLE_TLOG_ENCRYPTION = (ENABLE_ENCRYPTION && !PROXY_USE_RESOLVER_PRIVATE_MUTATIONS && deterministicRandom()->coinflip()); }
init( ENABLE_BLOB_GRANULE_ENCRYPTION, ENABLE_ENCRYPTION ); if ( randomize && BUGGIFY ) { ENABLE_BLOB_GRANULE_ENCRYPTION = (ENABLE_ENCRYPTION && deterministicRandom()->coinflip()); }
// encrypt key proxy
init( ENABLE_BLOB_GRANULE_COMPRESSION, false ); if ( randomize && BUGGIFY ) { ENABLE_BLOB_GRANULE_COMPRESSION = deterministicRandom()->coinflip(); }
init( BLOB_GRANULE_COMPRESSION_FILTER, "GZIP" ); if ( randomize && BUGGIFY ) { BLOB_GRANULE_COMPRESSION_FILTER = "NONE"; }
// KMS connector type
init( KMS_CONNECTOR_TYPE, "RESTKmsConnector" );

View File

@ -364,12 +364,12 @@ ACTOR Future<RangeResult> SpecialKeySpace::getRangeAggregationActor(SpecialKeySp
// Handle all corner cases like what RYW does
// return if range inverted
if (actualBeginOffset >= actualEndOffset && begin.getKey() >= end.getKey()) {
TEST(true); // inverted range
CODE_PROBE(true, "inverted range");
return RangeResultRef(false, false);
}
// If touches begin or end, return with readToBegin and readThroughEnd flags
if (begin.getKey() == moduleBoundary.end || end.getKey() == moduleBoundary.begin) {
TEST(true); // query touches begin or end
CODE_PROBE(true, "query touches begin or end");
return result;
}
state RangeMap<Key, SpecialKeyRangeReadImpl*, KeyRangeRef>::Ranges ranges =
@ -453,7 +453,7 @@ Future<RangeResult> SpecialKeySpace::getRange(ReadYourWritesTransaction* ryw,
if (!limits.isValid())
return range_limits_invalid();
if (limits.isReached()) {
TEST(true); // read limit 0
CODE_PROBE(true, "read limit 0");
return RangeResult();
}
// make sure orEqual == false
@ -461,7 +461,7 @@ Future<RangeResult> SpecialKeySpace::getRange(ReadYourWritesTransaction* ryw,
end.removeOrEqual(end.arena());
if (begin.offset >= end.offset && begin.getKey() >= end.getKey()) {
TEST(true); // range inverted
CODE_PROBE(true, "range inverted");
return RangeResult();
}

View File

@ -19,6 +19,7 @@
*/
#include "fdbclient/SystemData.h"
#include "fdbclient/BlobGranuleCommon.h"
#include "fdbclient/FDBTypes.h"
#include "fdbclient/NativeAPI.actor.h"
#include "fdbclient/StorageServerInterface.h"
@ -1370,26 +1371,35 @@ const KeyRange blobGranuleFileKeyRangeFor(UID granuleID) {
return KeyRangeRef(startKey, strinc(startKey));
}
const Value blobGranuleFileValueFor(StringRef const& filename, int64_t offset, int64_t length, int64_t fullFileLength) {
const Value blobGranuleFileValueFor(StringRef const& filename,
int64_t offset,
int64_t length,
int64_t fullFileLength,
Optional<BlobGranuleCipherKeysMeta> cipherKeysMeta) {
BinaryWriter wr(IncludeVersion(ProtocolVersion::withBlobGranule()));
wr << filename;
wr << offset;
wr << length;
wr << fullFileLength;
wr << cipherKeysMeta;
return wr.toValue();
}
std::tuple<Standalone<StringRef>, int64_t, int64_t, int64_t> decodeBlobGranuleFileValue(ValueRef const& value) {
std::tuple<Standalone<StringRef>, int64_t, int64_t, int64_t, Optional<BlobGranuleCipherKeysMeta>>
decodeBlobGranuleFileValue(ValueRef const& value) {
StringRef filename;
int64_t offset;
int64_t length;
int64_t fullFileLength;
Optional<BlobGranuleCipherKeysMeta> cipherKeysMeta;
BinaryReader reader(value, IncludeVersion());
reader >> filename;
reader >> offset;
reader >> length;
reader >> fullFileLength;
return std::tuple(filename, offset, length, fullFileLength);
reader >> cipherKeysMeta;
return std::tuple(filename, offset, length, fullFileLength, cipherKeysMeta);
}
const Value blobGranulePurgeValueFor(Version version, KeyRange range, bool force) {
@ -1620,6 +1630,13 @@ const KeyRef tenantMapPrivatePrefix = "\xff\xff/tenantMap/"_sr;
const KeyRef tenantLastIdKey = "\xff/tenantLastId/"_sr;
const KeyRef tenantDataPrefixKey = "\xff/tenantDataPrefix"_sr;
const KeyRangeRef storageQuotaKeys(LiteralStringRef("\xff/storageQuota/"), LiteralStringRef("\xff/storageQuota0"));
const KeyRef storageQuotaPrefix = storageQuotaKeys.begin;
Key storageQuotaKey(StringRef tenantName) {
return tenantName.withPrefix(storageQuotaPrefix);
}
// for tests
void testSSISerdes(StorageServerInterface const& ssi) {
printf("ssi=\nid=%s\nlocality=%s\nisTss=%s\ntssId=%s\nacceptingRequests=%s\naddress=%s\ngetValue=%s\n\n\n",

View File

@ -199,7 +199,7 @@ public:
// many other new tasks get added so that the timed out tasks never get chances to re-run
if (deterministicRandom()->random01() < CLIENT_KNOBS->TASKBUCKET_CHECK_TIMEOUT_CHANCE) {
bool anyTimeouts = wait(requeueTimedOutTasks(tr, taskBucket));
TEST(anyTimeouts); // Found a task that timed out
CODE_PROBE(anyTimeouts, "Found a task that timed out");
}
state std::vector<Future<Optional<Key>>> taskKeyFutures(CLIENT_KNOBS->TASKBUCKET_MAX_PRIORITY + 1);
@ -233,7 +233,7 @@ public:
bool anyTimeouts = wait(requeueTimedOutTasks(tr, taskBucket));
// If there were timeouts, try to get a task since there should now be one in one of the available spaces.
if (anyTimeouts) {
TEST(true); // Try to get one task from timeouts subspace
CODE_PROBE(true, "Try to get one task from timeouts subspace");
Reference<Task> task = wait(getOne(tr, taskBucket));
return task;
}
@ -707,7 +707,7 @@ public:
wait(delay(CLIENT_KNOBS->TASKBUCKET_CHECK_ACTIVE_DELAY));
bool isActiveKey = wait(getActiveKey(tr, taskBucket, startingValue));
if (isActiveKey) {
TEST(true); // checkActive return true
CODE_PROBE(true, "checkActive return true");
return true;
}
break;
@ -717,7 +717,7 @@ public:
}
}
TEST(true); // checkActive return false
CODE_PROBE(true, "checkActive return false");
return false;
}
@ -742,7 +742,7 @@ public:
// Returns True if any tasks were affected.
ACTOR static Future<bool> requeueTimedOutTasks(Reference<ReadYourWritesTransaction> tr,
Reference<TaskBucket> taskBucket) {
TEST(true); // Looks for tasks that have timed out and returns them to be available tasks.
CODE_PROBE(true, "Looks for tasks that have timed out and returns them to be available tasks.");
Version end = wait(tr->getReadVersion());
state KeyRange range(
KeyRangeRef(taskBucket->timeouts.get(0).range().begin, taskBucket->timeouts.get(end).range().end));
@ -849,12 +849,12 @@ public:
// If we're updating the task params the clear the old space and write params to the new space
if (updateParams) {
TEST(true); // Extended a task while updating parameters
CODE_PROBE(true, "Extended a task while updating parameters");
for (auto& p : task->params) {
tr->set(newTimeoutSpace.pack(p.key), p.value);
}
} else {
TEST(true); // Extended a task without updating parameters
CODE_PROBE(true, "Extended a task without updating parameters");
// Otherwise, read and transplant the params from the old to new timeout spaces
RangeResult params = wait(tr->getRange(oldTimeoutSpace.range(), CLIENT_KNOBS->TOO_MANY));
for (auto& kv : params) {
@ -1138,10 +1138,10 @@ public:
bool is_set = wait(isSet(tr, taskFuture));
if (is_set) {
TEST(true); // is_set == true
CODE_PROBE(true, "is_set == true");
wait(performAction(tr, taskBucket, taskFuture, task));
} else {
TEST(true); // is_set == false
CODE_PROBE(true, "is_set == false");
Subspace callbackSpace =
taskFuture->callbacks.get(StringRef(deterministicRandom()->randomUniqueID().toString()));
for (auto& v : task->params) {

View File

@ -567,7 +567,7 @@ void WriteMap::clearNoConflict(KeyRangeRef keys) {
bool end_conflict = it.is_conflict_range();
bool end_unreadable = it.is_unreadable();
TEST(it.is_conflict_range() != lastConflicted); // not last conflicted
CODE_PROBE(it.is_conflict_range() != lastConflicted, "not last conflicted");
it.tree.clear();

View File

@ -120,7 +120,7 @@ inline ValueRef doAppendIfFits(const Optional<ValueRef>& existingValueOptional,
if (!otherOperand.size())
return existingValue;
if (existingValue.size() + otherOperand.size() > CLIENT_KNOBS->VALUE_SIZE_LIMIT) {
TEST(true) // AppendIfFIts resulted in truncation
CODE_PROBE(true, "AppendIfFIts resulted in truncation");
return existingValue;
}

View File

@ -22,11 +22,18 @@
#define FDBCLIENT_BLOBGRANULECOMMON_H
#pragma once
#include <sstream>
#include "fdbclient/CommitTransaction.h"
#include "fdbclient/FDBTypes.h"
#include "flow/BlobCipher.h"
#include "flow/EncryptUtils.h"
#include "flow/IRandom.h"
#include "flow/serialize.h"
#include <sstream>
#define BG_ENCRYPT_COMPRESS_DEBUG false
// file format of actual blob files
// FIXME: use VecSerStrategy::String serialization for this
struct GranuleSnapshot : VectorRef<KeyValueRef> {
@ -48,33 +55,165 @@ struct GranuleDeltas : VectorRef<MutationsAndVersionRef> {
}
};
struct BlobGranuleCipherKeysMeta {
EncryptCipherDomainId textDomainId;
EncryptCipherBaseKeyId textBaseCipherId;
EncryptCipherRandomSalt textSalt;
EncryptCipherDomainId headerDomainId;
EncryptCipherBaseKeyId headerBaseCipherId;
EncryptCipherRandomSalt headerSalt;
std::string ivStr;
BlobGranuleCipherKeysMeta() {}
BlobGranuleCipherKeysMeta(const EncryptCipherDomainId tDomainId,
const EncryptCipherBaseKeyId tBaseCipherId,
const EncryptCipherRandomSalt tSalt,
const EncryptCipherDomainId hDomainId,
const EncryptCipherBaseKeyId hBaseCipherId,
const EncryptCipherRandomSalt hSalt,
const std::string& iv)
: textDomainId(tDomainId), textBaseCipherId(tBaseCipherId), textSalt(tSalt), headerDomainId(hDomainId),
headerBaseCipherId(hBaseCipherId), headerSalt(hSalt), ivStr(iv) {}
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, textDomainId, textBaseCipherId, textSalt, headerDomainId, headerBaseCipherId, headerSalt, ivStr);
}
};
struct BlobGranuleCipherKey {
constexpr static FileIdentifier file_identifier = 7274734;
EncryptCipherDomainId encryptDomainId;
EncryptCipherBaseKeyId baseCipherId;
EncryptCipherRandomSalt salt;
StringRef baseCipher;
static BlobGranuleCipherKey fromBlobCipherKey(Reference<BlobCipherKey> keyRef, Arena& arena) {
BlobGranuleCipherKey cipherKey;
cipherKey.encryptDomainId = keyRef->getDomainId();
cipherKey.baseCipherId = keyRef->getBaseCipherId();
cipherKey.salt = keyRef->getSalt();
cipherKey.baseCipher = makeString(keyRef->getBaseCipherLen(), arena);
memcpy(mutateString(cipherKey.baseCipher), keyRef->rawBaseCipher(), keyRef->getBaseCipherLen());
return cipherKey;
}
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, encryptDomainId, baseCipherId, salt, baseCipher);
}
};
struct BlobGranuleCipherKeysCtx {
constexpr static FileIdentifier file_identifier = 1278718;
BlobGranuleCipherKey textCipherKey;
BlobGranuleCipherKey headerCipherKey;
StringRef ivRef;
static BlobGranuleCipherKeysMeta toCipherKeysMeta(const BlobGranuleCipherKeysCtx& ctx) {
return BlobGranuleCipherKeysMeta(ctx.textCipherKey.encryptDomainId,
ctx.textCipherKey.baseCipherId,
ctx.textCipherKey.salt,
ctx.headerCipherKey.encryptDomainId,
ctx.headerCipherKey.baseCipherId,
ctx.headerCipherKey.salt,
ctx.ivRef.toString());
}
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, textCipherKey, headerCipherKey, ivRef);
}
};
struct BlobGranuleFileEncryptionKeys {
Reference<BlobCipherKey> textCipherKey;
Reference<BlobCipherKey> headerCipherKey;
};
struct BlobGranuleCipherKeysMetaRef {
EncryptCipherDomainId textDomainId;
EncryptCipherBaseKeyId textBaseCipherId;
EncryptCipherRandomSalt textSalt;
EncryptCipherDomainId headerDomainId;
EncryptCipherBaseKeyId headerBaseCipherId;
EncryptCipherRandomSalt headerSalt;
StringRef ivRef;
BlobGranuleCipherKeysMetaRef() {}
BlobGranuleCipherKeysMetaRef(Arena& to,
const EncryptCipherDomainId tDomainId,
const EncryptCipherBaseKeyId tBaseCipherId,
const EncryptCipherRandomSalt tSalt,
const EncryptCipherDomainId hDomainId,
const EncryptCipherBaseKeyId hBaseCipherId,
const EncryptCipherRandomSalt hSalt,
const std::string& ivStr)
: textDomainId(tDomainId), textBaseCipherId(tBaseCipherId), textSalt(tSalt), headerDomainId(hDomainId),
headerBaseCipherId(hBaseCipherId), headerSalt(hSalt), ivRef(StringRef(to, ivStr)) {}
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, textDomainId, textBaseCipherId, textSalt, headerDomainId, headerBaseCipherId, headerSalt, ivRef);
}
};
struct BlobFilePointerRef {
constexpr static FileIdentifier file_identifier = 5253554;
StringRef filename;
int64_t offset;
int64_t length;
int64_t fullFileLength;
Optional<BlobGranuleCipherKeysMetaRef> cipherKeysMetaRef;
BlobFilePointerRef() {}
BlobFilePointerRef(Arena& to, const std::string& filename, int64_t offset, int64_t length, int64_t fullFileLength)
: filename(to, filename), offset(offset), length(length), fullFileLength(fullFileLength) {}
BlobFilePointerRef(Arena& to,
const std::string& filename,
int64_t offset,
int64_t length,
int64_t fullFileLength,
Optional<BlobGranuleCipherKeysMeta> ciphKeysMeta)
: filename(to, filename), offset(offset), length(length), fullFileLength(fullFileLength) {
if (ciphKeysMeta.present()) {
cipherKeysMetaRef = BlobGranuleCipherKeysMetaRef(to,
ciphKeysMeta.get().textDomainId,
ciphKeysMeta.get().textBaseCipherId,
ciphKeysMeta.get().textSalt,
ciphKeysMeta.get().headerDomainId,
ciphKeysMeta.get().headerBaseCipherId,
ciphKeysMeta.get().headerSalt,
ciphKeysMeta.get().ivStr);
}
}
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, filename, offset, length, fullFileLength);
serializer(ar, filename, offset, length, fullFileLength, cipherKeysMetaRef);
}
std::string toString() const {
std::stringstream ss;
ss << filename.toString() << ":" << offset << ":" << length << ":" << fullFileLength;
if (cipherKeysMetaRef.present()) {
ss << ":CipherKeysMeta:TextCipher:" << cipherKeysMetaRef.get().textDomainId << ":"
<< cipherKeysMetaRef.get().textBaseCipherId << ":" << cipherKeysMetaRef.get().textSalt
<< ":HeaderCipher:" << cipherKeysMetaRef.get().headerDomainId << ":"
<< cipherKeysMetaRef.get().headerBaseCipherId << ":" << cipherKeysMetaRef.get().headerSalt;
}
return std::move(ss).str();
}
};
// the assumption of this response is that the client will deserialize the files and apply the mutations themselves
// TODO could filter out delta files that don't intersect the key range being requested?
// TODO since client request passes version, we don't need to include the version of each mutation in the response if we
// pruned it there
// the assumption of this response is that the client will deserialize the files
// and apply the mutations themselves
// TODO could filter out delta files that don't intersect the key range being
// requested?
// TODO since client request passes version, we don't need to include the
// version of each mutation in the response if we pruned it there
struct BlobGranuleChunkRef {
constexpr static FileIdentifier file_identifier = 865198;
KeyRangeRef keyRange;
@ -84,10 +223,19 @@ struct BlobGranuleChunkRef {
VectorRef<BlobFilePointerRef> deltaFiles;
GranuleDeltas newDeltas;
Optional<KeyRef> tenantPrefix;
Optional<BlobGranuleCipherKeysCtx> cipherKeysCtx;
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, keyRange, includedVersion, snapshotVersion, snapshotFile, deltaFiles, newDeltas, tenantPrefix);
serializer(ar,
keyRange,
includedVersion,
snapshotVersion,
snapshotFile,
deltaFiles,
newDeltas,
tenantPrefix,
cipherKeysCtx);
}
};

View File

@ -24,8 +24,12 @@
// This file contains functions for readers who want to materialize blob granules from the underlying files
#include "fdbclient/BlobGranuleCommon.h"
#include "flow/CompressionUtils.h"
Value serializeChunkedSnapshot(Standalone<GranuleSnapshot> snapshot, int chunks);
Value serializeChunkedSnapshot(Standalone<GranuleSnapshot> snapshot,
int chunks,
Optional<CompressionFilter> compressFilter,
Optional<BlobGranuleCipherKeysCtx> cipherKeysCtx = Optional<BlobGranuleCipherKeysCtx>());
// FIXME: support sorted and chunked delta files

View File

@ -159,5 +159,9 @@ bool schemaMatch(json_spirit::mValue const& schema,
// storage nodes
ACTOR Future<Void> mgmtSnapCreate(Database cx, Standalone<StringRef> snapCmd, UID snapUID);
// Set and get the storage quota per tenant
void setStorageQuota(Transaction& tr, StringRef tenantName, uint64_t quota);
ACTOR Future<Optional<uint64_t>> getStorageQuota(Transaction* tr, StringRef tenantName);
#include "flow/unactorcompiler.h"
#endif

View File

@ -39,4 +39,6 @@ public:
T const& operator*() const { return *impl; }
T* operator->() { return impl.get(); }
T const* operator->() const { return impl.get(); }
T* get() { return impl.get(); }
T const* get() const { return impl.get(); }
};

View File

@ -863,6 +863,11 @@ public:
int SIM_KMS_MAX_KEYS;
int ENCRYPT_PROXY_MAX_DBG_TRACE_LENGTH;
bool ENABLE_TLOG_ENCRYPTION;
bool ENABLE_BLOB_GRANULE_ENCRYPTION;
// Compression
bool ENABLE_BLOB_GRANULE_COMPRESSION;
std::string BLOB_GRANULE_COMPRESSION_FILTER;
// Key Management Service (KMS) Connector
std::string KMS_CONNECTOR_TYPE;

View File

@ -25,6 +25,7 @@
// Functions and constants documenting the organization of the reserved keyspace in the database beginning with "\xFF"
#include "fdbclient/FDBTypes.h"
#include "fdbclient/BlobGranuleCommon.h"
#include "fdbclient/BlobWorkerInterface.h" // TODO move the functions that depend on this out of here and into BlobWorkerInterface.h to remove this depdendency
#include "fdbclient/StorageServerInterface.h"
#include "Tenant.h"
@ -624,8 +625,14 @@ const Key blobGranuleFileKeyFor(UID granuleID, Version fileVersion, uint8_t file
std::tuple<UID, Version, uint8_t> decodeBlobGranuleFileKey(KeyRef const& key);
const KeyRange blobGranuleFileKeyRangeFor(UID granuleID);
const Value blobGranuleFileValueFor(StringRef const& filename, int64_t offset, int64_t length, int64_t fullFileLength);
std::tuple<Standalone<StringRef>, int64_t, int64_t, int64_t> decodeBlobGranuleFileValue(ValueRef const& value);
const Value blobGranuleFileValueFor(
StringRef const& filename,
int64_t offset,
int64_t length,
int64_t fullFileLength,
Optional<BlobGranuleCipherKeysMeta> cipherKeysMeta = Optional<BlobGranuleCipherKeysMeta>());
std::tuple<Standalone<StringRef>, int64_t, int64_t, int64_t, Optional<BlobGranuleCipherKeysMeta>>
decodeBlobGranuleFileValue(ValueRef const& value);
const Value blobGranulePurgeValueFor(Version version, KeyRange range, bool force);
std::tuple<Version, KeyRange, bool> decodeBlobGranulePurgeValue(ValueRef const& value);
@ -679,6 +686,12 @@ extern const KeyRef tenantMapPrivatePrefix;
extern const KeyRef tenantLastIdKey;
extern const KeyRef tenantDataPrefixKey;
// Storage quota per tenant
// "\xff/storageQuota/[[tenantName]]" := "[[quota]]"
extern const KeyRangeRef storageQuotaKeys;
extern const KeyRef storageQuotaPrefix;
Key storageQuotaKey(StringRef tenantName);
#pragma clang diagnostic pop
#endif

View File

@ -19,6 +19,7 @@
*/
#pragma once
#include "flow/IRandom.h"
#if defined(NO_INTELLISENSE) && !defined(FDBCLIENT_TENANT_MANAGEMENT_ACTOR_G_H)
#define FDBCLIENT_TENANT_MANAGEMENT_ACTOR_G_H
#include "fdbclient/TenantManagement.actor.g.h"
@ -135,6 +136,17 @@ Future<std::pair<TenantMapEntry, bool>> createTenantTransaction(Transaction tr,
return std::make_pair(newTenant, true);
}
ACTOR template <class Transaction>
Future<int64_t> getNextTenantId(Transaction tr) {
state typename transaction_future_type<Transaction, Optional<Value>>::type lastIdFuture = tr->get(tenantLastIdKey);
Optional<Value> lastIdVal = wait(safeThreadFutureToFuture(lastIdFuture));
int64_t tenantId = lastIdVal.present() ? TenantMapEntry::prefixToId(lastIdVal.get()) + 1 : 0;
if (BUGGIFY) {
tenantId += deterministicRandom()->randomSkewedUInt32(1, 1e9);
}
return tenantId;
}
ACTOR template <class DB>
Future<TenantMapEntry> createTenant(Reference<DB> db, TenantName name) {
state Reference<typename DB::TransactionT> tr = db->createTransaction();
@ -144,7 +156,8 @@ Future<TenantMapEntry> createTenant(Reference<DB> db, TenantName name) {
try {
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr->setOption(FDBTransactionOptions::LOCK_AWARE);
state typename DB::TransactionT::template FutureT<Optional<Value>> lastIdFuture = tr->get(tenantLastIdKey);
state Future<int64_t> tenantIdFuture = getNextTenantId(tr);
if (firstTry) {
Optional<TenantMapEntry> entry = wait(tryGetTenantTransaction(tr, name));
@ -155,8 +168,7 @@ Future<TenantMapEntry> createTenant(Reference<DB> db, TenantName name) {
firstTry = false;
}
Optional<Value> lastIdVal = wait(safeThreadFutureToFuture(lastIdFuture));
int64_t tenantId = lastIdVal.present() ? TenantMapEntry::prefixToId(lastIdVal.get()) + 1 : 0;
int64_t tenantId = wait(tenantIdFuture);
tr->set(tenantLastIdKey, TenantMapEntry::idToPrefix(tenantId));
state std::pair<TenantMapEntry, bool> newTenant = wait(createTenantTransaction(tr, name, tenantId));

View File

@ -31,6 +31,7 @@
#include "fdbclient/DatabaseContext.h"
#include "fdbclient/SpecialKeySpace.actor.h"
#include "fdbclient/TenantManagement.actor.h"
#include "fdbclient/libb64/encode.h"
#include "flow/Arena.h"
#include "flow/UnitTest.h"
#include "flow/actorcompiler.h" // This must be the last #include.
@ -76,7 +77,19 @@ private:
for (auto tenant : tenants) {
json_spirit::mObject tenantEntry;
tenantEntry["id"] = tenant.second.id;
if (ryw->getDatabase()->apiVersionAtLeast(720)) {
json_spirit::mObject prefixObject;
std::string encodedPrefix = base64::encoder::from_string(tenant.second.prefix.toString());
// Remove trailing newline
encodedPrefix.resize(encodedPrefix.size() - 1);
prefixObject["base64"] = encodedPrefix;
prefixObject["printable"] = printable(tenant.second.prefix);
tenantEntry["prefix"] = prefixObject;
} else {
// This is not a standard encoding in JSON, and some libraries may not be able to easily decode it
tenantEntry["prefix"] = tenant.second.prefix.toString();
}
std::string tenantEntryString = json_spirit::write_string(json_spirit::mValue(tenantEntry));
ValueRef tenantEntryBytes(results->arena(), tenantEntryString);
results->push_back(results->arena(),
@ -108,16 +121,16 @@ private:
}
ACTOR static Future<Void> createTenants(ReadYourWritesTransaction* ryw, std::vector<TenantNameRef> tenants) {
Optional<Value> lastIdVal = wait(ryw->getTransaction().get(tenantLastIdKey));
int64_t previousId = lastIdVal.present() ? TenantMapEntry::prefixToId(lastIdVal.get()) : -1;
int64_t _nextId = wait(TenantAPI::getNextTenantId(&ryw->getTransaction()));
int64_t nextId = _nextId;
std::vector<Future<Void>> createFutures;
for (auto tenant : tenants) {
createFutures.push_back(
success(TenantAPI::createTenantTransaction(&ryw->getTransaction(), tenant, ++previousId)));
success(TenantAPI::createTenantTransaction(&ryw->getTransaction(), tenant, nextId++)));
}
ryw->getTransaction().set(tenantLastIdKey, TenantMapEntry::idToPrefix(previousId));
ryw->getTransaction().set(tenantLastIdKey, TenantMapEntry::idToPrefix(nextId - 1));
wait(waitForAll(createFutures));
return Void();
}

View File

@ -48,6 +48,14 @@ struct decoder {
delete[] code;
delete[] plaintext;
}
static std::string from_string(std::string s) {
std::stringstream in(s);
std::stringstream out;
decoder dec;
dec.decode(in, out);
return out.str();
}
};
} // namespace base64

View File

@ -193,14 +193,14 @@ Future<Void> AsyncFileCached::changeFileSize(int64_t size) {
prevLength = size;
if (offsetInPage) {
TEST(true); // Truncating to the middle of a page
CODE_PROBE(true, "Truncating to the middle of a page");
auto p = pages.find(pageOffset);
if (p != pages.end()) {
auto f = p->second->flush();
if (!f.isReady() || f.isError())
actors.push_back(f);
} else {
TEST(true); // Truncating to the middle of a page that isn't in cache
CODE_PROBE(true, "Truncating to the middle of a page that isn't in cache");
}
pageOffset += pageCache->pageSize;

View File

@ -25,7 +25,7 @@ add_flow_target(STATIC_LIBRARY NAME fdbrpc_sampling
SRCS ${FDBRPC_SRCS}
DISABLE_ACTOR_DIAGNOSTICS ${FDBRPC_SRCS_DISABLE_ACTOR_DIAGNOSTICS})
add_flow_target(LINK_TEST NAME fdbrpclinktest SRCS ${FDBRPC_SRCS} LinkTest.cpp DISABLE_ACTOR_DIAGNOSTICS ${FDBRPC_SRCS_DISABLE_ACTOR_DIAGNOSTICS})
add_flow_target(LINK_TEST NAME fdbrpclinktest SRCS LinkTest.cpp)
target_link_libraries(fdbrpclinktest PRIVATE fdbrpc rapidjson)
target_include_directories(fdbrpclinktest PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/libeio)

View File

@ -69,7 +69,7 @@ TEST_CASE("/flow/buggifiedDelay") {
});
wait(f1 && f2);
if (last == 1) {
TEST(true); // Delays can become ready out of order
CODE_PROBE(true, "Delays can become ready out of order");
return Void();
}
}

View File

@ -615,8 +615,8 @@ ACTOR Future<Void> connectionWriter(Reference<Peer> self, Reference<IConnection>
break;
}
TEST(true); // We didn't write everything, so apparently the write buffer is full. Wait for it to be
// nonfull.
CODE_PROBE(
true, "We didn't write everything, so apparently the write buffer is full. Wait for it to be nonfull");
wait(conn->onWritable());
wait(yield(TaskPriority::WriteSocket));
}
@ -1462,7 +1462,7 @@ ACTOR static Future<Void> connectionIncoming(TransportData* self, Reference<ICon
}
when(Reference<Peer> p = wait(onConnected.getFuture())) { p->onIncomingConnection(p, conn, reader); }
when(wait(delayJittered(FLOW_KNOBS->CONNECTION_MONITOR_TIMEOUT))) {
TEST(true); // Incoming connection timed out
CODE_PROBE(true, "Incoming connection timed out");
throw timed_out();
}
}
@ -1703,7 +1703,7 @@ void FlowTransport::addWellKnownEndpoint(Endpoint& endpoint, NetworkMessageRecei
}
static void sendLocal(TransportData* self, ISerializeSource const& what, const Endpoint& destination) {
TEST(true); // "Loopback" delivery
CODE_PROBE(true, "\"Loopback\" delivery");
// SOMEDAY: Would it be better to avoid (de)serialization by doing this check in flow?
Standalone<StringRef> copy;
@ -1742,7 +1742,7 @@ static ReliablePacket* sendPacket(TransportData* self,
// If there isn't an open connection, a public address, or the peer isn't compatible, we can't send
if (!peer || (peer->outgoingConnectionIdle && !destination.getPrimaryAddress().isPublic()) ||
(!peer->compatible && destination.token != Endpoint::wellKnownToken(WLTOKEN_PING_PACKET))) {
TEST(true); // Can't send to private address without a compatible open connection
CODE_PROBE(true, "Can't send to private address without a compatible open connection");
return nullptr;
}

View File

@ -116,7 +116,7 @@ public:
static Future<Void> deleteFile(std::string filename, bool mustBeDurable) {
::deleteFile(filename);
if (mustBeDurable) {
TEST(true); // deleteFile and fsync parent dir
CODE_PROBE(true, "deleteFile and fsync parent dir");
return async_fsync_parent(filename);
} else
return Void();

View File

@ -360,7 +360,7 @@ public:
//(e.g. to simulate power failure)
Future<Void> kill() {
TraceEvent("AsyncFileNonDurable_Kill", id).detail("Filename", filename);
TEST(true); // AsyncFileNonDurable was killed
CODE_PROBE(true, "AsyncFileNonDurable was killed");
return sync(this, false);
}
@ -404,7 +404,7 @@ private:
TraceEvent("AsyncFileNonDurable_KilledFileOperation", self->id)
.detail("In", context)
.detail("Filename", self->filename);
TEST(true); // AsyncFileNonDurable operation killed
CODE_PROBE(true, "AsyncFileNonDurable operation killed");
throw io_error().asInjectedFault();
}
@ -603,13 +603,13 @@ private:
.detail("HasGarbage", garbage)
.detail("Side", side)
.detail("Filename", self->filename);
TEST(true); // AsyncFileNonDurable bad write
CODE_PROBE(true, "AsyncFileNonDurable bad write");
} else {
TraceEvent("AsyncFileNonDurable_DroppedWrite", self->id)
.detail("Offset", offset + writeOffset + pageOffset)
.detail("Length", sectorLength)
.detail("Filename", self->filename);
TEST(true); // AsyncFileNonDurable dropped write
CODE_PROBE(true, "AsyncFileNonDurable dropped write");
}
pageOffset += sectorLength;
@ -689,7 +689,7 @@ private:
wait(self->file->truncate(size));
else {
TraceEvent("AsyncFileNonDurable_DroppedTruncate", self->id).detail("Size", size);
TEST(true); // AsyncFileNonDurable dropped truncate
CODE_PROBE(true, "AsyncFileNonDurable dropped truncate");
}
return Void();
@ -753,7 +753,7 @@ private:
// temporary file and then renamed to the correct location once sync is called. By not calling sync, we
// simulate a failure to fsync the directory storing the file
if (self->hasBeenSynced && writeDurable && deterministicRandom()->random01() < 0.5) {
TEST(true); // AsyncFileNonDurable kill was durable and synced
CODE_PROBE(true, "AsyncFileNonDurable kill was durable and synced");
wait(success(errorOr(self->file->sync())));
}

View File

@ -140,7 +140,7 @@ Future<Void> tssComparison(Req req,
tssData.metrics->recordLatency(req, srcEndTime - startTime, tssEndTime - startTime);
if (!TSS_doCompare(src.get(), tss.get().get())) {
TEST(true); // TSS Mismatch
CODE_PROBE(true, "TSS Mismatch");
state TraceEvent mismatchEvent(
(g_network->isSimulated() && g_simulator.tssMode == ISimulator::TSSMode::EnabledDropMutations)
? SevWarnAlways
@ -150,7 +150,7 @@ Future<Void> tssComparison(Req req,
mismatchEvent.detail("TSSID", tssData.tssId);
if (FLOW_KNOBS->LOAD_BALANCE_TSS_MISMATCH_VERIFY_SS && ssTeam->size() > 1) {
TEST(true); // checking TSS mismatch against rest of storage team
CODE_PROBE(true, "checking TSS mismatch against rest of storage team");
// if there is more than 1 SS in the team, attempt to verify that the other SS servers have the same
// data
@ -195,9 +195,9 @@ Future<Void> tssComparison(Req req,
if (tssData.metrics->shouldRecordDetailedMismatch()) {
TSS_traceMismatch(mismatchEvent, req, src.get(), tss.get().get());
TEST(FLOW_KNOBS->LOAD_BALANCE_TSS_MISMATCH_TRACE_FULL); // Tracing Full TSS Mismatch
TEST(!FLOW_KNOBS->LOAD_BALANCE_TSS_MISMATCH_TRACE_FULL); // Tracing Partial TSS Mismatch and storing
// the rest in FDB
CODE_PROBE(FLOW_KNOBS->LOAD_BALANCE_TSS_MISMATCH_TRACE_FULL, "Tracing Full TSS Mismatch");
CODE_PROBE(!FLOW_KNOBS->LOAD_BALANCE_TSS_MISMATCH_TRACE_FULL,
"Tracing Partial TSS Mismatch and storing the rest in FDB");
if (!FLOW_KNOBS->LOAD_BALANCE_TSS_MISMATCH_TRACE_FULL) {
mismatchEvent.disable();
@ -268,7 +268,7 @@ struct RequestData : NonCopyable {
Optional<TSSEndpointData> tssData = model->getTssData(stream->getEndpoint().token.first());
if (tssData.present()) {
TEST(true); // duplicating request to TSS
CODE_PROBE(true, "duplicating request to TSS");
resetReply(request);
// FIXME: optimize to avoid creating new netNotifiedQueue for each message
RequestStream<Request, P> tssRequestStream(tssData.get().endpoint);

View File

@ -47,7 +47,7 @@ Future<REPLY_TYPE(Req)> retryBrokenPromise(RequestStream<Req, P> to, Req request
throw;
resetReply(request);
wait(delayJittered(FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY));
TEST(true); // retryBrokenPromise
CODE_PROBE(true, "retryBrokenPromise");
}
}
}
@ -67,7 +67,7 @@ Future<REPLY_TYPE(Req)> retryBrokenPromise(RequestStream<Req, P> to, Req request
throw;
resetReply(request);
wait(delayJittered(FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY, taskID));
TEST(true); // retryBrokenPromise with taskID
CODE_PROBE(true, "retryBrokenPromise with taskID");
}
}
}

View File

@ -75,10 +75,19 @@ bool simulator_should_inject_fault(const char* context, const char* file, int li
uint32_t h1 = line + (p->fault_injection_r >> 32);
if (h1 < p->fault_injection_p1 * std::numeric_limits<uint32_t>::max()) {
TEST(true); // A fault was injected
TEST(error_code == error_code_io_timeout); // An io timeout was injected
TEST(error_code == error_code_io_error); // An io error was injected
TEST(error_code == error_code_platform_error); // A platform error was injected.
CODE_PROBE(true, "A fault was injected", probe::assert::simOnly, probe::context::sim2);
CODE_PROBE(error_code == error_code_io_timeout,
"An io timeout was injected",
probe::assert::simOnly,
probe::context::sim2);
CODE_PROBE(error_code == error_code_io_error,
"An io error was injected",
probe::assert::simOnly,
probe::context::sim2);
CODE_PROBE(error_code == error_code_platform_error,
"A platform error was injected.",
probe::assert::simOnly,
probe::context::sim2);
TraceEvent(SevWarn, "FaultInjected")
.detail("Context", context)
.detail("File", file)
@ -426,7 +435,7 @@ private:
deterministicRandom()->random01() < .00001) {
g_simulator.lastConnectionFailure = now();
double a = deterministicRandom()->random01(), b = deterministicRandom()->random01();
TEST(true); // Simulated connection failure
CODE_PROBE(true, "Simulated connection failure", probe::context::sim2, probe::assert::simOnly);
TraceEvent("ConnectionFailure", dbgid)
.detail("MyAddr", process->address)
.detail("PeerAddr", peerProcess->address)
@ -1178,7 +1187,7 @@ public:
auto f = IAsyncFileSystem::filesystem(self->net2)->deleteFile(filename, false);
ASSERT(f.isReady());
wait(::delay(0.05 * deterministicRandom()->random01()));
TEST(true); // Simulated durable delete
CODE_PROBE(true, "Simulated durable delete", probe::context::sim2, probe::assert::simOnly);
}
wait(g_simulator.onProcess(currentProcess, currentTaskID));
return Void();
@ -1191,7 +1200,7 @@ public:
TraceEvent(SevDebug, "Sim2DeleteFileImplNonDurable")
.detail("Filename", filename)
.detail("Durable", mustBeDurable);
TEST(true); // Simulated non-durable delete
CODE_PROBE(true, "Simulated non-durable delete", probe::context::sim2, probe::assert::simOnly);
return Void();
}
}
@ -1587,10 +1596,20 @@ public:
killProcess_internal(p, KillInstantly);
}
void killProcess_internal(ProcessInfo* machine, KillType kt) {
TEST(true); // Simulated machine was killed with any kill type
TEST(kt == KillInstantly); // Simulated machine was killed instantly
TEST(kt == InjectFaults); // Simulated machine was killed with faults
TEST(kt == FailDisk); // Simulated machine was killed with a failed disk
CODE_PROBE(
true, "Simulated machine was killed with any kill type", probe::context::sim2, probe::assert::simOnly);
CODE_PROBE(kt == KillInstantly,
"Simulated machine was killed instantly",
probe::context::sim2,
probe::assert::simOnly);
CODE_PROBE(kt == InjectFaults,
"Simulated machine was killed with faults",
probe::context::sim2,
probe::assert::simOnly);
CODE_PROBE(kt == FailDisk,
"Simulated machine was killed with a failed disk",
probe::context::sim2,
probe::assert::simOnly);
if (kt == KillInstantly) {
TraceEvent(SevWarn, "FailMachine")
@ -1715,9 +1734,10 @@ public:
KillType* ktFinal) override {
auto ktOrig = kt;
TEST(true); // Trying to killing a machine
TEST(kt == KillInstantly); // Trying to kill instantly
TEST(kt == InjectFaults); // Trying to kill by injecting faults
CODE_PROBE(true, "Trying to killing a machine", probe::context::sim2, probe::assert::simOnly);
CODE_PROBE(kt == KillInstantly, "Trying to kill instantly", probe::context::sim2, probe::assert::simOnly);
CODE_PROBE(
kt == InjectFaults, "Trying to kill by injecting faults", probe::context::sim2, probe::assert::simOnly);
if (speedUpSimulation && !forceKill) {
TraceEvent(SevWarn, "AbortedKill")
@ -1851,11 +1871,17 @@ public:
}
}
TEST(originalKt != kt); // Kill type was changed from requested to reboot.
CODE_PROBE(originalKt != kt,
"Kill type was changed from requested to reboot.",
probe::context::sim2,
probe::assert::simOnly);
// Check if any processes on machine are rebooting
if (processesOnMachine != processesPerMachine && kt >= RebootAndDelete) {
TEST(true); // Attempted reboot, but the target did not have all of its processes running
CODE_PROBE(true,
"Attempted reboot, but the target did not have all of its processes running",
probe::context::sim2,
probe::assert::simOnly);
TraceEvent(SevWarn, "AbortedKill")
.detail("KillType", kt)
.detail("MachineId", machineId)
@ -1870,7 +1896,10 @@ public:
// Check if any processes on machine are rebooting
if (processesOnMachine != processesPerMachine) {
TEST(true); // Attempted reboot and kill, but the target did not have all of its processes running
CODE_PROBE(true,
"Attempted reboot and kill, but the target did not have all of its processes running",
probe::context::sim2,
probe::assert::simOnly);
TraceEvent(SevWarn, "AbortedKill")
.detail("KillType", kt)
.detail("MachineId", machineId)
@ -1920,10 +1949,12 @@ public:
}
}
TEST(kt == RebootAndDelete); // Resulted in a reboot and delete
TEST(kt == Reboot); // Resulted in a reboot
TEST(kt == KillInstantly); // Resulted in an instant kill
TEST(kt == InjectFaults); // Resulted in a kill by injecting faults
CODE_PROBE(
kt == RebootAndDelete, "Resulted in a reboot and delete", probe::context::sim2, probe::assert::simOnly);
CODE_PROBE(kt == Reboot, "Resulted in a reboot", probe::context::sim2, probe::assert::simOnly);
CODE_PROBE(kt == KillInstantly, "Resulted in an instant kill", probe::context::sim2, probe::assert::simOnly);
CODE_PROBE(
kt == InjectFaults, "Resulted in a kill by injecting faults", probe::context::sim2, probe::assert::simOnly);
if (ktFinal)
*ktFinal = kt;
@ -2037,13 +2068,32 @@ public:
.detail("KillTypeMin", ktMin)
.detail("KilledDC", kt == ktMin);
TEST(kt != ktMin); // DataCenter kill was rejected by killMachine
TEST((kt == ktMin) && (kt == RebootAndDelete)); // Datacenter kill Resulted in a reboot and delete
TEST((kt == ktMin) && (kt == Reboot)); // Datacenter kill Resulted in a reboot
TEST((kt == ktMin) && (kt == KillInstantly)); // Datacenter kill Resulted in an instant kill
TEST((kt == ktMin) && (kt == InjectFaults)); // Datacenter kill Resulted in a kill by injecting faults
TEST((kt == ktMin) && (kt != ktOrig)); // Datacenter Kill request was downgraded
TEST((kt == ktMin) && (kt == ktOrig)); // Datacenter kill - Requested kill was done
CODE_PROBE(
kt != ktMin, "DataCenter kill was rejected by killMachine", probe::context::sim2, probe::assert::simOnly);
CODE_PROBE((kt == ktMin) && (kt == RebootAndDelete),
"Datacenter kill Resulted in a reboot and delete",
probe::context::sim2,
probe::assert::simOnly);
CODE_PROBE((kt == ktMin) && (kt == Reboot),
"Datacenter kill Resulted in a reboot",
probe::context::sim2,
probe::assert::simOnly);
CODE_PROBE((kt == ktMin) && (kt == KillInstantly),
"Datacenter kill Resulted in an instant kill",
probe::context::sim2,
probe::assert::simOnly);
CODE_PROBE((kt == ktMin) && (kt == InjectFaults),
"Datacenter kill Resulted in a kill by injecting faults",
probe::context::sim2,
probe::assert::simOnly);
CODE_PROBE((kt == ktMin) && (kt != ktOrig),
"Datacenter Kill request was downgraded",
probe::context::sim2,
probe::assert::simOnly);
CODE_PROBE((kt == ktMin) && (kt == ktOrig),
"Datacenter kill - Requested kill was done",
probe::context::sim2,
probe::assert::simOnly);
if (ktFinal)
*ktFinal = ktMin;
@ -2276,7 +2326,7 @@ class UDPSimSocket : public IUDPSocket, ReferenceCounted<UDPSimSocket> {
NetworkAddress _localAddress;
bool randomDropPacket() {
auto res = deterministicRandom()->random01() < .000001;
TEST(res); // UDP packet drop
CODE_PROBE(res, "UDP packet drop", probe::context::sim2, probe::assert::simOnly);
return res;
}
@ -2485,12 +2535,20 @@ ACTOR void doReboot(ISimulator::ProcessInfo* p, ISimulator::KillType kt) {
ASSERT(kt == ISimulator::RebootProcess || kt == ISimulator::Reboot || kt == ISimulator::RebootAndDelete ||
kt == ISimulator::RebootProcessAndDelete);
TEST(kt == ISimulator::RebootProcess); // Simulated process rebooted
TEST(kt == ISimulator::Reboot); // Simulated machine rebooted
TEST(kt == ISimulator::RebootAndDelete); // Simulated machine rebooted with data and coordination state deletion
TEST(
kt ==
ISimulator::RebootProcessAndDelete); // Simulated process rebooted with data and coordination state deletion
CODE_PROBE(kt == ISimulator::RebootProcess,
"Simulated process rebooted",
probe::assert::simOnly,
probe::context::sim2);
CODE_PROBE(
kt == ISimulator::Reboot, "Simulated machine rebooted", probe::assert::simOnly, probe::context::sim2);
CODE_PROBE(kt == ISimulator::RebootAndDelete,
"Simulated machine rebooted with data and coordination state deletion",
probe::assert::simOnly,
probe::context::sim2);
CODE_PROBE(kt == ISimulator::RebootProcessAndDelete,
"Simulated process rebooted with data and coordination state deletion",
probe::assert::simOnly,
probe::context::sim2);
if (p->rebooting || !p->isReliable()) {
TraceEvent(SevDebug, "DoRebootFailed")

View File

@ -624,7 +624,7 @@ private:
if (!initialCommit)
txnStateStore->set(KeyValueRef(m.param1, m.param2));
confChange = true;
TEST(true); // Recovering at a higher version.
CODE_PROBE(true, "Recovering at a higher version.");
}
void checkSetVersionEpochKey(MutationRef m) {
@ -636,7 +636,7 @@ private:
if (!initialCommit)
txnStateStore->set(KeyValueRef(m.param1, m.param2));
confChange = true;
TEST(true); // Setting version epoch
CODE_PROBE(true, "Setting version epoch");
}
void checkSetWriteRecoverKey(MutationRef m) {
@ -646,7 +646,7 @@ private:
TraceEvent("WriteRecoveryKeySet", dbgid).log();
if (!initialCommit)
txnStateStore->set(KeyValueRef(m.param1, m.param2));
TEST(true); // Snapshot created, setting writeRecoveryKey in txnStateStore
CODE_PROBE(true, "Snapshot created, setting writeRecoveryKey in txnStateStore");
}
void checkSetTenantMapPrefix(MutationRef m) {
@ -680,7 +680,7 @@ private:
writeMutation(privatized);
}
TEST(true); // Tenant added to map
CODE_PROBE(true, "Tenant added to map");
}
}
@ -1068,7 +1068,7 @@ private:
writeMutation(privatized);
}
TEST(true); // Tenant cleared from map
CODE_PROBE(true, "Tenant cleared from map");
}
}

View File

@ -72,7 +72,7 @@ struct VersionedMessage {
if (reader.protocolVersion().hasSpanContext() && SpanContextMessage::isNextIn(reader))
return false;
if (reader.protocolVersion().hasOTELSpanContext() && OTELSpanContextMessage::isNextIn(reader)) {
TEST(true); // Returning false for OTELSpanContextMessage
CODE_PROBE(true, "Returning false for OTELSpanContextMessage");
return false;
}
if (EncryptedMutationMessage::isNextIn(reader)) {

View File

@ -30,6 +30,18 @@
#include "flow/UnitTest.h"
#include "flow/actorcompiler.h" // has to be last include
// serialize change feed key as UID bytes, to use 16 bytes on disk
Key granuleIDToCFKey(UID granuleID) {
BinaryWriter wr(Unversioned());
wr << granuleID;
return wr.toValue();
}
// parse change feed key back to UID, to be human-readable
UID cfKeyToGranuleID(Key cfKey) {
return BinaryReader::fromStringRef<UID>(cfKey, Unversioned());
}
// Gets the latest granule history node for range that was persisted
ACTOR Future<Optional<GranuleHistory>> getLatestGranuleHistory(Transaction* tr, KeyRange range) {
state KeyRange historyRange = blobGranuleHistoryKeyRangeFor(range);
@ -62,13 +74,14 @@ ACTOR Future<Void> readGranuleFiles(Transaction* tr, Key* startKey, Key endKey,
int64_t offset;
int64_t length;
int64_t fullFileLength;
Optional<BlobGranuleCipherKeysMeta> cipherKeysMeta;
std::tie(gid, version, fileType) = decodeBlobGranuleFileKey(it.key);
ASSERT(gid == granuleID);
std::tie(filename, offset, length, fullFileLength) = decodeBlobGranuleFileValue(it.value);
std::tie(filename, offset, length, fullFileLength, cipherKeysMeta) = decodeBlobGranuleFileValue(it.value);
BlobFileIndex idx(version, filename.toString(), offset, length, fullFileLength);
BlobFileIndex idx(version, filename.toString(), offset, length, fullFileLength, cipherKeysMeta);
if (fileType == 'S') {
ASSERT(files->snapshotFiles.empty() || files->snapshotFiles.back().version < idx.version);
files->snapshotFiles.push_back(idx);
@ -170,8 +183,12 @@ void GranuleFiles::getFiles(Version beginVersion,
Version lastIncluded = invalidVersion;
if (snapshotF != snapshotFiles.end()) {
chunk.snapshotVersion = snapshotF->version;
chunk.snapshotFile = BlobFilePointerRef(
replyArena, snapshotF->filename, snapshotF->offset, snapshotF->length, snapshotF->fullFileLength);
chunk.snapshotFile = BlobFilePointerRef(replyArena,
snapshotF->filename,
snapshotF->offset,
snapshotF->length,
snapshotF->fullFileLength,
snapshotF->cipherKeysMeta);
lastIncluded = chunk.snapshotVersion;
} else {
chunk.snapshotVersion = invalidVersion;

View File

@ -176,7 +176,7 @@ ACTOR Future<Void> clearAndAwaitMerge(Database cx, KeyRange range) {
if (ranges.size() == 1) {
return Void();
}
TEST(true); // clearAndAwaitMerge doing clear
CODE_PROBE(true, "ClearAndAwaitMerge doing clear");
tr.clear(range);
wait(tr.commit());

View File

@ -459,7 +459,7 @@ ACTOR Future<Standalone<VectorRef<KeyRef>>> splitRange(Reference<BlobManagerData
if (writeHot) {
splitThreshold /= 3;
}
TEST(writeHot); // Change feed write hot split
CODE_PROBE(writeHot, "Change feed write hot split");
if (estimated.bytes > splitThreshold) {
// only split on bytes and write rate
state StorageMetrics splitMetrics;
@ -495,7 +495,7 @@ ACTOR Future<Standalone<VectorRef<KeyRef>>> splitRange(Reference<BlobManagerData
ASSERT(keys.back() == range.end);
return keys;
} else {
TEST(writeHot); // Not splitting write-hot because granules would be too small
CODE_PROBE(writeHot, "Not splitting write-hot because granules would be too small");
if (BM_DEBUG) {
printf("Not splitting range\n");
}
@ -527,7 +527,7 @@ ACTOR Future<Standalone<VectorRef<KeyRef>>> splitRange(Reference<BlobManagerData
ACTOR Future<UID> pickWorkerForAssign(Reference<BlobManagerData> bmData) {
// wait until there are BWs to pick from
while (bmData->workerStats.size() == 0) {
TEST(true); // BM wants to assign range, but no workers available
CODE_PROBE(true, "BM wants to assign range, but no workers available");
if (BM_DEBUG) {
fmt::print("BM {0} waiting for blob workers before assigning granules\n", bmData->epoch);
}
@ -685,7 +685,7 @@ ACTOR Future<Void> doRangeAssignment(Reference<BlobManagerData> bmData,
throw;
}
TEST(true); // BM retrying range assign
CODE_PROBE(true, "BM retrying range assign");
// We use reliable delivery (getReply), so the broken_promise means the worker is dead, and we may need to retry
// somewhere else
@ -749,7 +749,7 @@ static bool handleRangeIsAssign(Reference<BlobManagerData> bmData, RangeAssignme
if (assignment.assign.get().type == AssignRequestType::Continue) {
ASSERT(assignment.worker.present());
if (i.range() != assignment.keyRange || i.cvalue() != assignment.worker.get()) {
TEST(true); // BM assignment out of date
CODE_PROBE(true, "BM assignment out of date");
if (BM_DEBUG) {
fmt::print("Out of date re-assign for ({0}, {1}). Assignment must have changed while "
"checking split.\n Reassign: [{2} - {3}): {4}\n Existing: [{5} - {6}): {7}\n",
@ -880,7 +880,7 @@ ACTOR Future<Void> writeInitialGranuleMapping(Reference<BlobManagerData> bmData,
state int i = 0;
state int transactionChunkSize = BUGGIFY ? deterministicRandom()->randomInt(2, 5) : 1000;
while (i < boundaries.size() - 1) {
TEST(i > 0); // multiple transactions for large granule split
CODE_PROBE(i > 0, "multiple transactions for large granule split");
tr->reset();
state int j = 0;
loop {
@ -1176,7 +1176,7 @@ ACTOR Future<Void> maybeSplitRange(Reference<BlobManagerData> bmData,
// Enforce max split fanout for performance reasons. This mainly happens when a blob worker is behind.
if (newRanges.size() >=
SERVER_KNOBS->BG_MAX_SPLIT_FANOUT + 2) { // +2 because this is boundaries, so N keys would have N+1 bounaries.
TEST(true); // downsampling granule split because fanout too high
CODE_PROBE(true, "downsampling granule split because fanout too high");
Standalone<VectorRef<KeyRef>> coalescedRanges;
coalescedRanges.arena().dependsOn(newRanges.arena());
coalescedRanges.push_back(coalescedRanges.arena(), newRanges.front());
@ -1250,7 +1250,7 @@ ACTOR Future<Void> maybeSplitRange(Reference<BlobManagerData> bmData,
if (!existingState.empty()) {
// Something was previously committed, we must go with that decision.
// Read its boundaries and override our planned split boundaries
TEST(true); // Overriding split ranges with existing ones from DB
CODE_PROBE(true, "Overriding split ranges with existing ones from DB");
RangeResult existingBoundaries =
wait(tr->getRange(KeyRangeRef(granuleRange.begin.withPrefix(blobGranuleMappingKeys.begin),
keyAfter(granuleRange.end).withPrefix(blobGranuleMappingKeys.begin)),
@ -1628,7 +1628,7 @@ ACTOR Future<Void> persistMergeGranulesDone(Reference<BlobManagerData> bmData,
}
}
if (tmpWorkerId == UID()) {
TEST(true); // All workers dead right now
CODE_PROBE(true, "All workers dead right now");
while (bmData->workersById.empty()) {
wait(bmData->recruitingStream.onChange() || bmData->foundBlobWorkers.getFuture());
}
@ -1699,7 +1699,7 @@ ACTOR Future<Void> persistMergeGranulesDone(Reference<BlobManagerData> bmData,
mergeVersion,
tr->getCommittedVersion());
}
TEST(true); // Granule merge complete
CODE_PROBE(true, "Granule merge complete");
return Void();
} catch (Error& e) {
wait(tr->onError(e));
@ -1807,7 +1807,7 @@ static void attemptStartMerge(Reference<BlobManagerData> bmData,
auto reCheckMergeCandidates = bmData->mergeCandidates.intersectingRanges(mergeRange);
for (auto it : reCheckMergeCandidates) {
if (!it->cvalue().canMergeNow()) {
TEST(true); // granule no longer merge candidate after checking metrics, aborting merge
CODE_PROBE(true, " granule no longer merge candidate after checking metrics, aborting merge");
return;
}
}
@ -1819,7 +1819,7 @@ static void attemptStartMerge(Reference<BlobManagerData> bmData,
mergeRange.end.printable(),
toMerge.size());
}
TEST(true); // Doing granule merge!
CODE_PROBE(true, "Doing granule merge");
bmData->activeGranuleMerges.insert(mergeRange, 0);
bmData->clearMergeCandidate(mergeRange, MergeCandidateMerging);
// Now, after setting activeGranuleMerges, we have committed to doing the merge, so any subsequent split eval for
@ -1836,7 +1836,7 @@ ACTOR Future<Void> attemptMerges(Reference<BlobManagerData> bmData,
for (int i = 0; i < candidates.size() - 1; i++) {
ASSERT(std::get<1>(candidates[i]).end == std::get<1>(candidates[i + 1]).begin);
}
TEST(true); // Candidate ranges to merge
CODE_PROBE(true, "Candidate ranges to merge");
wait(bmData->concurrentMergeChecks.take());
state FlowLock::Releaser holdingDVL(bmData->concurrentMergeChecks);
@ -1868,7 +1868,7 @@ ACTOR Future<Void> attemptMerges(Reference<BlobManagerData> bmData,
currentBytes + metrics.bytes > SERVER_KNOBS->BG_SNAPSHOT_FILE_TARGET_BYTES ||
currentKeySumBytes >= CLIENT_KNOBS->VALUE_SIZE_LIMIT / 2) {
ASSERT(currentBytes <= SERVER_KNOBS->BG_SNAPSHOT_FILE_TARGET_BYTES);
TEST(currentKeySumBytes >= CLIENT_KNOBS->VALUE_SIZE_LIMIT / 2); // merge early because of key size
CODE_PROBE(currentKeySumBytes >= CLIENT_KNOBS->VALUE_SIZE_LIMIT / 2, "merge early because of key size");
attemptStartMerge(bmData, currentCandidates);
currentCandidates.clear();
currentBytes = 0;
@ -1935,7 +1935,7 @@ ACTOR Future<Void> granuleMergeChecker(Reference<BlobManagerData> bmData) {
mergeChecks.push_back(attemptMerges(bmData, currentCandidates));
}
TEST(mergeChecks.size() > 1); // parallel merge checks
CODE_PROBE(mergeChecks.size() > 1, "parallel merge checks");
wait(waitForAll(mergeChecks));
// if the calculation took longer than the desired interval, still wait a bit
wait(intervalDelay && delay(5.0));
@ -2130,7 +2130,7 @@ ACTOR Future<Void> monitorBlobWorkerStatus(Reference<BlobManagerData> bmData, Bl
}
ignore = true;
} else if (newEval < lastBoundaryEval.cvalue()) {
TEST(true); // BM got out-of-date split request
CODE_PROBE(true, "BM got out-of-date split request");
if (BM_DEBUG) {
fmt::print("BM {0} ignoring status from BW {1} for granule [{2} - {3}) {4} since it "
"already processed [{5} - {6}) {7}.\n",
@ -2206,7 +2206,7 @@ ACTOR Future<Void> monitorBlobWorkerStatus(Reference<BlobManagerData> bmData, Bl
// suddenly gets a burst of writes after a decision to merge is made
if (inProgressMergeVersion != invalidVersion) {
if (rep.blockedVersion < inProgressMergeVersion) {
TEST(true); // merge blocking re-snapshot
CODE_PROBE(true, "merge blocking re-snapshot");
if (BM_DEBUG) {
fmt::print("DBG: BM {0} MERGE @ {1} blocking re-snapshot [{2} - {3}) @ {4}, "
"continuing snapshot\n",
@ -2272,6 +2272,7 @@ ACTOR Future<Void> monitorBlobWorkerStatus(Reference<BlobManagerData> bmData, Bl
if (rep.mergeCandidate && !ignore) {
// mark granule as merge candidate
ASSERT(!rep.doSplit);
CODE_PROBE(true, "Granule merge candidate");
if (BM_DEBUG) {
fmt::print("Manager {0} merge candidate granule [{1} - {2}) {3}\n",
bmData->epoch,
@ -2307,7 +2308,7 @@ ACTOR Future<Void> monitorBlobWorkerStatus(Reference<BlobManagerData> bmData, Bl
// if it is permanent, the failure monitor will eventually trip.
ASSERT(e.code() != error_code_end_of_stream);
if (e.code() == error_code_request_maybe_delivered || e.code() == error_code_connection_failed) {
TEST(true); // BM retrying BW monitoring
CODE_PROBE(true, "BM retrying BW monitoring");
wait(delay(backoff));
backoff = std::min(backoff * SERVER_KNOBS->BLOB_MANAGER_STATUS_EXP_BACKOFF_EXPONENT,
SERVER_KNOBS->BLOB_MANAGER_STATUS_EXP_BACKOFF_MAX);
@ -2445,7 +2446,7 @@ static void addAssignment(KeyRangeMap<std::tuple<UID, int64_t, int64_t>>& map,
if (oldEpoch > newEpoch || (oldEpoch == newEpoch && oldSeqno > newSeqno)) {
newer.push_back(std::pair(old.range(), std::tuple(oldWorker, oldEpoch, oldSeqno)));
if (old.range() != newRange) {
TEST(true); // BM Recovery: BWs disagree on range boundaries
CODE_PROBE(true, "BM Recovery: BWs disagree on range boundaries");
anyConflicts = true;
}
} else {
@ -2455,7 +2456,7 @@ static void addAssignment(KeyRangeMap<std::tuple<UID, int64_t, int64_t>>& map,
ASSERT(oldEpoch != newEpoch || oldSeqno != newSeqno);
}
if (newEpoch == std::numeric_limits<int64_t>::max() && (oldWorker != newId || old.range() != newRange)) {
TEST(true); // BM Recovery: DB disagrees with workers
CODE_PROBE(true, "BM Recovery: DB disagrees with workers");
// new one is from DB (source of truth on boundaries) and existing mapping disagrees on boundary or
// assignment, do explicit revoke and re-assign to converge
anyConflicts = true;
@ -2479,7 +2480,7 @@ static void addAssignment(KeyRangeMap<std::tuple<UID, int64_t, int64_t>>& map,
std::get<0>(old.value()) = UID();
}
if (outOfDate.empty() || outOfDate.back() != std::pair(oldWorker, KeyRange(old.range()))) {
TEST(true); // BM Recovery: Two workers claim ownership of same granule
CODE_PROBE(true, "BM Recovery: Two workers claim ownership of same granule");
outOfDate.push_back(std::pair(oldWorker, old.range()));
}
}
@ -2519,7 +2520,7 @@ ACTOR Future<Void> resumeActiveMerges(Reference<BlobManagerData> bmData) {
RangeResult result = wait(tr->getRange(currentRange, rowLimit));
for (auto& it : result) {
TEST(true); // Blob Manager Recovery found merging granule
CODE_PROBE(true, "Blob Manager Recovery found merging granule");
UID mergeGranuleID = decodeBlobGranuleMergeKey(it.key);
KeyRange mergeRange;
std::vector<UID> parentGranuleIDs;
@ -2586,7 +2587,7 @@ ACTOR Future<Void> recoverBlobManager(Reference<BlobManagerData> bmData) {
state Future<Void> resumeMergesFuture = resumeActiveMerges(bmData);
TEST(true); // BM doing recovery
CODE_PROBE(true, "BM doing recovery");
wait(delay(0));
@ -2667,7 +2668,7 @@ ACTOR Future<Void> recoverBlobManager(Reference<BlobManagerData> bmData) {
bmData->workerStats[workerId].numGranulesAssigned = reply.get().assignments.size();
}
} else {
TEST(true); // BM Recovery: BW didn't respond to assignments request
CODE_PROBE(true, "BM Recovery: BW didn't respond to assignments request");
// SOMEDAY: mark as failed and kill it
if (BM_DEBUG) {
fmt::print(" Worker {}: failed\n", workerId.toString().substr(0, 5));
@ -2771,7 +2772,7 @@ ACTOR Future<Void> recoverBlobManager(Reference<BlobManagerData> bmData) {
}
// revoke assignments that are old and incorrect
TEST(!outOfDateAssignments.empty()); // BM resolved conflicting assignments on recovery
CODE_PROBE(!outOfDateAssignments.empty(), "BM resolved conflicting assignments on recovery");
for (auto& it : outOfDateAssignments) {
if (BM_DEBUG) {
fmt::print("BM {0} revoking out of date assignment [{1} - {2}): {3}:\n",
@ -2841,7 +2842,7 @@ ACTOR Future<Void> chaosRangeMover(Reference<BlobManagerData> bmData) {
// KeyRange isn't hashable and this is only for simulation, so just use toString of range
state std::unordered_set<std::string> alreadyMoved;
ASSERT(g_network->isSimulated());
TEST(true); // BM chaos range mover enabled
CODE_PROBE(true, "BM chaos range mover enabled");
loop {
wait(delay(30.0));
@ -2945,7 +2946,7 @@ ACTOR Future<Void> initializeBlobWorker(Reference<BlobManagerData> self, Recruit
// if it failed in an expected way, add some delay before we try to recruit again
// on this worker
if (newBlobWorker.isError()) {
TEST(true); // BM got error recruiting BW
CODE_PROBE(true, "BM got error recruiting BW");
TraceEvent(SevWarn, "BMRecruitmentError", self->id)
.error(newBlobWorker.getError())
.detail("Epoch", self->epoch);
@ -3049,7 +3050,7 @@ ACTOR Future<Void> blobWorkerRecruiter(
if (e.code() != error_code_timed_out) {
throw;
}
TEST(true); // Blob worker recruitment timed out
CODE_PROBE(true, "Blob worker recruitment timed out");
}
}
}
@ -3737,7 +3738,7 @@ ACTOR Future<Void> doLockChecks(Reference<BlobManagerData> bmData) {
wait(check.getFuture());
wait(delay(0.5)); // don't do this too often if a lot of conflict
TEST(true); // BM doing lock checks after getting conflicts
CODE_PROBE(true, "BM doing lock checks after getting conflicts");
state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(bmData->db);

View File

@ -18,12 +18,7 @@
* limitations under the License.
*/
#include <limits>
#include <tuple>
#include <utility>
#include <vector>
#include "fmt/format.h"
#include "fdbclient/ClientBooleanParams.h"
#include "fdbclient/BlobGranuleFiles.h"
#include "fdbclient/FDBTypes.h"
#include "fdbclient/KeyRangeMap.h"
@ -40,18 +35,30 @@
#include "fdbclient/NativeAPI.actor.h"
#include "fdbclient/Notified.h"
#include "fdbserver/Knobs.h"
#include "fdbserver/BlobGranuleServerCommon.actor.h"
#include "fdbserver/GetEncryptCipherKeys.h"
#include "fdbserver/Knobs.h"
#include "fdbserver/MutationTracking.h"
#include "fdbserver/WaitFailure.h"
#include "fdbserver/ServerDBInfo.h"
#include "fdbserver/WaitFailure.h"
#include "flow/Arena.h"
#include "flow/BlobCipher.h"
#include "flow/CompressionUtils.h"
#include "flow/EncryptUtils.h"
#include "flow/Error.h"
#include "flow/IRandom.h"
#include "flow/Trace.h"
#include "flow/flow.h"
#include "flow/IRandom.h"
#include "flow/network.h"
#include "flow/Trace.h"
#include "flow/xxhash.h"
#include "fmt/format.h"
#include <limits>
#include <tuple>
#include <utility>
#include <vector>
#include "flow/actorcompiler.h" // has to be last include
@ -61,8 +68,8 @@
/*
* The Blob Worker is a stateless role assigned a set of granules by the Blob Manager.
* It is responsible for managing the change feeds for those granules, and for consuming the mutations from those change
* feeds and writing them out as files to blob storage.
* It is responsible for managing the change feeds for those granules, and for consuming the mutations from
* those change feeds and writing them out as files to blob storage.
*/
struct GranuleStartState {
@ -182,6 +189,7 @@ struct BlobWorkerData : NonCopyable, ReferenceCounted<BlobWorkerData> {
Reference<BlobConnectionProvider> bstore;
KeyRangeMap<GranuleRangeMetadata> granuleMetadata;
BGTenantMap tenantData;
Reference<AsyncVar<ServerDBInfo> const> dbInfo;
// contains the history of completed granules before the existing ones. Maps to the latest one, and has
// back-pointers to earlier granules
@ -199,8 +207,8 @@ struct BlobWorkerData : NonCopyable, ReferenceCounted<BlobWorkerData> {
int changeFeedStreamReplyBufferSize = SERVER_KNOBS->BG_DELTA_FILE_TARGET_BYTES / 2;
BlobWorkerData(UID id, Reference<AsyncVar<ServerDBInfo> const> dbInfo, Database db)
: id(id), db(db), stats(id, SERVER_KNOBS->WORKER_LOGGING_INTERVAL), tenantData(BGTenantMap(dbInfo)),
BlobWorkerData(UID id, Reference<AsyncVar<ServerDBInfo> const> dbInf, Database db)
: id(id), db(db), stats(id, SERVER_KNOBS->WORKER_LOGGING_INTERVAL), tenantData(BGTenantMap(dbInf)), dbInfo(dbInf),
initialSnapshotLock(SERVER_KNOBS->BLOB_WORKER_INITIAL_SNAPSHOT_PARALLELISM) {}
bool managerEpochOk(int64_t epoch) {
@ -225,8 +233,23 @@ struct BlobWorkerData : NonCopyable, ReferenceCounted<BlobWorkerData> {
}
};
namespace {
bool isBlobFileEncryptionSupported() {
bool supported = SERVER_KNOBS->ENABLE_BLOB_GRANULE_ENCRYPTION && SERVER_KNOBS->BG_RANGE_SOURCE == "tenant";
ASSERT((supported && SERVER_KNOBS->ENABLE_ENCRYPTION) || !supported);
return supported;
}
Optional<CompressionFilter> getBlobFileCompressFilter() {
Optional<CompressionFilter> compFilter;
if (SERVER_KNOBS->ENABLE_BLOB_GRANULE_COMPRESSION) {
compFilter = CompressionUtils::fromFilterString(SERVER_KNOBS->BLOB_GRANULE_COMPRESSION_FILTER);
}
return compFilter;
}
// returns true if we can acquire it
static void acquireGranuleLock(int64_t epoch, int64_t seqno, int64_t prevOwnerEpoch, int64_t prevOwnerSeqno) {
void acquireGranuleLock(int64_t epoch, int64_t seqno, int64_t prevOwnerEpoch, int64_t prevOwnerSeqno) {
// returns true if our lock (E, S) >= (Eprev, Sprev)
if (epoch < prevOwnerEpoch || (epoch == prevOwnerEpoch && seqno < prevOwnerSeqno)) {
if (BW_DEBUG) {
@ -240,7 +263,7 @@ static void acquireGranuleLock(int64_t epoch, int64_t seqno, int64_t prevOwnerEp
}
}
static void checkGranuleLock(int64_t epoch, int64_t seqno, int64_t ownerEpoch, int64_t ownerSeqno) {
void checkGranuleLock(int64_t epoch, int64_t seqno, int64_t ownerEpoch, int64_t ownerSeqno) {
// sanity check - lock value should never go backwards because of acquireGranuleLock
ASSERT(epoch <= ownerEpoch);
ASSERT(epoch < ownerEpoch || (epoch == ownerEpoch && seqno <= ownerSeqno));
@ -257,6 +280,112 @@ static void checkGranuleLock(int64_t epoch, int64_t seqno, int64_t ownerEpoch, i
throw granule_assignment_conflict();
}
}
} // namespace
// Below actors asssit in fetching/lookup desired encryption keys. Following steps are done for an encryption key
// lookup:
// 1. Lookup proccess local in-memory cache `BlobCipherKeyCache` to check if desired EK is 'present' and 'valid'. Given
// FDB supports 'revocable' & 'non-revocable' EKs; a cached EK can also be 'invalid'.
// 2. Local cache miss will follow with a RPC call to EncryptKeyProxy process (EKP), EKP maintain an in-memory cache of
// KMS BaseCipher details with KMS defined TTL if applicable. The lookup call can either to serviced by EKP or would
// lead to desired KMS endpoint invocation.
//
// In most of the cases, the EK lookup should be satisfied by process local in-memory cache and/or EKP in-memory cache,
// unless cluster and/or a process crash/restart.
ACTOR Future<BlobGranuleCipherKeysCtx> getLatestGranuleCipherKeys(Reference<BlobWorkerData> bwData,
KeyRange keyRange,
Arena* arena) {
state BlobGranuleCipherKeysCtx cipherKeysCtx;
state Reference<GranuleTenantData> tenantData = bwData->tenantData.getDataForGranule(keyRange);
ASSERT(tenantData.isValid());
std::unordered_map<EncryptCipherDomainId, EncryptCipherDomainName> domains;
domains.emplace(tenantData->entry.id, StringRef(*arena, tenantData->name));
std::unordered_map<EncryptCipherDomainId, Reference<BlobCipherKey>> domainKeyMap =
wait(getLatestEncryptCipherKeys(bwData->dbInfo, domains));
auto domainKeyItr = domainKeyMap.find(tenantData->entry.id);
ASSERT(domainKeyItr != domainKeyMap.end());
cipherKeysCtx.textCipherKey = BlobGranuleCipherKey::fromBlobCipherKey(domainKeyItr->second, *arena);
TextAndHeaderCipherKeys systemCipherKeys = wait(getLatestSystemEncryptCipherKeys(bwData->dbInfo));
cipherKeysCtx.headerCipherKey = BlobGranuleCipherKey::fromBlobCipherKey(systemCipherKeys.cipherHeaderKey, *arena);
cipherKeysCtx.ivRef = makeString(AES_256_IV_LENGTH, *arena);
generateRandomData(mutateString(cipherKeysCtx.ivRef), AES_256_IV_LENGTH);
if (BG_ENCRYPT_COMPRESS_DEBUG) {
TraceEvent(SevDebug, "GetLatestGranuleCipherKey")
.detail("TextDomainId", cipherKeysCtx.textCipherKey.encryptDomainId)
.detail("TextBaseCipherId", cipherKeysCtx.textCipherKey.baseCipherId)
.detail("TextSalt", cipherKeysCtx.textCipherKey.salt)
.detail("HeaderDomainId", cipherKeysCtx.textCipherKey.encryptDomainId)
.detail("HeaderBaseCipherId", cipherKeysCtx.textCipherKey.baseCipherId)
.detail("HeaderSalt", cipherKeysCtx.textCipherKey.salt)
.detail("IVChksum", XXH3_64bits(cipherKeysCtx.ivRef.begin(), cipherKeysCtx.ivRef.size()));
}
return cipherKeysCtx;
}
ACTOR Future<BlobGranuleCipherKey> lookupCipherKey(Reference<BlobWorkerData> bwData,
BlobCipherDetails cipherDetails,
Arena* arena) {
std::unordered_set<BlobCipherDetails> cipherDetailsSet;
cipherDetailsSet.emplace(cipherDetails);
state std::unordered_map<BlobCipherDetails, Reference<BlobCipherKey>> cipherKeyMap =
wait(getEncryptCipherKeys(bwData->dbInfo, cipherDetailsSet));
ASSERT(cipherKeyMap.size() == 1);
auto cipherKeyMapItr = cipherKeyMap.find(cipherDetails);
if (cipherKeyMapItr == cipherKeyMap.end()) {
TraceEvent(SevError, "CipherKeyLookup_Failure")
.detail("EncryptDomainId", cipherDetails.encryptDomainId)
.detail("BaseCipherId", cipherDetails.baseCipherId)
.detail("Salt", cipherDetails.salt);
throw encrypt_keys_fetch_failed();
}
return BlobGranuleCipherKey::fromBlobCipherKey(cipherKeyMapItr->second, *arena);
}
ACTOR Future<BlobGranuleCipherKeysCtx> getGranuleCipherKeys(Reference<BlobWorkerData> bwData,
BlobGranuleCipherKeysMetaRef cipherKeysMetaRef,
Arena* arena) {
state BlobGranuleCipherKeysCtx cipherKeysCtx;
// Fetch 'textCipher' key
state BlobCipherDetails textCipherDetails(
cipherKeysMetaRef.textDomainId, cipherKeysMetaRef.textBaseCipherId, cipherKeysMetaRef.textSalt);
BlobGranuleCipherKey textCipherKey = wait(lookupCipherKey(bwData, textCipherDetails, arena));
cipherKeysCtx.textCipherKey = textCipherKey;
// Fetch 'headerCipher' key
state BlobCipherDetails headerCipherDetails(
cipherKeysMetaRef.headerDomainId, cipherKeysMetaRef.headerBaseCipherId, cipherKeysMetaRef.headerSalt);
BlobGranuleCipherKey headerCipherKey = wait(lookupCipherKey(bwData, headerCipherDetails, arena));
cipherKeysCtx.headerCipherKey = headerCipherKey;
// Populate 'Intialization Vector'
ASSERT_EQ(cipherKeysMetaRef.ivRef.size(), AES_256_IV_LENGTH);
cipherKeysCtx.ivRef = StringRef(*arena, cipherKeysMetaRef.ivRef);
if (BG_ENCRYPT_COMPRESS_DEBUG) {
TraceEvent("GetGranuleCipherKey")
.detail("TextDomainId", cipherKeysCtx.textCipherKey.encryptDomainId)
.detail("TextBaseCipherId", cipherKeysCtx.textCipherKey.baseCipherId)
.detail("TextSalt", cipherKeysCtx.textCipherKey.salt)
.detail("HeaderDomainId", cipherKeysCtx.textCipherKey.encryptDomainId)
.detail("HeaderBaseCipherId", cipherKeysCtx.textCipherKey.baseCipherId)
.detail("HeaderSalt", cipherKeysCtx.textCipherKey.salt)
.detail("IVChksum", XXH3_64bits(cipherKeysCtx.ivRef.begin(), cipherKeysCtx.ivRef.size()));
}
return cipherKeysCtx;
}
ACTOR Future<Void> readAndCheckGranuleLock(Reference<ReadYourWritesTransaction> tr,
KeyRange granuleRange,
@ -410,7 +539,7 @@ ACTOR Future<Void> updateGranuleSplitState(Transaction* tr,
// tr->clear(singleKeyRange(oldGranuleLockKey));
tr->clear(currentRange);
TEST(true); // Granule split cleanup on last delta file persisted
CODE_PROBE(true, "Granule split cleanup on last delta file persisted");
} else {
tr->atomicOp(myStateKey, blobGranuleSplitValueFor(newState), MutationRef::SetVersionstampedValue);
if (newState == BlobGranuleSplitState::Assigned && currentState == BlobGranuleSplitState::Initialized &&
@ -425,10 +554,10 @@ ACTOR Future<Void> updateGranuleSplitState(Transaction* tr,
wait(updateChangeFeed(
tr, KeyRef(granuleIDToCFKey(parentGranuleID)), ChangeFeedStatus::CHANGE_FEED_STOP));
}
TEST(true); // Granule split stopping change feed
CODE_PROBE(true, "Granule split stopping change feed");
}
} else if (BW_DEBUG) {
TEST(true); // Out of order granule split state updates ignored
CODE_PROBE(true, "Out of order granule split state updates ignored");
fmt::print("Ignoring granule {0} split state from {1} {2} -> {3}\n",
currentGranuleID.toString(),
parentGranuleID.toString(),
@ -549,13 +678,13 @@ ACTOR Future<BlobFileIndex> writeDeltaFile(Reference<BlobWorkerData> bwData,
// commit a transaction, we can and want to safely delete the file we wrote. Otherwise, we may have updated FDB
// with file and cannot safely delete it.
if (numIterations > 0) {
TEST(true); // Granule potentially leaving orphaned delta file
CODE_PROBE(true, "Granule potentially leaving orphaned delta file");
throw e;
}
if (BW_DEBUG) {
fmt::print("deleting delta file {0} after error {1}\n", fname, e.name());
}
TEST(true); // Granule cleaning up delta file after error
CODE_PROBE(true, "Granule cleaning up delta file after error");
++bwData->stats.s3DeleteReqs;
bwData->addActor.send(writeBStore->deleteFile(fname));
throw e;
@ -614,7 +743,18 @@ ACTOR Future<BlobFileIndex> writeSnapshot(Reference<BlobWorkerData> bwData,
}
}
state Value serialized = serializeChunkedSnapshot(snapshot, SERVER_KNOBS->BG_SNAPSHOT_FILE_TARGET_CHUNKS);
state Optional<BlobGranuleCipherKeysCtx> cipherKeysCtx;
state Optional<BlobGranuleCipherKeysMeta> cipherKeysMeta;
state Arena arena;
if (isBlobFileEncryptionSupported()) {
BlobGranuleCipherKeysCtx ciphKeysCtx = wait(getLatestGranuleCipherKeys(bwData, keyRange, &arena));
cipherKeysCtx = ciphKeysCtx;
cipherKeysMeta = BlobGranuleCipherKeysCtx::toCipherKeysMeta(cipherKeysCtx.get());
}
Optional<CompressionFilter> compressFilter = getBlobFileCompressFilter();
state Value serialized =
serializeChunkedSnapshot(snapshot, SERVER_KNOBS->BG_SNAPSHOT_FILE_TARGET_CHUNKS, compressFilter, cipherKeysCtx);
state size_t serializedSize = serialized.size();
// free snapshot to reduce memory
@ -650,7 +790,8 @@ ACTOR Future<BlobFileIndex> writeSnapshot(Reference<BlobWorkerData> bwData,
numIterations++;
Key snapshotFileKey = blobGranuleFileKeyFor(granuleID, version, 'S');
// TODO change once we support file multiplexing
Key snapshotFileValue = blobGranuleFileValueFor(fname, 0, serializedSize, serializedSize);
Key snapshotFileValue =
blobGranuleFileValueFor(fname, 0, serializedSize, serializedSize, cipherKeysMeta);
tr->set(snapshotFileKey, snapshotFileValue);
// create granule history at version if this is a new granule with the initial dump from FDB
if (createGranuleHistory) {
@ -670,13 +811,13 @@ ACTOR Future<BlobFileIndex> writeSnapshot(Reference<BlobWorkerData> bwData,
// commit a transaction, we can and want to safely delete the file we wrote. Otherwise, we may have updated FDB
// with file and cannot safely delete it.
if (numIterations > 0) {
TEST(true); // Granule potentially leaving orphaned snapshot file
CODE_PROBE(true, "Granule potentially leaving orphaned snapshot file");
throw e;
}
if (BW_DEBUG) {
fmt::print("deleting snapshot file {0} after error {1}\n", fname, e.name());
}
TEST(true); // Granule deleting snapshot file after error
CODE_PROBE(true, "Granule deleting snapshot file after error");
++bwData->stats.s3DeleteReqs;
bwData->addActor.send(writeBStore->deleteFile(fname));
throw e;
@ -695,7 +836,7 @@ ACTOR Future<BlobFileIndex> writeSnapshot(Reference<BlobWorkerData> bwData,
}
// FIXME: change when we implement multiplexing
return BlobFileIndex(version, fname, 0, serializedSize, serializedSize);
return BlobFileIndex(version, fname, 0, serializedSize, serializedSize, cipherKeysMeta);
}
ACTOR Future<BlobFileIndex> dumpInitialSnapshotFromFDB(Reference<BlobWorkerData> bwData,
@ -766,7 +907,7 @@ ACTOR Future<BlobFileIndex> dumpInitialSnapshotFromFDB(Reference<BlobWorkerData>
wait(tr->onError(e));
}
retries++;
TEST(true); // Granule initial snapshot failed
CODE_PROBE(true, "Granule initial snapshot failed");
// FIXME: why can't we supress error event?
TraceEvent(retries < 10 ? SevDebug : SevWarn, "BlobGranuleInitialSnapshotRetry", bwData->id)
.error(err)
@ -803,14 +944,14 @@ ACTOR Future<BlobFileIndex> compactFromBlob(Reference<BlobWorkerData> bwData,
state Arena filenameArena;
state std::vector<Future<RangeResult>> chunksToRead;
state int64_t compactBytesRead = 0;
for (auto& files : fileSet) {
ASSERT(!files.snapshotFiles.empty());
ASSERT(!files.deltaFiles.empty());
for (auto& f : fileSet) {
ASSERT(!f.snapshotFiles.empty());
ASSERT(!f.deltaFiles.empty());
state BlobGranuleChunkRef chunk;
state GranuleFiles files = f;
state Version snapshotVersion = files.snapshotFiles.back().version;
BlobFileIndex snapshotF = files.snapshotFiles.back();
state BlobFileIndex snapshotF = files.snapshotFiles.back();
if (snapshotVersion >= version) {
fmt::print("Chunk snapshot version [{0} - {1}) @ {2} >= compact version {3}\n",
@ -821,8 +962,21 @@ ACTOR Future<BlobFileIndex> compactFromBlob(Reference<BlobWorkerData> bwData,
}
ASSERT(snapshotVersion < version);
chunk.snapshotFile = BlobFilePointerRef(
filenameArena, snapshotF.filename, snapshotF.offset, snapshotF.length, snapshotF.fullFileLength);
chunk.snapshotFile = BlobFilePointerRef(filenameArena,
snapshotF.filename,
snapshotF.offset,
snapshotF.length,
snapshotF.fullFileLength,
snapshotF.cipherKeysMeta);
// TODO: optimization - batch 'encryption-key' lookup given the GranuleFile set is known
if (chunk.snapshotFile.get().cipherKeysMetaRef.present()) {
ASSERT(isBlobFileEncryptionSupported());
BlobGranuleCipherKeysCtx cipherKeysCtx =
wait(getGranuleCipherKeys(bwData, chunk.snapshotFile.get().cipherKeysMetaRef.get(), &filenameArena));
chunk.cipherKeysCtx = cipherKeysCtx;
}
compactBytesRead += snapshotF.length;
int deltaIdx = files.deltaFiles.size() - 1;
while (deltaIdx >= 0 && files.deltaFiles[deltaIdx].version > snapshotVersion) {
@ -975,8 +1129,7 @@ ACTOR Future<BlobFileIndex> checkSplitAndReSnapshot(Reference<BlobWorkerData> bw
if (e.code() == error_code_operation_cancelled) {
throw e;
}
TEST(true); // Blob worker re-sending split evaluation to manager after not error/not hearing
// back
CODE_PROBE(true, "Blob worker re-sending split evaluation to manager after not error/not hearing back");
// if we got broken promise while waiting, the old stream was killed, so we don't need to wait
// on change, just retry
if (e.code() == error_code_broken_promise) {
@ -1047,11 +1200,11 @@ ACTOR Future<Void> granuleCheckMergeCandidate(Reference<BlobWorkerData> bwData,
if (currentMetrics.bytes >= SERVER_KNOBS->BG_SNAPSHOT_FILE_TARGET_BYTES / 2 ||
currentMetrics.bytesPerKSecond >= SERVER_KNOBS->SHARD_MIN_BYTES_PER_KSEC) {
wait(delayJittered(SERVER_KNOBS->BG_MERGE_CANDIDATE_THRESHOLD_SECONDS / 2.0));
TEST(true); // wait and check later to see if granule got smaller or colder
CODE_PROBE(true, "wait and check later to see if granule got smaller or colder");
continue;
}
TEST(true); // Blob Worker identified merge candidate granule
CODE_PROBE(true, "Blob Worker identified merge candidate granule");
// if we are a merge candidate, send a message to the BM. Once successful, this actor is complete
while (!bwData->statusStreamInitialized) {
@ -1072,7 +1225,7 @@ ACTOR Future<Void> granuleCheckMergeCandidate(Reference<BlobWorkerData> bwData,
}
if (now() >= sendTimeGiveUp) {
TEST(true); // Blob worker could not send merge candidate in time, re-checking status
CODE_PROBE(true, "Blob worker could not send merge candidate in time, re-checking status");
break;
}
@ -1093,13 +1246,13 @@ ACTOR Future<Void> granuleCheckMergeCandidate(Reference<BlobWorkerData> bwData,
wait(bwData->currentManagerStatusStream.onChange());
wait(delay(0));
}
TEST(true); // Blob worker re-sending merge candidate to new manager
CODE_PROBE(true, "Blob worker re-sending merge candidate to new manager");
} catch (Error& e) {
if (e.code() == error_code_operation_cancelled) {
throw e;
}
TEST(true); // Blob worker re-sending merge candidate to manager after not error/not hearing back
CODE_PROBE(true, "Blob worker re-sending merge candidate to manager after not error/not hearing back");
// if we got broken promise while waiting, the old stream was killed, so we don't need to wait
// on change, just retry
@ -1113,7 +1266,8 @@ ACTOR Future<Void> granuleCheckMergeCandidate(Reference<BlobWorkerData> bwData,
}
}
static void handleCompletedDeltaFile(Reference<BlobWorkerData> bwData,
namespace {
void handleCompletedDeltaFile(Reference<BlobWorkerData> bwData,
Reference<GranuleMetadata> metadata,
BlobFileIndex completedDeltaFile,
Key cfKey,
@ -1155,7 +1309,7 @@ static void handleCompletedDeltaFile(Reference<BlobWorkerData> bwData,
}
// if we get an i/o error updating files, or a rollback, reassign the granule to ourselves and start fresh
static bool granuleCanRetry(const Error& e) {
bool granuleCanRetry(const Error& e) {
switch (e.code()) {
case error_code_io_error:
case error_code_io_timeout:
@ -1170,6 +1324,7 @@ static bool granuleCanRetry(const Error& e) {
return false;
};
}
} // namespace
struct InFlightFile {
Future<BlobFileIndex> future;
@ -1181,7 +1336,8 @@ struct InFlightFile {
: future(future), version(version), bytes(bytes), snapshot(snapshot) {}
};
static Version doGranuleRollback(Reference<GranuleMetadata> metadata,
namespace {
Version doGranuleRollback(Reference<GranuleMetadata> metadata,
Version mutationVersion,
Version rollbackVersion,
std::deque<InFlightFile>& inFlightFiles,
@ -1199,7 +1355,7 @@ static Version doGranuleRollback(Reference<GranuleMetadata> metadata,
for (auto& f : inFlightFiles) {
if (f.snapshot) {
if (f.version > rollbackVersion) {
TEST(true); // Granule rollback cancelling snapshot file
CODE_PROBE(true, "Granule rollback cancelling snapshot file");
if (BW_DEBUG) {
fmt::print("[{0} - {1}) rollback cancelling snapshot file @ {2}\n",
metadata->keyRange.begin.printable(),
@ -1220,7 +1376,7 @@ static Version doGranuleRollback(Reference<GranuleMetadata> metadata,
metadata->bytesInNewDeltaFiles -= f.bytes;
}
toPop++;
TEST(true); // Granule rollback cancelling delta file
CODE_PROBE(true, "Granule rollback cancelling delta file");
if (BW_DEBUG) {
fmt::print("[{0} - {1}) rollback cancelling delta file @ {2}\n",
metadata->keyRange.begin.printable(),
@ -1275,7 +1431,7 @@ static Version doGranuleRollback(Reference<GranuleMetadata> metadata,
} else {
// No pending delta files to discard, just in-memory mutations
TEST(true); // Granule rollback discarding in memory mutations
CODE_PROBE(true, "Granule rollback discarding in memory mutations");
// FIXME: could binary search?
int mIdx = metadata->currentDeltas.size() - 1;
@ -1337,6 +1493,7 @@ static Version doGranuleRollback(Reference<GranuleMetadata> metadata,
return cfRollbackVersion;
}
} // namespace
ACTOR Future<Void> waitOnCFVersion(Reference<GranuleMetadata> metadata, Version waitVersion) {
loop {
@ -1479,7 +1636,7 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
}
if (!startState.doSnapshot) {
TEST(true); // Granule moved without split
CODE_PROBE(true, "Granule moved without split");
startVersion = startState.previousDurableVersion;
ASSERT(!metadata->files.snapshotFiles.empty());
metadata->pendingSnapshotVersion = metadata->files.snapshotFiles.back().version;
@ -1640,7 +1797,7 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
// popped up to V+1 is ok. Or in other words, if the last delta @ V, we only missed data
// at V+1 onward if popVersion >= V+2
if (metadata->bufferedDeltaVersion < metadata->activeCFData.get()->popVersion - 1) {
TEST(true); // Blob Worker detected popped
CODE_PROBE(true, "Blob Worker detected popped");
TraceEvent("BlobWorkerChangeFeedPopped", bwData->id)
.detail("Granule", metadata->keyRange)
.detail("GranuleID", startState.granuleID)
@ -1738,7 +1895,7 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
if (metadata->pendingDeltaVersion <= rollbackVersion &&
(metadata->currentDeltas.empty() ||
metadata->currentDeltas.back().version <= rollbackVersion)) {
TEST(true); // Granule ignoring rollback
CODE_PROBE(true, "Granule ignoring rollback");
if (BW_DEBUG) {
fmt::print("Granule [{0} - {1}) on BW {2} skipping rollback {3} -> {4} "
@ -1755,7 +1912,7 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
// rollbackInProgress when we restart the stream.
rollbacksCompleted.push_back(std::pair(rollbackVersion, deltas.version));
} else {
TEST(true); // Granule processing rollback
CODE_PROBE(true, "Granule processing rollback");
if (BW_DEBUG) {
fmt::print("[{0} - {1}) on BW {2} ROLLBACK @ {3} -> {4}\n",
metadata->keyRange.begin.printable(),
@ -1786,7 +1943,7 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
// change feed
ASSERT(cfRollbackVersion >= startState.previousDurableVersion);
ASSERT(cfRollbackVersion >= metadata->durableDeltaVersion.get());
TEST(true); // rollback crossed change feed boundaries
CODE_PROBE(true, "rollback crossed change feed boundaries");
readOldChangeFeed = true;
oldChangeFeedDataComplete.reset();
}
@ -1832,7 +1989,7 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
}
} else if (!rollbacksInProgress.empty() && rollbacksInProgress.front().first < deltas.version &&
rollbacksInProgress.front().second > deltas.version) {
TEST(true); // Granule skipping mutations b/c prior rollback
CODE_PROBE(true, "Granule skipping mutations b/c prior rollback");
if (BW_DEBUG) {
fmt::print("Skipping mutations @ {} b/c prior rollback\n", deltas.version);
}
@ -1875,7 +2032,7 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
// The force flush contract is a version cannot be put in forceFlushVersion unless the change feed
// is already whenAtLeast that version
bool forceFlush = !forceFlushVersions.empty() && forceFlushVersions.back() > metadata->pendingDeltaVersion;
TEST(forceFlush); // Force flushing granule
CODE_PROBE(forceFlush, "Force flushing granule");
if (metadata->bufferedDeltaBytes >= SERVER_KNOBS->BG_DELTA_FILE_TARGET_BYTES || forceFlush) {
TraceEvent(SevDebug, "BlobGranuleDeltaFile", bwData->id)
.detail("Granule", metadata->keyRange)
@ -1914,7 +2071,7 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
// write/read a bunch of empty blob files
ASSERT(forceFlush);
ASSERT(!forceFlushVersions.empty());
TEST(true); // Force flushing empty delta file!
CODE_PROBE(true, "Force flushing empty delta file!");
}
if (BW_DEBUG) {
@ -2042,7 +2199,7 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
idx++;
}
while (waitIdx > 0) {
TEST(true); // Granule blocking on previous snapshot
CODE_PROBE(true, "Granule blocking on previous snapshot");
// TODO don't duplicate code
BlobFileIndex completedFile = wait(inFlightFiles.front().future);
if (inFlightFiles.front().snapshot) {
@ -2083,7 +2240,7 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
// queue too many files in parallel, and slow down change feed consuming to let file writing
// catch up
TEST(true); // Granule processing long tail of old change feed
CODE_PROBE(true, "Granule processing long tail of old change feed");
if (inFlightFiles.size() > 10 && inFlightFiles.front().version <= metadata->knownCommittedVersion) {
if (BW_DEBUG) {
fmt::print("[{0} - {1}) Waiting on delta file b/c old change feed\n",
@ -2137,7 +2294,7 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
++bwData->stats.granuleUpdateErrors;
if (granuleCanRetry(e)) {
TEST(true); // Granule close and re-open on error
CODE_PROBE(true, "Granule close and re-open on error");
TraceEvent("GranuleFileUpdaterRetriableError", bwData->id)
.error(e)
.detail("Granule", metadata->keyRange)
@ -2313,7 +2470,7 @@ ACTOR Future<Void> blobGranuleLoadHistory(Reference<BlobWorkerData> bwData,
next.version);
}
} else {
TEST(true); // duplicate parent in granule history (split then merge)
CODE_PROBE(true, "duplicate parent in granule history (split then merge)");
if (BW_HISTORY_DEBUG) {
fmt::print("HL {0} {1}) [{2} - {3}) @ {4}: duplicate parent [{5} - "
"{6}) @ {7}\n",
@ -2533,8 +2690,9 @@ struct sort_result_chunks {
}
};
static int64_t nextHistoryQueryId = 0;
static std::vector<std::pair<KeyRange, Future<GranuleFiles>>> loadHistoryChunks(Reference<BlobWorkerData> bwData,
namespace {
int64_t nextHistoryQueryId = 0;
std::vector<std::pair<KeyRange, Future<GranuleFiles>>> loadHistoryChunks(Reference<BlobWorkerData> bwData,
Version expectedEndVersion,
KeyRange keyRange,
Version readVersion) {
@ -2660,7 +2818,7 @@ static std::vector<std::pair<KeyRange, Future<GranuleFiles>>> loadHistoryChunks(
ASSERT(!resultChunks.empty());
if (resultChunks.size() >= 2) {
TEST(true); // Multiple history chunks for time travel query
CODE_PROBE(true, "Multiple history chunks for time travel query");
std::sort(resultChunks.begin(), resultChunks.end(), sort_result_chunks());
// Assert contiguous
for (int i = 0; i < resultChunks.size() - 1; i++) {
@ -2698,7 +2856,6 @@ static std::vector<std::pair<KeyRange, Future<GranuleFiles>>> loadHistoryChunks(
// TODO might want to separate this out for valid values for range assignments vs read requests. Assignment
// conflict isn't valid for read requests but is for assignments
namespace {
bool canReplyWith(Error e) {
switch (e.code()) {
case error_code_blob_granule_transaction_too_old:
@ -2735,7 +2892,7 @@ ACTOR Future<Void> waitForVersion(Reference<GranuleMetadata> metadata, Version v
metadata->durableDeltaVersion.get() == metadata->pendingDeltaVersion) &&
(v <= metadata->durableSnapshotVersion.get() ||
metadata->durableSnapshotVersion.get() == metadata->pendingSnapshotVersion)) {
TEST(true); // Granule read not waiting
CODE_PROBE(true, "Granule read not waiting");
return Void();
}
@ -2752,7 +2909,7 @@ ACTOR Future<Void> waitForVersion(Reference<GranuleMetadata> metadata, Version v
// If there are mutations that are no longer buffered but have not been
// persisted to a delta file that are necessary for the query, wait for them
if (pendingDeltaV > metadata->durableDeltaVersion.get() && v > metadata->durableDeltaVersion.get()) {
TEST(true); // Granule read waiting for pending delta
CODE_PROBE(true, "Granule read waiting for pending delta");
wait(metadata->durableDeltaVersion.whenAtLeast(pendingDeltaV));
ASSERT(metadata->durableDeltaVersion.get() >= pendingDeltaV);
}
@ -2760,7 +2917,7 @@ ACTOR Future<Void> waitForVersion(Reference<GranuleMetadata> metadata, Version v
// This isn't strictly needed, but if we're in the process of re-snapshotting, we'd likely rather
// return that snapshot file than the previous snapshot file and all its delta files.
if (pendingSnapshotV > metadata->durableSnapshotVersion.get() && v > metadata->durableSnapshotVersion.get()) {
TEST(true); // Granule read waiting for pending snapshot
CODE_PROBE(true, "Granule read waiting for pending snapshot");
wait(metadata->durableSnapshotVersion.whenAtLeast(pendingSnapshotV));
ASSERT(metadata->durableSnapshotVersion.get() >= pendingSnapshotV);
}
@ -2770,7 +2927,7 @@ ACTOR Future<Void> waitForVersion(Reference<GranuleMetadata> metadata, Version v
// file instead of in memory mutations, so we wait for that delta file to complete
while (v > metadata->durableDeltaVersion.get() && metadata->pendingDeltaVersion > pendingDeltaV) {
TEST(true); // Granule mutations flushed while waiting for files to complete
CODE_PROBE(true, "Granule mutations flushed while waiting for files to complete");
Version waitVersion = std::min(v, metadata->pendingDeltaVersion);
pendingDeltaV = metadata->pendingDeltaVersion;
wait(metadata->durableDeltaVersion.whenAtLeast(waitVersion));
@ -2793,6 +2950,7 @@ ACTOR Future<Void> doBlobGranuleFileRequest(Reference<BlobWorkerData> bwData, Bl
}
state Optional<Key> tenantPrefix;
state Arena arena;
if (req.tenantInfo.name.present()) {
ASSERT(req.tenantInfo.tenantId != TenantInfo::INVALID_TENANT);
Optional<TenantMapEntry> tenantEntry = bwData->tenantData.getTenantById(req.tenantInfo.tenantId);
@ -2800,7 +2958,7 @@ ACTOR Future<Void> doBlobGranuleFileRequest(Reference<BlobWorkerData> bwData, Bl
ASSERT(tenantEntry.get().id == req.tenantInfo.tenantId);
tenantPrefix = tenantEntry.get().prefix;
} else {
TEST(true); // Blob worker unknown tenant
CODE_PROBE(true, "Blob worker unknown tenant");
// FIXME - better way. Wait on retry here, or just have better model for tenant metadata?
// Just throw wrong_shard_server and make the client retry and assume we load it later
TraceEvent(SevDebug, "BlobWorkerRequestUnknownTenant", bwData->id)
@ -2869,6 +3027,7 @@ ACTOR Future<Void> doBlobGranuleFileRequest(Reference<BlobWorkerData> bwData, Bl
continue;
}
state Reference<GranuleMetadata> metadata = m;
state Version granuleBeginVersion = req.beginVersion;
choose {
when(wait(metadata->readable.getFuture())) {}
@ -2880,10 +3039,10 @@ ACTOR Future<Void> doBlobGranuleFileRequest(Reference<BlobWorkerData> bwData, Bl
throw wrong_shard_server();
}
state std::vector<std::pair<KeyRange, GranuleFiles>> chunks;
state std::vector<std::pair<KeyRange, GranuleFiles>> rangeGranulePair;
if (req.readVersion < metadata->historyVersion) {
TEST(true); // Granule Time Travel Read
CODE_PROBE(true, "Granule Time Travel Read");
// this is a time travel query, find previous granule
if (metadata->historyLoaded.canBeSet()) {
choose {
@ -2898,19 +3057,19 @@ ACTOR Future<Void> doBlobGranuleFileRequest(Reference<BlobWorkerData> bwData, Bl
for (chunkIdx = 0; chunkIdx < finalChunks.size(); chunkIdx++) {
choose {
when(GranuleFiles f = wait(finalChunks[chunkIdx].second)) {
chunks.push_back(std::pair(finalChunks[chunkIdx].first, f));
rangeGranulePair.push_back(std::pair(finalChunks[chunkIdx].first, f));
}
when(wait(metadata->cancelled.getFuture())) { throw wrong_shard_server(); }
}
if (chunks.back().second.snapshotFiles.empty()) {
if (rangeGranulePair.back().second.snapshotFiles.empty()) {
// a snapshot file must have been purged
throw blob_granule_transaction_too_old();
}
ASSERT(!chunks.back().second.deltaFiles.empty());
ASSERT(chunks.back().second.deltaFiles.back().version > req.readVersion);
if (chunks.back().second.snapshotFiles.front().version > req.readVersion) {
ASSERT(!rangeGranulePair.back().second.deltaFiles.empty());
ASSERT(rangeGranulePair.back().second.deltaFiles.back().version > req.readVersion);
if (rangeGranulePair.back().second.snapshotFiles.front().version > req.readVersion) {
// a snapshot file must have been purged
throw blob_granule_transaction_too_old();
}
@ -2922,7 +3081,7 @@ ACTOR Future<Void> doBlobGranuleFileRequest(Reference<BlobWorkerData> bwData, Bl
throw blob_granule_transaction_too_old();
}
TEST(true); // Granule Active Read
CODE_PROBE(true, "Granule Active Read");
// this is an active granule query
loop {
if (!metadata->activeCFData.get().isValid() || !metadata->cancelled.canBeSet()) {
@ -2945,13 +3104,13 @@ ACTOR Future<Void> doBlobGranuleFileRequest(Reference<BlobWorkerData> bwData, Bl
// We can get change feed cancelled from whenAtLeast. This means the change feed may
// retry, or may be cancelled. Wait a bit and try again to see
if (e.code() == error_code_change_feed_popped) {
TEST(true); // Change feed popped while read waiting
CODE_PROBE(true, "Change feed popped while read waiting");
throw wrong_shard_server();
}
if (e.code() != error_code_change_feed_cancelled) {
throw e;
}
TEST(true); // Change feed switched while read waiting
CODE_PROBE(true, "Change feed switched while read waiting");
// wait 1ms and try again
wait(delay(0.001));
}
@ -2962,7 +3121,7 @@ ACTOR Future<Void> doBlobGranuleFileRequest(Reference<BlobWorkerData> bwData, Bl
req.readVersion);
}
}
chunks.push_back(std::pair(metadata->keyRange, metadata->files));
rangeGranulePair.push_back(std::pair(metadata->keyRange, metadata->files));
}
if (!metadata->cancelled.canBeSet()) {
@ -2978,31 +3137,40 @@ ACTOR Future<Void> doBlobGranuleFileRequest(Reference<BlobWorkerData> bwData, Bl
// granule is up to date, do read
ASSERT(metadata->cancelled.canBeSet());
for (auto& c : chunks) {
for (auto& item : rangeGranulePair) {
Version granuleBeginVersion = req.beginVersion;
// Right now we force a collapse if the version range crosses granule boundaries, for simplicity
if (granuleBeginVersion > 0 && granuleBeginVersion <= c.second.snapshotFiles.front().version) {
TEST(true); // collapsed begin version request because of boundaries
if (granuleBeginVersion > 0 && granuleBeginVersion <= item.second.snapshotFiles.front().version) {
CODE_PROBE(true, "collapsed begin version request because of boundaries");
didCollapse = true;
granuleBeginVersion = 0;
}
BlobGranuleChunkRef chunk;
state BlobGranuleChunkRef chunk;
// TODO change with early reply
chunk.includedVersion = req.readVersion;
chunk.keyRange = KeyRangeRef(StringRef(rep.arena, c.first.begin), StringRef(rep.arena, c.first.end));
chunk.keyRange =
KeyRangeRef(StringRef(rep.arena, item.first.begin), StringRef(rep.arena, item.first.end));
if (tenantPrefix.present()) {
chunk.tenantPrefix = Optional<StringRef>(tenantPrefix.get());
}
int64_t deltaBytes = 0;
c.second.getFiles(
item.second.getFiles(
granuleBeginVersion, req.readVersion, req.canCollapseBegin, chunk, rep.arena, deltaBytes);
bwData->stats.readReqDeltaBytesReturned += deltaBytes;
if (granuleBeginVersion > 0 && chunk.snapshotFile.present()) {
TEST(true); // collapsed begin version request for efficiency
CODE_PROBE(true, "collapsed begin version request for efficiency");
didCollapse = true;
}
// TODO: optimization - batch 'encryption-key' lookup given the GranuleFile set is known
state Future<BlobGranuleCipherKeysCtx> cipherKeysCtx;
if (chunk.snapshotFile.present() && chunk.snapshotFile.get().cipherKeysMetaRef.present()) {
ASSERT(isBlobFileEncryptionSupported());
cipherKeysCtx =
getGranuleCipherKeys(bwData, chunk.snapshotFile.get().cipherKeysMetaRef.get(), &rep.arena);
}
// new deltas (if version is larger than version of last delta file)
// FIXME: do trivial key bounds here if key range is not fully contained in request key
// range
@ -3023,11 +3191,11 @@ ACTOR Future<Void> doBlobGranuleFileRequest(Reference<BlobWorkerData> bwData, Bl
rep.arena.dependsOn(metadata->currentDeltas.arena());
MutationsAndVersionRef* mutationIt = metadata->currentDeltas.begin();
if (granuleBeginVersion > metadata->currentDeltas.back().version) {
TEST(true); // beginVersion pruning all in-memory mutations
CODE_PROBE(true, "beginVersion pruning all in-memory mutations");
mutationIt = metadata->currentDeltas.end();
} else if (granuleBeginVersion > metadata->currentDeltas.front().version) {
// binary search for beginVersion
TEST(true); // beginVersion pruning some in-memory mutations
CODE_PROBE(true, "beginVersion pruning some in-memory mutations");
mutationIt = std::lower_bound(metadata->currentDeltas.begin(),
metadata->currentDeltas.end(),
MutationsAndVersionRef(granuleBeginVersion, 0),
@ -3037,7 +3205,7 @@ ACTOR Future<Void> doBlobGranuleFileRequest(Reference<BlobWorkerData> bwData, Bl
// add mutations to response
while (mutationIt != metadata->currentDeltas.end()) {
if (mutationIt->version > req.readVersion) {
TEST(true); // readVersion pruning some in-memory mutations
CODE_PROBE(true, "readVersion pruning some in-memory mutations");
break;
}
chunk.newDeltas.push_back_deep(rep.arena, *mutationIt);
@ -3045,6 +3213,11 @@ ACTOR Future<Void> doBlobGranuleFileRequest(Reference<BlobWorkerData> bwData, Bl
}
}
if (chunk.snapshotFile.present() && chunk.snapshotFile.get().cipherKeysMetaRef.present()) {
BlobGranuleCipherKeysCtx ctx = wait(cipherKeysCtx);
chunk.cipherKeysCtx = std::move(ctx);
}
rep.chunks.push_back(rep.arena, chunk);
bwData->stats.readReqTotalFilesReturned += chunk.deltaFiles.size() + int(chunk.snapshotFile.present());
@ -3087,7 +3260,7 @@ ACTOR Future<Void> handleBlobGranuleFileRequest(Reference<BlobWorkerData> bwData
when(wait(doBlobGranuleFileRequest(bwData, req))) {}
when(wait(delay(SERVER_KNOBS->BLOB_WORKER_REQUEST_TIMEOUT))) {
if (!req.reply.isSet()) {
TEST(true); // Blob Worker request timeout hit
CODE_PROBE(true, "Blob Worker request timeout hit");
if (BW_DEBUG) {
fmt::print("BW {0} request [{1} - {2}) @ {3} timed out, sending WSS\n",
bwData->id.toString().substr(0, 5),
@ -3151,7 +3324,7 @@ ACTOR Future<GranuleStartState> openGranule(Reference<BlobWorkerData> bwData, As
state bool hasPrevOwner = prevLockValue.present();
state bool createChangeFeed = false;
if (hasPrevOwner) {
TEST(true); // Granule open found previous owner
CODE_PROBE(true, "Granule open found previous owner");
std::tuple<int64_t, int64_t, UID> prevOwner = decodeBlobGranuleLockValue(prevLockValue.get());
info.granuleID = std::get<2>(prevOwner);
@ -3160,7 +3333,7 @@ ACTOR Future<GranuleStartState> openGranule(Reference<BlobWorkerData> bwData, As
// if it's the first snapshot of a new granule, history won't be present
if (info.history.present()) {
if (info.granuleID != info.history.get().value.granuleID) {
TEST(true); // Blob Worker re-opening granule after merge+resplit
CODE_PROBE(true, "Blob Worker re-opening granule after merge+resplit");
// The only case this can happen is when a granule was merged into a larger granule,
// then split back out to the same one. Validate that this is a new granule that was
// split previously. Just check lock based on epoch, since seqno is intentionally
@ -3237,7 +3410,7 @@ ACTOR Future<GranuleStartState> openGranule(Reference<BlobWorkerData> bwData, As
// ret.previousChangeFeedId, and the previous durable version will come from the previous
// granules
if (info.history.present() && info.history.get().value.parentVersions.size() > 0) {
TEST(true); // Granule open found parent
CODE_PROBE(true, "Granule open found parent");
if (info.history.get().value.parentVersions.size() == 1) { // split
state KeyRangeRef parentRange(info.history.get().value.parentBoundaries[0],
info.history.get().value.parentBoundaries[1]);
@ -3262,12 +3435,12 @@ ACTOR Future<GranuleStartState> openGranule(Reference<BlobWorkerData> bwData, As
}
if (granuleSplitState.first == BlobGranuleSplitState::Assigned) {
TEST(true); // Granule open found granule in assign state
CODE_PROBE(true, "Granule open found granule in assign state");
// was already assigned, use change feed start version
ASSERT(granuleSplitState.second > 0);
info.changeFeedStartVersion = granuleSplitState.second;
} else if (granuleSplitState.first == BlobGranuleSplitState::Initialized) {
TEST(true); // Granule open found granule in initialized state
CODE_PROBE(true, "Granule open found granule in initialized state");
wait(updateGranuleSplitState(&tr,
info.splitParentGranule.get().first,
info.splitParentGranule.get().second,
@ -3276,7 +3449,7 @@ ACTOR Future<GranuleStartState> openGranule(Reference<BlobWorkerData> bwData, As
// change feed was created as part of this transaction, changeFeedStartVersion
// will be set later
} else {
TEST(true); // Granule open found granule in done state
CODE_PROBE(true, "Granule open found granule in done state");
// this sub-granule is done splitting, no need for split logic.
info.splitParentGranule.reset();
}
@ -3295,7 +3468,7 @@ ACTOR Future<GranuleStartState> openGranule(Reference<BlobWorkerData> bwData, As
: info.blobFilesToSnapshot[0].deltaFiles.back().version;
}
} else if (info.doSnapshot) {
TEST(true); // merge needs to snapshot at start
CODE_PROBE(true, "merge needs to snapshot at start");
state std::vector<Future<GranuleFiles>> parentGranulesToSnapshot;
ASSERT(info.previousDurableVersion == invalidVersion);
// need first snapshot to be at history version so this granule can serve the full range
@ -3359,7 +3532,7 @@ ACTOR Future<Reference<BlobConnectionProvider>> loadBStoreForTenant(Reference<Bl
wait(delay(0));
return data->bstore;
} else {
TEST(true); // bstore for unknown tenant
CODE_PROBE(true, "bstore for unknown tenant");
// Assume not loaded yet, just wait a bit. Could do sophisticated mechanism but will redo tenant
// loading to be versioned anyway. 10 retries means it's likely not a transient race with
// loading tenants, and instead a persistent issue.
@ -3389,7 +3562,8 @@ ACTOR Future<Void> start(Reference<BlobWorkerData> bwData, GranuleRangeMetadata*
return Void();
}
static GranuleRangeMetadata constructActiveBlobRange(Reference<BlobWorkerData> bwData,
namespace {
GranuleRangeMetadata constructActiveBlobRange(Reference<BlobWorkerData> bwData,
KeyRange keyRange,
int64_t epoch,
int64_t seqno) {
@ -3405,12 +3579,12 @@ static GranuleRangeMetadata constructActiveBlobRange(Reference<BlobWorkerData> b
return GranuleRangeMetadata(epoch, seqno, newMetadata);
}
static GranuleRangeMetadata constructInactiveBlobRange(int64_t epoch, int64_t seqno) {
GranuleRangeMetadata constructInactiveBlobRange(int64_t epoch, int64_t seqno) {
return GranuleRangeMetadata(epoch, seqno, Reference<GranuleMetadata>());
}
// ignore stale assignments and make repeating the same one idempotent
static bool newerRangeAssignment(GranuleRangeMetadata oldMetadata, int64_t epoch, int64_t seqno) {
bool newerRangeAssignment(GranuleRangeMetadata oldMetadata, int64_t epoch, int64_t seqno) {
return epoch > oldMetadata.lastEpoch || (epoch == oldMetadata.lastEpoch && seqno > oldMetadata.lastSeqno);
}
@ -3435,7 +3609,7 @@ static bool newerRangeAssignment(GranuleRangeMetadata oldMetadata, int64_t epoch
// state.
// Not an actor because we need to guarantee it changes the synchronously as part of the request
static bool changeBlobRange(Reference<BlobWorkerData> bwData,
bool changeBlobRange(Reference<BlobWorkerData> bwData,
KeyRange keyRange,
int64_t epoch,
int64_t seqno,
@ -3548,7 +3722,7 @@ static bool changeBlobRange(Reference<BlobWorkerData> bwData,
return newerRanges.size() == 0;
}
static bool resumeBlobRange(Reference<BlobWorkerData> bwData, KeyRange keyRange, int64_t epoch, int64_t seqno) {
bool resumeBlobRange(Reference<BlobWorkerData> bwData, KeyRange keyRange, int64_t epoch, int64_t seqno) {
auto existingRange = bwData->granuleMetadata.rangeContaining(keyRange.begin);
// if range boundaries don't match, or this (epoch, seqno) is old or the granule is inactive, ignore
if (keyRange.begin != existingRange.begin() || keyRange.end != existingRange.end() ||
@ -3585,6 +3759,7 @@ static bool resumeBlobRange(Reference<BlobWorkerData> bwData, KeyRange keyRange,
// else we already processed this continue, do nothing
return true;
}
} // namespace
// the contract of handleRangeAssign and handleRangeRevoke is that they change the mapping before doing any
// waiting. This ensures GetGranuleAssignment returns an up-to-date set of ranges
@ -3736,7 +3911,7 @@ ACTOR Future<Void> monitorRemoval(Reference<BlobWorkerData> bwData) {
Optional<Value> val = wait(tr.get(blobWorkerListKey));
if (!val.present()) {
TEST(true); // Blob worker found out BM killed it from reading DB
CODE_PROBE(true, "Blob worker found out BM killed it from reading DB");
return Void();
}
@ -3823,8 +3998,8 @@ ACTOR Future<Void> monitorTenants(Reference<BlobWorkerData> bwData) {
}
}
static void handleGetGranuleAssignmentsRequest(Reference<BlobWorkerData> self,
const GetGranuleAssignmentsRequest& req) {
namespace {
void handleGetGranuleAssignmentsRequest(Reference<BlobWorkerData> self, const GetGranuleAssignmentsRequest& req) {
GetGranuleAssignmentsReply reply;
auto allRanges = self->granuleMetadata.intersectingRanges(normalKeys);
for (auto& it : allRanges) {
@ -3845,6 +4020,7 @@ static void handleGetGranuleAssignmentsRequest(Reference<BlobWorkerData> self,
}
req.reply.send(reply);
}
} // namespace
ACTOR Future<Void> handleFlushGranuleReq(Reference<BlobWorkerData> self, FlushGranuleRequest req) {
++self->stats.flushGranuleReqs;

View File

@ -23,6 +23,13 @@ file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/workloads)
add_flow_target(EXECUTABLE NAME fdbserver SRCS ${FDBSERVER_SRCS})
find_package(ZLIB)
if(ZLIB_FOUND)
add_compile_definitions(ZLIB_LIB_SUPPORTED)
else()
message(STATUS "ZLIB package not found")
endif()
target_include_directories(fdbserver PRIVATE
${CMAKE_SOURCE_DIR}/bindings/c
${CMAKE_BINARY_DIR}/bindings/c

View File

@ -316,7 +316,7 @@ ACTOR Future<Void> clusterWatchDatabase(ClusterControllerData* cluster,
wait(spinDelay);
TEST(true); // clusterWatchDatabase() master failed
CODE_PROBE(true, "clusterWatchDatabase() master failed");
TraceEvent(SevWarn, "DetectedFailedRecovery", cluster->id).detail("OldMaster", iMaster.id());
} catch (Error& e) {
state Error err = e;
@ -328,13 +328,14 @@ ACTOR Future<Void> clusterWatchDatabase(ClusterControllerData* cluster,
wait(cleanupRecoveryActorCollection(recoveryData, true /* exThrown */));
ASSERT(addActor.isEmpty());
TEST(err.code() == error_code_tlog_failed); // Terminated due to tLog failure
TEST(err.code() == error_code_commit_proxy_failed); // Terminated due to commit proxy failure
TEST(err.code() == error_code_grv_proxy_failed); // Terminated due to GRV proxy failure
TEST(err.code() == error_code_resolver_failed); // Terminated due to resolver failure
TEST(err.code() == error_code_backup_worker_failed); // Terminated due to backup worker failure
TEST(err.code() == error_code_operation_failed); // Terminated due to failed operation
TEST(err.code() == error_code_restart_cluster_controller); // Terminated due to cluster-controller restart.
CODE_PROBE(err.code() == error_code_tlog_failed, "Terminated due to tLog failure");
CODE_PROBE(err.code() == error_code_commit_proxy_failed, "Terminated due to commit proxy failure");
CODE_PROBE(err.code() == error_code_grv_proxy_failed, "Terminated due to GRV proxy failure");
CODE_PROBE(err.code() == error_code_resolver_failed, "Terminated due to resolver failure");
CODE_PROBE(err.code() == error_code_backup_worker_failed, "Terminated due to backup worker failure");
CODE_PROBE(err.code() == error_code_operation_failed, "Terminated due to failed operation");
CODE_PROBE(err.code() == error_code_restart_cluster_controller,
"Terminated due to cluster-controller restart.");
if (cluster->shouldCommitSuicide || err.code() == error_code_coordinators_changed) {
TraceEvent("ClusterControllerTerminate", cluster->id).errorUnsuppressed(err);
@ -622,7 +623,7 @@ void checkBetterSingletons(ClusterControllerData* self) {
}
WorkerDetails newEKPWorker;
if (SERVER_KNOBS->ENABLE_ENCRYPTION || g_network->isSimulated()) {
if (SERVER_KNOBS->ENABLE_ENCRYPTION) {
newEKPWorker = findNewProcessForSingleton(self, ProcessClass::EncryptKeyProxy, id_used);
}
@ -636,7 +637,7 @@ void checkBetterSingletons(ClusterControllerData* self) {
}
ProcessClass::Fitness bestFitnessForEKP;
if (SERVER_KNOBS->ENABLE_ENCRYPTION || g_network->isSimulated()) {
if (SERVER_KNOBS->ENABLE_ENCRYPTION) {
bestFitnessForEKP = findBestFitnessForSingleton(self, newEKPWorker, ProcessClass::EncryptKeyProxy);
}
@ -661,7 +662,7 @@ void checkBetterSingletons(ClusterControllerData* self) {
}
bool ekpHealthy = true;
if (SERVER_KNOBS->ENABLE_ENCRYPTION || g_network->isSimulated()) {
if (SERVER_KNOBS->ENABLE_ENCRYPTION) {
ekpHealthy = isHealthySingleton<EncryptKeyProxyInterface>(
self, newEKPWorker, ekpSingleton, bestFitnessForEKP, self->recruitingEncryptKeyProxyID);
}
@ -685,7 +686,7 @@ void checkBetterSingletons(ClusterControllerData* self) {
}
Optional<Standalone<StringRef>> currEKPProcessId, newEKPProcessId;
if (SERVER_KNOBS->ENABLE_ENCRYPTION || g_network->isSimulated()) {
if (SERVER_KNOBS->ENABLE_ENCRYPTION) {
currEKPProcessId = ekpSingleton.interface.get().locality.processId();
newEKPProcessId = newEKPWorker.interf.locality.processId();
}
@ -697,7 +698,7 @@ void checkBetterSingletons(ClusterControllerData* self) {
newPids.emplace_back(newBMProcessId);
}
if (SERVER_KNOBS->ENABLE_ENCRYPTION || g_network->isSimulated()) {
if (SERVER_KNOBS->ENABLE_ENCRYPTION) {
currPids.emplace_back(currEKPProcessId);
newPids.emplace_back(newEKPProcessId);
}
@ -712,7 +713,7 @@ void checkBetterSingletons(ClusterControllerData* self) {
}
// if the knob is disabled, the EKP coloc counts should have no affect on the coloc counts check below
if (!SERVER_KNOBS->ENABLE_ENCRYPTION && !g_network->isSimulated()) {
if (!SERVER_KNOBS->ENABLE_ENCRYPTION) {
ASSERT(currColocMap[currEKPProcessId] == 0);
ASSERT(newColocMap[newEKPProcessId] == 0);
}
@ -1244,7 +1245,7 @@ ACTOR Future<Void> registerWorker(RegisterWorkerRequest req,
}
checkOutstandingRequests(self);
} else {
TEST(true); // Received an old worker registration request.
CODE_PROBE(true, "Received an old worker registration request.");
}
// For each singleton
@ -1271,7 +1272,7 @@ ACTOR Future<Void> registerWorker(RegisterWorkerRequest req,
self, w, currSingleton, registeringSingleton, self->recruitingBlobManagerID);
}
if ((SERVER_KNOBS->ENABLE_ENCRYPTION || g_network->isSimulated()) && req.encryptKeyProxyInterf.present()) {
if (SERVER_KNOBS->ENABLE_ENCRYPTION && req.encryptKeyProxyInterf.present()) {
auto currSingleton = EncryptKeyProxySingleton(self->db.serverInfo->get().encryptKeyProxy);
auto registeringSingleton = EncryptKeyProxySingleton(req.encryptKeyProxyInterf);
haltRegisteringOrCurrentSingleton<EncryptKeyProxyInterface>(
@ -2525,7 +2526,7 @@ ACTOR Future<Void> clusterControllerCore(ClusterControllerFullInterface interf,
state Future<ErrorOr<Void>> error = errorOr(actorCollection(self.addActor.getFuture()));
// EncryptKeyProxy is necessary for TLog recovery, recruit it as the first process
if (SERVER_KNOBS->ENABLE_ENCRYPTION || g_network->isSimulated()) {
if (SERVER_KNOBS->ENABLE_ENCRYPTION) {
self.addActor.send(monitorEncryptKeyProxy(&self));
}
self.addActor.send(clusterWatchDatabase(

View File

@ -58,7 +58,7 @@ ACTOR Future<Void> recoveryTerminateOnConflict(UID dbgid,
when(wait(onConflict)) {
if (!fullyRecovered.isSet()) {
TraceEvent("RecoveryTerminated", dbgid).detail("Reason", "Conflict");
TEST(true); // Coordinated state conflict, recovery terminating
CODE_PROBE(true, "Coordinated state conflict, recovery terminating");
throw worker_removed();
}
return Void();
@ -110,7 +110,7 @@ ACTOR Future<Void> recruitNewMaster(ClusterControllerData* cluster,
return Void();
} else {
TEST(true); // clusterWatchDatabase() !newMaster.present()
CODE_PROBE(true, "clusterWatchDatabase() !newMaster.present()");
wait(delay(SERVER_KNOBS->MASTER_SPIN_DELAY));
}
}
@ -118,7 +118,7 @@ ACTOR Future<Void> recruitNewMaster(ClusterControllerData* cluster,
ACTOR Future<Void> clusterRecruitFromConfiguration(ClusterControllerData* self, Reference<RecruitWorkersInfo> req) {
// At the moment this doesn't really need to be an actor (it always completes immediately)
TEST(true); // ClusterController RecruitTLogsRequest
CODE_PROBE(true, "ClusterController RecruitTLogsRequest");
loop {
try {
req->rep = self->findWorkersForConfiguration(req->req);
@ -150,7 +150,7 @@ ACTOR Future<RecruitRemoteFromConfigurationReply> clusterRecruitRemoteFromConfig
ClusterControllerData* self,
Reference<RecruitRemoteWorkersInfo> req) {
// At the moment this doesn't really need to be an actor (it always completes immediately)
TEST(true); // ClusterController RecruitTLogsRequest Remote
CODE_PROBE(true, "ClusterController RecruitTLogsRequest Remote");
loop {
try {
auto rep = self->findRemoteWorkersForConfiguration(req->req);
@ -355,7 +355,7 @@ ACTOR Future<Void> newSeedServers(Reference<ClusterRecoveryData> self,
!newServer.isError(error_code_request_maybe_delivered))
throw newServer.getError();
TEST(true); // initial storage recuitment loop failed to get new server
CODE_PROBE(true, "initial storage recuitment loop failed to get new server");
wait(delay(SERVER_KNOBS->STORAGE_RECRUITMENT_DELAY));
} else {
if (!dcId_tags.count(recruits.storageServers[idx].locality.dcId())) {
@ -736,7 +736,7 @@ ACTOR Future<Void> updateLogsValue(Reference<ClusterRecoveryData> self, Database
}
if (!found) {
TEST(true); // old master attempted to change logsKey
CODE_PROBE(true, "old master attempted to change logsKey");
return Void();
}
@ -815,7 +815,7 @@ ACTOR Future<Void> updateRegistration(Reference<ClusterRecoveryData> self, Refer
std::vector<UID>()));
} else {
// The cluster should enter the accepting commits phase soon, and then we will register again
TEST(true); // cstate is updated but we aren't accepting commits yet
CODE_PROBE(true, "cstate is updated but we aren't accepting commits yet");
}
}
}
@ -1357,7 +1357,7 @@ ACTOR Future<Void> recoverFrom(Reference<ClusterRecoveryData> self,
}
when(Standalone<CommitTransactionRef> _req = wait(provisional)) {
state Standalone<CommitTransactionRef> req = _req; // mutable
TEST(true); // Emergency transaction processing during recovery
CODE_PROBE(true, "Emergency transaction processing during recovery");
TraceEvent("EmergencyTransaction", self->dbgid).log();
for (auto m = req.mutations.begin(); m != req.mutations.end(); ++m)
TraceEvent("EmergencyTransactionMutation", self->dbgid)
@ -1559,7 +1559,7 @@ ACTOR Future<Void> clusterRecoveryCore(Reference<ClusterRecoveryData> self) {
.detail("SnapRecoveryFlag", snapRecoveryFlag.present() ? snapRecoveryFlag.get().toString() : "N/A")
.detail("LastEpochEnd", self->lastEpochEnd);
if (snapRecoveryFlag.present()) {
TEST(true); // Recovering from snapshot, writing to snapShotEndVersionKey
CODE_PROBE(true, "Recovering from snapshot, writing to snapShotEndVersionKey");
BinaryWriter bw(Unversioned());
tr.set(recoveryCommitRequest.arena, snapshotEndVersionKey, (bw << self->lastEpochEnd).toValue());
// Pause the backups that got restored in this snapshot to avoid data corruption
@ -1659,7 +1659,7 @@ ACTOR Future<Void> clusterRecoveryCore(Reference<ClusterRecoveryData> self) {
// unless we want to change TLogs
wait((success(recoveryCommit) && sendInitialCommitToResolvers(self)));
if (recoveryCommit.isReady() && recoveryCommit.get().isError()) {
TEST(true); // Cluster recovery failed because of the initial commit failed
CODE_PROBE(true, "Cluster recovery failed because of the initial commit failed");
throw cluster_recovery_failed();
}

View File

@ -789,7 +789,7 @@ ACTOR Future<Void> preresolutionProcessing(CommitBatchContext* self) {
}
// Pre-resolution the commits
TEST(pProxyCommitData->latestLocalCommitBatchResolving.get() < localBatchNumber - 1); // Wait for local batch
CODE_PROBE(pProxyCommitData->latestLocalCommitBatchResolving.get() < localBatchNumber - 1, "Wait for local batch");
wait(pProxyCommitData->latestLocalCommitBatchResolving.whenAtLeast(localBatchNumber - 1));
pProxyCommitData->stats.computeLatency.addMeasurement(now() - timeStart);
double queuingDelay = g_network->now() - timeStart;
@ -798,7 +798,7 @@ ACTOR Future<Void> preresolutionProcessing(CommitBatchContext* self) {
(g_network->isSimulated() && BUGGIFY_WITH_PROB(0.01))) &&
SERVER_KNOBS->PROXY_REJECT_BATCH_QUEUED_TOO_LONG && canReject(trs)) {
// Disabled for the recovery transaction. otherwise, recovery can't finish and keeps doing more recoveries.
TEST(true); // Reject transactions in the batch
CODE_PROBE(true, "Reject transactions in the batch");
TraceEvent(SevWarnAlways, "ProxyReject", pProxyCommitData->dbgid)
.suppressFor(0.1)
.detail("QDelay", queuingDelay)
@ -1152,7 +1152,7 @@ void writeMutation(CommitBatchContext* self, int64_t tenantId, const MutationRef
bool isRawAccess = tenantId == TenantInfo::INVALID_TENANT && !isSystemKey(mutation.param1) &&
!(mutation.type == MutationRef::ClearRange && isSystemKey(mutation.param2)) &&
self->pProxyCommitData->db->get().client.tenantMode == TenantMode::REQUIRED;
TEST(isRawAccess); // Raw access to tenant key space
CODE_PROBE(isRawAccess, "Raw access to tenant key space");
self->toCommit.writeTypedMessage(mutation);
} else {
Arena arena;
@ -1259,7 +1259,7 @@ ACTOR Future<Void> assignMutationsToStorageServers(CommitBatchContext* self) {
trCost->get().clearIdxCosts.pop_front();
}
} else {
TEST(true); // A clear range extends past a shard boundary
CODE_PROBE(true, "A clear range extends past a shard boundary");
std::set<Tag> allSources;
for (auto r : ranges) {
r.value().populateTags();
@ -1347,7 +1347,7 @@ ACTOR Future<Void> postResolution(CommitBatchContext* self) {
state Span span("MP:postResolution"_loc, self->span.context);
bool queuedCommits = pProxyCommitData->latestLocalCommitBatchLogging.get() < localBatchNumber - 1;
TEST(queuedCommits); // Queuing post-resolution commit processing
CODE_PROBE(queuedCommits, "Queuing post-resolution commit processing");
wait(pProxyCommitData->latestLocalCommitBatchLogging.whenAtLeast(localBatchNumber - 1));
state double postResolutionQueuing = now();
pProxyCommitData->stats.postResolutionDist->sampleSeconds(postResolutionQueuing - postResolutionStart);
@ -1424,7 +1424,7 @@ ACTOR Future<Void> postResolution(CommitBatchContext* self) {
self->commitVersion - SERVER_KNOBS->MAX_READ_TRANSACTION_LIFE_VERSIONS) {
// This should be *extremely* rare in the real world, but knob buggification should make it happen in
// simulation
TEST(true); // Semi-committed pipeline limited by MVCC window
CODE_PROBE(true, "Semi-committed pipeline limited by MVCC window");
//TraceEvent("ProxyWaitingForCommitted", pProxyCommitData->dbgid).detail("CommittedVersion", pProxyCommitData->committedVersion.get()).detail("NeedToCommit", commitVersion);
waitVersionSpan = Span("MP:overMaxReadTransactionLifeVersions"_loc, span.context);
choose {
@ -1617,7 +1617,8 @@ ACTOR Future<Void> reply(CommitBatchContext* self) {
// client may get a commit version that the master is not aware of, and next GRV request may get a version less than
// self->committedVersion.
TEST(pProxyCommitData->committedVersion.get() > self->commitVersion); // later version was reported committed first
CODE_PROBE(pProxyCommitData->committedVersion.get() > self->commitVersion,
"later version was reported committed first");
if (self->commitVersion >= pProxyCommitData->committedVersion.get()) {
state Optional<std::set<Tag>> writtenTags;
@ -2603,7 +2604,7 @@ ACTOR Future<Void> commitProxyServer(CommitProxyInterface proxy,
e.code() != error_code_failed_to_progress) {
throw;
}
TEST(e.code() == error_code_failed_to_progress); // Commit proxy failed to progress
CODE_PROBE(e.code() == error_code_failed_to_progress, "Commit proxy failed to progress");
}
return Void();
}

View File

@ -206,7 +206,7 @@ class ConfigNodeImpl {
// Handle a very rare case where a ConfigNode loses data between
// responding with a committed version and responding to the
// subsequent get changes request.
TEST(true); // ConfigNode data loss occurred on a minority of coordinators
CODE_PROBE(true, "ConfigNode data loss occurred on a minority of coordinators");
req.reply.sendError(process_behind()); // Reuse the process_behind error
return Void();
}
@ -230,7 +230,8 @@ class ConfigNodeImpl {
state ConfigGeneration generation = wait(getGeneration(self));
++generation.liveVersion;
if (req.lastSeenLiveVersion.present()) {
TEST(req.lastSeenLiveVersion.get() >= generation.liveVersion); // Node is lagging behind some other node
CODE_PROBE(req.lastSeenLiveVersion.get() >= generation.liveVersion,
"Node is lagging behind some other node");
generation.liveVersion = std::max(generation.liveVersion, req.lastSeenLiveVersion.get() + 1);
}
self->kvStore->set(KeyValueRef(currentGenerationKey, BinaryWriter::toValue(generation, IncludeVersion())));

View File

@ -79,13 +79,13 @@ struct CoordinatedStateImpl {
CoordinatedStateImpl(ServerCoordinators const& c)
: coordinators(c), stage(0), conflictGen(0), doomed(false), ac(false), initial(false) {}
uint64_t getConflict() { return conflictGen; }
uint64_t getConflict() const { return conflictGen; }
bool isDoomed(GenerationRegReadReply const& rep) {
return rep.gen > gen // setExclusive is doomed, because there was a write at least started at a higher
bool isDoomed(GenerationRegReadReply const& rep) const {
return rep.gen > gen;
// setExclusive is doomed, because there was a write at least started at a higher
// generation, which means a read completed at that higher generation
// || rep.rgen > gen // setExclusive isn't absolutely doomed, but it may/probably will fail
;
}
ACTOR static Future<Value> read(CoordinatedStateImpl* self) {
@ -216,7 +216,7 @@ struct CoordinatedStateImpl {
};
CoordinatedState::CoordinatedState(ServerCoordinators const& coord)
: impl(std::make_unique<CoordinatedStateImpl>(coord)) {}
: impl(PImpl<CoordinatedStateImpl>::create(coord)) {}
CoordinatedState::~CoordinatedState() = default;
Future<Value> CoordinatedState::read() {
return CoordinatedStateImpl::read(impl.get());
@ -227,7 +227,7 @@ Future<Void> CoordinatedState::onConflict() {
Future<Void> CoordinatedState::setExclusive(Value v) {
return CoordinatedStateImpl::setExclusive(impl.get(), v);
}
uint64_t CoordinatedState::getConflict() {
uint64_t CoordinatedState::getConflict() const {
return impl->getConflict();
}
@ -273,7 +273,7 @@ struct MovableCoordinatedStateImpl {
// SOMEDAY: If moveState.mode == MovingFrom, read (without locking) old state and assert that it corresponds
// with our state and is ReallyTo(coordinators)
if (moveState.mode == MovableValue::MaybeTo) {
TEST(true); // Maybe moveto state
CODE_PROBE(true, "Maybe moveto state");
ASSERT(moveState.other.present());
wait(self->moveTo(
self, &self->cs, ClusterConnectionString(moveState.other.get().toString()), moveState.value));
@ -322,7 +322,7 @@ struct MovableCoordinatedStateImpl {
Value oldQuorumState = wait(cs.read());
if (oldQuorumState != self->lastCSValue.get()) {
TEST(true); // Quorum change aborted by concurrent write to old coordination state
CODE_PROBE(true, "Quorum change aborted by concurrent write to old coordination state");
TraceEvent("QuorumChangeAbortedByConcurrency").log();
throw coordinated_state_conflict();
}
@ -354,7 +354,7 @@ struct MovableCoordinatedStateImpl {
MovableCoordinatedState& MovableCoordinatedState::operator=(MovableCoordinatedState&&) = default;
MovableCoordinatedState::MovableCoordinatedState(class ServerCoordinators const& coord)
: impl(std::make_unique<MovableCoordinatedStateImpl>(coord)) {}
: impl(PImpl<MovableCoordinatedStateImpl>::create(coord)) {}
MovableCoordinatedState::~MovableCoordinatedState() = default;
Future<Value> MovableCoordinatedState::read() {
return MovableCoordinatedStateImpl::read(impl.get());

View File

@ -942,8 +942,9 @@ public:
: SERVER_KNOBS->PRIORITY_TEAM_UNHEALTHY);
}
} else {
TEST(true); // A removed server is still associated with a team in
// ShardsAffectedByTeamFailure
CODE_PROBE(true,
"A removed server is still associated with a team in "
"ShardsAffectedByTeamFailure");
}
}
}
@ -1253,7 +1254,7 @@ public:
server->updateLastKnown(newInterface.first, newInterface.second);
if (localityChanged && !isTss) {
TEST(true); // Server locality changed
CODE_PROBE(true, "Server locality changed");
// The locality change of a server will affect machine teams related to the server if
// the server's machine locality is changed
@ -1320,7 +1321,7 @@ public:
}
}
if (addedNewBadTeam && self->badTeamRemover.isReady()) {
TEST(true); // Server locality change created bad teams
CODE_PROBE(true, "Server locality change created bad teams");
self->doBuildTeams = true;
self->badTeamRemover = removeBadTeams(self);
self->addActor.send(self->badTeamRemover);
@ -1724,7 +1725,7 @@ public:
// in the serverTeams vector in the machine team.
--teamIndex;
self->addTeam(team->getServers(), IsInitialTeam::True, IsRedundantTeam::True);
TEST(true); // Removed machine team
CODE_PROBE(true, "Removed machine team");
}
self->doBuildTeams = true;
@ -1808,7 +1809,7 @@ public:
bool foundTeam = self->removeTeam(st);
ASSERT(foundTeam);
self->addTeam(st->getServers(), IsInitialTeam::True, IsRedundantTeam::True);
TEST(true); // Marked team as a bad team
CODE_PROBE(true, "Marked team as a bad team");
self->doBuildTeams = true;
@ -2052,7 +2053,7 @@ public:
if (self->wigglingId.present()) {
state UID id = self->wigglingId.get();
if (self->pauseWiggle->get()) {
TEST(true); // paused because cluster is unhealthy
CODE_PROBE(true, "paused because cluster is unhealthy");
moveFinishFuture = Never();
self->includeStorageServersForWiggle();
self->storageWiggler->setWiggleState(StorageWiggler::PAUSE);
@ -2068,7 +2069,7 @@ public:
} else {
choose {
when(wait(self->waitUntilHealthy())) {
TEST(true); // start wiggling
CODE_PROBE(true, "start wiggling");
wait(self->storageWiggler->startWiggle());
auto fv = self->excludeStorageServersForWiggle(id);
moveFinishFuture = fv;
@ -2431,10 +2432,10 @@ public:
// SS and/or TSS recruitment failed at this point, update tssState
if (recruitTss && tssState->tssRecruitFailed()) {
tssState->markComplete();
TEST(true); // TSS recruitment failed for some reason
CODE_PROBE(true, "TSS recruitment failed for some reason");
}
if (!recruitTss && tssState->ssRecruitFailed()) {
TEST(true); // SS with pair TSS recruitment failed for some reason
CODE_PROBE(true, "SS with pair TSS recruitment failed for some reason");
}
self->recruitingStream.set(self->recruitingStream.get() - 1);
@ -2575,7 +2576,7 @@ public:
.detail("Addr", candidateSSAddr.toString())
.detail("Locality", candidateWorker.worker.locality.toString());
TEST(true); // Starting TSS recruitment
CODE_PROBE(true, "Starting TSS recruitment");
self->isTssRecruiting = true;
tssState = makeReference<TSSPairState>(candidateWorker.worker.locality);
@ -2585,7 +2586,7 @@ public:
checkTss = self->initialFailureReactionDelay;
} else {
if (tssState->active && tssState->inDataZone(candidateWorker.worker.locality)) {
TEST(true); // TSS recruits pair in same dc/datahall
CODE_PROBE(true, "TSS recruits pair in same dc/datahall");
self->isTssRecruiting = false;
TraceEvent("TSS_Recruit", self->distributorId)
.detail("Stage", "PairSS")
@ -2596,8 +2597,9 @@ public:
// successfully started recruitment of pair, reset tss recruitment state
tssState = makeReference<TSSPairState>();
} else {
TEST(tssState->active); // TSS recruitment skipped potential pair because it's in a
// different dc/datahall
CODE_PROBE(
tssState->active,
"TSS recruitment skipped potential pair because it's in a different dc/datahall");
self->addActor.send(initializeStorage(
self, candidateWorker, ddEnabledState, false, makeReference<TSSPairState>()));
}
@ -2617,8 +2619,9 @@ public:
int tssToKill = std::min((int)self->tss_info_by_pair.size(),
std::max(-tssToRecruit, self->zeroHealthyTeams->get() ? 1 : 0));
if (cancelTss) {
TEST(tssToRecruit < 0); // tss recruitment cancelled due to too many TSS
TEST(self->zeroHealthyTeams->get()); // tss recruitment cancelled due zero healthy teams
CODE_PROBE(tssToRecruit < 0, "tss recruitment cancelled due to too many TSS");
CODE_PROBE(self->zeroHealthyTeams->get(),
"tss recruitment cancelled due zero healthy teams");
TraceEvent(SevWarn, "TSS_RecruitCancelled", self->distributorId)
.detail("Reason", tssToRecruit <= 0 ? "TooMany" : "ZeroHealthyTeams");
@ -2637,8 +2640,8 @@ public:
if (self->shouldHandleServer(tssi) && self->server_and_tss_info.count(tssId)) {
Promise<Void> killPromise = itr->second->killTss;
if (killPromise.canBeSet()) {
TEST(tssToRecruit < 0); // Killing TSS due to too many TSS
TEST(self->zeroHealthyTeams->get()); // Killing TSS due zero healthy teams
CODE_PROBE(tssToRecruit < 0, "Killing TSS due to too many TSS");
CODE_PROBE(self->zeroHealthyTeams->get(), "Killing TSS due zero healthy teams");
TraceEvent(SevWarn, "TSS_DDKill", self->distributorId)
.detail("TSSID", tssId)
.detail("Reason",
@ -2672,7 +2675,7 @@ public:
if (e.code() != error_code_timed_out) {
throw;
}
TEST(true); // Storage recruitment timed out
CODE_PROBE(true, "Storage recruitment timed out");
}
}
}
@ -2992,14 +2995,14 @@ public:
loop choose {
when(UID removedServer = waitNext(self->removedServers.getFuture())) {
TEST(true); // Storage server removed from database
CODE_PROBE(true, "Storage server removed from database");
self->removeServer(removedServer);
serverRemoved.send(Void());
self->restartRecruiting.trigger();
}
when(UID removedTSS = waitNext(self->removedTSS.getFuture())) {
TEST(true); // TSS removed from database
CODE_PROBE(true, "TSS removed from database");
self->removeTSS(removedTSS);
serverRemoved.send(Void());
@ -4808,7 +4811,7 @@ Reference<TCMachineInfo> DDTeamCollection::checkAndCreateMachine(Reference<TCSer
Reference<TCMachineInfo> machineInfo;
if (machine_info.find(machine_id) == machine_info.end()) {
// uid is the first storage server process on the machine
TEST(true); // First storage server in process on the machine
CODE_PROBE(true, "First storage server in process on the machine");
// For each machine, store the first server's localityEntry into machineInfo for later use.
LocalityEntry localityEntry = machineLocalityMap.add(locality, &server->getId());
machineInfo = makeReference<TCMachineInfo>(server, localityEntry);

View File

@ -250,7 +250,7 @@ class DDTxnProcessorImpl {
// If keyServers is too large to read in a single transaction, then we will have to break this process up into
// multiple transactions. In that case, each iteration should begin where the previous left off
while (beginKey < allKeys.end) {
TEST(beginKey > allKeys.begin); // Multi-transactional getInitialDataDistribution
CODE_PROBE(beginKey > allKeys.begin, "Multi-transactional getInitialDataDistribution");
loop {
succeeded = false;
try {

View File

@ -310,6 +310,8 @@ struct DataDistributor : NonCopyable, ReferenceCounted<DataDistributor> {
// Optional components that can be set after ::init(). They're optional when test, but required for DD being
// fully-functional.
DDTeamCollection* teamCollection;
Reference<ShardsAffectedByTeamFailure> shardsAffectedByTeamFailure;
PromiseStream<RelocateShard> relocationProducer, relocationConsumer; // comsumer is a yield stream from producer
DataDistributor(Reference<AsyncVar<ServerDBInfo> const> const& db, UID id)
: dbInfo(db), ddId(id), txnProcessor(nullptr), initialDDEventHolder(makeReference<EventCacheHolder>("InitialDD")),
@ -433,6 +435,88 @@ struct DataDistributor : NonCopyable, ReferenceCounted<DataDistributor> {
}
return Void();
}
// Resume inflight relocations from the previous DD
// TODO: add a test to verify the inflight relocation correctness and measure the memory usage with 4 million shards
ACTOR static Future<Void> resumeRelocations(Reference<DataDistributor> self) {
ASSERT(self->shardsAffectedByTeamFailure); // has to be allocated
state int shard = 0;
for (; shard < self->initData->shards.size() - 1; shard++) {
const DDShardInfo& iShard = self->initData->shards[shard];
KeyRangeRef keys = KeyRangeRef(iShard.key, self->initData->shards[shard + 1].key);
self->shardsAffectedByTeamFailure->defineShard(keys);
std::vector<ShardsAffectedByTeamFailure::Team> teams;
teams.push_back(ShardsAffectedByTeamFailure::Team(iShard.primarySrc, true));
if (self->configuration.usableRegions > 1) {
teams.push_back(ShardsAffectedByTeamFailure::Team(iShard.remoteSrc, false));
}
if (g_network->isSimulated()) {
TraceEvent("DDInitShard")
.detail("Keys", keys)
.detail("PrimarySrc", describe(iShard.primarySrc))
.detail("RemoteSrc", describe(iShard.remoteSrc))
.detail("PrimaryDest", describe(iShard.primaryDest))
.detail("RemoteDest", describe(iShard.remoteDest))
.detail("SrcID", iShard.srcId)
.detail("DestID", iShard.destId);
}
self->shardsAffectedByTeamFailure->moveShard(keys, teams);
if (iShard.hasDest && iShard.destId == anonymousShardId) {
// This shard is already in flight. Ideally we should use dest in ShardsAffectedByTeamFailure and
// generate a dataDistributionRelocator directly in DataDistributionQueue to track it, but it's
// easier to just (with low priority) schedule it for movement.
bool unhealthy = iShard.primarySrc.size() != self->configuration.storageTeamSize;
if (!unhealthy && self->configuration.usableRegions > 1) {
unhealthy = iShard.remoteSrc.size() != self->configuration.storageTeamSize;
}
self->relocationProducer.send(RelocateShard(keys,
unhealthy ? SERVER_KNOBS->PRIORITY_TEAM_UNHEALTHY
: SERVER_KNOBS->PRIORITY_RECOVER_MOVE,
RelocateReason::OTHER));
}
wait(yield(TaskPriority::DataDistribution));
}
state KeyRangeMap<std::shared_ptr<DataMove>>::iterator it = self->initData->dataMoveMap.ranges().begin();
for (; it != self->initData->dataMoveMap.ranges().end(); ++it) {
const DataMoveMetaData& meta = it.value()->meta;
if (it.value()->isCancelled() || (it.value()->valid && !CLIENT_KNOBS->SHARD_ENCODE_LOCATION_METADATA)) {
RelocateShard rs(meta.range, SERVER_KNOBS->PRIORITY_RECOVER_MOVE, RelocateReason::OTHER);
rs.dataMoveId = meta.id;
rs.cancelled = true;
self->relocationProducer.send(rs);
TraceEvent("DDInitScheduledCancelDataMove", self->ddId).detail("DataMove", meta.toString());
} else if (it.value()->valid) {
TraceEvent(SevDebug, "DDInitFoundDataMove", self->ddId).detail("DataMove", meta.toString());
ASSERT(meta.range == it.range());
// TODO: Persist priority in DataMoveMetaData.
RelocateShard rs(meta.range, SERVER_KNOBS->PRIORITY_RECOVER_MOVE, RelocateReason::OTHER);
rs.dataMoveId = meta.id;
rs.dataMove = it.value();
std::vector<ShardsAffectedByTeamFailure::Team> teams;
teams.push_back(ShardsAffectedByTeamFailure::Team(rs.dataMove->primaryDest, true));
if (!rs.dataMove->remoteDest.empty()) {
teams.push_back(ShardsAffectedByTeamFailure::Team(rs.dataMove->remoteDest, false));
}
// Since a DataMove could cover more than one keyrange, e.g., during merge, we need to define
// the target shard and restart the shard tracker.
self->shardsAffectedByTeamFailure->restartShardTracker.send(rs.keys);
self->shardsAffectedByTeamFailure->defineShard(rs.keys);
// When restoring a DataMove, the destination team is determined, and hence we need to register
// the data move now, so that team failures can be captured.
self->shardsAffectedByTeamFailure->moveShard(rs.keys, teams);
self->relocationProducer.send(rs);
wait(yield(TaskPriority::DataDistribution));
}
}
return Void();
}
};
// Runs the data distribution algorithm for FDB, including the DD Queue, DD tracker, and DD team collection
@ -473,8 +557,6 @@ ACTOR Future<Void> dataDistribution(Reference<DataDistributor> self,
// When/If this assertion fails, Evan owes Ben a pat on the back for his foresight
ASSERT(self->configuration.storageTeamSize > 0);
state PromiseStream<RelocateShard> output;
state PromiseStream<RelocateShard> input;
state PromiseStream<Promise<int64_t>> getAverageShardBytes;
state PromiseStream<Promise<int>> getUnhealthyRelocationCount;
state PromiseStream<GetMetricsRequest> getShardMetrics;
@ -482,82 +564,8 @@ ACTOR Future<Void> dataDistribution(Reference<DataDistributor> self,
state Reference<AsyncVar<bool>> processingUnhealthy(new AsyncVar<bool>(false));
state Reference<AsyncVar<bool>> processingWiggle(new AsyncVar<bool>(false));
state Promise<Void> readyToStart;
state Reference<ShardsAffectedByTeamFailure> shardsAffectedByTeamFailure(new ShardsAffectedByTeamFailure);
state int shard = 0;
for (; shard < self->initData->shards.size() - 1; shard++) {
const DDShardInfo& iShard = self->initData->shards[shard];
KeyRangeRef keys = KeyRangeRef(iShard.key, self->initData->shards[shard + 1].key);
shardsAffectedByTeamFailure->defineShard(keys);
std::vector<ShardsAffectedByTeamFailure::Team> teams;
teams.push_back(ShardsAffectedByTeamFailure::Team(iShard.primarySrc, true));
if (self->configuration.usableRegions > 1) {
teams.push_back(ShardsAffectedByTeamFailure::Team(iShard.remoteSrc, false));
}
if (g_network->isSimulated()) {
TraceEvent("DDInitShard")
.detail("Keys", keys)
.detail("PrimarySrc", describe(iShard.primarySrc))
.detail("RemoteSrc", describe(iShard.remoteSrc))
.detail("PrimaryDest", describe(iShard.primaryDest))
.detail("RemoteDest", describe(iShard.remoteDest))
.detail("SrcID", iShard.srcId)
.detail("DestID", iShard.destId);
}
shardsAffectedByTeamFailure->moveShard(keys, teams);
if (iShard.hasDest && iShard.destId == anonymousShardId) {
// This shard is already in flight. Ideally we should use dest in ShardsAffectedByTeamFailure and
// generate a dataDistributionRelocator directly in DataDistributionQueue to track it, but it's
// easier to just (with low priority) schedule it for movement.
bool unhealthy = iShard.primarySrc.size() != self->configuration.storageTeamSize;
if (!unhealthy && self->configuration.usableRegions > 1) {
unhealthy = iShard.remoteSrc.size() != self->configuration.storageTeamSize;
}
output.send(RelocateShard(keys,
unhealthy ? SERVER_KNOBS->PRIORITY_TEAM_UNHEALTHY
: SERVER_KNOBS->PRIORITY_RECOVER_MOVE,
RelocateReason::OTHER));
}
wait(yield(TaskPriority::DataDistribution));
}
state KeyRangeMap<std::shared_ptr<DataMove>>::iterator it = self->initData->dataMoveMap.ranges().begin();
for (; it != self->initData->dataMoveMap.ranges().end(); ++it) {
const DataMoveMetaData& meta = it.value()->meta;
if (it.value()->isCancelled() || (it.value()->valid && !CLIENT_KNOBS->SHARD_ENCODE_LOCATION_METADATA)) {
RelocateShard rs(meta.range, SERVER_KNOBS->PRIORITY_RECOVER_MOVE, RelocateReason::OTHER);
rs.dataMoveId = meta.id;
rs.cancelled = true;
output.send(rs);
TraceEvent("DDInitScheduledCancelDataMove", self->ddId).detail("DataMove", meta.toString());
} else if (it.value()->valid) {
TraceEvent(SevDebug, "DDInitFoundDataMove", self->ddId).detail("DataMove", meta.toString());
ASSERT(meta.range == it.range());
// TODO: Persist priority in DataMoveMetaData.
RelocateShard rs(meta.range, SERVER_KNOBS->PRIORITY_RECOVER_MOVE, RelocateReason::OTHER);
rs.dataMoveId = meta.id;
rs.dataMove = it.value();
std::vector<ShardsAffectedByTeamFailure::Team> teams;
teams.push_back(ShardsAffectedByTeamFailure::Team(rs.dataMove->primaryDest, true));
if (!rs.dataMove->remoteDest.empty()) {
teams.push_back(ShardsAffectedByTeamFailure::Team(rs.dataMove->remoteDest, false));
}
// Since a DataMove could cover more than one keyrange, e.g., during merge, we need to define
// the target shard and restart the shard tracker.
shardsAffectedByTeamFailure->restartShardTracker.send(rs.keys);
shardsAffectedByTeamFailure->defineShard(rs.keys);
// When restoring a DataMove, the destination team is determined, and hence we need to register
// the data move now, so that team failures can be captured.
shardsAffectedByTeamFailure->moveShard(rs.keys, teams);
output.send(rs);
wait(yield(TaskPriority::DataDistribution));
}
}
self->shardsAffectedByTeamFailure = makeReference<ShardsAffectedByTeamFailure>();
wait(DataDistributor::resumeRelocations(self));
std::vector<TeamCollectionInterface> tcis;
@ -586,8 +594,8 @@ ACTOR Future<Void> dataDistribution(Reference<DataDistributor> self,
actors.push_back(pollMoveKeysLock(cx, self->lock, ddEnabledState));
actors.push_back(reportErrorsExcept(dataDistributionTracker(self->initData,
cx,
output,
shardsAffectedByTeamFailure,
self->relocationProducer,
self->shardsAffectedByTeamFailure,
getShardMetrics,
getTopKShardMetrics.getFuture(),
getShardMetricsList,
@ -601,14 +609,14 @@ ACTOR Future<Void> dataDistribution(Reference<DataDistributor> self,
self->ddId,
&normalDDQueueErrors()));
actors.push_back(reportErrorsExcept(dataDistributionQueue(cx,
output,
input.getFuture(),
self->relocationProducer,
self->relocationConsumer.getFuture(),
getShardMetrics,
getTopKShardMetrics,
processingUnhealthy,
processingWiggle,
tcis,
shardsAffectedByTeamFailure,
self->shardsAffectedByTeamFailure,
self->lock,
getAverageShardBytes,
getUnhealthyRelocationCount.getFuture(),
@ -625,8 +633,8 @@ ACTOR Future<Void> dataDistribution(Reference<DataDistributor> self,
cx,
self->ddId,
self->lock,
output,
shardsAffectedByTeamFailure,
self->relocationProducer,
self->shardsAffectedByTeamFailure,
self->configuration,
self->primaryDcId,
self->configuration.usableRegions > 1 ? self->remoteDcIds : std::vector<Optional<Key>>(),
@ -646,8 +654,8 @@ ACTOR Future<Void> dataDistribution(Reference<DataDistributor> self,
makeReference<DDTeamCollection>(cx,
self->ddId,
self->lock,
output,
shardsAffectedByTeamFailure,
self->relocationProducer,
self->shardsAffectedByTeamFailure,
self->configuration,
self->remoteDcIds,
Optional<std::vector<Optional<Key>>>(),
@ -678,7 +686,7 @@ ACTOR Future<Void> dataDistribution(Reference<DataDistributor> self,
&normalDDQueueErrors()));
actors.push_back(DDTeamCollection::printSnapshotTeamsInfo(primaryTeamCollection));
actors.push_back(yieldPromiseStream(output.getFuture(), input));
actors.push_back(yieldPromiseStream(self->relocationProducer.getFuture(), self->relocationConsumer));
wait(waitForAll(actors));
return Void();
@ -873,7 +881,7 @@ ACTOR Future<std::map<NetworkAddress, std::pair<WorkerInterface, std::string>>>
configuration.storageTeamSize - 1) -
storageFailures;
if (*storageFaultTolerance < 0) {
TEST(true); // Too many failed storage servers to complete snapshot
CODE_PROBE(true, "Too many failed storage servers to complete snapshot");
throw snap_storage_failed();
}
// tlogs
@ -1319,14 +1327,14 @@ ACTOR Future<Void> dataDistributor(DataDistributorInterface di, Reference<AsyncV
when(DistributorSnapRequest snapReq = waitNext(di.distributorSnapReq.getFuture())) {
auto& snapUID = snapReq.snapUID;
if (ddSnapReqResultMap.count(snapUID)) {
TEST(true); // Data distributor received a duplicate finished snap request
CODE_PROBE(true, "Data distributor received a duplicate finished snap request");
auto result = ddSnapReqResultMap[snapUID];
result.isError() ? snapReq.reply.sendError(result.getError()) : snapReq.reply.send(result.get());
TraceEvent("RetryFinishedDistributorSnapRequest")
.detail("SnapUID", snapUID)
.detail("Result", result.isError() ? result.getError().code() : 0);
} else if (ddSnapReqMap.count(snapReq.snapUID)) {
TEST(true); // Data distributor received a duplicate ongoing snap request
CODE_PROBE(true, "Data distributor received a duplicate ongoing snap request");
TraceEvent("RetryOngoingDistributorSnapRequest").detail("SnapUID", snapUID);
ASSERT(snapReq.snapPayload == ddSnapReqMap[snapUID].snapPayload);
ddSnapReqMap[snapUID] = snapReq;
@ -1361,6 +1369,8 @@ ACTOR Future<Void> dataDistributor(DataDistributorInterface di, Reference<AsyncV
return Void();
}
namespace data_distribution_test {
static Future<ErrorOr<Void>> goodTestFuture(double duration) {
return tag(delay(duration), ErrorOr<Void>(Void()));
}
@ -1369,29 +1379,41 @@ static Future<ErrorOr<Void>> badTestFuture(double duration, Error e) {
return tag(delay(duration), ErrorOr<Void>(e));
}
} // namespace data_distribution_test
TEST_CASE("/DataDistribution/WaitForMost") {
state std::vector<Future<ErrorOr<Void>>> futures;
{
futures = { goodTestFuture(1), goodTestFuture(2), goodTestFuture(3) };
futures = { data_distribution_test::goodTestFuture(1),
data_distribution_test::goodTestFuture(2),
data_distribution_test::goodTestFuture(3) };
wait(waitForMost(futures, 1, operation_failed(), 0.0)); // Don't wait for slowest future
ASSERT(!futures[2].isReady());
}
{
futures = { goodTestFuture(1), goodTestFuture(2), goodTestFuture(3) };
futures = { data_distribution_test::goodTestFuture(1),
data_distribution_test::goodTestFuture(2),
data_distribution_test::goodTestFuture(3) };
wait(waitForMost(futures, 0, operation_failed(), 0.0)); // Wait for all futures
ASSERT(futures[2].isReady());
}
{
futures = { goodTestFuture(1), goodTestFuture(2), goodTestFuture(3) };
futures = { data_distribution_test::goodTestFuture(1),
data_distribution_test::goodTestFuture(2),
data_distribution_test::goodTestFuture(3) };
wait(waitForMost(futures, 1, operation_failed(), 1.0)); // Wait for slowest future
ASSERT(futures[2].isReady());
}
{
futures = { goodTestFuture(1), goodTestFuture(2), badTestFuture(1, success()) };
futures = { data_distribution_test::goodTestFuture(1),
data_distribution_test::goodTestFuture(2),
data_distribution_test::badTestFuture(1, success()) };
wait(waitForMost(futures, 1, operation_failed(), 1.0)); // Error ignored
}
{
futures = { goodTestFuture(1), goodTestFuture(2), badTestFuture(1, success()) };
futures = { data_distribution_test::goodTestFuture(1),
data_distribution_test::goodTestFuture(2),
data_distribution_test::badTestFuture(1, success()) };
try {
wait(waitForMost(futures, 0, operation_failed(), 1.0));
ASSERT(false);

View File

@ -41,20 +41,100 @@
typedef Reference<IDataDistributionTeam> ITeamRef;
typedef std::pair<ITeamRef, ITeamRef> SrcDestTeamPair;
// TODO: add guard to guarantee the priority is not equal for each purpose?
// FIXME: Always use DataMovementReason to invoke these functions.
inline bool isDiskRebalancePriority(int priority) {
return priority == SERVER_KNOBS->PRIORITY_REBALANCE_UNDERUTILIZED_TEAM ||
priority == SERVER_KNOBS->PRIORITY_REBALANCE_OVERUTILIZED_TEAM;
}
inline bool isDataMovementForDiskBalancing(DataMovementReason reason) {
return reason == DataMovementReason::REBALANCE_UNDERUTILIZED_TEAM ||
reason == DataMovementReason::REBALANCE_OVERUTILIZED_TEAM;
}
inline bool isDataMovementForReadBalancing(DataMovementReason reason) {
return reason == DataMovementReason::REBALANCE_READ_OVERUTIL_TEAM ||
reason == DataMovementReason::REBALANCE_READ_UNDERUTIL_TEAM;
}
inline bool isMountainChopperPriority(int priority) {
return priority == SERVER_KNOBS->PRIORITY_REBALANCE_OVERUTILIZED_TEAM ||
priority == SERVER_KNOBS->PRIORITY_REBALANCE_READ_OVERUTIL_TEAM;
}
inline bool isDataMovementForMountainChopper(DataMovementReason reason) {
return reason == DataMovementReason::REBALANCE_OVERUTILIZED_TEAM ||
reason == DataMovementReason::REBALANCE_READ_OVERUTIL_TEAM;
}
inline bool isValleyFillerPriority(int priority) {
return priority == SERVER_KNOBS->PRIORITY_REBALANCE_UNDERUTILIZED_TEAM ||
priority == SERVER_KNOBS->PRIORITY_REBALANCE_READ_UNDERUTIL_TEAM;
}
inline bool isDataMovementForValleyFiller(DataMovementReason reason) {
return reason == DataMovementReason::REBALANCE_UNDERUTILIZED_TEAM ||
reason == DataMovementReason::REBALANCE_READ_UNDERUTIL_TEAM;
}
int dataMovementPriority(DataMovementReason reason) {
int priority;
switch (reason) {
case DataMovementReason::RECOVER_MOVE:
priority = SERVER_KNOBS->PRIORITY_RECOVER_MOVE;
break;
case DataMovementReason::REBALANCE_UNDERUTILIZED_TEAM:
priority = SERVER_KNOBS->PRIORITY_REBALANCE_UNDERUTILIZED_TEAM;
break;
case DataMovementReason::REBALANCE_OVERUTILIZED_TEAM:
priority = SERVER_KNOBS->PRIORITY_REBALANCE_OVERUTILIZED_TEAM;
break;
case DataMovementReason::REBALANCE_READ_OVERUTIL_TEAM:
priority = SERVER_KNOBS->PRIORITY_REBALANCE_READ_OVERUTIL_TEAM;
break;
case DataMovementReason::REBALANCE_READ_UNDERUTIL_TEAM:
priority = SERVER_KNOBS->PRIORITY_REBALANCE_READ_UNDERUTIL_TEAM;
break;
case DataMovementReason::PERPETUAL_STORAGE_WIGGLE:
priority = SERVER_KNOBS->PRIORITY_PERPETUAL_STORAGE_WIGGLE;
break;
case DataMovementReason::TEAM_HEALTHY:
priority = SERVER_KNOBS->PRIORITY_TEAM_HEALTHY;
break;
case DataMovementReason::TEAM_CONTAINS_UNDESIRED_SERVER:
priority = SERVER_KNOBS->PRIORITY_TEAM_CONTAINS_UNDESIRED_SERVER;
break;
case DataMovementReason::TEAM_REDUNDANT:
priority = SERVER_KNOBS->PRIORITY_TEAM_REDUNDANT;
break;
case DataMovementReason::MERGE_SHARD:
priority = SERVER_KNOBS->PRIORITY_MERGE_SHARD;
break;
case DataMovementReason::POPULATE_REGION:
priority = SERVER_KNOBS->PRIORITY_POPULATE_REGION;
break;
case DataMovementReason::TEAM_UNHEALTHY:
priority = SERVER_KNOBS->PRIORITY_TEAM_UNHEALTHY;
break;
case DataMovementReason::TEAM_2_LEFT:
priority = SERVER_KNOBS->PRIORITY_TEAM_2_LEFT;
break;
case DataMovementReason::TEAM_1_LEFT:
priority = SERVER_KNOBS->PRIORITY_TEAM_1_LEFT;
break;
case DataMovementReason::TEAM_FAILED:
priority = SERVER_KNOBS->PRIORITY_TEAM_FAILED;
break;
case DataMovementReason::TEAM_0_LEFT:
priority = SERVER_KNOBS->PRIORITY_TEAM_0_LEFT;
break;
case DataMovementReason::SPLIT_SHARD:
priority = SERVER_KNOBS->PRIORITY_SPLIT_SHARD;
break;
}
return priority;
}
struct RelocateData {
KeyRange keys;
int priority;
@ -1349,7 +1429,7 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueueData* self,
}
if (anyDestOverloaded) {
TEST(true); // Destination overloaded throttled move
CODE_PROBE(true, "Destination overloaded throttled move");
destOverloadedCount++;
TraceEvent(destOverloadedCount > 50 ? SevInfo : SevDebug, "DestSSBusy", distributorId)
.suppressFor(1.0)
@ -1361,7 +1441,7 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueueData* self,
.detail("Servers", destServersString(bestTeams));
wait(delay(SERVER_KNOBS->DEST_OVERLOADED_DELAY, TaskPriority::DataDistributionLaunch));
} else {
TEST(true); // did not find a healthy destination team on the first attempt
CODE_PROBE(true, "did not find a healthy destination team on the first attempt");
stuckCount++;
TraceEvent(stuckCount > 50 ? SevWarnAlways : SevWarn, "BestTeamStuck", distributorId)
.suppressFor(1.0)
@ -1594,7 +1674,7 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueueData* self,
throw error;
}
} else {
TEST(true); // move to removed server
CODE_PROBE(true, "move to removed server");
healthyDestinations.addDataInFlightToTeam(-metrics.bytes);
auto readLoad = metrics.bytesReadPerKSecond;
auto& destinationRef = healthyDestinations;
@ -1842,16 +1922,16 @@ ACTOR Future<SrcDestTeamPair> getSrcDestTeams(DDQueueData* self,
return {};
}
ACTOR Future<Void> BgDDLoadRebalance(DDQueueData* self, int teamCollectionIndex, int ddPriority) {
ACTOR Future<Void> BgDDLoadRebalance(DDQueueData* self, int teamCollectionIndex, DataMovementReason reason) {
state int resetCount = SERVER_KNOBS->DD_REBALANCE_RESET_AMOUNT;
state Transaction tr(self->cx);
state double lastRead = 0;
state bool skipCurrentLoop = false;
state Future<Void> delayF = Never();
state const bool readRebalance = !isDiskRebalancePriority(ddPriority);
state const bool readRebalance = isDataMovementForReadBalancing(reason);
state const char* eventName =
isMountainChopperPriority(ddPriority) ? "BgDDMountainChopper_New" : "BgDDValleyFiller_New";
isDataMovementForMountainChopper(reason) ? "BgDDMountainChopper_New" : "BgDDValleyFiller_New";
state int ddPriority = dataMovementPriority(reason);
loop {
state bool moved = false;
state Reference<IDataDistributionTeam> sourceTeam;
@ -1899,7 +1979,7 @@ ACTOR Future<Void> BgDDLoadRebalance(DDQueueData* self, int teamCollectionIndex,
traceEvent.detail("QueuedRelocations", self->priority_relocations[ddPriority]);
if (self->priority_relocations[ddPriority] < SERVER_KNOBS->DD_REBALANCE_PARALLELISM) {
if (isMountainChopperPriority(ddPriority)) {
if (isDataMovementForMountainChopper(reason)) {
srcReq = GetTeamRequest(WantNewServers::True,
WantTrueBest::True,
PreferLowerDiskUtil::False,
@ -2197,10 +2277,8 @@ ACTOR Future<Void> dataDistributionQueue(Database cx,
// balancingFutures.push_back(BgDDLoadRebalance(&self, i, SERVER_KNOBS->PRIORITY_REBALANCE_OVERUTILIZED_TEAM));
// balancingFutures.push_back(BgDDLoadRebalance(&self, i, SERVER_KNOBS->PRIORITY_REBALANCE_UNDERUTILIZED_TEAM));
if (SERVER_KNOBS->READ_SAMPLING_ENABLED) {
balancingFutures.push_back(
BgDDLoadRebalance(&self, i, SERVER_KNOBS->PRIORITY_REBALANCE_READ_OVERUTIL_TEAM));
balancingFutures.push_back(
BgDDLoadRebalance(&self, i, SERVER_KNOBS->PRIORITY_REBALANCE_READ_UNDERUTIL_TEAM));
balancingFutures.push_back(BgDDLoadRebalance(&self, i, DataMovementReason::REBALANCE_READ_OVERUTIL_TEAM));
balancingFutures.push_back(BgDDLoadRebalance(&self, i, DataMovementReason::REBALANCE_READ_UNDERUTIL_TEAM));
}
balancingFutures.push_back(BgDDMountainChopper(&self, i));
balancingFutures.push_back(BgDDValleyFiller(&self, i));

View File

@ -110,7 +110,7 @@ struct DataDistributionTracker {
DataDistributionTracker* operator()() {
if (trackerCancelled) {
TEST(true); // Trying to access DataDistributionTracker after tracker has been cancelled
CODE_PROBE(true, "Trying to access DataDistributionTracker after tracker has been cancelled");
throw dd_tracker_cancelled();
}
return &tracker;
@ -482,7 +482,7 @@ ACTOR Future<Void> shardSplitter(DataDistributionTracker* self,
state BandwidthStatus bandwidthStatus = getBandwidthStatus(metrics);
// Split
TEST(true); // shard to be split
CODE_PROBE(true, "shard to be split");
StorageMetrics splitMetrics;
splitMetrics.bytes = shardBounds.max.bytes / 2;
@ -559,7 +559,7 @@ Future<Void> shardMerger(DataDistributionTracker* self,
auto prevIter = self->shards.rangeContaining(keys.begin);
auto nextIter = self->shards.rangeContaining(keys.begin);
TEST(true); // shard to be merged
CODE_PROBE(true, "shard to be merged");
ASSERT(keys.begin > allKeys.begin);
// This will merge shards both before and after "this" shard in keyspace.
@ -604,7 +604,7 @@ Future<Void> shardMerger(DataDistributionTracker* self,
// on the previous shard changing "size".
if (!newMetrics.present() || shardCount + newMetrics.get().shardCount >= CLIENT_KNOBS->SHARD_COUNT_LIMIT) {
if (shardsMerged == 1) {
TEST(true); // shardMerger cannot merge anything
CODE_PROBE(true, "shardMerger cannot merge anything");
return brokenPromiseToReady(prevIter->value().stats->onChange());
}
@ -797,7 +797,7 @@ void restartShardTrackers(DataDistributionTracker* self, KeyRangeRef keys, Optio
.detail("Keys", keys)
.detail("Size", startingMetrics.get().metrics.bytes)
.detail("Merges", startingMetrics.get().merges);*/
TEST(true); // shardTracker started with trackedBytes already set
CODE_PROBE(true, "shardTracker started with trackedBytes already set");
shardMetrics->set(startingMetrics);
}
@ -903,7 +903,7 @@ ACTOR Future<Void> fetchTopKShardMetrics(DataDistributionTracker* self, GetTopKM
choose {
when(wait(fetchTopKShardMetrics_impl(self, req))) {}
when(wait(delay(SERVER_KNOBS->DD_SHARD_METRICS_TIMEOUT))) {
TEST(true); // TopK DD_SHARD_METRICS_TIMEOUT
CODE_PROBE(true, "TopK DD_SHARD_METRICS_TIMEOUT");
req.reply.send(GetTopKMetricsReply());
}
}
@ -942,7 +942,7 @@ ACTOR Future<Void> fetchShardMetrics(DataDistributionTracker* self, GetMetricsRe
choose {
when(wait(fetchShardMetrics_impl(self, req))) {}
when(wait(delay(SERVER_KNOBS->DD_SHARD_METRICS_TIMEOUT, TaskPriority::DataDistribution))) {
TEST(true); // DD_SHARD_METRICS_TIMEOUT
CODE_PROBE(true, "DD_SHARD_METRICS_TIMEOUT");
StorageMetrics largeMetrics;
largeMetrics.bytes = getMaxShardSize(self->dbSizeEstimate->get());
req.reply.send(largeMetrics);

View File

@ -379,7 +379,7 @@ public:
pageFloor(std::max(self->files[1].size - desiredMaxFileSize, self->fileShrinkBytes));
if ((maxShrink > SERVER_KNOBS->DISK_QUEUE_MAX_TRUNCATE_BYTES) ||
(frivolouslyTruncate && deterministicRandom()->random01() < 0.3)) {
TEST(true); // Replacing DiskQueue file
CODE_PROBE(true, "Replacing DiskQueue file");
TraceEvent("DiskQueueReplaceFile", self->dbgid)
.detail("Filename", self->files[1].f->getFilename())
.detail("OldFileSize", self->files[1].size)
@ -389,7 +389,7 @@ public:
waitfor.push_back(self->files[1].f->truncate(self->fileExtensionBytes));
self->files[1].size = self->fileExtensionBytes;
} else {
TEST(true); // Truncating DiskQueue file
CODE_PROBE(true, "Truncating DiskQueue file");
const int64_t startingSize = self->files[1].size;
self->files[1].size -= std::min(maxShrink, self->files[1].size);
self->files[1].size = std::max(self->files[1].size, self->fileExtensionBytes);
@ -460,12 +460,12 @@ public:
wait(ready);
TEST(pageData.size() > sizeof(Page)); // push more than one page of data
CODE_PROBE(pageData.size() > sizeof(Page), "push more than one page of data");
Future<Void> pushed = wait(self->push(pageData, &syncFiles));
pushing.send(Void());
ASSERT(syncFiles.size() >= 1 && syncFiles.size() <= 2);
TEST(2 == syncFiles.size()); // push spans both files
CODE_PROBE(2 == syncFiles.size(), "push spans both files");
wait(pushed);
delete pageMem;
@ -491,8 +491,8 @@ public:
committed.send(Void());
} catch (Error& e) {
delete pageMem;
TEST(true); // push error
TEST(2 == syncFiles.size()); // push spanning both files error
CODE_PROBE(true, "push error");
CODE_PROBE(2 == syncFiles.size(), "push spanning both files error");
TraceEvent(SevError, "RDQPushAndCommitError", dbgid)
.errorUnsuppressed(e)
.detail("InitialFilename0", filename);
@ -805,7 +805,7 @@ public:
Standalone<StringRef> result = self->readingBuffer.pop_front(sizeof(Page));
return result;
} catch (Error& e) {
TEST(true); // Read next page error
CODE_PROBE(true, "Read next page error");
TraceEvent(SevError, "RDQReadNextPageError", self->dbgid)
.errorUnsuppressed(e)
.detail("File0Name", self->files[0].dbgFilename);
@ -840,8 +840,8 @@ public:
state std::vector<Future<Void>> commits;
state bool swap = file == 0;
TEST(file == 0); // truncate before last read page on file 0
TEST(file == 1 && pos != self->files[1].size); // truncate before last read page on file 1
CODE_PROBE(file == 0, "truncate before last read page on file 0");
CODE_PROBE(file == 1 && pos != self->files[1].size, "truncate before last read page on file 1");
self->readingFile = 2;
self->readingBuffer.clear();
@ -890,10 +890,10 @@ public:
ASSERT(recovered);
uint8_t const* begin = contents.begin();
uint8_t const* end = contents.end();
TEST(contents.size() && pushedPageCount()); // More than one push between commits
CODE_PROBE(contents.size() && pushedPageCount(), "More than one push between commits");
bool pushAtEndOfPage = contents.size() >= 4 && pushedPageCount() && backPage().remainingCapacity() < 4;
TEST(pushAtEndOfPage); // Push right at the end of a page, possibly splitting size
CODE_PROBE(pushAtEndOfPage, "Push right at the end of a page, possibly splitting size");
while (begin != end) {
if (!pushedPageCount() || !backPage().remainingCapacity())
addEmptyPage();
@ -1391,7 +1391,7 @@ private:
int f;
int64_t p;
bool poppedNotDurable = self->lastPoppedSeq / sizeof(Page) != self->poppedSeq / sizeof(Page);
TEST(poppedNotDurable); // DiskQueue: Recovery popped position not fully durable
CODE_PROBE(poppedNotDurable, "DiskQueue: Recovery popped position not fully durable");
self->findPhysicalLocation(self->lastPoppedSeq, &f, &p, "lastPoppedSeq");
wait(self->rawQueue->setPoppedPage(f, p, pageFloor(self->lastPoppedSeq)));
@ -1408,8 +1408,8 @@ private:
self->recovered = true;
ASSERT(self->poppedSeq <= self->endLocation());
TEST(result.size() == 0); // End of queue at border between reads
TEST(result.size() != 0); // Partial read at end of queue
CODE_PROBE(result.size() == 0, "End of queue at border between reads");
CODE_PROBE(result.size() != 0, "Partial read at end of queue");
// The next read location isn't necessarily the end of the last commit, but this is sufficient for helping us
// check an ASSERTion
@ -1628,8 +1628,9 @@ public:
// totally finished
pop(popLocation);
commitFuture = commitFuture && queue->commit();
} else
TEST(true); // No uncommitted data was popped
} else {
CODE_PROBE(true, "No uncommitted data was popped");
}
return commitFuture;
}

View File

@ -166,7 +166,7 @@ class GlobalTagThrottlerImpl {
// wait(tr.watch(tagThrottleSignalKey));
wait(delay(5.0));
TraceEvent("GlobalTagThrottler_ChangeSignaled");
TEST(true); // Global tag throttler detected quota changes
CODE_PROBE(true, "Global tag throttler detected quota changes");
break;
} catch (Error& e) {
TraceEvent("GlobalTagThrottlerMonitoringChangesError", self->id).error(e);

View File

@ -661,14 +661,14 @@ ACTOR Future<Void> sendGrvReplies(Future<GetReadVersionReply> replyFuture,
if (tagItr != priorityThrottledTags.end()) {
if (tagItr->second.expiration > now()) {
if (tagItr->second.tpsRate == std::numeric_limits<double>::max()) {
TEST(true); // Auto TPS rate is unlimited
CODE_PROBE(true, "Auto TPS rate is unlimited");
} else {
TEST(true); // GRV proxy returning tag throttle
CODE_PROBE(true, "GRV proxy returning tag throttle");
reply.tagThrottleInfo[tag.first] = tagItr->second;
}
} else {
// This isn't required, but we might as well
TEST(true); // GRV proxy expiring tag throttle
CODE_PROBE(true, "GRV proxy expiring tag throttle");
priorityThrottledTags.erase(tagItr);
}
}

View File

@ -104,9 +104,9 @@ public:
TraceEvent("KVSMemSwitchingToLargeTransactionMode", id)
.detail("TransactionSize", transactionSize)
.detail("DataSize", committedDataSize);
TEST(true); // KeyValueStoreMemory switching to large transaction mode
TEST(committedDataSize >
1e3); // KeyValueStoreMemory switching to large transaction mode with committed data
CODE_PROBE(true, "KeyValueStoreMemory switching to large transaction mode");
CODE_PROBE(committedDataSize > 1e3,
"KeyValueStoreMemory switching to large transaction mode with committed data");
}
int64_t bytesWritten = commit_queue(queue, true);
@ -506,6 +506,12 @@ private:
OpHeader* h,
bool* isZeroFilled,
int* zeroFillSize) {
// Metadata op types to be excluded from encryption.
static std::unordered_set<OpType> metaOps = { OpSnapshotEnd, OpSnapshotAbort, OpCommit, OpRollback };
if (metaOps.count((OpType)h->op) == 0) {
// It is not supported to open an encrypted store as unencrypted, or vice-versa.
ASSERT_EQ(h->op == OpEncrypted, self->enableEncryption);
}
state int remainingBytes = h->len1 + h->len2 + 1;
if (h->op == OpEncrypted) {
// encryption header, plus the real (encrypted) op type
@ -568,7 +574,7 @@ private:
Standalone<StringRef> data = wait(self->log->readNext(sizeof(OpHeader)));
if (data.size() != sizeof(OpHeader)) {
if (data.size()) {
TEST(true); // zero fill partial header in KeyValueStoreMemory
CODE_PROBE(true, "zero fill partial header in KeyValueStoreMemory");
memset(&h, 0, sizeof(OpHeader));
memcpy(&h, data.begin(), data.size());
zeroFillSize = sizeof(OpHeader) - data.size() + h.len1 + h.len2 + 1;
@ -699,7 +705,7 @@ private:
ASSERT(false);
}
TEST(true); // Fixing a partial commit at the end of the KeyValueStoreMemory log
CODE_PROBE(true, "Fixing a partial commit at the end of the KeyValueStoreMemory log");
for (int i = 0; i < zeroFillSize; i++)
self->log->push(StringRef((const uint8_t*)"", 1));
}

View File

@ -741,7 +741,8 @@ ACTOR Future<Void> rocksDBMetricLogger(UID id,
std::shared_ptr<PerfContextMetrics> perfContextMetrics,
rocksdb::DB* db,
std::shared_ptr<ReadIteratorPool> readIterPool,
Counters* counters) {
Counters* counters,
CF cf) {
state std::vector<std::tuple<const char*, uint32_t, uint64_t>> tickerStats = {
{ "StallMicros", rocksdb::STALL_MICROS, 0 },
{ "BytesRead", rocksdb::BYTES_READ, 0 },
@ -779,7 +780,7 @@ ACTOR Future<Void> rocksDBMetricLogger(UID id,
{ "CountIterSkippedKeys", rocksdb::NUMBER_ITER_SKIP, 0 },
};
state std::vector<std::pair<const char*, std::string>> propertyStats = {
state std::vector<std::pair<const char*, std::string>> intPropertyStats = {
{ "NumImmutableMemtables", rocksdb::DB::Properties::kNumImmutableMemTable },
{ "NumImmutableMemtablesFlushed", rocksdb::DB::Properties::kNumImmutableMemTableFlushed },
{ "IsMemtableFlushPending", rocksdb::DB::Properties::kMemTableFlushPending },
@ -807,6 +808,14 @@ ACTOR Future<Void> rocksDBMetricLogger(UID id,
{ "LiveSstFilesSize", rocksdb::DB::Properties::kLiveSstFilesSize },
};
state std::vector<std::pair<const char*, std::string>> strPropertyStats = {
{ "LevelStats", rocksdb::DB::Properties::kLevelStats },
};
state std::vector<std::pair<const char*, std::string>> levelStrPropertyStats = {
{ "CompressionRatioAtLevel", rocksdb::DB::Properties::kCompressionRatioAtLevelPrefix },
};
state std::unordered_map<std::string, uint64_t> readIteratorPoolStats = {
{ "NumReadIteratorsCreated", 0 },
{ "NumTimesReadIteratorsReused", 0 },
@ -816,21 +825,40 @@ ACTOR Future<Void> rocksDBMetricLogger(UID id,
wait(delay(SERVER_KNOBS->ROCKSDB_METRICS_DELAY));
TraceEvent e("RocksDBMetrics", id);
uint64_t stat;
for (auto& t : tickerStats) {
auto& [name, ticker, cum] = t;
for (auto& [name, ticker, cum] : tickerStats) {
stat = statistics->getTickerCount(ticker);
e.detail(name, stat - cum);
cum = stat;
}
for (auto& p : propertyStats) {
auto& [name, property] = p;
for (const auto& [name, property] : intPropertyStats) {
stat = 0;
// GetAggregatedIntProperty gets the aggregated int property from all column families.
ASSERT(db->GetAggregatedIntProperty(property, &stat));
e.detail(name, stat);
}
std::string propValue;
for (const auto& [name, property] : strPropertyStats) {
propValue = "";
ASSERT(db->GetProperty(cf, property, &propValue));
e.detail(name, propValue);
}
rocksdb::ColumnFamilyMetaData cf_meta_data;
db->GetColumnFamilyMetaData(cf, &cf_meta_data);
int numLevels = static_cast<int>(cf_meta_data.levels.size());
std::string levelProp;
for (const auto& [name, property] : levelStrPropertyStats) {
levelProp = "";
for (int level = 0; level < numLevels; level++) {
propValue = "";
ASSERT(db->GetProperty(cf, property + std::to_string(level), &propValue));
levelProp += std::to_string(level) + ":" + propValue + (level == numLevels - 1 ? "" : ",");
}
e.detail(name, levelProp);
}
stat = readIterPool->numReadIteratorsCreated();
e.detail("NumReadIteratorsCreated", stat - readIteratorPoolStats["NumReadIteratorsCreated"]);
readIteratorPoolStats["NumReadIteratorsCreated"] = stat;
@ -1009,13 +1037,13 @@ struct RocksDBKeyValueStore : IKeyValueStore {
// The current thread and main thread are same when the code runs in simulation.
// blockUntilReady() is getting the thread into deadlock state, so directly calling
// the metricsLogger.
a.metrics =
rocksDBMetricLogger(id, options.statistics, perfContextMetrics, db, readIterPool, &a.counters) &&
a.metrics = rocksDBMetricLogger(
id, options.statistics, perfContextMetrics, db, readIterPool, &a.counters, cf) &&
flowLockLogger(id, a.readLock, a.fetchLock) && refreshReadIteratorPool(readIterPool);
} else {
onMainThread([&] {
a.metrics = rocksDBMetricLogger(
id, options.statistics, perfContextMetrics, db, readIterPool, &a.counters) &&
id, options.statistics, perfContextMetrics, db, readIterPool, &a.counters, cf) &&
flowLockLogger(id, a.readLock, a.fetchLock) && refreshReadIteratorPool(readIterPool);
return Future<bool>(true);
}).blockUntilReady();

View File

@ -117,7 +117,7 @@ struct PageChecksumCodec {
crc32Sum.part1 = 0;
crc32Sum.part2 = crc32c_append(0xfdbeefdb, static_cast<uint8_t*>(data), dataLen);
if (crc32Sum == *pSumInPage) {
TEST(true); // Read CRC32 checksum
CODE_PROBE(true, "Read CRC32 checksum");
return true;
}
}
@ -133,7 +133,7 @@ struct PageChecksumCodec {
xxHash3Sum.part1 = static_cast<uint32_t>((xxHash3 >> 32) & 0x00ffffff);
xxHash3Sum.part2 = static_cast<uint32_t>(xxHash3 & 0xffffffff);
if (xxHash3Sum == *pSumInPage) {
TEST(true); // Read xxHash3 checksum
CODE_PROBE(true, "Read xxHash3 checksum");
return true;
}
}
@ -144,7 +144,7 @@ struct PageChecksumCodec {
hashLittle2Sum.part2 = 0x5ca1ab1e;
hashlittle2(pData, dataLen, &hashLittle2Sum.part1, &hashLittle2Sum.part2);
if (hashLittle2Sum == *pSumInPage) {
TEST(true); // Read HashLittle2 checksum
CODE_PROBE(true, "Read HashLittle2 checksum");
return true;
}
@ -357,7 +357,7 @@ struct SQLiteDB : NonCopyable {
lineStart = lineEnd;
}
}
TEST(true); // BTree integrity checked
CODE_PROBE(true, "BTree integrity checked");
}
if (e)
sqlite3_free(e);
@ -1423,7 +1423,7 @@ void SQLiteDB::open(bool writable) {
renameFile(walpath, walpath + "-old-" + deterministicRandom()->randomUniqueID().toString());
ASSERT_WE_THINK(false); //< This code should not be hit in FoundationDB at the moment, because worker looks
// for databases to open by listing .fdb files, not .fdb-wal files
// TEST(true); // Replace a partially constructed or destructed DB
// CODE_PROBE(true, "Replace a partially constructed or destructed DB");
}
if (dbFile.isError() && walFile.isError() && writable &&
@ -1942,8 +1942,8 @@ private:
}
if (canDelete && (!canVacuum || deterministicRandom()->random01() < lazyDeleteBatchProbability)) {
TEST(canVacuum); // SQLite lazy deletion when vacuuming is active
TEST(!canVacuum); // SQLite lazy deletion when vacuuming is inactive
CODE_PROBE(canVacuum, "SQLite lazy deletion when vacuuming is active");
CODE_PROBE(!canVacuum, "SQLite lazy deletion when vacuuming is inactive");
int pagesToDelete = std::max(
1,
@ -1955,10 +1955,10 @@ private:
lazyDeleteTime += now() - begin;
} else {
ASSERT(canVacuum);
TEST(canDelete); // SQLite vacuuming when lazy delete is active
TEST(!canDelete); // SQLite vacuuming when lazy delete is inactive
TEST(SERVER_KNOBS->SPRING_CLEANING_VACUUMS_PER_LAZY_DELETE_PAGE !=
0); // SQLite vacuuming with nonzero vacuums_per_lazy_delete_page
CODE_PROBE(canDelete, "SQLite vacuuming when lazy delete is active");
CODE_PROBE(!canDelete, "SQLite vacuuming when lazy delete is inactive");
CODE_PROBE(SERVER_KNOBS->SPRING_CLEANING_VACUUMS_PER_LAZY_DELETE_PAGE != 0,
"SQLite vacuuming with nonzero vacuums_per_lazy_delete_page");
vacuumFinished = conn.vacuum();
if (!vacuumFinished) {
@ -1973,10 +1973,10 @@ private:
freeListPages = conn.freePages();
TEST(workPerformed.lazyDeletePages > 0); // Pages lazily deleted
TEST(workPerformed.vacuumedPages > 0); // Pages vacuumed
TEST(vacuumTime > 0); // Time spent vacuuming
TEST(lazyDeleteTime > 0); // Time spent lazy deleting
CODE_PROBE(workPerformed.lazyDeletePages > 0, "Pages lazily deleted");
CODE_PROBE(workPerformed.vacuumedPages > 0, "Pages vacuumed");
CODE_PROBE(vacuumTime > 0, "Time spent vacuuming");
CODE_PROBE(lazyDeleteTime > 0, "Time spent lazy deleting");
++springCleaningStats.springCleaningCount;
springCleaningStats.lazyDeletePages += workPerformed.lazyDeletePages;

View File

@ -206,7 +206,7 @@ ACTOR Future<Void> tryBecomeLeaderInternal(ServerCoordinators coordinators,
choose {
when(wait(nomineeChange.onTrigger())) {}
when(wait(badCandidateTimeout.isValid() ? badCandidateTimeout : Never())) {
TEST(true); // Bad candidate timeout
CODE_PROBE(true, "Bad candidate timeout");
TraceEvent("LeaderBadCandidateTimeout", myInfo.changeID).log();
break;
}

View File

@ -65,7 +65,7 @@ class LocalConfigurationImpl {
configClassToKnobToValue[configPath.back()] = {};
}
} else {
TEST(true); // Invalid configuration path
CODE_PROBE(true, "Invalid configuration path");
if (!g_network->isSimulated()) {
fprintf(stderr, "WARNING: Invalid configuration path: `%s'\n", paramString.c_str());
}
@ -88,7 +88,7 @@ class LocalConfigurationImpl {
knobCollection.setKnob(knobName.toString(), knobValue);
} catch (Error& e) {
if (e.code() == error_code_invalid_option_value) {
TEST(true); // invalid knob in configuration database
CODE_PROBE(true, "invalid knob in configuration database");
TraceEvent(SevWarnAlways, "InvalidKnobOptionValue")
.detail("KnobName", knobName)
.detail("KnobValue", knobValue.toString());
@ -126,10 +126,10 @@ class LocalConfigurationImpl {
this->overrides[stringToKeyRef(knobName)] = knobValue;
} catch (Error& e) {
if (e.code() == error_code_invalid_option) {
TEST(true); // Attempted to manually set invalid knob option
CODE_PROBE(true, "Attempted to manually set invalid knob option");
TraceEvent(SevWarnAlways, "UnrecognizedKnobOption").detail("Knob", printable(knobName));
} else if (e.code() == error_code_invalid_option_value) {
TEST(true); // Invalid manually set knob value
CODE_PROBE(true, "Invalid manually set knob value");
TraceEvent(SevWarnAlways, "InvalidKnobValue")
.detail("Knob", printable(knobName))
.detail("Value", printable(knobValueString));
@ -198,7 +198,7 @@ class LocalConfigurationImpl {
state ConfigKnobOverrides storedConfigPath =
BinaryReader::fromStringRef<ConfigKnobOverrides>(storedConfigPathValue.get(), IncludeVersion());
if (!storedConfigPath.hasSameConfigPath(self->configKnobOverrides)) {
TEST(true); // All local information is outdated
CODE_PROBE(true, "All local information is outdated");
wait(clearKVStore(self));
wait(saveConfigPath(self));
self->updateInMemoryState(lastSeenVersion);

View File

@ -592,7 +592,7 @@ Future<Void> logRouterPeekMessages(PromiseType replyPromise,
}
if (sequenceData.isSet()) {
if (sequenceData.getFuture().get().first != reply.end) {
TEST(true); // tlog peek second attempt ended at a different version
CODE_PROBE(true, "tlog peek second attempt ended at a different version");
replyPromise.sendError(operation_obsolete());
return Void();
}

View File

@ -290,7 +290,7 @@ void LogPushData::addTxsTag() {
}
void LogPushData::addTransactionInfo(SpanContext const& context) {
TEST(!spanContext.isValid()); // addTransactionInfo with invalid SpanContext
CODE_PROBE(!spanContext.isValid(), "addTransactionInfo with invalid SpanContext");
spanContext = context;
writtenLocations.clear();
}
@ -352,7 +352,7 @@ bool LogPushData::writeTransactionInfo(int location, uint32_t subseq) {
return false;
}
TEST(true); // Wrote SpanContextMessage to a transaction log
CODE_PROBE(true, "Wrote SpanContextMessage to a transaction log");
writtenLocations.insert(location);
BinaryWriter& wr = messagesWriter[location];
@ -375,10 +375,10 @@ bool LogPushData::writeTransactionInfo(int location, uint32_t subseq) {
// parent->child.
SpanContextMessage contextMessage;
if (spanContext.isSampled()) {
TEST(true); // Converting OTELSpanContextMessage to traced SpanContextMessage
CODE_PROBE(true, "Converting OTELSpanContextMessage to traced SpanContextMessage");
contextMessage = SpanContextMessage(UID(spanContext.traceID.first(), spanContext.traceID.second()));
} else {
TEST(true); // Converting OTELSpanContextMessage to untraced SpanContextMessage
CODE_PROBE(true, "Converting OTELSpanContextMessage to untraced SpanContextMessage");
contextMessage = SpanContextMessage(UID(0, 0));
}
wr << contextMessage;

View File

@ -101,7 +101,7 @@ public:
TraceEvent(SevWarnAlways, "DiskQueueAdapterReset")
.detail("Version", self->cursor->popped())
.detail("PeekTypeSwitch", self->peekTypeSwitches % 3);
TEST(true); // disk adapter reset
CODE_PROBE(true, "disk adapter reset");
if (self->cursor->popped() != 0) {
self->recoveryLoc = self->cursor->popped();
} else {

View File

@ -317,7 +317,7 @@ ACTOR Future<Void> serverPeekParallelGetMore(ILogSystem::ServerPeekCursor* self,
//
// A cursor for a log router can be delayed indefinitely during a network partition, so only fail
// simulation tests sufficiently far after we finish simulating network partitions.
TEST(e.code() == error_code_timed_out); // peek cursor timed out
CODE_PROBE(e.code() == error_code_timed_out, "peek cursor timed out");
if (now() >= FLOW_KNOBS->SIM_SPEEDUP_AFTER_SECONDS + SERVER_KNOBS->PEEK_TRACKER_EXPIRATION_TIME) {
ASSERT_WE_THINK(e.code() == error_code_operation_obsolete ||
SERVER_KNOBS->PEEK_TRACKER_EXPIRATION_TIME < 10);
@ -653,7 +653,7 @@ void ILogSystem::MergedPeekCursor::updateMessage(bool usePolicy) {
c->advanceTo(messageVersion);
if (start <= messageVersion && messageVersion < c->version()) {
advancedPast = true;
TEST(true); // Merge peek cursor advanced past desired sequence
CODE_PROBE(true, "Merge peek cursor advanced past desired sequence");
}
}
@ -965,7 +965,7 @@ void ILogSystem::SetPeekCursor::updateMessage(int logIdx, bool usePolicy) {
c->advanceTo(messageVersion);
if (start <= messageVersion && messageVersion < c->version()) {
advancedPast = true;
TEST(true); // Merge peek cursor with logIdx advanced past desired sequence
CODE_PROBE(true, "Merge peek cursor with logIdx advanced past desired sequence");
}
}
}

View File

@ -217,7 +217,7 @@ ACTOR Future<MoveKeysLock> takeMoveKeysLock(Database cx, UID ddId) {
return lock;
} catch (Error& e) {
wait(tr.onError(e));
TEST(true); // takeMoveKeysLock retry
CODE_PROBE(true, "takeMoveKeysLock retry");
}
}
}
@ -239,7 +239,7 @@ ACTOR static Future<Void> checkMoveKeysLock(Transaction* tr,
Optional<Value> readVal = wait(tr->get(moveKeysLockWriteKey));
UID lastWrite = readVal.present() ? BinaryReader::fromStringRef<UID>(readVal.get(), Unversioned()) : UID();
if (lastWrite != lock.prevWrite) {
TEST(true); // checkMoveKeysLock: Conflict with previous owner
CODE_PROBE(true, "checkMoveKeysLock: Conflict with previous owner");
throw movekeys_conflict();
}
@ -272,7 +272,7 @@ ACTOR static Future<Void> checkMoveKeysLock(Transaction* tr,
return Void();
} else {
TEST(true); // checkMoveKeysLock: Conflict with new owner
CODE_PROBE(true, "checkMoveKeysLock: Conflict with new owner");
throw movekeys_conflict();
}
}
@ -591,7 +591,7 @@ ACTOR static Future<Void> startMoveKeys(Database occ,
// This process can be split up into multiple transactions if there are too many existing overlapping shards
// In that case, each iteration of this loop will have begin set to the end of the last processed shard
while (begin < keys.end) {
TEST(begin > keys.begin); // Multi-transactional startMoveKeys
CODE_PROBE(begin > keys.begin, "Multi-transactional startMoveKeys");
batches++;
// RYW to optimize re-reading the same key ranges
@ -631,7 +631,7 @@ ACTOR static Future<Void> startMoveKeys(Database occ,
// Attempt to move onto a server that isn't in serverList (removed or never added to the
// database) This can happen (why?) and is handled by the data distribution algorithm
// FIXME: Answer why this can happen?
TEST(true); // start move keys moving to a removed server
CODE_PROBE(true, "start move keys moving to a removed server");
throw move_to_removed_server();
}
}
@ -825,7 +825,7 @@ ACTOR Future<Void> checkFetchingState(Database cx,
for (int s = 0; s < serverListValues.size(); s++) {
if (!serverListValues[s].present()) {
// FIXME: Is this the right behavior? dataMovementComplete will never be sent!
TEST(true); // check fetching state moved to removed server
CODE_PROBE(true, "check fetching state moved to removed server");
throw move_to_removed_server();
}
auto si = decodeServerListValue(serverListValues[s].get());
@ -897,7 +897,7 @@ ACTOR static Future<Void> finishMoveKeys(Database occ,
// This process can be split up into multiple transactions if there are too many existing overlapping shards
// In that case, each iteration of this loop will have begin set to the end of the last processed shard
while (begin < keys.end) {
TEST(begin > keys.begin); // Multi-transactional finishMoveKeys
CODE_PROBE(begin > keys.begin, "Multi-transactional finishMoveKeys");
state Transaction tr(occ);
@ -994,7 +994,8 @@ ACTOR static Future<Void> finishMoveKeys(Database occ,
} else if (alreadyMoved) {
dest.clear();
src.clear();
TEST(true); // FinishMoveKeys first key in iteration sub-range has already been processed
CODE_PROBE(true,
"FinishMoveKeys first key in iteration sub-range has already been processed");
}
}
@ -1029,8 +1030,9 @@ ACTOR static Future<Void> finishMoveKeys(Database occ,
}
}
if (!dest.size()) {
TEST(true); // A previous finishMoveKeys for this range committed just as it was cancelled to
// start this one?
CODE_PROBE(true,
"A previous finishMoveKeys for this range committed just as it was cancelled to "
"start this one?");
TraceEvent("FinishMoveKeysNothingToDo", relocationIntervalId)
.detail("KeyBegin", keys.begin)
.detail("KeyEnd", keys.end)
@ -1394,7 +1396,6 @@ ACTOR static Future<Void> startMoveShards(Database occ,
physicalShardMap[ssId].emplace_back(rangeIntersectKeys, srcId);
}
const UID checkpontId = deterministicRandom()->randomUniqueID();
for (const UID& ssId : src) {
dataMove.src.insert(ssId);
// TODO(psm): Create checkpoint for the range.
@ -2021,8 +2022,9 @@ ACTOR Future<Void> removeStorageServer(Database cx,
state bool canRemove = wait(canRemoveStorageServer(tr, serverID));
if (!canRemove) {
TEST(true); // The caller had a transaction in flight that assigned keys to the server. Wait for it to
// reverse its mistake.
CODE_PROBE(true,
"The caller had a transaction in flight that assigned keys to the server. Wait for it to "
"reverse its mistake.");
TraceEvent(SevWarn, "NoCanRemove").detail("Count", noCanRemoveCount++).detail("ServerID", serverID);
wait(delayJittered(SERVER_KNOBS->REMOVE_RETRY_DELAY, TaskPriority::DataDistributionLaunch));
tr->reset();
@ -2039,7 +2041,7 @@ ACTOR Future<Void> removeStorageServer(Database cx,
if (!fListKey.get().present()) {
if (retry) {
TEST(true); // Storage server already removed after retrying transaction
CODE_PROBE(true, "Storage server already removed after retrying transaction");
return Void();
}
TraceEvent(SevError, "RemoveInvalidServer").detail("ServerID", serverID);

View File

@ -99,7 +99,7 @@ TraceEvent debugTagsAndMessageEnabled(const char* context, Version version, Stri
SpanContextMessage scm;
br >> scm;
} else if (OTELSpanContextMessage::startsOTELSpanContextMessage(mutationType)) {
TEST(true); // MutationTracking reading OTELSpanContextMessage
CODE_PROBE(true, "MutationTracking reading OTELSpanContextMessage");
BinaryReader br(mutationData, AssumeVersion(rdr.protocolVersion()));
OTELSpanContextMessage scm;
br >> scm;

View File

@ -182,7 +182,7 @@ private:
Standalone<StringRef> h = wait(self->queue->readNext(sizeof(uint32_t)));
if (h.size() != sizeof(uint32_t)) {
if (h.size()) {
TEST(true); // Zero fill within size field
CODE_PROBE(true, "Zero fill within size field");
int payloadSize = 0;
memcpy(&payloadSize, h.begin(), h.size());
zeroFillSize = sizeof(uint32_t) - h.size(); // zero fill the size itself
@ -196,7 +196,7 @@ private:
Standalone<StringRef> e = wait(self->queue->readNext(payloadSize + 1));
if (e.size() != payloadSize + 1) {
TEST(true); // Zero fill within payload
CODE_PROBE(true, "Zero fill within payload");
zeroFillSize = payloadSize + 1 - e.size();
break;
}
@ -210,7 +210,7 @@ private:
}
}
if (zeroFillSize) {
TEST(true); // Fixing a partial commit at the end of the tlog queue
CODE_PROBE(true, "Fixing a partial commit at the end of the tlog queue");
for (int i = 0; i < zeroFillSize; i++)
self->queue->push(StringRef((const uint8_t*)"", 1));
}
@ -507,9 +507,9 @@ struct LogData : NonCopyable, public ReferenceCounted<LogData> {
ACTOR Future<Void> tLogLock(TLogData* self, ReplyPromise<TLogLockResult> reply, Reference<LogData> logData) {
state Version stopVersion = logData->version.get();
TEST(true); // TLog stopped by recovering master
TEST(logData->stopped); // LogData already stopped
TEST(!logData->stopped); // LogData not yet stopped
CODE_PROBE(true, "TLog stopped by recovering master");
CODE_PROBE(logData->stopped, "LogData already stopped");
CODE_PROBE(!logData->stopped, "LogData not yet stopped");
TraceEvent("TLogStop", logData->logId)
.detail("Ver", stopVersion)
@ -611,7 +611,7 @@ ACTOR Future<Void> updatePersistentData(TLogData* self, Reference<LogData> logDa
// Now that the changes we made to persistentData are durable, erase the data we moved from memory and the queue,
// increase bytesDurable accordingly, and update persistentDataDurableVersion.
TEST(anyData); // TLog moved data to persistentData
CODE_PROBE(anyData, "TLog moved data to persistentData");
logData->persistentDataDurableVersion = newPersistentDataVersion;
for (tag = logData->tag_data.begin(); tag != logData->tag_data.end(); ++tag) {
@ -834,7 +834,7 @@ void commitMessages(Reference<LogData> self,
// Fill up the rest of this block
int bytes = (uint8_t*)r.getLengthPtr() - messages.begin();
if (bytes) {
TEST(true); // Splitting commit messages across multiple blocks
CODE_PROBE(true, "Splitting commit messages across multiple blocks");
messages1 = StringRef(block.end(), bytes);
block.append(block.arena(), messages.begin(), bytes);
self->messageBlocks.emplace_back(version, block);
@ -1047,7 +1047,7 @@ Future<Void> tLogPeekMessages(PromiseType replyPromise,
}
if (sequenceData.isSet()) {
if (sequenceData.getFuture().get() != rep.end) {
TEST(true); // tlog peek second attempt ended at a different version
CODE_PROBE(true, "tlog peek second attempt ended at a different version");
replyPromise.sendError(operation_obsolete());
return Void();
}
@ -1120,7 +1120,7 @@ Future<Void> tLogPeekMessages(PromiseType replyPromise,
auto& sequenceData = trackerData.sequence_version[sequence + 1];
if (sequenceData.isSet()) {
if (sequenceData.getFuture().get() != reply.end) {
TEST(true); // tlog peek second attempt ended at a different version (2)
CODE_PROBE(true, "tlog peek second attempt ended at a different version (2)");
replyPromise.sendError(operation_obsolete());
return Void();
}
@ -1467,7 +1467,7 @@ ACTOR Future<Void> restorePersistentState(TLogData* self, LocalityData locality)
if (!fFormat.get().present()) {
RangeResult v = wait(self->persistentData->readRange(KeyRangeRef(StringRef(), LiteralStringRef("\xff")), 1));
if (!v.size()) {
TEST(true); // The DB is completely empty, so it was never initialized. Delete it.
CODE_PROBE(true, "The DB is completely empty, so it was never initialized. Delete it.");
throw worker_removed();
} else {
// This should never happen
@ -1553,7 +1553,7 @@ ACTOR Future<Void> restorePersistentState(TLogData* self, LocalityData locality)
try {
loop {
if (allRemoved.isReady()) {
TEST(true); // all tlogs removed during queue recovery
CODE_PROBE(true, "all tlogs removed during queue recovery");
throw worker_removed();
}
choose {
@ -1586,7 +1586,7 @@ ACTOR Future<Void> restorePersistentState(TLogData* self, LocalityData locality)
logData->queueCommittedVersion.set(qe.version);
while (self->bytesInput - self->bytesDurable >= recoverMemoryLimit) {
TEST(true); // Flush excess data during TLog queue recovery
CODE_PROBE(true, "Flush excess data during TLog queue recovery");
TraceEvent("FlushLargeQueueDuringRecovery", self->dbgid)
.detail("BytesInput", self->bytesInput)
.detail("BytesDurable", self->bytesDurable)
@ -1610,7 +1610,7 @@ ACTOR Future<Void> restorePersistentState(TLogData* self, LocalityData locality)
}
TraceEvent("TLogRestorePersistentStateDone", self->dbgid).detail("Took", now() - startt);
TEST(now() - startt >= 1.0); // TLog recovery took more than 1 second
CODE_PROBE(now() - startt >= 1.0, "TLog recovery took more than 1 second");
for (auto it : self->id_data) {
if (it.second->queueCommittedVersion.get() == 0) {

View File

@ -148,7 +148,7 @@ private:
Standalone<StringRef> h = wait(self->queue->readNext(sizeof(uint32_t)));
if (h.size() != sizeof(uint32_t)) {
if (h.size()) {
TEST(true); // Zero fill within size field
CODE_PROBE(true, "Zero fill within size field");
int payloadSize = 0;
memcpy(&payloadSize, h.begin(), h.size());
zeroFillSize = sizeof(uint32_t) - h.size(); // zero fill the size itself
@ -162,7 +162,7 @@ private:
Standalone<StringRef> e = wait(self->queue->readNext(payloadSize + 1));
if (e.size() != payloadSize + 1) {
TEST(true); // Zero fill within payload
CODE_PROBE(true, "Zero fill within payload");
zeroFillSize = payloadSize + 1 - e.size();
break;
}
@ -176,7 +176,7 @@ private:
}
}
if (zeroFillSize) {
TEST(true); // Fixing a partial commit at the end of the tlog queue
CODE_PROBE(true, "Fixing a partial commit at the end of the tlog queue");
for (int i = 0; i < zeroFillSize; i++)
self->queue->push(StringRef((const uint8_t*)"", 1));
}
@ -653,9 +653,9 @@ void TLogQueue::updateVersionSizes(const TLogQueueEntry& result, TLogData* tLog)
ACTOR Future<Void> tLogLock(TLogData* self, ReplyPromise<TLogLockResult> reply, Reference<LogData> logData) {
state Version stopVersion = logData->version.get();
TEST(true); // TLog stopped by recovering master
TEST(logData->stopped); // logData already stopped
TEST(!logData->stopped); // logData not yet stopped
CODE_PROBE(true, "TLog stopped by recovering master");
CODE_PROBE(logData->stopped, "logData already stopped");
CODE_PROBE(!logData->stopped, "logData not yet stopped");
TraceEvent("TLogStop", logData->logId)
.detail("Ver", stopVersion)
@ -769,7 +769,7 @@ ACTOR Future<Void> updatePersistentData(TLogData* self, Reference<LogData> logDa
// Now that the changes we made to persistentData are durable, erase the data we moved from memory and the queue,
// increase bytesDurable accordingly, and update persistentDataDurableVersion.
TEST(anyData); // TLog moved data to persistentData
CODE_PROBE(anyData, "TLog moved data to persistentData");
logData->persistentDataDurableVersion = newPersistentDataVersion;
for (tagLocality = 0; tagLocality < logData->tag_data.size(); tagLocality++) {
@ -1341,7 +1341,7 @@ Future<Void> tLogPeekMessages(PromiseType replyPromise,
}
if (sequenceData.isSet()) {
if (sequenceData.getFuture().get().first != rep.end) {
TEST(true); // tlog peek second attempt ended at a different version
CODE_PROBE(true, "tlog peek second attempt ended at a different version");
replyPromise.sendError(operation_obsolete());
return Void();
}
@ -1439,7 +1439,7 @@ Future<Void> tLogPeekMessages(PromiseType replyPromise,
if (sequenceData.isSet()) {
trackerData.duplicatePeeks++;
if (sequenceData.getFuture().get().first != reply.end) {
TEST(true); // tlog peek second attempt ended at a different version (2)
CODE_PROBE(true, "tlog peek second attempt ended at a different version (2)");
replyPromise.sendError(operation_obsolete());
return Void();
}
@ -1546,7 +1546,7 @@ ACTOR Future<Void> doQueueCommit(TLogData* self,
.detail("LogId", logData->logId)
.detail("Version", it->version.get())
.detail("QueueVer", it->queueCommittedVersion.get());
TEST(true); // A TLog was replaced before having a chance to commit its queue
CODE_PROBE(true, "A TLog was replaced before having a chance to commit its queue");
it->queueCommittedVersion.set(it->version.get());
}
return Void();
@ -2007,7 +2007,7 @@ ACTOR Future<Void> serveTLogInterface(TLogData* self,
when(TLogCommitRequest req = waitNext(tli.commit.getFuture())) {
//TraceEvent("TLogCommitReq", logData->logId).detail("Ver", req.version).detail("PrevVer", req.prevVersion).detail("LogVer", logData->version.get());
ASSERT(logData->isPrimary);
TEST(logData->stopped); // TLogCommitRequest while stopped
CODE_PROBE(logData->stopped, "TLogCommitRequest while stopped");
if (!logData->stopped)
logData->addActor.send(tLogCommit(self, req, logData, warningCollectorInput));
else
@ -2333,7 +2333,7 @@ ACTOR Future<Void> restorePersistentState(TLogData* self,
if (!fFormat.get().present()) {
RangeResult v = wait(self->persistentData->readRange(KeyRangeRef(StringRef(), LiteralStringRef("\xff")), 1));
if (!v.size()) {
TEST(true); // The DB is completely empty, so it was never initialized. Delete it.
CODE_PROBE(true, "The DB is completely empty, so it was never initialized. Delete it.");
throw worker_removed();
} else {
// This should never happen
@ -2473,7 +2473,7 @@ ACTOR Future<Void> restorePersistentState(TLogData* self,
try {
loop {
if (allRemoved.isReady()) {
TEST(true); // all tlogs removed during queue recovery
CODE_PROBE(true, "all tlogs removed during queue recovery");
throw worker_removed();
}
choose {
@ -2503,7 +2503,7 @@ ACTOR Future<Void> restorePersistentState(TLogData* self,
logData->queueCommittedVersion.set(qe.version);
while (self->bytesInput - self->bytesDurable >= recoverMemoryLimit) {
TEST(true); // Flush excess data during TLog queue recovery
CODE_PROBE(true, "Flush excess data during TLog queue recovery");
TraceEvent("FlushLargeQueueDuringRecovery", self->dbgid)
.detail("BytesInput", self->bytesInput)
.detail("BytesDurable", self->bytesDurable)
@ -2527,7 +2527,7 @@ ACTOR Future<Void> restorePersistentState(TLogData* self,
}
TraceEvent("TLogRestorePersistentStateDone", self->dbgid).detail("Took", now() - startt);
TEST(now() - startt >= 1.0); // TLog recovery took more than 1 second
CODE_PROBE(now() - startt >= 1.0, "TLog recovery took more than 1 second");
for (auto it : self->id_data) {
if (it.second->queueCommittedVersion.get() == 0) {

View File

@ -156,7 +156,7 @@ private:
Standalone<StringRef> h = wait(self->queue->readNext(sizeof(uint32_t)));
if (h.size() != sizeof(uint32_t)) {
if (h.size()) {
TEST(true); // Zero fill within size field
CODE_PROBE(true, "Zero fill within size field");
int payloadSize = 0;
memcpy(&payloadSize, h.begin(), h.size());
zeroFillSize = sizeof(uint32_t) - h.size(); // zero fill the size itself
@ -170,7 +170,7 @@ private:
Standalone<StringRef> e = wait(self->queue->readNext(payloadSize + 1));
if (e.size() != payloadSize + 1) {
TEST(true); // Zero fill within payload
CODE_PROBE(true, "Zero fill within payload");
zeroFillSize = payloadSize + 1 - e.size();
break;
}
@ -186,7 +186,7 @@ private:
}
}
if (zeroFillSize) {
TEST(true); // Fixing a partial commit at the end of the tlog queue
CODE_PROBE(true, "Fixing a partial commit at the end of the tlog queue");
for (int i = 0; i < zeroFillSize; i++)
self->queue->push(StringRef((const uint8_t*)"", 1));
}
@ -756,9 +756,9 @@ void TLogQueue::updateVersionSizes(const TLogQueueEntry& result,
ACTOR Future<Void> tLogLock(TLogData* self, ReplyPromise<TLogLockResult> reply, Reference<LogData> logData) {
state Version stopVersion = logData->version.get();
TEST(true); // TLog stopped by recovering master
TEST(logData->stopped); // logData already stopped
TEST(!logData->stopped); // logData not yet stopped
CODE_PROBE(true, "TLog stopped by recovering master");
CODE_PROBE(logData->stopped, "logData already stopped");
CODE_PROBE(!logData->stopped, "logData not yet stopped");
TraceEvent("TLogStop", logData->logId)
.detail("Ver", stopVersion)
@ -1042,7 +1042,7 @@ ACTOR Future<Void> updatePersistentData(TLogData* self, Reference<LogData> logDa
// Now that the changes we made to persistentData are durable, erase the data we moved from memory and the queue,
// increase bytesDurable accordingly, and update persistentDataDurableVersion.
TEST(anyData); // TLog moved data to persistentData
CODE_PROBE(anyData, "TLog moved data to persistentData");
logData->persistentDataDurableVersion = newPersistentDataVersion;
for (tagLocality = 0; tagLocality < logData->tag_data.size(); tagLocality++) {
@ -1680,7 +1680,7 @@ Future<Void> tLogPeekMessages(PromiseType replyPromise,
}
if (sequenceData.isSet()) {
if (sequenceData.getFuture().get().first != rep.end) {
TEST(true); // tlog peek second attempt ended at a different version
CODE_PROBE(true, "tlog peek second attempt ended at a different version");
replyPromise.sendError(operation_obsolete());
return Void();
}
@ -1868,7 +1868,7 @@ Future<Void> tLogPeekMessages(PromiseType replyPromise,
if (sequenceData.isSet()) {
trackerData.duplicatePeeks++;
if (sequenceData.getFuture().get().first != reply.end) {
TEST(true); // tlog peek second attempt ended at a different version (2)
CODE_PROBE(true, "tlog peek second attempt ended at a different version (2)");
replyPromise.sendError(operation_obsolete());
return Void();
}
@ -1930,7 +1930,7 @@ ACTOR Future<Void> watchDegraded(TLogData* self) {
wait(lowPriorityDelay(SERVER_KNOBS->TLOG_DEGRADED_DURATION));
TraceEvent(SevWarnAlways, "TLogDegraded", self->dbgid).log();
TEST(true); // TLog degraded
CODE_PROBE(true, "TLog degraded");
self->degraded->set(true);
return Void();
}
@ -1988,7 +1988,7 @@ ACTOR Future<Void> doQueueCommit(TLogData* self,
.detail("LogId", logData->logId)
.detail("Version", it->version.get())
.detail("QueueVer", it->queueCommittedVersion.get());
TEST(true); // A TLog was replaced before having a chance to commit its queue
CODE_PROBE(true, "A TLog was replaced before having a chance to commit its queue");
it->queueCommittedVersion.set(it->version.get());
}
return Void();
@ -2452,7 +2452,7 @@ ACTOR Future<Void> serveTLogInterface(TLogData* self,
when(TLogCommitRequest req = waitNext(tli.commit.getFuture())) {
//TraceEvent("TLogCommitReq", logData->logId).detail("Ver", req.version).detail("PrevVer", req.prevVersion).detail("LogVer", logData->version.get());
ASSERT(logData->isPrimary);
TEST(logData->stopped); // TLogCommitRequest while stopped
CODE_PROBE(logData->stopped, "TLogCommitRequest while stopped");
if (!logData->stopped)
logData->addActor.send(tLogCommit(self, req, logData, warningCollectorInput));
else
@ -2801,7 +2801,7 @@ ACTOR Future<Void> restorePersistentState(TLogData* self,
if (!fFormat.get().present()) {
RangeResult v = wait(self->persistentData->readRange(KeyRangeRef(StringRef(), LiteralStringRef("\xff")), 1));
if (!v.size()) {
TEST(true); // The DB is completely empty, so it was never initialized. Delete it.
CODE_PROBE(true, "The DB is completely empty, so it was never initialized. Delete it.");
throw worker_removed();
} else {
// This should never happen
@ -2949,7 +2949,7 @@ ACTOR Future<Void> restorePersistentState(TLogData* self,
throw end_of_stream();
loop {
if (allRemoved.isReady()) {
TEST(true); // all tlogs removed during queue recovery
CODE_PROBE(true, "all tlogs removed during queue recovery");
throw worker_removed();
}
choose {
@ -2980,7 +2980,7 @@ ACTOR Future<Void> restorePersistentState(TLogData* self,
logData->queueCommittedVersion.set(qe.version);
while (self->bytesInput - self->bytesDurable >= recoverMemoryLimit) {
TEST(true); // Flush excess data during TLog queue recovery
CODE_PROBE(true, "Flush excess data during TLog queue recovery");
TraceEvent("FlushLargeQueueDuringRecovery", self->dbgid)
.detail("LogId", logData->logId)
.detail("BytesInput", self->bytesInput)
@ -3010,7 +3010,7 @@ ACTOR Future<Void> restorePersistentState(TLogData* self,
}
TraceEvent("TLogRestorePersistentStateDone", self->dbgid).detail("Took", now() - startt);
TEST(now() - startt >= 1.0); // TLog recovery took more than 1 second
CODE_PROBE(now() - startt >= 1.0, "TLog recovery took more than 1 second");
for (auto it : self->id_data) {
if (it.second->queueCommittedVersion.get() == 0) {

View File

@ -449,7 +449,7 @@ class PaxosConfigConsumerImpl {
if (e.code() == error_code_version_already_compacted || e.code() == error_code_timed_out ||
e.code() == error_code_failed_to_reach_quorum || e.code() == error_code_version_already_compacted ||
e.code() == error_code_process_behind) {
TEST(true); // PaxosConfigConsumer get version_already_compacted error
CODE_PROBE(true, "PaxosConfigConsumer get version_already_compacted error");
if (e.code() == error_code_failed_to_reach_quorum) {
try {
wait(self->getCommittedVersionQuorum.complete());

View File

@ -325,7 +325,7 @@ public:
reply.throttledTags = self.tagThrottler->getClientRates();
bool returningTagsToProxy =
reply.throttledTags.present() && reply.throttledTags.get().size() > 0;
TEST(returningTagsToProxy); // Returning tag throttles to a proxy
CODE_PROBE(returningTagsToProxy, "Returning tag throttles to a proxy");
}
reply.healthMetrics.update(self.healthMetrics, true, req.detailed);

View File

@ -39,7 +39,7 @@ void ResolutionBalancer::setChangesInReply(UID requestingProxy, GetCommitVersion
rep.resolverChangesVersion = resolverChangesVersion;
resolverNeedingChanges.erase(requestingProxy);
TEST(!rep.resolverChanges.empty()); // resolution balancing moves keyranges
CODE_PROBE(!rep.resolverChanges.empty(), "resolution balancing moves keyranges");
if (resolverNeedingChanges.empty())
resolverChanges.set(Standalone<VectorRef<ResolverMoveRef>>());
}

View File

@ -350,7 +350,7 @@ ACTOR Future<Void> resolveBatch(Reference<Resolver> self, ResolveTransactionBatc
applyMetadataMutations(spanContext, *resolverData, req.transactions[t].mutations);
}
TEST(self->forceRecovery); // Resolver detects forced recovery
CODE_PROBE(self->forceRecovery, "Resolver detects forced recovery");
}
self->resolvedStateTransactions += req.txnStateTransactions.size();
@ -362,7 +362,7 @@ ACTOR Future<Void> resolveBatch(Reference<Resolver> self, ResolveTransactionBatc
ASSERT(req.version >= firstUnseenVersion);
ASSERT(firstUnseenVersion >= self->debugMinRecentStateVersion);
TEST(firstUnseenVersion == req.version); // Resolver first unseen version is current version
CODE_PROBE(firstUnseenVersion == req.version, "Resolver first unseen version is current version");
// If shardChanged at or before this commit version, the proxy may have computed
// the wrong set of groups. Then we need to broadcast to all groups below.
@ -400,13 +400,14 @@ ACTOR Future<Void> resolveBatch(Reference<Resolver> self, ResolveTransactionBatc
}
}
TEST(oldestProxyVersion == req.version); // The proxy that sent this request has the oldest current version
TEST(oldestProxyVersion !=
req.version); // The proxy that sent this request does not have the oldest current version
CODE_PROBE(oldestProxyVersion == req.version,
"The proxy that sent this request has the oldest current version");
CODE_PROBE(oldestProxyVersion != req.version,
"The proxy that sent this request does not have the oldest current version");
bool anyPopped = false;
if (firstUnseenVersion <= oldestProxyVersion && self->proxyInfoMap.size() == self->commitProxyCount + 1) {
TEST(true); // Deleting old state transactions
CODE_PROBE(true, "Deleting old state transactions");
int64_t erasedBytes = self->recentStateTransactionsInfo.eraseUpTo(oldestProxyVersion);
self->debugMinRecentStateVersion = oldestProxyVersion + 1;
anyPopped = erasedBytes > 0;
@ -445,7 +446,7 @@ ACTOR Future<Void> resolveBatch(Reference<Resolver> self, ResolveTransactionBatc
if (req.debugID.present())
g_traceBatch.addEvent("CommitDebug", debugID.get().first(), "Resolver.resolveBatch.After");
} else {
TEST(true); // Duplicate resolve batch request
CODE_PROBE(true, "Duplicate resolve batch request");
//TraceEvent("DupResolveBatchReq", self->dbgid).detail("From", proxyAddress);
}
@ -456,13 +457,13 @@ ACTOR Future<Void> resolveBatch(Reference<Resolver> self, ResolveTransactionBatc
if (batchItr != proxyInfoItr->second.outstandingBatches.end()) {
req.reply.send(batchItr->second);
} else {
TEST(true); // No outstanding batches for version on proxy
CODE_PROBE(true, "No outstanding batches for version on proxy");
req.reply.send(Never());
}
} else {
ASSERT_WE_THINK(false); // The first non-duplicate request with this proxyAddress, including this one, should
// have inserted this item in the map!
// TEST(true); // No prior proxy requests
// CODE_PROBE(true, "No prior proxy requests");
req.reply.send(Never());
}

View File

@ -48,7 +48,7 @@ Optional<double> RkTagThrottleCollection::RkTagThrottleData::updateAndGetClientR
ASSERT_GE(rate, 0);
return rate;
} else {
TEST(true); // Get throttle rate for expired throttle
CODE_PROBE(true, "Get throttle rate for expired throttle");
rateSet = false;
return Optional<double>();
}
@ -92,14 +92,14 @@ Optional<double> RkTagThrottleCollection::autoThrottleTag(UID id,
bool present = (itr != autoThrottledTags.end());
if (!present) {
if (autoThrottledTags.size() >= SERVER_KNOBS->MAX_AUTO_THROTTLED_TRANSACTION_TAGS) {
TEST(true); // Reached auto-throttle limit
CODE_PROBE(true, "Reached auto-throttle limit");
return Optional<double>();
}
itr = autoThrottledTags.try_emplace(tag).first;
initializeTag(tag);
} else if (itr->second.limits.expiration <= now()) {
TEST(true); // Re-throttling expired tag that hasn't been cleaned up
CODE_PROBE(true, "Re-throttling expired tag that hasn't been cleaned up");
present = false;
itr->second = RkTagThrottleData();
}
@ -113,7 +113,7 @@ Optional<double> RkTagThrottleCollection::autoThrottleTag(UID id,
return Optional<double>();
}
} else if (now() <= throttle.lastUpdated + SERVER_KNOBS->AUTO_TAG_THROTTLE_UPDATE_FREQUENCY) {
TEST(true); // Tag auto-throttled too quickly
CODE_PROBE(true, "Tag auto-throttled too quickly");
return Optional<double>();
} else {
tpsRate = computeTargetTpsRate(fractionalBusyness,
@ -121,7 +121,7 @@ Optional<double> RkTagThrottleCollection::autoThrottleTag(UID id,
tagData[tag].requestRate.smoothRate());
if (throttle.limits.expiration > now() && tpsRate.get() >= throttle.limits.tpsRate) {
TEST(true); // Tag auto-throttle rate increase attempt while active
CODE_PROBE(true, "Tag auto-throttle rate increase attempt while active");
return Optional<double>();
}
@ -176,14 +176,14 @@ void RkTagThrottleCollection::manualThrottleTag(UID id,
result.first->second.limits.expiration = expiration;
if (!oldLimits.present()) {
TEST(true); // Transaction tag manually throttled
CODE_PROBE(true, "Transaction tag manually throttled");
TraceEvent("RatekeeperAddingManualThrottle", id)
.detail("Tag", tag)
.detail("Rate", tpsRate)
.detail("Priority", transactionPriorityToString(priority))
.detail("SecondsToExpiration", expiration - now());
} else if (oldLimits.get().tpsRate != tpsRate || oldLimits.get().expiration != expiration) {
TEST(true); // Manual transaction tag throttle updated
CODE_PROBE(true, "Manual transaction tag throttle updated");
TraceEvent("RatekeeperUpdatingManualThrottle", id)
.detail("Tag", tag)
.detail("Rate", tpsRate)
@ -225,14 +225,14 @@ PrioritizedTransactionTagMap<ClientTagThrottleLimits> RkTagThrottleCollection::g
if (priorityItr != manualItr->second.end()) {
Optional<double> priorityClientRate = priorityItr->second.updateAndGetClientRate(requestRate);
if (!priorityClientRate.present()) {
TEST(true); // Manual priority throttle expired
CODE_PROBE(true, "Manual priority throttle expired");
priorityItr = manualItr->second.erase(priorityItr);
} else {
if (!manualClientRate.present() || manualClientRate.get().tpsRate > priorityClientRate.get()) {
manualClientRate = ClientTagThrottleLimits(priorityClientRate.get(),
priorityItr->second.limits.expiration);
} else {
TEST(true); // Manual throttle overriden by higher priority
CODE_PROBE(true, "Manual throttle overriden by higher priority");
}
++priorityItr;
@ -241,13 +241,13 @@ PrioritizedTransactionTagMap<ClientTagThrottleLimits> RkTagThrottleCollection::g
if (manualClientRate.present()) {
tagPresent = true;
TEST(true); // Using manual throttle
CODE_PROBE(true, "Using manual throttle");
clientRates[*priority][tagItr->first] = manualClientRate.get();
}
}
if (manualItr->second.empty()) {
TEST(true); // All manual throttles expired
CODE_PROBE(true, "All manual throttles expired");
manualThrottledTags.erase(manualItr);
break;
}
@ -261,7 +261,7 @@ PrioritizedTransactionTagMap<ClientTagThrottleLimits> RkTagThrottleCollection::g
double rampStartTime = autoItr->second.lastReduced + SERVER_KNOBS->AUTO_TAG_THROTTLE_DURATION -
SERVER_KNOBS->AUTO_TAG_THROTTLE_RAMP_UP_TIME;
if (now() >= rampStartTime && adjustedRate != std::numeric_limits<double>::max()) {
TEST(true); // Tag auto-throttle ramping up
CODE_PROBE(true, "Tag auto-throttle ramping up");
double targetBusyness = SERVER_KNOBS->AUTO_THROTTLE_TARGET_TAG_BUSYNESS;
if (targetBusyness == 0) {
@ -280,14 +280,14 @@ PrioritizedTransactionTagMap<ClientTagThrottleLimits> RkTagThrottleCollection::g
if (!result.second && result.first->second.tpsRate > adjustedRate) {
result.first->second = ClientTagThrottleLimits(adjustedRate, autoItr->second.limits.expiration);
} else {
TEST(true); // Auto throttle overriden by manual throttle
CODE_PROBE(true, "Auto throttle overriden by manual throttle");
}
clientRates[TransactionPriority::BATCH][tagItr->first] =
ClientTagThrottleLimits(0, autoItr->second.limits.expiration);
}
} else {
ASSERT(autoItr->second.limits.expiration <= now());
TEST(true); // Auto throttle expired
CODE_PROBE(true, "Auto throttle expired");
if (BUGGIFY) { // Temporarily extend the window between expiration and cleanup
tagPresent = true;
} else {
@ -297,7 +297,7 @@ PrioritizedTransactionTagMap<ClientTagThrottleLimits> RkTagThrottleCollection::g
}
if (!tagPresent) {
TEST(true); // All tag throttles expired
CODE_PROBE(true, "All tag throttles expired");
tagItr = tagData.erase(tagItr);
} else {
++tagItr;
@ -309,7 +309,7 @@ PrioritizedTransactionTagMap<ClientTagThrottleLimits> RkTagThrottleCollection::g
void RkTagThrottleCollection::addRequests(TransactionTag const& tag, int requests) {
if (requests > 0) {
TEST(true); // Requests reported for throttled tag
CODE_PROBE(true, "Requests reported for throttled tag");
auto tagItr = tagData.try_emplace(tag);
tagItr.first->second.requestRate.addDelta(requests);

View File

@ -109,7 +109,7 @@ class SimpleConfigConsumerImpl {
} catch (Error& e) {
++self->failedChangeRequest;
if (e.code() == error_code_version_already_compacted) {
TEST(true); // SimpleConfigConsumer get version_already_compacted error
CODE_PROBE(true, "SimpleConfigConsumer get version_already_compacted error");
wait(getSnapshotAndChanges(self, broadcaster));
} else {
throw e;

View File

@ -46,6 +46,7 @@
#include "flow/network.h"
#include "flow/TypeTraits.h"
#include "flow/FaultInjection.h"
#include "flow/CodeProbeUtils.h"
#include "flow/actorcompiler.h" // This must be the last #include.
#undef max
@ -260,6 +261,9 @@ class TestConfig {
if (attrib == "disableRemoteKVS") {
disableRemoteKVS = strcmp(value.c_str(), "true") == 0;
}
if (attrib == "disableEncryption") {
disableEncryption = strcmp(value.c_str(), "true") == 0;
}
if (attrib == "restartInfoLocation") {
isFirstTestInRestart = true;
}
@ -297,6 +301,8 @@ public:
bool disableHostname = false;
// remote key value store is a child process spawned by the SS process to run the storage engine
bool disableRemoteKVS = false;
// 7.2 cannot be downgraded to 7.1 or below after enabling encryption-at-rest.
bool disableEncryption = false;
// Storage Engine Types: Verify match with SimulationConfig::generateNormalConfig
// 0 = "ssd"
// 1 = "memory"
@ -358,6 +364,7 @@ public:
.add("disableTss", &disableTss)
.add("disableHostname", &disableHostname)
.add("disableRemoteKVS", &disableRemoteKVS)
.add("disableEncryption", &disableEncryption)
.add("simpleConfig", &simpleConfig)
.add("generateFearless", &generateFearless)
.add("datacenters", &datacenters)
@ -839,9 +846,9 @@ ACTOR Future<Void> simulatedMachine(ClusterConnectionString connStr,
.detail("Folder", myFolders[i]);
}
TEST(bootCount >= 1); // Simulated machine rebooted
TEST(bootCount >= 2); // Simulated machine rebooted twice
TEST(bootCount >= 3); // Simulated machine rebooted three times
CODE_PROBE(bootCount >= 1, "Simulated machine rebooted");
CODE_PROBE(bootCount >= 2, "Simulated machine rebooted twice");
CODE_PROBE(bootCount >= 3, "Simulated machine rebooted three times");
++bootCount;
TraceEvent("SimulatedMachineStart", randomId)
@ -961,7 +968,7 @@ ACTOR Future<Void> simulatedMachine(ClusterConnectionString connStr,
for (int i = 1; i < ips.size(); i++)
killType = std::max(processes[i].get(), killType);
TEST(true); // Simulated machine has been rebooted
CODE_PROBE(true, "Simulated machine has been rebooted");
state bool swap = killType == ISimulator::Reboot && BUGGIFY_WITH_PROB(0.75) &&
g_simulator.canSwapToMachine(localities.zoneId());
@ -989,7 +996,7 @@ ACTOR Future<Void> simulatedMachine(ClusterConnectionString connStr,
avail.pop_back();
if (myFolders != toRebootFrom) {
TEST(true); // Simulated machine swapped data folders
CODE_PROBE(true, "Simulated machine swapped data folders");
TraceEvent("SimulatedMachineFolderSwap", randomId)
.detail("OldFolder0", myFolders[0])
.detail("NewFolder0", toRebootFrom[0])
@ -1014,7 +1021,7 @@ ACTOR Future<Void> simulatedMachine(ClusterConnectionString connStr,
}
}
TEST(true); // Simulated machine rebooted with data loss
CODE_PROBE(true, "Simulated machine rebooted with data loss");
}
// this machine is rebooting = false;
@ -1061,7 +1068,7 @@ ACTOR Future<Void> restartSimulatedSystem(std::vector<Future<Void>>* systemActor
// Randomly change data center id names to test that localities
// can be modified on cluster restart
bool renameZoneIds = testConfig.randomlyRenameZoneId ? deterministicRandom()->random01() < 0.1 : false;
TEST(renameZoneIds); // Zone ID names altered in restart test
CODE_PROBE(renameZoneIds, "Zone ID names altered in restart test");
// allows multiple ipAddr entries
ini.SetMultiKey();
@ -1091,10 +1098,15 @@ ACTOR Future<Void> restartSimulatedSystem(std::vector<Future<Void>>* systemActor
INetworkConnections::net()->parseMockDNSFromString(mockDNSStr);
}
}
auto& g_knobs = IKnobCollection::getMutableGlobalKnobCollection();
if (testConfig.disableRemoteKVS) {
IKnobCollection::getMutableGlobalKnobCollection().setKnob("remote_kv_store",
KnobValueRef::create(bool{ false }));
TraceEvent(SevDebug, "DisaableRemoteKVS").log();
g_knobs.setKnob("remote_kv_store", KnobValueRef::create(bool{ false }));
TraceEvent(SevDebug, "DisableRemoteKVS");
}
if (testConfig.disableEncryption) {
g_knobs.setKnob("enable_encryption", KnobValueRef::create(bool{ false }));
g_knobs.setKnob("enable_tlog_encryption", KnobValueRef::create(bool{ false }));
TraceEvent(SevDebug, "DisableEncryption");
}
*pConnString = conn;
*pTesterCount = testerCount;
@ -1386,27 +1398,27 @@ void SimulationConfig::setStorageEngine(const TestConfig& testConfig) {
switch (storage_engine_type) {
case 0: {
TEST(true); // Simulated cluster using ssd storage engine
CODE_PROBE(true, "Simulated cluster using ssd storage engine");
set_config("ssd");
break;
}
case 1: {
TEST(true); // Simulated cluster using default memory storage engine
CODE_PROBE(true, "Simulated cluster using default memory storage engine");
set_config("memory");
break;
}
case 2: {
TEST(true); // Simulated cluster using radix-tree storage engine
CODE_PROBE(true, "Simulated cluster using radix-tree storage engine");
set_config("memory-radixtree-beta");
break;
}
case 3: {
TEST(true); // Simulated cluster using redwood storage engine
CODE_PROBE(true, "Simulated cluster using redwood storage engine");
set_config("ssd-redwood-1-experimental");
break;
}
case 4: {
TEST(true); // Simulated cluster using RocksDB storage engine
CODE_PROBE(true, "Simulated cluster using RocksDB storage engine");
set_config("ssd-rocksdb-v1");
// Tests using the RocksDB engine are necessarily non-deterministic because of RocksDB
// background threads.
@ -1416,7 +1428,7 @@ void SimulationConfig::setStorageEngine(const TestConfig& testConfig) {
break;
}
case 5: {
TEST(true); // Simulated cluster using Sharded RocksDB storage engine
CODE_PROBE(true, "Simulated cluster using Sharded RocksDB storage engine");
set_config("ssd-sharded-rocksdb");
// Tests using the RocksDB engine are necessarily non-deterministic because of RocksDB
// background threads.
@ -1442,7 +1454,7 @@ void SimulationConfig::setReplicationType(const TestConfig& testConfig) {
} else {
switch (replication_type) {
case 0: {
TEST(true); // Simulated cluster using custom redundancy mode
CODE_PROBE(true, "Simulated cluster using custom redundancy mode");
int storage_servers = deterministicRandom()->randomInt(1, generateFearless ? 4 : 5);
// FIXME: log replicas must be more than storage replicas because otherwise better master exists will not
// recognize it needs to change dcs
@ -1461,21 +1473,21 @@ void SimulationConfig::setReplicationType(const TestConfig& testConfig) {
break;
}
case 1: {
TEST(true); // Simulated cluster running in single redundancy mode
CODE_PROBE(true, "Simulated cluster running in single redundancy mode");
set_config("single");
break;
}
case 2: {
TEST(true); // Simulated cluster running in double redundancy mode
CODE_PROBE(true, "Simulated cluster running in double redundancy mode");
set_config("double");
break;
}
case 3: {
if (datacenters <= 2 || generateFearless) {
TEST(true); // Simulated cluster running in triple redundancy mode
CODE_PROBE(true, "Simulated cluster running in triple redundancy mode");
set_config("triple");
} else if (datacenters == 3) {
TEST(true); // Simulated cluster running in 3 data-hall mode
CODE_PROBE(true, "Simulated cluster running in 3 data-hall mode");
set_config("three_data_hall");
} else {
ASSERT(false);
@ -1526,17 +1538,17 @@ void SimulationConfig::setRegions(const TestConfig& testConfig) {
int satellite_replication_type = deterministicRandom()->randomInt(0, 3);
switch (satellite_replication_type) {
case 0: {
TEST(true); // Simulated cluster using no satellite redundancy mode (>4 datacenters)
CODE_PROBE(true, "Simulated cluster using no satellite redundancy mode (>4 datacenters)");
break;
}
case 1: {
TEST(true); // Simulated cluster using two satellite fast redundancy mode
CODE_PROBE(true, "Simulated cluster using two satellite fast redundancy mode");
primaryObj["satellite_redundancy_mode"] = "two_satellite_fast";
remoteObj["satellite_redundancy_mode"] = "two_satellite_fast";
break;
}
case 2: {
TEST(true); // Simulated cluster using two satellite safe redundancy mode
CODE_PROBE(true, "Simulated cluster using two satellite safe redundancy mode");
primaryObj["satellite_redundancy_mode"] = "two_satellite_safe";
remoteObj["satellite_redundancy_mode"] = "two_satellite_safe";
break;
@ -1549,27 +1561,27 @@ void SimulationConfig::setRegions(const TestConfig& testConfig) {
switch (satellite_replication_type) {
case 0: {
// FIXME: implement
TEST(true); // Simulated cluster using custom satellite redundancy mode
CODE_PROBE(true, "Simulated cluster using custom satellite redundancy mode");
break;
}
case 1: {
TEST(true); // Simulated cluster using no satellite redundancy mode (<4 datacenters)
CODE_PROBE(true, "Simulated cluster using no satellite redundancy mode (<4 datacenters)");
break;
}
case 2: {
TEST(true); // Simulated cluster using single satellite redundancy mode
CODE_PROBE(true, "Simulated cluster using single satellite redundancy mode");
primaryObj["satellite_redundancy_mode"] = "one_satellite_single";
remoteObj["satellite_redundancy_mode"] = "one_satellite_single";
break;
}
case 3: {
TEST(true); // Simulated cluster using double satellite redundancy mode
CODE_PROBE(true, "Simulated cluster using double satellite redundancy mode");
primaryObj["satellite_redundancy_mode"] = "one_satellite_double";
remoteObj["satellite_redundancy_mode"] = "one_satellite_double";
break;
}
case 4: {
TEST(true); // Simulated cluster using triple satellite redundancy mode
CODE_PROBE(true, "Simulated cluster using triple satellite redundancy mode");
primaryObj["satellite_redundancy_mode"] = "one_satellite_triple";
remoteObj["satellite_redundancy_mode"] = "one_satellite_triple";
break;
@ -1589,10 +1601,10 @@ void SimulationConfig::setRegions(const TestConfig& testConfig) {
if (testConfig.minimumRegions <= 1 &&
(deterministicRandom()->random01() < 0.25 ||
SERVER_KNOBS->MAX_READ_TRANSACTION_LIFE_VERSIONS < SERVER_KNOBS->VERSIONS_PER_SECOND)) {
TEST(true); // Simulated cluster using one region
CODE_PROBE(true, "Simulated cluster using one region");
needsRemote = false;
} else {
TEST(true); // Simulated cluster using two regions
CODE_PROBE(true, "Simulated cluster using two regions");
db.usableRegions = 2;
}
@ -1600,25 +1612,25 @@ void SimulationConfig::setRegions(const TestConfig& testConfig) {
switch (remote_replication_type) {
case 0: {
// FIXME: implement
TEST(true); // Simulated cluster using custom remote redundancy mode
CODE_PROBE(true, "Simulated cluster using custom remote redundancy mode");
break;
}
case 1: {
TEST(true); // Simulated cluster using default remote redundancy mode
CODE_PROBE(true, "Simulated cluster using default remote redundancy mode");
break;
}
case 2: {
TEST(true); // Simulated cluster using single remote redundancy mode
CODE_PROBE(true, "Simulated cluster using single remote redundancy mode");
set_config("remote_single");
break;
}
case 3: {
TEST(true); // Simulated cluster using double remote redundancy mode
CODE_PROBE(true, "Simulated cluster using double remote redundancy mode");
set_config("remote_double");
break;
}
case 4: {
TEST(true); // Simulated cluster using triple remote redundancy mode
CODE_PROBE(true, "Simulated cluster using triple remote redundancy mode");
set_config("remote_triple");
break;
}
@ -1860,10 +1872,15 @@ void setupSimulatedSystem(std::vector<Future<Void>>* systemActors,
if (testConfig.configureLocked) {
startingConfigString += " locked";
}
auto& g_knobs = IKnobCollection::getMutableGlobalKnobCollection();
if (testConfig.disableRemoteKVS) {
IKnobCollection::getMutableGlobalKnobCollection().setKnob("remote_kv_store",
KnobValueRef::create(bool{ false }));
TraceEvent(SevDebug, "DisaableRemoteKVS").log();
g_knobs.setKnob("remote_kv_store", KnobValueRef::create(bool{ false }));
TraceEvent(SevDebug, "DisableRemoteKVS");
}
if (testConfig.disableEncryption) {
g_knobs.setKnob("enable_encryption", KnobValueRef::create(bool{ false }));
g_knobs.setKnob("enable_tlog_encryption", KnobValueRef::create(bool{ false }));
TraceEvent(SevDebug, "DisableEncryption");
}
auto configDBType = testConfig.getConfigDBType();
for (auto kv : startingConfigJSON) {
@ -1967,18 +1984,18 @@ void setupSimulatedSystem(std::vector<Future<Void>>* systemActors,
bool sslOnly = sslEnabled && deterministicRandom()->coinflip();
bool isTLS = sslEnabled && sslOnly;
g_simulator.listenersPerProcess = sslEnabled && !sslOnly ? 2 : 1;
TEST(sslEnabled); // SSL enabled
TEST(!sslEnabled); // SSL disabled
CODE_PROBE(sslEnabled, "SSL enabled");
CODE_PROBE(!sslEnabled, "SSL disabled");
// Use IPv6 25% of the time
bool useIPv6 = deterministicRandom()->random01() < 0.25;
TEST(useIPv6); // Use IPv6
TEST(!useIPv6); // Use IPv4
CODE_PROBE(useIPv6, "Use IPv6");
CODE_PROBE(!useIPv6, "Use IPv4");
// Use hostname 25% of the time, unless it is disabled
bool useHostname = !testConfig.disableHostname && deterministicRandom()->random01() < 0.25;
TEST(useHostname); // Use hostname
TEST(!useHostname); // Use IP address
CODE_PROBE(useHostname, "Use hostname");
CODE_PROBE(!useHostname, "Use IP address");
NetworkAddressFromHostname fromHostname =
useHostname ? NetworkAddressFromHostname::True : NetworkAddressFromHostname::False;
@ -2414,7 +2431,7 @@ ACTOR void setupAndRun(std::string dataFolder,
wait(g_simulator.onProcess(testSystem, TaskPriority::DefaultYield));
Sim2FileSystem::newFileSystem();
FlowTransport::createInstance(true, 1, WLTOKEN_RESERVED_COUNT, &allowList);
TEST(true); // Simulation start
CODE_PROBE(true, "Simulation start");
state Optional<TenantName> defaultTenant;
state Standalone<VectorRef<TenantNameRef>> tenantsToCreate;
@ -2491,6 +2508,8 @@ ACTOR void setupAndRun(std::string dataFolder,
TraceEvent(SevError, "SetupAndRunError").error(e);
}
TraceEvent("TracingMissingCodeProbes").log();
probe::traceMissedProbes(probe::ExecutionContext::Simulation);
TraceEvent("SimulatedSystemDestruct").log();
g_simulator.stop();
destructed = true;

View File

@ -817,7 +817,7 @@ ACTOR static Future<JsonBuilderObject> processStatusFetcher(
roles.addRole("blob_manager", db->get().blobManager.get());
}
if ((SERVER_KNOBS->ENABLE_ENCRYPTION || g_network->isSimulated()) && db->get().encryptKeyProxy.present()) {
if (SERVER_KNOBS->ENABLE_ENCRYPTION && db->get().encryptKeyProxy.present()) {
roles.addRole("encrypt_key_proxy", db->get().encryptKeyProxy.get());
}

View File

@ -282,7 +282,7 @@ public:
void checkChangeCounter(uint64_t oldCacheRangeChangeCounter, KeyRef const& key) {
if (oldCacheRangeChangeCounter != cacheRangeChangeCounter &&
cachedRangeMap[key]->changeCounter > oldCacheRangeChangeCounter) {
TEST(true); // CacheRange change during getValueQ
CODE_PROBE(true, "CacheRange change during getValueQ");
// TODO: should we throw the cold_cache_server() error here instead?
throw wrong_shard_server();
}
@ -293,7 +293,7 @@ public:
auto sh = cachedRangeMap.intersectingRanges(keys);
for (auto i = sh.begin(); i != sh.end(); ++i)
if (i->value()->changeCounter > oldCacheRangeChangeCounter) {
TEST(true); // CacheRange change during range operation
CODE_PROBE(true, "CacheRange change during range operation");
// TODO: should we throw the cold_cache_server() error here instead?
throw wrong_shard_server();
}
@ -472,7 +472,6 @@ ACTOR Future<Void> getValueQ(StorageCacheData* data, GetValueRequest req) {
try {
++data->counters.getValueQueries;
++data->counters.allQueries;
//++data->readQueueSizeMetric;
// TODO later
// data->maxQueryQueue = std::max<int>( data->maxQueryQueue, data->counters.allQueries.getValue() -
// data->counters.finishedQueries.getValue());
@ -544,7 +543,6 @@ ACTOR Future<Void> getValueQ(StorageCacheData* data, GetValueRequest req) {
}
++data->counters.finishedQueries;
//--data->readQueueSizeMetric;
// if(data->latencyBandConfig.present()) {
// int maxReadBytes =
// data->latencyBandConfig.get().readConfig.maxReadBytes.orDefault(std::numeric_limits<int>::max());
@ -665,7 +663,7 @@ Key findKey(StorageCacheData* data, KeySelectorRef sel, Version version, KeyRang
// If we get only one result in the reverse direction as a result of the data being too large, we could get stuck in
// a loop
if (more && !forward && rep.data.size() == 1) {
TEST(true); // Reverse key selector returned only one result in range read
CODE_PROBE(true, "Reverse key selector returned only one result in range read");
maxBytes = std::numeric_limits<int>::max();
GetKeyValuesReply rep2 =
readRange(data, version, KeyRangeRef(range.begin, keyAfter(sel.getKey())), -2, &maxBytes);
@ -688,7 +686,7 @@ Key findKey(StorageCacheData* data, KeySelectorRef sel, Version version, KeyRang
*pOffset = -*pOffset;
if (more) {
TEST(true); // Key selector read range had more results
CODE_PROBE(true, "Key selector read range had more results");
ASSERT(rep.data.size());
Key returnKey = forward ? keyAfter(rep.data.back().key) : rep.data.back().key;
@ -728,7 +726,6 @@ ACTOR Future<Void> getKeyValues(StorageCacheData* data, GetKeyValuesRequest req)
++data->counters.getRangeQueries;
++data->counters.allQueries;
// printf("\nSCGetKeyValues\n");
//++data->readQueueSizeMetric;
// data->maxQueryQueue = std::max<int>( data->maxQueryQueue, data->counters.allQueries.getValue() -
// data->counters.finishedQueries.getValue());
@ -781,7 +778,7 @@ ACTOR Future<Void> getKeyValues(StorageCacheData* data, GetKeyValuesRequest req)
// cachedKeyRange is the end the last actual key returned must be from this cachedKeyRange. A begin offset of 1
// is also OK because then either begin is past end or equal to end (so the result is definitely empty)
if ((offset1 && offset1 != 1) || (offset2 && offset2 != 1)) {
TEST(true); // wrong_cache_server due to offset
CODE_PROBE(true, "wrong_cache_server due to offset");
// We could detect when offset1 takes us off the beginning of the database or offset2 takes us off the end,
// and return a clipped range rather than an error (since that is what the NativeAPI.getRange will do anyway
// via its "slow path"), but we would have to add some flags to the response to encode whether we went off
@ -943,7 +940,7 @@ bool expandMutation(MutationRef& m, StorageCacheData::VersionedData const& data,
if (it != data.atLatest().end() && it->isValue() && it.key() == m.param1)
oldVal = it->getValue();
else if (it != data.atLatest().end() && it->isClearTo() && it->getEndKey() > m.param1) {
TEST(true); // Atomic op right after a clear.
CODE_PROBE(true, "Atomic op right after a clear.");
}
switch (m.type) {
@ -1073,7 +1070,7 @@ void splitMutation(StorageCacheData* data, KeyRangeMap<T>& map, MutationRef cons
}
void rollback(StorageCacheData* data, Version rollbackVersion, Version nextVersion) {
TEST(true); // call to cacheRange rollback
CODE_PROBE(true, "call to cacheRange rollback");
// FIXME: enable when debugKeyRange is active
// debugKeyRange("Rollback", rollbackVersion, allKeys);
@ -1279,7 +1276,7 @@ ACTOR Future<Void> fetchKeys(StorageCacheData* data, AddingCacheRange* cacheRang
lastAvailable = std::max(lastAvailable, r->value());
if (lastAvailable != invalidVersion && lastAvailable >= data->oldestVersion.get()) {
TEST(true); // wait for oldest version
CODE_PROBE(true, "wait for oldest version");
wait(data->oldestVersion.whenAtLeast(lastAvailable + 1));
}
@ -1318,7 +1315,7 @@ ACTOR Future<Void> fetchKeys(StorageCacheData* data, AddingCacheRange* cacheRang
loop {
try {
TEST(true); // Fetching keys for transferred cacheRange
CODE_PROBE(true, "Fetching keys for transferred cacheRange");
state RangeResult this_block =
wait(tryFetchRange(data->cx,
@ -1382,7 +1379,7 @@ ACTOR Future<Void> fetchKeys(StorageCacheData* data, AddingCacheRange* cacheRang
.suppressFor(1.0)
.detail("FKID", interval.pairID);
if (e.code() == error_code_transaction_too_old) {
TEST(true); // A storage server has forgotten the history data we are fetching
CODE_PROBE(true, "A storage server has forgotten the history data we are fetching");
Version lastFV = fetchVersion;
fetchVersion = data->version.get();
isTooOld = false;
@ -1409,8 +1406,9 @@ ACTOR Future<Void> fetchKeys(StorageCacheData* data, AddingCacheRange* cacheRang
.detail("E", data->version.get());
}
} else if (e.code() == error_code_future_version || e.code() == error_code_process_behind) {
TEST(true); // fetchKeys got future_version or process_behind, so there must be a huge storage lag
// somewhere. Keep trying.
CODE_PROBE(true,
"fetchKeys got future_version or process_behind, so there must be a huge storage lag "
"somewhere. Keep trying.");
} else {
throw;
}
@ -1470,7 +1468,7 @@ ACTOR Future<Void> fetchKeys(StorageCacheData* data, AddingCacheRange* cacheRang
}
int startSize = batch->changes.size();
TEST(startSize); // Adding fetch data to a batch which already has changes
CODE_PROBE(startSize, "Adding fetch data to a batch which already has changes");
batch->changes.resize(batch->changes.size() + cacheRange->updates.size());
// FIXME: pass the deque back rather than copy the data
@ -1633,7 +1631,7 @@ void cacheWarmup(StorageCacheData* data, const KeyRangeRef& keys, bool nowAssign
else {
ASSERT(ranges[i].value->adding);
data->addCacheRange(CacheRangeInfo::newAdding(data, ranges[i]));
TEST(true); // cacheWarmup reFetchKeys
CODE_PROBE(true, "cacheWarmup reFetchKeys");
}
}
@ -1772,7 +1770,7 @@ private:
br >> rollbackVersion;
if (rollbackVersion < fromVersion && rollbackVersion > data->oldestVersion.get()) {
TEST(true); // CacheRangeApplyPrivateData cacheRange rollback
CODE_PROBE(true, "CacheRangeApplyPrivateData cacheRange rollback");
TraceEvent(SevWarn, "Rollback", data->thisServerID)
.detail("FromVersion", fromVersion)
.detail("ToVersion", rollbackVersion)
@ -1962,8 +1960,10 @@ ACTOR Future<Void> pullAsyncData(StorageCacheData* data) {
}
if (data->cacheRangeChangeCounter == changeCounter)
break;
// TEST(true); // A fetchKeys completed while we were doing this, so eager might be outdated. Read
// it again.
// CODE_PROBE(
// true,
// "A fetchKeys completed while we were doing this, so eager might be outdated. Read it
// again.");
}
}
@ -2014,7 +2014,7 @@ ACTOR Future<Void> pullAsyncData(StorageCacheData* data) {
SpanContextMessage scm;
reader >> scm;
} else if (reader.protocolVersion().hasOTELSpanContext() && OTELSpanContextMessage::isNextIn(reader)) {
TEST(true); // StorageCache reading OTELSpanContextMessage
CODE_PROBE(true, "StorageCache reading OTELSpanContextMessage");
OTELSpanContextMessage oscm;
reader >> oscm;
} else {

View File

@ -157,7 +157,7 @@ private:
Standalone<StringRef> h = wait(self->queue->readNext(sizeof(uint32_t)));
if (h.size() != sizeof(uint32_t)) {
if (h.size()) {
TEST(true); // Zero fill within size field
CODE_PROBE(true, "Zero fill within size field");
int payloadSize = 0;
memcpy(&payloadSize, h.begin(), h.size());
zeroFillSize = sizeof(uint32_t) - h.size(); // zero fill the size itself
@ -171,7 +171,7 @@ private:
Standalone<StringRef> e = wait(self->queue->readNext(payloadSize + 1));
if (e.size() != payloadSize + 1) {
TEST(true); // Zero fill within payload
CODE_PROBE(true, "Zero fill within payload");
zeroFillSize = payloadSize + 1 - e.size();
break;
}
@ -187,7 +187,7 @@ private:
}
}
if (zeroFillSize) {
TEST(true); // Fixing a partial commit at the end of the tlog queue
CODE_PROBE(true, "Fixing a partial commit at the end of the tlog queue");
for (int i = 0; i < zeroFillSize; i++)
self->queue->push(StringRef((const uint8_t*)"", 1));
}
@ -805,9 +805,9 @@ void TLogQueue::updateVersionSizes(const TLogQueueEntry& result,
ACTOR Future<Void> tLogLock(TLogData* self, ReplyPromise<TLogLockResult> reply, Reference<LogData> logData) {
state Version stopVersion = logData->version.get();
TEST(true); // TLog stopped by recovering cluster-controller
TEST(logData->stopped); // logData already stopped
TEST(!logData->stopped); // logData not yet stopped
CODE_PROBE(true, "TLog stopped by recovering cluster-controller");
CODE_PROBE(logData->stopped, "logData already stopped");
CODE_PROBE(!logData->stopped, "logData not yet stopped");
TraceEvent("TLogStop", logData->logId)
.detail("Ver", stopVersion)
@ -1097,7 +1097,7 @@ ACTOR Future<Void> updatePersistentData(TLogData* self, Reference<LogData> logDa
// Now that the changes we made to persistentData are durable, erase the data we moved from memory and the queue,
// increase bytesDurable accordingly, and update persistentDataDurableVersion.
TEST(anyData); // TLog moved data to persistentData
CODE_PROBE(anyData, "TLog moved data to persistentData");
logData->persistentDataDurableVersion = newPersistentDataVersion;
for (tagLocality = 0; tagLocality < logData->tag_data.size(); tagLocality++) {
for (tagId = 0; tagId < logData->tag_data[tagLocality].size(); tagId++) {
@ -1248,7 +1248,7 @@ ACTOR Future<Void> processPopRequests(TLogData* self, Reference<LogData> logData
TraceEvent("PlayIgnoredPop", logData->logId).detail("Tag", tag.toString()).detail("Version", version);
ignoredPops.push_back(tLogPopCore(self, tag, version, logData));
if (++ignoredPopsPlayed % SERVER_KNOBS->TLOG_POP_BATCH_SIZE == 0) {
TEST(true); // Yielding while processing pop requests
CODE_PROBE(true, "Yielding while processing pop requests");
wait(yield());
}
}
@ -1836,7 +1836,7 @@ Future<Void> tLogPeekMessages(PromiseType replyPromise,
}
if (sequenceData.isSet()) {
if (sequenceData.getFuture().get().first != rep.end) {
TEST(true); // tlog peek second attempt ended at a different version
CODE_PROBE(true, "tlog peek second attempt ended at a different version");
replyPromise.sendError(operation_obsolete());
return Void();
}
@ -2069,7 +2069,7 @@ Future<Void> tLogPeekMessages(PromiseType replyPromise,
if (sequenceData.isSet()) {
trackerData.duplicatePeeks++;
if (sequenceData.getFuture().get().first != reply.end) {
TEST(true); // tlog peek second attempt ended at a different version (2)
CODE_PROBE(true, "tlog peek second attempt ended at a different version (2)");
replyPromise.sendError(operation_obsolete());
return Void();
}
@ -2177,7 +2177,7 @@ ACTOR Future<Void> doQueueCommit(TLogData* self,
.detail("LogId", logData->logId)
.detail("Version", it->version.get())
.detail("QueueVer", it->queueCommittedVersion.get());
TEST(true); // A TLog was replaced before having a chance to commit its queue
CODE_PROBE(true, "A TLog was replaced before having a chance to commit its queue");
it->queueCommittedVersion.set(it->version.get());
}
return Void();
@ -2655,7 +2655,7 @@ ACTOR Future<Void> serveTLogInterface(TLogData* self,
when(TLogCommitRequest req = waitNext(tli.commit.getFuture())) {
//TraceEvent("TLogCommitReq", logData->logId).detail("Ver", req.version).detail("PrevVer", req.prevVersion).detail("LogVer", logData->version.get());
ASSERT(logData->isPrimary);
TEST(logData->stopped); // TLogCommitRequest while stopped
CODE_PROBE(logData->stopped, "TLogCommitRequest while stopped");
if (!logData->stopped)
logData->addActor.send(tLogCommit(self, req, logData, warningCollectorInput));
else
@ -3026,7 +3026,7 @@ ACTOR Future<Void> restorePersistentState(TLogData* self,
if (!fFormat.get().present()) {
RangeResult v = wait(self->persistentData->readRange(KeyRangeRef(StringRef(), LiteralStringRef("\xff")), 1));
if (!v.size()) {
TEST(true); // The DB is completely empty, so it was never initialized. Delete it.
CODE_PROBE(true, "The DB is completely empty, so it was never initialized. Delete it.");
throw worker_removed();
} else {
// This should never happen
@ -3183,7 +3183,7 @@ ACTOR Future<Void> restorePersistentState(TLogData* self,
throw end_of_stream();
loop {
if (allRemoved.isReady()) {
TEST(true); // all tlogs removed during queue recovery
CODE_PROBE(true, "all tlogs removed during queue recovery");
throw worker_removed();
}
choose {
@ -3214,7 +3214,7 @@ ACTOR Future<Void> restorePersistentState(TLogData* self,
logData->queueCommittedVersion.set(qe.version);
while (self->bytesInput - self->bytesDurable >= recoverMemoryLimit) {
TEST(true); // Flush excess data during TLog queue recovery
CODE_PROBE(true, "Flush excess data during TLog queue recovery");
TraceEvent("FlushLargeQueueDuringRecovery", self->dbgid)
.detail("LogId", logData->logId)
.detail("BytesInput", self->bytesInput)
@ -3244,7 +3244,7 @@ ACTOR Future<Void> restorePersistentState(TLogData* self,
}
TraceEvent("TLogRestorePersistentStateDone", self->dbgid).detail("Took", now() - startt);
TEST(now() - startt >= 1.0); // TLog recovery took more than 1 second
CODE_PROBE(now() - startt >= 1.0, "TLog recovery took more than 1 second");
for (auto it : self->id_data) {
if (it.second->queueCommittedVersion.get() == 0) {

View File

@ -2167,7 +2167,7 @@ ACTOR Future<Void> TagPartitionedLogSystem::epochEnd(Reference<AsyncVar<Referenc
}
}
TEST(true); // Master recovery from pre-existing database
CODE_PROBE(true, "Master recovery from pre-existing database");
// trackRejoins listens for rejoin requests from the tLogs that we are recovering from, to learn their
// TLogInterfaces
@ -2228,7 +2228,7 @@ ACTOR Future<Void> TagPartitionedLogSystem::epochEnd(Reference<AsyncVar<Referenc
}
if (!lockedLocalities.count(log->locality)) {
TraceEvent("EpochEndLockExtra").detail("Locality", log->locality);
TEST(true); // locking old generations for version information
CODE_PROBE(true, "locking old generations for version information");
lockedLocalities.insert(log->locality);
LogLockInfo lockResult;
lockResult.epochEnd = old.epochEnd;
@ -2312,7 +2312,7 @@ ACTOR Future<Void> TagPartitionedLogSystem::epochEnd(Reference<AsyncVar<Referenc
changes.push_back(TagPartitionedLogSystem::getDurableVersionChanged(lockResults[log], logFailed[log]));
}
if (maxEnd > 0 && (!lastEnd.present() || maxEnd < lastEnd.get())) {
TEST(lastEnd.present()); // Restarting recovery at an earlier point
CODE_PROBE(lastEnd.present(), "Restarting recovery at an earlier point");
auto logSystem = makeReference<TagPartitionedLogSystem>(dbgid, locality, prevState.recoveryCount);

View File

@ -54,20 +54,20 @@ class TagThrottlerImpl {
if (autoThrottlingEnabled.get().present() &&
autoThrottlingEnabled.get().get() == LiteralStringRef("0")) {
TEST(true); // Auto-throttling disabled
CODE_PROBE(true, "Auto-throttling disabled");
if (self->autoThrottlingEnabled) {
TraceEvent("AutoTagThrottlingDisabled", self->id).log();
}
self->autoThrottlingEnabled = false;
} else if (autoThrottlingEnabled.get().present() &&
autoThrottlingEnabled.get().get() == LiteralStringRef("1")) {
TEST(true); // Auto-throttling enabled
CODE_PROBE(true, "Auto-throttling enabled");
if (!self->autoThrottlingEnabled) {
TraceEvent("AutoTagThrottlingEnabled", self->id).log();
}
self->autoThrottlingEnabled = true;
} else {
TEST(true); // Auto-throttling unspecified
CODE_PROBE(true, "Auto-throttling unspecified");
if (autoThrottlingEnabled.get().present()) {
TraceEvent(SevWarnAlways, "InvalidAutoTagThrottlingValue", self->id)
.detail("Value", autoThrottlingEnabled.get().get());
@ -90,7 +90,7 @@ class TagThrottlerImpl {
if (tagValue.expirationTime == 0 ||
tagValue.expirationTime > now() + tagValue.initialDuration) {
TEST(true); // Converting tag throttle duration to absolute time
CODE_PROBE(true, "Converting tag throttle duration to absolute time");
tagValue.expirationTime = now() + tagValue.initialDuration;
BinaryWriter wr(IncludeVersion(ProtocolVersion::withTagThrottleValueReason()));
wr << tagValue;
@ -128,7 +128,7 @@ class TagThrottlerImpl {
wait(watchFuture);
TraceEvent("RatekeeperThrottleSignaled", self->id).log();
TEST(true); // Tag throttle changes detected
CODE_PROBE(true, "Tag throttle changes detected");
break;
} catch (Error& e) {
TraceEvent("RatekeeperMonitorThrottlingChangesError", self->id).error(e);
@ -142,7 +142,7 @@ class TagThrottlerImpl {
// NOTE: before the comparison with MIN_TAG_COST, the busiest tag rate also compares with MIN_TAG_PAGES_RATE
// currently MIN_TAG_PAGES_RATE > MIN_TAG_COST in our default knobs.
if (busyness > SERVER_KNOBS->AUTO_THROTTLE_TARGET_TAG_BUSYNESS && rate > SERVER_KNOBS->MIN_TAG_COST) {
TEST(true); // Transaction tag auto-throttled
CODE_PROBE(true, "Transaction tag auto-throttled");
Optional<double> clientRate = throttledTags.autoThrottleTag(id, tag, busyness);
// TODO: Increment tag throttle counts here?
if (clientRate.present()) {

View File

@ -99,7 +99,7 @@ public:
void addRequest(Optional<TagSet> const& tags, int64_t bytes) {
if (tags.present()) {
TEST(true); // Tracking transaction tag in counter
CODE_PROBE(true, "Tracking transaction tag in counter");
double cost = costFunction(bytes);
for (auto& tag : tags.get()) {
int64_t& count = intervalCounts[TransactionTag(tag, tags.get().getArena())];

View File

@ -7819,10 +7819,10 @@ public:
if (rowLimit > 0) {
f = cur.seekGTE(keys.begin);
if (f.isReady()) {
TEST(true); // Cached forward range read seek
CODE_PROBE(true, "Cached forward range read seek");
f.get();
} else {
TEST(true); // Uncached forward range read seek
CODE_PROBE(true, "Uncached forward range read seek");
wait(store(lock, self->m_concurrentReads.lock()));
wait(f);
}
@ -7875,10 +7875,10 @@ public:
} else {
f = cur.seekLT(keys.end);
if (f.isReady()) {
TEST(true); // Cached reverse range read seek
CODE_PROBE(true, "Cached reverse range read seek");
f.get();
} else {
TEST(true); // Uncached reverse range read seek
CODE_PROBE(true, "Uncached reverse range read seek");
wait(store(lock, self->m_concurrentReads.lock()));
wait(f);
}

View File

@ -30,7 +30,7 @@ ACTOR Future<Void> waitFailureServer(FutureStream<ReplyPromise<Void>> waitFailur
ReplyPromise<Void> P = waitNext(waitFailure);
queue.push_back(P);
if (queue.size() > SERVER_KNOBS->MAX_OUTSTANDING_WAIT_FAILURE_REQUESTS) {
TEST(true); // wait server queue full
CODE_PROBE(true, "wait server queue full");
queue.front().send(Void());
queue.pop_front();
}

View File

@ -108,7 +108,7 @@ enum {
OPT_CONNFILE, OPT_SEEDCONNFILE, OPT_SEEDCONNSTRING, OPT_ROLE, OPT_LISTEN, OPT_PUBLICADDR, OPT_DATAFOLDER, OPT_LOGFOLDER, OPT_PARENTPID, OPT_TRACER, OPT_NEWCONSOLE,
OPT_NOBOX, OPT_TESTFILE, OPT_RESTARTING, OPT_RESTORING, OPT_RANDOMSEED, OPT_KEY, OPT_MEMLIMIT, OPT_VMEMLIMIT, OPT_STORAGEMEMLIMIT, OPT_CACHEMEMLIMIT, OPT_MACHINEID,
OPT_DCID, OPT_MACHINE_CLASS, OPT_BUGGIFY, OPT_VERSION, OPT_BUILD_FLAGS, OPT_CRASHONERROR, OPT_HELP, OPT_NETWORKIMPL, OPT_NOBUFSTDOUT, OPT_BUFSTDOUTERR,
OPT_TRACECLOCK, OPT_NUMTESTERS, OPT_DEVHELP, OPT_ROLLSIZE, OPT_MAXLOGS, OPT_MAXLOGSSIZE, OPT_KNOB, OPT_UNITTESTPARAM, OPT_TESTSERVERS, OPT_TEST_ON_SERVERS, OPT_METRICSCONNFILE,
OPT_TRACECLOCK, OPT_NUMTESTERS, OPT_DEVHELP, OPT_PRINT_CODE_PROBES, OPT_ROLLSIZE, OPT_MAXLOGS, OPT_MAXLOGSSIZE, OPT_KNOB, OPT_UNITTESTPARAM, OPT_TESTSERVERS, OPT_TEST_ON_SERVERS, OPT_METRICSCONNFILE,
OPT_METRICSPREFIX, OPT_LOGGROUP, OPT_LOCALITY, OPT_IO_TRUST_SECONDS, OPT_IO_TRUST_WARN_ONLY, OPT_FILESYSTEM, OPT_PROFILER_RSS_SIZE, OPT_KVFILE,
OPT_TRACE_FORMAT, OPT_WHITELIST_BINPATH, OPT_BLOB_CREDENTIAL_FILE, OPT_CONFIG_PATH, OPT_USE_TEST_CONFIG_DB, OPT_FAULT_INJECTION, OPT_PROFILER, OPT_PRINT_SIMTIME,
OPT_FLOW_PROCESS_NAME, OPT_FLOW_PROCESS_ENDPOINT, OPT_IP_TRUSTED_MASK, OPT_KMS_CONN_DISCOVERY_URL_FILE, OPT_KMS_CONN_VALIDATION_TOKEN_DETAILS, OPT_KMS_CONN_GET_ENCRYPTION_KEYS_ENDPOINT
@ -183,6 +183,7 @@ CSimpleOpt::SOption g_rgOptions[] = {
{ OPT_HELP, "-h", SO_NONE },
{ OPT_HELP, "--help", SO_NONE },
{ OPT_DEVHELP, "--dev-help", SO_NONE },
{ OPT_PRINT_CODE_PROBES, "--code-probes", SO_REQ_SEP },
{ OPT_KNOB, "--knob-", SO_REQ_SEP },
{ OPT_UNITTESTPARAM, "--test-", SO_REQ_SEP },
{ OPT_LOCALITY, "--locality-", SO_REQ_SEP },
@ -1144,6 +1145,10 @@ private:
printUsage(argv[0], true);
flushAndExit(FDB_EXIT_SUCCESS);
break;
case OPT_PRINT_CODE_PROBES:
probe::ICodeProbe::printProbesJSON({ std::string(args.OptionArg()) });
flushAndExit(FDB_EXIT_SUCCESS);
break;
case OPT_KNOB: {
Optional<std::string> knobName = extractPrefixedArgument("--knob", args.OptionSyntax());
if (!knobName.present()) {
@ -2121,6 +2126,14 @@ int main(int argc, char* argv[]) {
}
}
}
g_knobs.setKnob("enable_encryption",
KnobValue::create(ini.GetBoolValue("META", "enableEncryption", false)));
g_knobs.setKnob("enable_tlog_encryption",
KnobValue::create(ini.GetBoolValue("META", "enableTLogEncryption", false)));
g_knobs.setKnob("enable_blob_granule_encryption",
KnobValue::create(ini.GetBoolValue("META", "enableBlobGranuleEncryption", false)));
g_knobs.setKnob("enable_blob_granule_compression",
KnobValue::create(ini.GetBoolValue("META", "enableBlobGranuleEncryption", false)));
}
setupAndRun(dataFolder, opts.testFile, opts.restarting, (isRestoring >= 1), opts.whitelistBinPaths);
g_simulator.run();

View File

@ -26,14 +26,14 @@
#pragma once
#include "flow/flow.h"
#include "fdbclient/CommitTransaction.h"
#include "fdbclient/FDBTypes.h"
#include "fdbclient/BlobConnectionProvider.h"
#include "fdbclient/BlobGranuleCommon.h"
#include "fdbclient/CommitTransaction.h"
#include "fdbclient/FDBTypes.h"
#include "fdbclient/Tenant.h"
#include "fdbserver/ServerDBInfo.h"
#include "flow/actorcompiler.h" // has to be last include
#include "flow/flow.h"
struct GranuleHistory {
KeyRange range;
@ -53,18 +53,28 @@ struct BlobFileIndex {
int64_t offset;
int64_t length;
int64_t fullFileLength;
Optional<BlobGranuleCipherKeysMeta> cipherKeysMeta;
BlobFileIndex() {}
BlobFileIndex(Version version, std::string filename, int64_t offset, int64_t length, int64_t fullFileLength)
: version(version), filename(filename), offset(offset), length(length), fullFileLength(fullFileLength) {}
BlobFileIndex(Version version,
std::string filename,
int64_t offset,
int64_t length,
int64_t fullFileLength,
Optional<BlobGranuleCipherKeysMeta> ciphKeysMeta)
: version(version), filename(filename), offset(offset), length(length), fullFileLength(fullFileLength),
cipherKeysMeta(ciphKeysMeta) {}
// compare on version
bool operator<(const BlobFileIndex& r) const { return version < r.version; }
};
// FIXME: initialize these to smaller default sizes to save a bit of memory, particularly snapshotFiles
// Stores the files that comprise a blob granule
// FIXME: initialize these to smaller default sizes to save a bit of memory,
// particularly snapshotFiles Stores the files that comprise a blob granule
struct GranuleFiles {
std::vector<BlobFileIndex> snapshotFiles;
std::vector<BlobFileIndex> deltaFiles;
@ -78,16 +88,10 @@ struct GranuleFiles {
};
// serialize change feed key as UID bytes, to use 16 bytes on disk
static Key granuleIDToCFKey(UID granuleID) {
BinaryWriter wr(Unversioned());
wr << granuleID;
return wr.toValue();
}
Key granuleIDToCFKey(UID granuleID);
// parse change feed key back to UID, to be human-readable
static UID cfKeyToGranuleID(Key cfKey) {
return BinaryReader::fromStringRef<UID>(cfKey, Unversioned());
}
UID cfKeyToGranuleID(Key cfKey);
class Transaction;
ACTOR Future<Optional<GranuleHistory>> getLatestGranuleHistory(Transaction* tr, KeyRange range);

Some files were not shown because too many files have changed in this diff Show More