Merge branch 'main' into feature-metacluster

This commit is contained in:
A.J. Beamon 2022-07-27 08:55:10 -07:00
commit 7c6b3fb0b8
111 changed files with 3732 additions and 842 deletions

View File

@ -765,6 +765,71 @@ JNIEXPORT jdouble JNICALL Java_com_apple_foundationdb_FDBDatabase_Database_1getM
return (jdouble)fdb_database_get_main_thread_busyness(database);
}
JNIEXPORT jlong JNICALL Java_com_apple_foundationdb_FDBDatabase_Database_1purgeBlobGranules(JNIEnv* jenv,
jobject,
jlong dbPtr,
jbyteArray beginKeyBytes,
jbyteArray endKeyBytes,
jlong purgeVersion,
jboolean force) {
if (!dbPtr || !beginKeyBytes || !endKeyBytes) {
throwParamNotNull(jenv);
return 0;
}
FDBDatabase* database = (FDBDatabase*)dbPtr;
uint8_t* beginKeyArr = (uint8_t*)jenv->GetByteArrayElements(beginKeyBytes, JNI_NULL);
if (!beginKeyArr) {
if (!jenv->ExceptionOccurred())
throwRuntimeEx(jenv, "Error getting handle to native resources");
return 0;
}
uint8_t* endKeyArr = (uint8_t*)jenv->GetByteArrayElements(endKeyBytes, JNI_NULL);
if (!endKeyArr) {
jenv->ReleaseByteArrayElements(beginKeyBytes, (jbyte*)beginKeyArr, JNI_ABORT);
if (!jenv->ExceptionOccurred())
throwRuntimeEx(jenv, "Error getting handle to native resources");
return 0;
}
FDBFuture* f = fdb_database_purge_blob_granules(database,
beginKeyArr,
jenv->GetArrayLength(beginKeyBytes),
endKeyArr,
jenv->GetArrayLength(endKeyBytes),
purgeVersion,
(fdb_bool_t)force);
jenv->ReleaseByteArrayElements(beginKeyBytes, (jbyte*)beginKeyArr, JNI_ABORT);
jenv->ReleaseByteArrayElements(endKeyBytes, (jbyte*)endKeyArr, JNI_ABORT);
return (jlong)f;
}
JNIEXPORT jlong JNICALL
Java_com_apple_foundationdb_FDBDatabase_Database_1waitPurgeGranulesComplete(JNIEnv* jenv,
jobject,
jlong dbPtr,
jbyteArray purgeKeyBytes) {
if (!dbPtr || !purgeKeyBytes) {
throwParamNotNull(jenv);
return 0;
}
FDBDatabase* database = (FDBDatabase*)dbPtr;
uint8_t* purgeKeyArr = (uint8_t*)jenv->GetByteArrayElements(purgeKeyBytes, JNI_NULL);
if (!purgeKeyArr) {
if (!jenv->ExceptionOccurred())
throwRuntimeEx(jenv, "Error getting handle to native resources");
return 0;
}
FDBFuture* f =
fdb_database_wait_purge_granules_complete(database, purgeKeyArr, jenv->GetArrayLength(purgeKeyBytes));
jenv->ReleaseByteArrayElements(purgeKeyBytes, (jbyte*)purgeKeyArr, JNI_ABORT);
return (jlong)f;
}
JNIEXPORT jboolean JNICALL Java_com_apple_foundationdb_FDB_Error_1predicate(JNIEnv* jenv,
jobject,
jint predicate,

View File

@ -161,6 +161,24 @@ public interface Database extends AutoCloseable, TransactionContext {
*/
double getMainThreadBusyness();
/**
* Queues a purge of blob granules for the specified key range, at the specified version.
*
* @param beginKey start of the key range
* @param endKey end of the key range
* @param purgeVersion version to purge at
* @param force if true delete all data, if not keep data >= purgeVersion
* @return the key to watch for purge complete
*/
CompletableFuture<byte[]> purgeBlobGranules(byte[] beginKey, byte[] endKey, long purgeVersion, boolean force, Executor e);
/**
* Wait for a previous call to purgeBlobGranules to complete
*
* @param purgeKey key to watch
*/
CompletableFuture<Void> waitPurgeGranulesComplete(byte[] purgeKey, Executor e);
/**
* Runs a read-only transactional function against this {@code Database} with retry logic.
* {@link Function#apply(Object) apply(ReadTransaction)} will be called on the

View File

@ -200,6 +200,26 @@ class FDBDatabase extends NativeObjectWrapper implements Database, OptionConsume
}
}
@Override
public CompletableFuture<byte[]> purgeBlobGranules(byte[] beginKey, byte[] endKey, long purgeVersion, boolean force, Executor executor) {
pointerReadLock.lock();
try {
return new FutureKey(Database_purgeBlobGranules(getPtr(), beginKey, endKey, purgeVersion, force), executor, eventKeeper);
} finally {
pointerReadLock.unlock();
}
}
@Override
public CompletableFuture<Void> waitPurgeGranulesComplete(byte[] purgeKey, Executor executor) {
pointerReadLock.lock();
try {
return new FutureVoid(Database_waitPurgeGranulesComplete(getPtr(), purgeKey), executor);
} finally {
pointerReadLock.unlock();
}
}
@Override
public Executor getExecutor() {
return executor;
@ -215,4 +235,6 @@ class FDBDatabase extends NativeObjectWrapper implements Database, OptionConsume
private native void Database_dispose(long cPtr);
private native void Database_setOption(long cPtr, int code, byte[] value) throws FDBException;
private native double Database_getMainThreadBusyness(long cPtr);
private native long Database_purgeBlobGranules(long cPtr, byte[] beginKey, byte[] endKey, long purgeVersion, boolean force);
private native long Database_waitPurgeGranulesComplete(long cPtr, byte[] purgeKey);
}

View File

@ -666,14 +666,6 @@ def tenants(logger):
assert len(lines) == 4
assert lines[3].strip() == 'tenant group: tenant_group1'
output = run_fdbcli_command('configuretenant tenant tenant_group=tenant_group1 tenant_group=tenant_group2')
assert output == 'The configuration for tenant `tenant\' has been updated'
output = run_fdbcli_command('gettenant tenant')
lines = output.split('\n')
assert len(lines) == 4
assert lines[3].strip() == 'tenant group: tenant_group2'
output = run_fdbcli_command('configuretenant tenant unset tenant_group')
assert output == 'The configuration for tenant `tenant\' has been updated'

View File

@ -68,6 +68,7 @@ if(WIN32)
add_definitions(-DBOOST_USE_WINDOWS_H)
add_definitions(-DWIN32_LEAN_AND_MEAN)
add_definitions(-D_ITERATOR_DEBUG_LEVEL=0)
add_definitions(-DNOGDI) # WinGDI.h defines macro ERROR
endif()
if (USE_CCACHE)

View File

@ -8,6 +8,8 @@ add_subdirectory(rapidxml)
add_subdirectory(sqlite)
add_subdirectory(SimpleOpt)
add_subdirectory(fmt-8.1.1)
add_subdirectory(md5)
add_subdirectory(libb64)
if(NOT WIN32)
add_subdirectory(linenoise)
add_subdirectory(debug_determinism)

View File

@ -1,2 +1,8 @@
add_library(crc32 STATIC crc32.S crc32_wrapper.c crc32c.cpp)
if (CLANG)
# This is necessary for clang since the compiler reports that crc32_align is
# defined but not used. With -Werror, crc32 will not compile.
# TODO: Remove this when the upstream issue is repaired.
target_compile_options(crc32 PUBLIC -Wno-unused-function)
endif ()
target_include_directories(crc32 PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/include")

72
contrib/ddsketch_calc.py Normal file
View File

@ -0,0 +1,72 @@
#!/usr/bin/env python3
#
# ddsketch_calc.py
#
# This source file is part of the FoundationDB open source project
#
# Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import numpy as np
import math as m
# Implements a DDSketch class as desrcibed in:
# https://arxiv.org/pdf/1908.10693.pdf
# This class has methods that use cubic interpolation to quickly compute log
# and inverse log. The coefficients A,B,C as well as correctingFactor are
# all constants used for interpolating.
# The implementation for interpolation was originally seen here in:
# https://github.com/DataDog/sketches-java/
# in the file CubicallyInterpolatedMapping.java
class DDSketch(object):
A = 6.0 / 35.0
B = -3.0 / 5.0
C = 10.0 / 7.0
EPS = 1e-18
correctingFactor = 1.00988652862227438516
offset = 0
multiplier = 0
gamma = 0
def __init__(self, errorGuarantee):
self.gamma = (1 + errorGuarantee) / (1 - errorGuarantee)
self.multiplier = (self.correctingFactor * m.log(2)) / m.log(self.gamma)
self.offset = self.getIndex(1.0 / self.EPS)
def fastlog(self, value):
s = np.frexp(value)
e = s[1]
s = s[0]
s = s * 2 - 1
return ((self.A * s + self.B) * s + self.C) * s + e - 1
def reverseLog(self, index):
exponent = m.floor(index)
d0 = self.B * self.B - 3 * self.A * self.C
d1 = 2 * self.B * self.B * self.B - 9 * self.A * self.B * self.C - 27 * self.A * self.A * (index - exponent)
p = np.cbrt((d1 - np.sqrt(d1 * d1 - 4 * d0 * d0 * d0)) / 2)
significandPlusOne = - (self.B + p + d0 / p) / (3 * self.A) + 1
return np.ldexp(significandPlusOne / 2, exponent + 1)
def getIndex(self, sample):
return m.ceil(self.fastlog(sample) * self.multiplier) + self.offset
def getValue(self, idx):
return self.reverseLog((idx - self.offset) / self.multiplier) * 2.0 / (1 + self.gamma)

View File

@ -0,0 +1,70 @@
#!/usr/bin/env python3
#
# ddsketch_compare.py
#
# This source file is part of the FoundationDB open source project
#
# Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import argparse
import json
import numpy as np
# kullback-leibler divergence (or relative entropy)
def relative_entropy(p, q):
difference = 0.0
for i in range(len(p)):
if p[i] != 0.0 and q[i] != 0.0:
difference += (p[i] * np.log2(p[i]/q[i]))
return difference
# jensen-shannon divergence (or symmetric relative entropy)
def relative_entropy_symmetric(dd1, dd2):
# normalize p, q into distribution
sum1 = sum(dd1)
sum2 = sum(dd2)
p = [dd1[i] / sum1 for i in range(len(dd1))]
q = [dd2[i] / sum2 for i in range(len(dd2))]
m = [0.5 * (p[i] + q[i]) for i in range(len(p))]
return 0.5 * relative_entropy(p, m) + 0.5 * relative_entropy(q, m)
# setup cmdline args
parser = argparse.ArgumentParser(description="Compares two DDSketch distributions")
parser.add_argument('--txn1', help='Transaction type for first file', required=True, type=str)
parser.add_argument('--txn2', help='Transaction type for second file', required=True, type=str)
parser.add_argument('--file1', help='Path to first ddsketch json', required=True, type=str)
parser.add_argument('--file2', help="Path to second ddsketch json'", required=True, type=str)
parser.add_argument("--op", help='Operation name', type=str)
args = parser.parse_args()
f1 = open(args.file1)
f2 = open(args.file2)
data1 = json.load(f1)
data2 = json.load(f2)
if data1[args.txn1][args.op]["errorGuarantee"] != data2[args.txn2][args.op]["errorGuarantee"]:
print("ERROR: The sketches have different error guarantees and cannot be compared!")
exit()
b1 = data1[args.txn1][args.op]["buckets"]
b2 = data2[args.txn2][args.op]["buckets"]
re = relative_entropy_symmetric(b1, b2)
print("The similarity is: ", round(re, 8))
print("1 means least alike, 0 means most alike")

View File

@ -0,0 +1,45 @@
#!/usr/bin/env python3
#
# ddsketch_conversion.py
#
# This source file is part of the FoundationDB open source project
#
# Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import argparse
import ddsketch_calc as dd
parser = argparse.ArgumentParser(description="Converts values to DDSketch buckets")
parser.add_argument('-e', '--error_guarantee', help='Error guarantee (default is 0.005)', required=False, type=float)
parser.add_argument('-v', '--value', help="Value", required=False, type=int)
parser.add_argument('-b', '--bucket', help='Bucket index', required=False, type=int)
args = parser.parse_args()
error = 0.005
if args.error_guarantee is not None:
error = args.error_guarantee
sketch = dd.DDSketch(error)
if args.value is not None:
print("Bucket index for ", args.value)
print(sketch.getIndex(args.value))
if args.bucket is not None:
print("Value for bucket ", args.bucket)
print(sketch.getValue(args.bucket))

67
contrib/export_graph.py Normal file
View File

@ -0,0 +1,67 @@
#!/usr/bin/env python3
#
# export_graph.py
#
# This source file is part of the FoundationDB open source project
#
# Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import json
import matplotlib.pyplot as plt
import argparse
import ddsketch_calc as dd
# setup cmdline args
parser = argparse.ArgumentParser(description="Graphs DDSketch distribution")
parser.add_argument('-t', '--txn', help='Transaction type (ex: g8ui)', required=True, type=str)
parser.add_argument('--file', help='Path to ddsketch json', required=True, type=str)
parser.add_argument('--title', help='Title for the graph', required=False, type=str)
parser.add_argument('--savefig', help='Will save the plot to a file if set', type=str)
parser.add_argument('--op', help='Which OP to plot (casing matters)', type=str)
args = parser.parse_args()
# Opening JSON file
f = open(args.file)
data = json.load(f)
# parse json and init sketch
buckets = data[args.t][args.op]["buckets"]
error = data[args.t][args.op]["errorGuarantee"]
sketch = dd.DDSketch(error)
# trim the tails of the distribution
ls = [i for i, e in enumerate(buckets) if e != 0]
actual_data = buckets[ls[0]:ls[-1]+1]
indices = range(ls[0], ls[-1]+1)
actual_indices = [sketch.getValue(i) for i in indices]
# configure the x-axis to make more sense
fig, ax = plt.subplots()
ax.ticklabel_format(useOffset=False, style='plain')
plt.plot(actual_indices, actual_data)
plt.xlabel("Latency (in us)")
plt.ylabel("Frequency count")
plt_title = "Title"
if args.title is not None:
plt_title = args.title
plt.title(plt_title)
plt.xlim([actual_indices[0], actual_indices[-1]])
if args.savefig is not None:
plt.savefig(args.savefig, format='png')
else:
plt.show()

View File

@ -0,0 +1,2 @@
add_library(libb64 STATIC cdecode.c cencode.c)
target_include_directories(libb64 PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/include")

View File

@ -5,7 +5,7 @@ This is part of the libb64 project, and has been placed in the public domain.
For details, see http://sourceforge.net/projects/libb64
*/
#include "fdbclient/libb64/cdecode.h"
#include "libb64/cdecode.h"
int base64_decode_value(char value_in) {
static const char decoding[] = { 62, -1, -1, -1, 63, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -1, -1, -1, -2, -1,

View File

@ -5,7 +5,7 @@ This is part of the libb64 project, and has been placed in the public domain.
For details, see http://sourceforge.net/projects/libb64
*/
#include "fdbclient/libb64/cencode.h"
#include "libb64/cencode.h"
const int CHARS_PER_LINE = 72;

View File

@ -9,7 +9,7 @@ For details, see http://sourceforge.net/projects/libb64
#define BASE64_DECODE_H
#include <iostream>
#include "fdbclient/libb64/encode.h"
#include "libb64/encode.h"
namespace base64 {
extern "C" {

View File

@ -0,0 +1,2 @@
add_library(md5 STATIC md5.c)
target_include_directories(md5 PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/include")

View File

@ -194,7 +194,7 @@ class BaseInfo(object):
if protocol_version >= PROTOCOL_VERSION_6_3:
self.dc_id = bb.get_bytes_with_length()
if protocol_version >= PROTOCOL_VERSION_7_1:
if bb.get_bytes(1):
if bb.get_bool():
self.tenant = bb.get_bytes_with_length()
class GetVersionInfo(BaseInfo):

View File

@ -523,7 +523,8 @@
"duplicate_mutation_streams",
"duplicate_mutation_fetch_timeout",
"primary_dc_missing",
"fetch_primary_dc_timeout"
"fetch_primary_dc_timeout",
"fetch_storage_wiggler_stats_timeout"
]
},
"issues":[

View File

@ -36,11 +36,21 @@
namespace fdb_cli {
// TODO: API version
const KeyRangeRef tenantMapSpecialKeyRange(LiteralStringRef("\xff\xff/management/tenant/map/"),
LiteralStringRef("\xff\xff/management/tenant/map0"));
const KeyRangeRef tenantConfigSpecialKeyRange(LiteralStringRef("\xff\xff/management/tenant/configure/"),
LiteralStringRef("\xff\xff/management/tenant/configure0"));
const KeyRangeRef tenantMapSpecialKeyRange720("\xff\xff/management/tenant/map/"_sr,
"\xff\xff/management/tenant/map0"_sr);
const KeyRangeRef tenantConfigSpecialKeyRange("\xff\xff/management/tenant/configure/"_sr,
"\xff\xff/management/tenant/configure0"_sr);
const KeyRangeRef tenantMapSpecialKeyRange710("\xff\xff/management/tenant_map/"_sr,
"\xff\xff/management/tenant_map0"_sr);
KeyRangeRef const& tenantMapSpecialKeyRange(int apiVersion) {
if (apiVersion >= 720) {
return tenantMapSpecialKeyRange720;
} else {
return tenantMapSpecialKeyRange710;
}
}
Optional<std::map<Standalone<StringRef>, Optional<Value>>>
parseTenantConfiguration(std::vector<StringRef> const& tokens, int startIndex, bool allowUnset) {
@ -68,10 +78,16 @@ parseTenantConfiguration(std::vector<StringRef> const& tokens, int startIndex, b
value = token;
}
if (configParams.count(param)) {
fmt::print(
stderr, "ERROR: configuration parameter `{}' specified more than once.\n", param.toString().c_str());
return {};
}
if (tokencmp(param, "tenant_group")) {
configParams[param] = value;
} else {
fmt::print(stderr, "ERROR: unrecognized configuration parameter `{}'\n", param.toString().c_str());
fmt::print(stderr, "ERROR: unrecognized configuration parameter `{}'.\n", param.toString().c_str());
return {};
}
}
@ -96,13 +112,13 @@ void applyConfigurationToSpecialKeys(Reference<ITransaction> tr,
}
// createtenant command
ACTOR Future<bool> createTenantCommandActor(Reference<IDatabase> db, std::vector<StringRef> tokens) {
ACTOR Future<bool> createTenantCommandActor(Reference<IDatabase> db, std::vector<StringRef> tokens, int apiVersion) {
if (tokens.size() < 2 || tokens.size() > 3) {
printUsage(tokens[0]);
return false;
}
state Key tenantNameKey = tenantMapSpecialKeyRange.begin.withSuffix(tokens[1]);
state Key tenantNameKey = tenantMapSpecialKeyRange(apiVersion).begin.withSuffix(tokens[1]);
state Reference<ITransaction> tr = db->createTransaction();
state bool doneExistenceCheck = false;
@ -113,6 +129,11 @@ ACTOR Future<bool> createTenantCommandActor(Reference<IDatabase> db, std::vector
return false;
}
if (apiVersion < 720 && !configuration.get().empty()) {
fmt::print(stderr, "ERROR: tenants do not accept configuration options before API version 720.\n");
return false;
}
loop {
try {
tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES);
@ -164,13 +185,13 @@ CommandFactory createTenantFactory(
"that will require this tenant to be placed on the same cluster as other tenants in the same group."));
// deletetenant command
ACTOR Future<bool> deleteTenantCommandActor(Reference<IDatabase> db, std::vector<StringRef> tokens) {
ACTOR Future<bool> deleteTenantCommandActor(Reference<IDatabase> db, std::vector<StringRef> tokens, int apiVersion) {
if (tokens.size() != 2) {
printUsage(tokens[0]);
return false;
}
state Key tenantNameKey = tenantMapSpecialKeyRange.begin.withSuffix(tokens[1]);
state Key tenantNameKey = tenantMapSpecialKeyRange(apiVersion).begin.withSuffix(tokens[1]);
state Reference<ITransaction> tr = db->createTransaction();
state bool doneExistenceCheck = false;
@ -220,7 +241,7 @@ CommandFactory deleteTenantFactory(
"Deletes a tenant from the cluster. Deletion will be allowed only if the specified tenant contains no data."));
// listtenants command
ACTOR Future<bool> listTenantsCommandActor(Reference<IDatabase> db, std::vector<StringRef> tokens) {
ACTOR Future<bool> listTenantsCommandActor(Reference<IDatabase> db, std::vector<StringRef> tokens, int apiVersion) {
if (tokens.size() > 4) {
printUsage(tokens[0]);
return false;
@ -248,8 +269,8 @@ ACTOR Future<bool> listTenantsCommandActor(Reference<IDatabase> db, std::vector<
}
}
state Key beginTenantKey = tenantMapSpecialKeyRange.begin.withSuffix(beginTenant);
state Key endTenantKey = tenantMapSpecialKeyRange.begin.withSuffix(endTenant);
state Key beginTenantKey = tenantMapSpecialKeyRange(apiVersion).begin.withSuffix(beginTenant);
state Key endTenantKey = tenantMapSpecialKeyRange(apiVersion).begin.withSuffix(endTenant);
state Reference<ITransaction> tr = db->createTransaction();
loop {
@ -269,7 +290,7 @@ ACTOR Future<bool> listTenantsCommandActor(Reference<IDatabase> db, std::vector<
tr->getRange(firstGreaterOrEqual(beginTenantKey), firstGreaterOrEqual(endTenantKey), limit);
RangeResult tenants = wait(safeThreadFutureToFuture(kvsFuture));
for (auto tenant : tenants) {
tenantNames.push_back(tenant.key.removePrefix(tenantMapSpecialKeyRange.begin));
tenantNames.push_back(tenant.key.removePrefix(tenantMapSpecialKeyRange(apiVersion).begin));
}
}
@ -314,7 +335,7 @@ ACTOR Future<bool> getTenantCommandActor(Reference<IDatabase> db, std::vector<St
}
state bool useJson = tokens.size() == 3;
state Key tenantNameKey = tenantMapSpecialKeyRange.begin.withSuffix(tokens[1]);
state Key tenantNameKey = tenantMapSpecialKeyRange(apiVersion).begin.withSuffix(tokens[1]);
state Reference<ITransaction> tr = db->createTransaction();
loop {

View File

@ -1909,14 +1909,14 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise) {
}
if (tokencmp(tokens[0], "createtenant")) {
bool _result = wait(makeInterruptable(createTenantCommandActor(db, tokens)));
bool _result = wait(makeInterruptable(createTenantCommandActor(db, tokens, opt.apiVersion)));
if (!_result)
is_error = true;
continue;
}
if (tokencmp(tokens[0], "deletetenant")) {
bool _result = wait(makeInterruptable(deleteTenantCommandActor(db, tokens)));
bool _result = wait(makeInterruptable(deleteTenantCommandActor(db, tokens, opt.apiVersion)));
if (!_result)
is_error = true;
else if (tenantName.present() && tokens[1] == tenantName.get()) {
@ -1928,7 +1928,7 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise) {
}
if (tokencmp(tokens[0], "listtenants")) {
bool _result = wait(makeInterruptable(listTenantsCommandActor(db, tokens)));
bool _result = wait(makeInterruptable(listTenantsCommandActor(db, tokens, opt.apiVersion)));
if (!_result)
is_error = true;
continue;
@ -1942,6 +1942,12 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise) {
}
if (tokencmp(tokens[0], "configuretenant")) {
if (opt.apiVersion < 720) {
fmt::print(stderr, "ERROR: tenants cannot be configured before API version 720.\n");
is_error = true;
continue;
}
bool _result = wait(makeInterruptable(configureTenantCommandActor(db, tokens)));
if (!_result)
is_error = true;
@ -1949,6 +1955,12 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise) {
}
if (tokencmp(tokens[0], "renametenant")) {
if (opt.apiVersion < 720) {
fmt::print(stderr, "ERROR: tenants cannot be renamed before API version 720.\n");
is_error = true;
continue;
}
bool _result = wait(makeInterruptable(renameTenantCommandActor(db, tokens)));
if (!_result)
is_error = true;

View File

@ -166,11 +166,11 @@ ACTOR Future<bool> consistencyCheckCommandActor(Reference<ITransaction> tr,
// coordinators command
ACTOR Future<bool> coordinatorsCommandActor(Reference<IDatabase> db, std::vector<StringRef> tokens);
// createtenant command
ACTOR Future<bool> createTenantCommandActor(Reference<IDatabase> db, std::vector<StringRef> tokens);
ACTOR Future<bool> createTenantCommandActor(Reference<IDatabase> db, std::vector<StringRef> tokens, int apiVersion);
// datadistribution command
ACTOR Future<bool> dataDistributionCommandActor(Reference<IDatabase> db, std::vector<StringRef> tokens);
// deletetenant command
ACTOR Future<bool> deleteTenantCommandActor(Reference<IDatabase> db, std::vector<StringRef> tokens);
ACTOR Future<bool> deleteTenantCommandActor(Reference<IDatabase> db, std::vector<StringRef> tokens, int apiVersion);
// exclude command
ACTOR Future<bool> excludeCommandActor(Reference<IDatabase> db, std::vector<StringRef> tokens, Future<Void> warn);
// expensive_data_check command
@ -196,7 +196,7 @@ ACTOR Future<bool> killCommandActor(Reference<IDatabase> db,
std::vector<StringRef> tokens,
std::map<Key, std::pair<Value, ClientLeaderRegInterface>>* address_interface);
// listtenants command
ACTOR Future<bool> listTenantsCommandActor(Reference<IDatabase> db, std::vector<StringRef> tokens);
ACTOR Future<bool> listTenantsCommandActor(Reference<IDatabase> db, std::vector<StringRef> tokens, int apiVersion);
// lock/unlock command
ACTOR Future<bool> lockCommandActor(Reference<IDatabase> db, std::vector<StringRef> tokens);
ACTOR Future<bool> unlockDatabaseActor(Reference<IDatabase> db, UID uid);

File diff suppressed because it is too large Load Diff

View File

@ -1,8 +1,5 @@
fdb_find_sources(FDBCLIENT_SRCS)
list(APPEND FDBCLIENT_SRCS
sha1/SHA1.cpp
libb64/cdecode.c
libb64/cencode.c)
list(APPEND FDBCLIENT_SRCS sha1/SHA1.cpp)
message(STATUS "FDB version is ${FDB_VERSION}")
message(STATUS "FDB package name is ${FDB_PACKAGE_NAME}")

View File

@ -209,12 +209,7 @@ void ClientKnobs::initialize(Randomize randomize) {
init( IS_ACCEPTABLE_DELAY, 1.5 );
init( HTTP_READ_SIZE, 128*1024 );
init( HTTP_SEND_SIZE, 32*1024 );
init( HTTP_VERBOSE_LEVEL, 0 );
init( HTTP_REQUEST_ID_HEADER, "" );
init( HTTP_REQUEST_AWS_V4_HEADER, true );
init( HTTP_RESPONSE_SKIP_VERIFY_CHECKSUM_FOR_PARTIAL_CONTENT, false );
init( BLOBSTORE_ENCRYPTION_TYPE, "" );
init( BLOBSTORE_CONNECT_TRIES, 10 );
init( BLOBSTORE_CONNECT_TIMEOUT, 10 );

View File

@ -3236,13 +3236,26 @@ TenantInfo TransactionState::getTenantInfo() {
} else if (!t.present()) {
return TenantInfo();
} else if (cx->clientInfo->get().tenantMode == TenantMode::DISABLED && t.present()) {
throw tenants_disabled();
// If we are running provisional proxies, we allow a tenant request to go through since we don't know the tenant
// mode. Such a transaction would not be allowed to commit without enabling provisional commits because either
// the commit proxies will be provisional or the read version will be too old.
if (!cx->clientInfo->get().grvProxies.empty() && !cx->clientInfo->get().grvProxies[0].provisional) {
throw tenants_disabled();
} else {
ASSERT(!useProvisionalProxies);
}
}
ASSERT(tenantId != TenantInfo::INVALID_TENANT);
return TenantInfo(t.get(), tenantId);
}
// Returns the tenant used in this transaction. If the tenant is unset and raw access isn't specified, then the default
// tenant from DatabaseContext is applied to this transaction (note: the default tenant is typically unset, but in
// simulation could be something different).
//
// This function should not be called in the transaction constructor or in the setOption function to allow a user the
// opportunity to set raw access.
Optional<TenantName> const& TransactionState::tenant() {
if (tenantSet) {
return tenant_;
@ -3255,6 +3268,9 @@ Optional<TenantName> const& TransactionState::tenant() {
}
}
// Returns true if the tenant has been set, but does not cause default tenant resolution. This is useful in setOption
// (where we do not want to call tenant()) if we want to enforce that an option not be set on a Tenant transaction (e.g.
// for raw access).
bool TransactionState::hasTenant() const {
return tenantSet && tenant_.present();
}
@ -6572,6 +6588,11 @@ void Transaction::setOption(FDBTransactionOptions::Option option, Optional<Strin
case FDBTransactionOptions::USE_PROVISIONAL_PROXIES:
validateOptionValueNotPresent(value);
if (trState->hasTenant()) {
Error e = invalid_option();
TraceEvent(SevWarn, "TenantTransactionUseProvisionalProxies").error(e).detail("Tenant", trState->tenant());
throw e;
}
trState->options.getReadVersionFlags |= GetReadVersionRequest::FLAG_USE_PROVISIONAL_PROXIES;
trState->useProvisionalProxies = UseProvisionalProxies::True;
break;
@ -9390,11 +9411,20 @@ Future<Void> DatabaseContext::getChangeFeedStream(Reference<ChangeFeedData> resu
Reference<DatabaseContext>::addRef(this), results, rangeID, begin, end, range, replyBufferSize, canReadPopped);
}
ACTOR Future<std::vector<OverlappingChangeFeedEntry>> singleLocationOverlappingChangeFeeds(
Database cx,
Reference<LocationInfo> location,
KeyRangeRef range,
Version minVersion) {
Version OverlappingChangeFeedsInfo::getFeedMetadataVersion(const KeyRangeRef& range) const {
Version v = invalidVersion;
for (auto& it : feedMetadataVersions) {
if (it.second > v && it.first.intersects(range)) {
v = it.second;
}
}
return v;
}
ACTOR Future<OverlappingChangeFeedsReply> singleLocationOverlappingChangeFeeds(Database cx,
Reference<LocationInfo> location,
KeyRangeRef range,
Version minVersion) {
state OverlappingChangeFeedsRequest req;
req.range = range;
req.minVersion = minVersion;
@ -9406,16 +9436,16 @@ ACTOR Future<std::vector<OverlappingChangeFeedEntry>> singleLocationOverlappingC
TaskPriority::DefaultPromiseEndpoint,
AtMostOnce::False,
cx->enableLocalityLoadBalance ? &cx->queueModel : nullptr));
return rep.rangeIds;
return rep;
}
bool compareChangeFeedResult(const OverlappingChangeFeedEntry& i, const OverlappingChangeFeedEntry& j) {
return i.rangeId < j.rangeId;
return i.feedId < j.feedId;
}
ACTOR Future<std::vector<OverlappingChangeFeedEntry>> getOverlappingChangeFeedsActor(Reference<DatabaseContext> db,
KeyRangeRef range,
Version minVersion) {
ACTOR Future<OverlappingChangeFeedsInfo> getOverlappingChangeFeedsActor(Reference<DatabaseContext> db,
KeyRangeRef range,
Version minVersion) {
state Database cx(db);
state Span span("NAPI:GetOverlappingChangeFeeds"_loc);
@ -9441,19 +9471,33 @@ ACTOR Future<std::vector<OverlappingChangeFeedEntry>> getOverlappingChangeFeedsA
throw all_alternatives_failed();
}
state std::vector<Future<std::vector<OverlappingChangeFeedEntry>>> allOverlappingRequests;
state std::vector<Future<OverlappingChangeFeedsReply>> allOverlappingRequests;
for (auto& it : locations) {
allOverlappingRequests.push_back(
singleLocationOverlappingChangeFeeds(cx, it.locations, it.range & range, minVersion));
}
wait(waitForAll(allOverlappingRequests));
std::vector<OverlappingChangeFeedEntry> result;
for (auto& it : allOverlappingRequests) {
result.insert(result.end(), it.get().begin(), it.get().end());
OverlappingChangeFeedsInfo result;
std::unordered_map<KeyRef, OverlappingChangeFeedEntry> latestFeedMetadata;
for (int i = 0; i < locations.size(); i++) {
result.arena.dependsOn(allOverlappingRequests[i].get().arena);
result.arena.dependsOn(locations[i].range.arena());
result.feedMetadataVersions.push_back(
{ locations[i].range, allOverlappingRequests[i].get().feedMetadataVersion });
for (auto& it : allOverlappingRequests[i].get().feeds) {
auto res = latestFeedMetadata.insert({ it.feedId, it });
if (!res.second) {
CODE_PROBE(true, "deduping fetched overlapping feed by higher metadata version");
if (res.first->second.feedMetadataVersion < it.feedMetadataVersion) {
res.first->second = it;
}
}
}
}
for (auto& it : latestFeedMetadata) {
result.feeds.push_back(result.arena, it.second);
}
std::sort(result.begin(), result.end(), compareChangeFeedResult);
result.resize(std::unique(result.begin(), result.end()) - result.begin());
return result;
} catch (Error& e) {
if (e.code() == error_code_wrong_shard_server || e.code() == error_code_all_alternatives_failed) {
@ -9466,8 +9510,7 @@ ACTOR Future<std::vector<OverlappingChangeFeedEntry>> getOverlappingChangeFeedsA
}
}
Future<std::vector<OverlappingChangeFeedEntry>> DatabaseContext::getOverlappingChangeFeeds(KeyRangeRef range,
Version minVersion) {
Future<OverlappingChangeFeedsInfo> DatabaseContext::getOverlappingChangeFeeds(KeyRangeRef range, Version minVersion) {
return getOverlappingChangeFeedsActor(Reference<DatabaseContext>::addRef(this), range, minVersion);
}
@ -9591,7 +9634,7 @@ ACTOR Future<Key> purgeBlobGranulesActor(Reference<DatabaseContext> db,
state bool loadedTenantPrefix = false;
// FIXME: implement force
if (!force) {
if (force) {
throw unsupported_operation();
}

View File

@ -20,7 +20,7 @@
#include "fdbclient/RESTClient.h"
#include "fdbclient/HTTP.h"
#include "fdbrpc/HTTP.h"
#include "flow/IRateControl.h"
#include "fdbclient/RESTUtils.h"
#include "flow/Arena.h"

View File

@ -20,8 +20,8 @@
#include "fdbclient/S3BlobStore.h"
#include "fdbclient/md5/md5.h"
#include "fdbclient/libb64/encode.h"
#include "md5/md5.h"
#include "libb64/encode.h"
#include "fdbclient/sha1/SHA1.h"
#include <time.h>
#include <iomanip>

View File

@ -701,8 +701,9 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( FETCH_BLOCK_BYTES, 2e6 );
init( FETCH_KEYS_PARALLELISM_BYTES, 4e6 ); if( randomize && BUGGIFY ) FETCH_KEYS_PARALLELISM_BYTES = 3e6;
init( FETCH_KEYS_PARALLELISM, 2 );
init( FETCH_KEYS_PARALLELISM_FULL, 10 );
init( FETCH_KEYS_LOWER_PRIORITY, 0 );
init( FETCH_CHANGEFEED_PARALLELISM, 2 );
init( FETCH_CHANGEFEED_PARALLELISM, 4 );
init( SERVE_FETCH_CHECKPOINT_PARALLELISM, 4 );
init( BUGGIFY_BLOCK_BYTES, 10000 );
init( STORAGE_RECOVERY_VERSION_LAG_LIMIT, 2 * MAX_READ_TRANSACTION_LIFE_VERSIONS );
@ -907,11 +908,13 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
// BlobGranuleVerify* simulation tests use "blobRangeKeys", BlobGranuleCorrectness* use "tenant", default in real clusters is "tenant"
init( BG_RANGE_SOURCE, "tenant" );
// BlobGranuleVerify* simulation tests use "knobs", BlobGranuleCorrectness* use "tenant", default in real clusters is "knobs"
bool buggifyMediumGranules = simulationMediumShards || (randomize && BUGGIFY);
init( BG_METADATA_SOURCE, "knobs" );
init( BG_SNAPSHOT_FILE_TARGET_BYTES, 10000000 ); if( buggifySmallShards ) BG_SNAPSHOT_FILE_TARGET_BYTES = 100000; else if (simulationMediumShards || (randomize && BUGGIFY) ) BG_SNAPSHOT_FILE_TARGET_BYTES = 1000000;
init( BG_SNAPSHOT_FILE_TARGET_CHUNKS, 100 ); if ( randomize && BUGGIFY ) BG_SNAPSHOT_FILE_TARGET_CHUNKS = 1 << deterministicRandom()->randomInt(0, 8);
init( BG_SNAPSHOT_FILE_TARGET_BYTES, 10000000 ); if( buggifySmallShards ) BG_SNAPSHOT_FILE_TARGET_BYTES = 100000; else if (buggifyMediumGranules) BG_SNAPSHOT_FILE_TARGET_BYTES = 1000000;
init( BG_SNAPSHOT_FILE_TARGET_CHUNK_BYTES, 64*1024 ); if ( randomize && BUGGIFY ) BG_SNAPSHOT_FILE_TARGET_CHUNK_BYTES = BG_SNAPSHOT_FILE_TARGET_BYTES / (1 << deterministicRandom()->randomInt(0, 8));
init( BG_DELTA_BYTES_BEFORE_COMPACT, BG_SNAPSHOT_FILE_TARGET_BYTES/2 );
init( BG_DELTA_FILE_TARGET_BYTES, BG_DELTA_BYTES_BEFORE_COMPACT/10 );
init( BG_DELTA_FILE_TARGET_CHUNK_BYTES, 64*1024 ); if ( randomize && BUGGIFY ) BG_DELTA_FILE_TARGET_CHUNK_BYTES = BG_DELTA_FILE_TARGET_BYTES / (1 << deterministicRandom()->randomInt(0, 7));
init( BG_MAX_SPLIT_FANOUT, 10 ); if( randomize && BUGGIFY ) BG_MAX_SPLIT_FANOUT = deterministicRandom()->randomInt(5, 15);
init( BG_MAX_MERGE_FANIN, 10 ); if( randomize && BUGGIFY ) BG_MAX_MERGE_FANIN = deterministicRandom()->randomInt(2, 15);
init( BG_HOT_SNAPSHOT_VERSIONS, 5000000 );

View File

@ -21,7 +21,7 @@
#include "fdbclient/NativeAPI.actor.h"
#include "fdbclient/SystemData.h"
#include "fdbclient/Tenant.h"
#include "fdbclient/libb64/encode.h"
#include "libb64/encode.h"
#include "flow/UnitTest.h"
Key TenantMapEntry::idToPrefix(int64_t id) {
@ -118,6 +118,17 @@ std::string TenantMapEntry::toJson(int apiVersion) const {
tenantEntry["tenant_group"] = tenantGroupObject;
}
if (tenantGroup.present()) {
json_spirit::mObject tenantGroupObject;
std::string encodedTenantGroup = base64::encoder::from_string(tenantGroup.get().toString());
// Remove trailing newline
encodedTenantGroup.resize(encodedTenantGroup.size() - 1);
tenantGroupObject["base64"] = encodedTenantGroup;
tenantGroupObject["printable"] = printable(tenantGroup.get());
tenantEntry["tenant_group"] = tenantGroupObject;
}
return json_spirit::write_string(json_spirit::mValue(tenantEntry));
}

View File

@ -36,8 +36,8 @@
#include "flow/Net2Packet.h"
#include "flow/IRateControl.h"
#include "fdbclient/S3BlobStore.h"
#include "fdbclient/md5/md5.h"
#include "fdbclient/libb64/encode.h"
#include "md5/md5.h"
#include "libb64/encode.h"
#include "flow/actorcompiler.h" // This must be the last #include.
ACTOR template <typename T>

View File

@ -46,6 +46,7 @@ struct GranuleSnapshot : VectorRef<KeyValueRef> {
}
};
// Deltas in version order
struct GranuleDeltas : VectorRef<MutationsAndVersionRef> {
constexpr static FileIdentifier file_identifier = 8563013;

View File

@ -27,11 +27,15 @@
#include "flow/CompressionUtils.h"
Value serializeChunkedSnapshot(Standalone<GranuleSnapshot> snapshot,
int chunks,
int chunkSize,
Optional<CompressionFilter> compressFilter,
Optional<BlobGranuleCipherKeysCtx> cipherKeysCtx = Optional<BlobGranuleCipherKeysCtx>());
Optional<BlobGranuleCipherKeysCtx> cipherKeysCtx = {});
// FIXME: support sorted and chunked delta files
Value serializeChunkedDeltaFile(Standalone<GranuleDeltas> deltas,
const KeyRangeRef& fileRange,
int chunkSize,
Optional<CompressionFilter> compressFilter,
Optional<BlobGranuleCipherKeysCtx> cipherKeysCtx = {});
ErrorOr<RangeResult> loadAndMaterializeBlobGranules(const Standalone<VectorRef<BlobGranuleChunkRef>>& files,
const KeyRangeRef& keyRange,

View File

@ -220,12 +220,7 @@ public:
int64_t CSI_SIZE_LIMIT;
double CSI_STATUS_DELAY;
int HTTP_SEND_SIZE;
int HTTP_READ_SIZE;
int HTTP_VERBOSE_LEVEL;
std::string HTTP_REQUEST_ID_HEADER;
bool HTTP_REQUEST_AWS_V4_HEADER; // setting this knob to true will enable AWS V4 style header.
bool HTTP_RESPONSE_SKIP_VERIFY_CHECKSUM_FOR_PARTIAL_CONTENT; // skip verify md5 checksum for 206 response
std::string BLOBSTORE_ENCRYPTION_TYPE;
int BLOBSTORE_CONNECT_TRIES;
int BLOBSTORE_CONNECT_TIMEOUT;

View File

@ -207,6 +207,16 @@ struct KeyRangeLocationInfo {
: tenantEntry(tenantEntry), range(range), locations(locations) {}
};
struct OverlappingChangeFeedsInfo {
Arena arena;
VectorRef<OverlappingChangeFeedEntry> feeds;
// would prefer to use key range map but it complicates copy/move constructors
std::vector<std::pair<KeyRangeRef, Version>> feedMetadataVersions;
// for a feed that wasn't present, returns the metadata version it would have been fetched at.
Version getFeedMetadataVersion(const KeyRangeRef& feedRange) const;
};
class DatabaseContext : public ReferenceCounted<DatabaseContext>, public FastAllocated<DatabaseContext>, NonCopyable {
public:
static DatabaseContext* allocateOnForeignThread() {
@ -361,7 +371,7 @@ public:
int replyBufferSize = -1,
bool canReadPopped = true);
Future<std::vector<OverlappingChangeFeedEntry>> getOverlappingChangeFeeds(KeyRangeRef ranges, Version minVersion);
Future<OverlappingChangeFeedsInfo> getOverlappingChangeFeeds(KeyRangeRef ranges, Version minVersion);
Future<Void> popChangeFeedMutations(Key rangeID, Version version);
Future<Key> purgeBlobGranules(KeyRange keyRange,

View File

@ -25,7 +25,7 @@
#pragma once
#include "fdbclient/JSONDoc.h"
#include "fdbclient/HTTP.h"
#include "fdbrpc/HTTP.h"
#include "fdbclient/RESTUtils.h"
#include "flow/Arena.h"
#include "flow/FastRef.h"

View File

@ -26,7 +26,7 @@
#include "flow/Net2Packet.h"
#include "fdbclient/Knobs.h"
#include "flow/IRateControl.h"
#include "fdbclient/HTTP.h"
#include "fdbrpc/HTTP.h"
#include "fdbclient/JSONDoc.h"
// Representation of all the things you need to connect to a blob store instance with some credentials.

View File

@ -659,6 +659,7 @@ public:
int FETCH_BLOCK_BYTES;
int FETCH_KEYS_PARALLELISM_BYTES;
int FETCH_KEYS_PARALLELISM;
int FETCH_KEYS_PARALLELISM_FULL;
int FETCH_KEYS_LOWER_PRIORITY;
int FETCH_CHANGEFEED_PARALLELISM;
int SERVE_FETCH_CHECKPOINT_PARALLELISM;
@ -887,8 +888,9 @@ public:
std::string BG_METADATA_SOURCE;
int BG_SNAPSHOT_FILE_TARGET_BYTES;
int BG_SNAPSHOT_FILE_TARGET_CHUNKS;
int BG_SNAPSHOT_FILE_TARGET_CHUNK_BYTES;
int BG_DELTA_FILE_TARGET_BYTES;
int BG_DELTA_FILE_TARGET_CHUNK_BYTES;
int BG_DELTA_BYTES_BEFORE_COMPACT;
int BG_MAX_SPLIT_FANOUT;
int BG_MAX_MERGE_FANIN;

View File

@ -970,39 +970,51 @@ struct FetchCheckpointKeyValuesRequest {
};
struct OverlappingChangeFeedEntry {
Key rangeId;
KeyRange range;
KeyRef feedId;
KeyRangeRef range;
Version emptyVersion;
Version stopVersion;
Version feedMetadataVersion;
bool operator==(const OverlappingChangeFeedEntry& r) const {
return rangeId == r.rangeId && range == r.range && emptyVersion == r.emptyVersion &&
stopVersion == r.stopVersion;
return feedId == r.feedId && range == r.range && emptyVersion == r.emptyVersion &&
stopVersion == r.stopVersion && feedMetadataVersion == r.feedMetadataVersion;
}
OverlappingChangeFeedEntry() {}
OverlappingChangeFeedEntry(Key const& rangeId, KeyRange const& range, Version emptyVersion, Version stopVersion)
: rangeId(rangeId), range(range), emptyVersion(emptyVersion), stopVersion(stopVersion) {}
OverlappingChangeFeedEntry(KeyRef const& feedId,
KeyRangeRef const& range,
Version emptyVersion,
Version stopVersion,
Version feedMetadataVersion)
: feedId(feedId), range(range), emptyVersion(emptyVersion), stopVersion(stopVersion),
feedMetadataVersion(feedMetadataVersion) {}
OverlappingChangeFeedEntry(Arena& arena, const OverlappingChangeFeedEntry& rhs)
: feedId(arena, rhs.feedId), range(arena, rhs.range), emptyVersion(rhs.emptyVersion),
stopVersion(rhs.stopVersion), feedMetadataVersion(rhs.feedMetadataVersion) {}
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, rangeId, range, emptyVersion, stopVersion);
serializer(ar, feedId, range, emptyVersion, stopVersion, feedMetadataVersion);
}
};
struct OverlappingChangeFeedsReply {
constexpr static FileIdentifier file_identifier = 11815134;
std::vector<OverlappingChangeFeedEntry> rangeIds;
VectorRef<OverlappingChangeFeedEntry> feeds;
bool cached;
Arena arena;
Version feedMetadataVersion;
OverlappingChangeFeedsReply() : cached(false) {}
explicit OverlappingChangeFeedsReply(std::vector<OverlappingChangeFeedEntry> const& rangeIds)
: rangeIds(rangeIds), cached(false) {}
OverlappingChangeFeedsReply() : cached(false), feedMetadataVersion(invalidVersion) {}
explicit OverlappingChangeFeedsReply(VectorRef<OverlappingChangeFeedEntry> const& feeds,
Version feedMetadataVersion)
: feeds(feeds), cached(false), feedMetadataVersion(feedMetadataVersion) {}
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, rangeIds, arena);
serializer(ar, feeds, arena, feedMetadataVersion);
}
};

View File

@ -80,13 +80,9 @@ struct TenantMapEntry {
bool matchesConfiguration(TenantMapEntry const& other) const;
void configure(Standalone<StringRef> parameter, Optional<Value> value);
Value encode() const { return ObjectWriter::toValue(*this, IncludeVersion(ProtocolVersion::withTenants())); }
Value encode() const { return ObjectWriter::toValue(*this, IncludeVersion()); }
static TenantMapEntry decode(ValueRef const& value) {
TenantMapEntry entry;
ObjectReader reader(value.begin(), IncludeVersion());
reader.deserialize(entry);
return entry;
return ObjectReader::fromStringRef<TenantMapEntry>(value, IncludeVersion());
}
template <class Ar>
@ -109,7 +105,7 @@ struct TenantGroupEntry {
TenantGroupEntry() = default;
TenantGroupEntry(Optional<ClusterName> assignedCluster) : assignedCluster(assignedCluster) {}
Value encode() { return ObjectWriter::toValue(*this, IncludeVersion(ProtocolVersion::withTenants())); }
Value encode() { return ObjectWriter::toValue(*this, IncludeVersion()); }
static TenantGroupEntry decode(ValueRef const& value) {
TenantGroupEntry entry;
ObjectReader reader(value.begin(), IncludeVersion());
@ -153,14 +149,12 @@ struct TenantMetadataSpecification {
KeyBackedObjectMap<TenantGroupName, TenantGroupEntry, decltype(IncludeVersion()), NullCodec> tenantGroupMap;
TenantMetadataSpecification(KeyRef subspace)
: tenantMap(subspace.withSuffix("tenant/map/"_sr), IncludeVersion(ProtocolVersion::withTenants())),
: tenantMap(subspace.withSuffix("tenant/map/"_sr), IncludeVersion()),
lastTenantId(subspace.withSuffix("tenant/lastId"_sr)),
tenantTombstones(subspace.withSuffix("tenant/tombstones/"_sr)),
tombstoneCleanupData(subspace.withSuffix("tenant/tombstoneCleanup"_sr),
IncludeVersion(ProtocolVersion::withTenants())),
tombstoneCleanupData(subspace.withSuffix("tenant/tombstoneCleanup"_sr), IncludeVersion()),
tenantGroupTenantIndex(subspace.withSuffix("tenant/tenantGroup/tenantIndex/"_sr)),
tenantGroupMap(subspace.withSuffix("tenant/tenantGroup/map/"_sr),
IncludeVersion(ProtocolVersion::withTenants())) {}
tenantGroupMap(subspace.withSuffix("tenant/tenantGroup/map/"_sr), IncludeVersion()) {}
};
struct TenantMetadata {

View File

@ -61,10 +61,10 @@ if(${COROUTINE_IMPL} STREQUAL libcoro)
endif()
target_include_directories(fdbrpc PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/include" "${CMAKE_CURRENT_BINARY_DIR}/include" PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/libeio)
target_link_libraries(fdbrpc PUBLIC flow PRIVATE rapidjson)
target_link_libraries(fdbrpc PUBLIC flow libb64 md5 PRIVATE rapidjson)
target_include_directories(fdbrpc_sampling PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/include" "${CMAKE_CURRENT_BINARY_DIR}/include" PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/libeio)
target_link_libraries(fdbrpc_sampling PUBLIC flow_sampling PRIVATE rapidjson)
target_link_libraries(fdbrpc_sampling PUBLIC flow_sampling libb64 md5 PRIVATE rapidjson)
if(${COROUTINE_IMPL} STREQUAL libcoro)
target_link_libraries(fdbrpc PUBLIC coro)

View File

@ -18,12 +18,11 @@
* limitations under the License.
*/
#include "fdbclient/HTTP.h"
#include "fdbrpc/HTTP.h"
#include "fdbclient/md5/md5.h"
#include "fdbclient/ClientKnobs.h"
#include "fdbclient/libb64/encode.h"
#include "fdbclient/Knobs.h"
#include "md5/md5.h"
#include "libb64/encode.h"
#include "flow/Knobs.h"
#include <cctype>
#include "flow/actorcompiler.h" // has to be last include
@ -153,7 +152,7 @@ ACTOR Future<size_t> read_delimited_into_string(Reference<IConnection> conn,
// Next search will start at the current end of the buffer - delim size + 1
if (sPos >= lookBack)
sPos -= lookBack;
wait(success(read_into_string(conn, buf, CLIENT_KNOBS->HTTP_READ_SIZE)));
wait(success(read_into_string(conn, buf, FLOW_KNOBS->HTTP_READ_SIZE)));
}
}
@ -161,7 +160,7 @@ ACTOR Future<size_t> read_delimited_into_string(Reference<IConnection> conn,
ACTOR Future<Void> read_fixed_into_string(Reference<IConnection> conn, int len, std::string* buf, size_t pos) {
state int stop_size = pos + len;
while (buf->size() < stop_size)
wait(success(read_into_string(conn, buf, CLIENT_KNOBS->HTTP_READ_SIZE)));
wait(success(read_into_string(conn, buf, FLOW_KNOBS->HTTP_READ_SIZE)));
return Void();
}
@ -329,7 +328,7 @@ ACTOR Future<Void> read_http_response(Reference<HTTP::Response> r, Reference<ICo
// If there is actual response content, check the MD5 sum against the Content-MD5 response header
if (r->content.size() > 0) {
if (r->code == 206 && CLIENT_KNOBS->HTTP_RESPONSE_SKIP_VERIFY_CHECKSUM_FOR_PARTIAL_CONTENT) {
if (r->code == 206 && FLOW_KNOBS->HTTP_RESPONSE_SKIP_VERIFY_CHECKSUM_FOR_PARTIAL_CONTENT) {
return Void();
}
@ -368,7 +367,7 @@ ACTOR Future<Reference<HTTP::Response>> doRequest(Reference<IConnection> conn,
// There is no standard http request id header field, so either a global default can be set via a knob
// or it can be set per-request with the requestIDHeader argument (which overrides the default)
if (requestIDHeader.empty()) {
requestIDHeader = CLIENT_KNOBS->HTTP_REQUEST_ID_HEADER;
requestIDHeader = FLOW_KNOBS->HTTP_REQUEST_ID_HEADER;
}
state bool earlyResponse = false;
@ -400,13 +399,13 @@ ACTOR Future<Reference<HTTP::Response>> doRequest(Reference<IConnection> conn,
// Prepend headers to content packer buffer chain
pContent->prependWriteBuffer(pFirst, pLast);
if (CLIENT_KNOBS->HTTP_VERBOSE_LEVEL > 1)
if (FLOW_KNOBS->HTTP_VERBOSE_LEVEL > 1)
printf("[%s] HTTP starting %s %s ContentLen:%d\n",
conn->getDebugID().toString().c_str(),
verb.c_str(),
resource.c_str(),
contentLen);
if (CLIENT_KNOBS->HTTP_VERBOSE_LEVEL > 2) {
if (FLOW_KNOBS->HTTP_VERBOSE_LEVEL > 2) {
for (auto h : headers)
printf("Request Header: %s: %s\n", h.first.c_str(), h.second.c_str());
}
@ -427,7 +426,7 @@ ACTOR Future<Reference<HTTP::Response>> doRequest(Reference<IConnection> conn,
break;
}
state int trySend = CLIENT_KNOBS->HTTP_SEND_SIZE;
state int trySend = FLOW_KNOBS->HTTP_SEND_SIZE;
wait(sendRate->getAllowance(trySend));
int len = conn->write(pContent->getUnsent(), trySend);
if (pSent != nullptr)
@ -481,7 +480,7 @@ ACTOR Future<Reference<HTTP::Response>> doRequest(Reference<IConnection> conn,
}
}
if (CLIENT_KNOBS->HTTP_VERBOSE_LEVEL > 0) {
if (FLOW_KNOBS->HTTP_VERBOSE_LEVEL > 0) {
printf("[%s] HTTP %scode=%d early=%d, time=%fs %s %s contentLen=%d [%d out, response content len %d]\n",
conn->getDebugID().toString().c_str(),
(err.present() ? format("*ERROR*=%s ", err.get().name()).c_str() : ""),
@ -494,7 +493,7 @@ ACTOR Future<Reference<HTTP::Response>> doRequest(Reference<IConnection> conn,
total_sent,
(int)r->contentLen);
}
if (CLIENT_KNOBS->HTTP_VERBOSE_LEVEL > 2) {
if (FLOW_KNOBS->HTTP_VERBOSE_LEVEL > 2) {
printf("[%s] HTTP RESPONSE: %s %s\n%s\n",
conn->getDebugID().toString().c_str(),
verb.c_str(),
@ -510,7 +509,7 @@ ACTOR Future<Reference<HTTP::Response>> doRequest(Reference<IConnection> conn,
} catch (Error& e) {
double elapsed = timer() - send_start;
// A bad_request_id error would have already been logged in verbose mode before err is thrown above.
if (CLIENT_KNOBS->HTTP_VERBOSE_LEVEL > 0 && e.code() != error_code_http_bad_request_id) {
if (FLOW_KNOBS->HTTP_VERBOSE_LEVEL > 0 && e.code() != error_code_http_bad_request_id) {
printf("[%s] HTTP *ERROR*=%s early=%d, time=%fs %s %s contentLen=%d [%d out]\n",
conn->getDebugID().toString().c_str(),
e.name(),

View File

@ -48,6 +48,7 @@ public:
int write(SendBuffer const* buffer, int limit) override;
NetworkAddress getPeerAddress() const override;
UID getDebugID() const override;
boost::asio::ip::tcp::socket& getSocket() override { return socket; }
static Future<std::vector<NetworkAddress>> resolveTCPEndpoint(const std::string& host,
const std::string& service,
DNSCache* dnsCache);

View File

@ -23,6 +23,7 @@
#include "flow/ProtocolVersion.h"
#include <algorithm>
#include <string>
#include <limits>
#pragma once
#include "flow/flow.h"
@ -488,6 +489,8 @@ public:
bool setDiffProtocol; // true if a process with a different protocol version has been started
bool allowStorageMigrationTypeChange = false;
double injectTargetedSSRestartTime = std::numeric_limits<double>::max();
double injectSSDelayTime = std::numeric_limits<double>::max();
flowGlobalType global(int id) const final { return getCurrentProcess()->global(id); };
void setGlobal(size_t id, flowGlobalType v) final { getCurrentProcess()->setGlobal(id, v); };

View File

@ -324,6 +324,8 @@ struct Sim2Conn final : IConnection, ReferenceCounted<Sim2Conn> {
NetworkAddress getPeerAddress() const override { return peerEndpoint; }
UID getDebugID() const override { return dbgid; }
boost::asio::ip::tcp::socket& getSocket() override { throw operation_failed(); }
bool opened, closedByCaller, stableConnection;
private:
@ -948,8 +950,9 @@ public:
TaskPriority getCurrentTask() const override { return currentTaskID; }
void setCurrentTask(TaskPriority taskID) override { currentTaskID = taskID; }
// Sets the taskID/priority of the current task, without yielding
Future<Reference<IConnection>> connect(NetworkAddress toAddr, const std::string& host) override {
ASSERT(host.empty());
Future<Reference<IConnection>> connect(NetworkAddress toAddr,
boost::asio::ip::tcp::socket* existingSocket = nullptr) override {
ASSERT(existingSocket == nullptr);
if (!addressMap.count(toAddr)) {
return waitForProcessAndConnect(toAddr, this);
}
@ -975,7 +978,7 @@ public:
return onConnect(::delay(0.5 * deterministicRandom()->random01()), myc);
}
Future<Reference<IConnection>> connectExternal(NetworkAddress toAddr, const std::string& host) override {
Future<Reference<IConnection>> connectExternal(NetworkAddress toAddr) override {
return SimExternalConnection::connect(toAddr);
}

View File

@ -143,30 +143,34 @@ bool compareFDBAndBlob(RangeResult fdb,
}
}
printf("Chunks:\n");
for (auto& chunk : blob.second) {
printf("[%s - %s)\n", chunk.keyRange.begin.printable().c_str(), chunk.keyRange.end.printable().c_str());
printf(" SnapshotFile:\n %s\n",
chunk.snapshotFile.present() ? chunk.snapshotFile.get().toString().c_str() : "<none>");
printf(" DeltaFiles:\n");
for (auto& df : chunk.deltaFiles) {
printf(" %s\n", df.toString().c_str());
}
printf(" Deltas: (%d)", chunk.newDeltas.size());
if (chunk.newDeltas.size() > 0) {
fmt::print(" with version [{0} - {1}]",
chunk.newDeltas[0].version,
chunk.newDeltas[chunk.newDeltas.size() - 1].version);
}
fmt::print(" IncludedVersion: {}\n", chunk.includedVersion);
}
printf("\n");
printGranuleChunks(blob.second);
}
}
return correct;
}
void printGranuleChunks(const Standalone<VectorRef<BlobGranuleChunkRef>>& chunks) {
printf("Chunks:\n");
for (auto& chunk : chunks) {
printf("[%s - %s)\n", chunk.keyRange.begin.printable().c_str(), chunk.keyRange.end.printable().c_str());
printf(" SnapshotFile:\n %s\n",
chunk.snapshotFile.present() ? chunk.snapshotFile.get().toString().c_str() : "<none>");
printf(" DeltaFiles:\n");
for (auto& df : chunk.deltaFiles) {
printf(" %s\n", df.toString().c_str());
}
printf(" Deltas: (%d)", chunk.newDeltas.size());
if (chunk.newDeltas.size() > 0) {
fmt::print(" with version [{0} - {1}]",
chunk.newDeltas[0].version,
chunk.newDeltas[chunk.newDeltas.size() - 1].version);
}
fmt::print(" IncludedVersion: {}\n", chunk.includedVersion);
}
printf("\n");
}
ACTOR Future<Void> clearAndAwaitMerge(Database cx, KeyRange range) {
// clear key range and check whether it is merged or not, repeatedly
state Transaction tr(cx);

View File

@ -52,6 +52,7 @@
*/
#define BM_DEBUG false
#define BM_PURGE_DEBUG false
void handleClientBlobRange(KeyRangeMap<bool>* knownBlobRanges,
Arena& ar,
@ -1652,7 +1653,9 @@ ACTOR Future<Void> persistMergeGranulesDone(Reference<BlobManagerData> bmData,
state Key lockKey = blobGranuleLockKeyFor(parentRange);
state Future<Optional<Value>> oldLockFuture = tr->get(lockKey);
wait(updateChangeFeed(tr,
// This has to be
// TODO: fix this better! (privatize change feed key clear)
wait(updateChangeFeed(&tr->getTransaction(),
granuleIDToCFKey(parentGranuleIDs[parentIdx]),
ChangeFeedStatus::CHANGE_FEED_DESTROY,
parentRange));
@ -3171,8 +3174,8 @@ ACTOR Future<Void> fullyDeleteGranule(Reference<BlobManagerData> self,
Key historyKey,
Version purgeVersion,
KeyRange granuleRange) {
if (BM_DEBUG) {
fmt::print("Fully deleting granule {0}: init\n", granuleId.toString());
if (BM_PURGE_DEBUG) {
fmt::print("BM {0} Fully deleting granule {1}: init\n", self->epoch, granuleId.toString());
}
// if granule is still splitting and files are needed for new sub-granules to re-snapshot, we can only partially
@ -3198,8 +3201,11 @@ ACTOR Future<Void> fullyDeleteGranule(Reference<BlobManagerData> self,
filesToDelete.emplace_back(fname);
}
if (BM_DEBUG) {
fmt::print("Fully deleting granule {0}: deleting {1} files\n", granuleId.toString(), filesToDelete.size());
if (BM_PURGE_DEBUG) {
fmt::print("BM {0} Fully deleting granule {1}: deleting {2} files\n",
self->epoch,
granuleId.toString(),
filesToDelete.size());
for (auto filename : filesToDelete) {
fmt::print(" - {}\n", filename.c_str());
}
@ -3212,8 +3218,9 @@ ACTOR Future<Void> fullyDeleteGranule(Reference<BlobManagerData> self,
wait(waitForAll(deletions));
// delete metadata in FDB (history entry and file keys)
if (BM_DEBUG) {
fmt::print("Fully deleting granule {0}: deleting history and file keys\n", granuleId.toString());
if (BM_PURGE_DEBUG) {
fmt::print(
"BM {0} Fully deleting granule {1}: deleting history and file keys\n", self->epoch, granuleId.toString());
}
state Transaction tr(self->db);
@ -3232,8 +3239,8 @@ ACTOR Future<Void> fullyDeleteGranule(Reference<BlobManagerData> self,
}
}
if (BM_DEBUG) {
fmt::print("Fully deleting granule {0}: success\n", granuleId.toString());
if (BM_PURGE_DEBUG) {
fmt::print("BM {0} Fully deleting granule {1}: success\n", self->epoch, granuleId.toString());
}
TraceEvent("GranuleFullPurge", self->id)
@ -3245,6 +3252,8 @@ ACTOR Future<Void> fullyDeleteGranule(Reference<BlobManagerData> self,
++self->stats.granulesFullyPurged;
self->stats.filesPurged += filesToDelete.size();
CODE_PROBE(true, "full granule purged");
return Void();
}
@ -3260,8 +3269,8 @@ ACTOR Future<Void> partiallyDeleteGranule(Reference<BlobManagerData> self,
UID granuleId,
Version purgeVersion,
KeyRange granuleRange) {
if (BM_DEBUG) {
fmt::print("Partially deleting granule {0}: init\n", granuleId.toString());
if (BM_PURGE_DEBUG) {
fmt::print("BM {0} Partially deleting granule {1}: init\n", self->epoch, granuleId.toString());
}
state Reference<BlobConnectionProvider> bstore = wait(getBStoreForGranule(self, granuleRange));
@ -3310,8 +3319,11 @@ ACTOR Future<Void> partiallyDeleteGranule(Reference<BlobManagerData> self,
filesToDelete.emplace_back(fname);
}
if (BM_DEBUG) {
fmt::print("Partially deleting granule {0}: deleting {1} files\n", granuleId.toString(), filesToDelete.size());
if (BM_PURGE_DEBUG) {
fmt::print("BM {0} Partially deleting granule {1}: deleting {2} files\n",
self->epoch,
granuleId.toString(),
filesToDelete.size());
for (auto filename : filesToDelete) {
fmt::print(" - {0}\n", filename);
}
@ -3328,8 +3340,8 @@ ACTOR Future<Void> partiallyDeleteGranule(Reference<BlobManagerData> self,
wait(waitForAll(deletions));
// delete metadata in FDB (deleted file keys)
if (BM_DEBUG) {
fmt::print("Partially deleting granule {0}: deleting file keys\n", granuleId.toString());
if (BM_PURGE_DEBUG) {
fmt::print("BM {0} Partially deleting granule {1}: deleting file keys\n", self->epoch, granuleId.toString());
}
state Transaction tr(self->db);
@ -3348,8 +3360,8 @@ ACTOR Future<Void> partiallyDeleteGranule(Reference<BlobManagerData> self,
}
}
if (BM_DEBUG) {
fmt::print("Partially deleting granule {0}: success\n", granuleId.toString());
if (BM_PURGE_DEBUG) {
fmt::print("BM {0} Partially deleting granule {1}: success\n", self->epoch, granuleId.toString());
}
TraceEvent("GranulePartialPurge", self->id)
.detail("Epoch", self->epoch)
@ -3360,6 +3372,8 @@ ACTOR Future<Void> partiallyDeleteGranule(Reference<BlobManagerData> self,
++self->stats.granulesPartiallyPurged;
self->stats.filesPurged += filesToDelete.size();
CODE_PROBE(true, " partial granule purged");
return Void();
}
@ -3372,8 +3386,9 @@ ACTOR Future<Void> partiallyDeleteGranule(Reference<BlobManagerData> self,
* processing this purge intent.
*/
ACTOR Future<Void> purgeRange(Reference<BlobManagerData> self, KeyRangeRef range, Version purgeVersion, bool force) {
if (BM_DEBUG) {
fmt::print("purgeRange starting for range [{0} - {1}) @ purgeVersion={2}, force={3}\n",
if (BM_PURGE_DEBUG) {
fmt::print("BM {0} purgeRange starting for range [{1} - {2}) @ purgeVersion={3}, force={4}\n",
self->epoch,
range.begin.printable(),
range.end.printable(),
purgeVersion,
@ -3395,8 +3410,7 @@ ACTOR Future<Void> purgeRange(Reference<BlobManagerData> self, KeyRangeRef range
// track which granules we have already added to traversal
// note: (startKey, startVersion) uniquely identifies a granule
state std::unordered_set<std::pair<const uint8_t*, Version>, boost::hash<std::pair<const uint8_t*, Version>>>
visited;
state std::unordered_set<std::pair<std::string, Version>, boost::hash<std::pair<std::string, Version>>> visited;
// find all active granules (that comprise the range) and add to the queue
state KeyRangeMap<UID>::Ranges activeRanges = self->workerAssignments.intersectingRanges(range);
@ -3407,8 +3421,9 @@ ACTOR Future<Void> purgeRange(Reference<BlobManagerData> self, KeyRangeRef range
state KeyRangeMap<UID>::iterator activeRange;
for (activeRange = activeRanges.begin(); activeRange != activeRanges.end(); ++activeRange) {
if (BM_DEBUG) {
fmt::print("Checking if active range [{0} - {1}), owned by BW {2}, should be purged\n",
if (BM_PURGE_DEBUG) {
fmt::print("BM {0} Checking if active range [{1} - {2}), owned by BW {3}, should be purged\n",
self->epoch,
activeRange.begin().printable(),
activeRange.end().printable(),
activeRange.value().toString());
@ -3416,6 +3431,10 @@ ACTOR Future<Void> purgeRange(Reference<BlobManagerData> self, KeyRangeRef range
// assumption: purge boundaries must respect granule boundaries
if (activeRange.begin() < range.begin || activeRange.end() > range.end) {
TraceEvent(SevWarn, "GranulePurgeRangesUnaligned", self->id)
.detail("Epoch", self->epoch)
.detail("PurgeRange", range)
.detail("GranuleRange", activeRange.range());
continue;
}
@ -3425,20 +3444,29 @@ ACTOR Future<Void> purgeRange(Reference<BlobManagerData> self, KeyRangeRef range
loop {
try {
if (BM_DEBUG) {
fmt::print("Fetching latest history entry for range [{0} - {1})\n",
if (BM_PURGE_DEBUG) {
fmt::print("BM {0} Fetching latest history entry for range [{1} - {2})\n",
self->epoch,
activeRange.begin().printable(),
activeRange.end().printable());
}
// FIXME: doing this serially will likely be too slow for large purges
Optional<GranuleHistory> history = wait(getLatestGranuleHistory(&tr, activeRange.range()));
// TODO: can we tell from the krm that this range is not valid, so that we don't need to do a
// get
if (history.present()) {
if (BM_DEBUG) {
printf("Adding range to history queue\n");
if (BM_PURGE_DEBUG) {
fmt::print("BM {0} Adding range to history queue: [{1} - {2}) @ {3} ({4})\n",
self->epoch,
activeRange.begin().printable(),
activeRange.end().printable(),
history.get().version,
(void*)(activeRange.range().begin.begin()));
}
visited.insert({ activeRange.range().begin.begin(), history.get().version });
visited.insert({ activeRange.range().begin.toString(), history.get().version });
historyEntryQueue.push({ activeRange.range(), history.get().version, MAX_VERSION });
} else if (BM_PURGE_DEBUG) {
fmt::print("BM {0} No history for range, ignoring\n", self->epoch);
}
break;
} catch (Error& e) {
@ -3447,8 +3475,12 @@ ACTOR Future<Void> purgeRange(Reference<BlobManagerData> self, KeyRangeRef range
}
}
if (BM_DEBUG) {
printf("Beginning BFS traversal of history\n");
if (BM_PURGE_DEBUG) {
fmt::print("BM {0} Beginning BFS traversal of {1} history items for range [{2} - {3}) \n",
self->epoch,
historyEntryQueue.size(),
range.begin.printable(),
range.end.printable());
}
while (!historyEntryQueue.empty()) {
// process the node at the front of the queue and remove it
@ -3458,8 +3490,9 @@ ACTOR Future<Void> purgeRange(Reference<BlobManagerData> self, KeyRangeRef range
std::tie(currRange, startVersion, endVersion) = historyEntryQueue.front();
historyEntryQueue.pop();
if (BM_DEBUG) {
fmt::print("Processing history node [{0} - {1}) with versions [{2}, {3})\n",
if (BM_PURGE_DEBUG) {
fmt::print("BM {0} Processing history node [{1} - {2}) with versions [{3}, {4})\n",
self->epoch,
currRange.begin.printable(),
currRange.end.printable(),
startVersion,
@ -3484,11 +3517,15 @@ ACTOR Future<Void> purgeRange(Reference<BlobManagerData> self, KeyRangeRef range
}
if (!foundHistory) {
if (BM_PURGE_DEBUG) {
fmt::print("BM {0} No history for this node, skipping\n", self->epoch);
}
continue;
}
if (BM_DEBUG) {
fmt::print("Found history entry for this node. It's granuleID is {0}\n",
if (BM_PURGE_DEBUG) {
fmt::print("BM {0} Found history entry for this node. It's granuleID is {1}\n",
self->epoch,
currHistoryNode.granuleID.toString());
}
@ -3499,33 +3536,45 @@ ACTOR Future<Void> purgeRange(Reference<BlobManagerData> self, KeyRangeRef range
// and so this granule should be partially deleted
// - otherwise, this granule is active, so don't schedule it for deletion
if (force || endVersion <= purgeVersion) {
if (BM_DEBUG) {
fmt::print("Granule {0} will be FULLY deleted\n", currHistoryNode.granuleID.toString());
if (BM_PURGE_DEBUG) {
fmt::print(
"BM {0} Granule {1} will be FULLY deleted\n", self->epoch, currHistoryNode.granuleID.toString());
}
toFullyDelete.push_back({ currHistoryNode.granuleID, historyKey, currRange });
} else if (startVersion < purgeVersion) {
if (BM_DEBUG) {
fmt::print("Granule {0} will be partially deleted\n", currHistoryNode.granuleID.toString());
if (BM_PURGE_DEBUG) {
fmt::print("BM {0} Granule {1} will be partially deleted\n",
self->epoch,
currHistoryNode.granuleID.toString());
}
toPartiallyDelete.push_back({ currHistoryNode.granuleID, currRange });
}
// add all of the node's parents to the queue
if (BM_PURGE_DEBUG) {
fmt::print("BM {0} Checking {1} parents\n", self->epoch, currHistoryNode.parentVersions.size());
}
for (int i = 0; i < currHistoryNode.parentVersions.size(); i++) {
// for (auto& parent : currHistoryNode.parentVersions.size()) {
// if we already added this node to queue, skip it; otherwise, mark it as visited
KeyRangeRef parentRange(currHistoryNode.parentBoundaries[i], currHistoryNode.parentBoundaries[i + 1]);
Version parentVersion = currHistoryNode.parentVersions[i];
if (visited.count({ parentRange.begin.begin(), parentVersion })) {
if (BM_DEBUG) {
fmt::print("Already added {0} to queue, so skipping it\n", currHistoryNode.granuleID.toString());
std::string beginStr = parentRange.begin.toString();
if (!visited.insert({ beginStr, parentVersion }).second) {
if (BM_PURGE_DEBUG) {
fmt::print("BM {0} Already added [{1} - {2}) @ {3} - {4} to queue, so skipping it\n",
self->epoch,
parentRange.begin.printable(),
parentRange.end.printable(),
parentVersion,
startVersion);
}
continue;
}
visited.insert({ parentRange.begin.begin(), parentVersion });
if (BM_DEBUG) {
fmt::print("Adding parent [{0} - {1}) with versions [{2} - {3}) to queue\n",
if (BM_PURGE_DEBUG) {
fmt::print("BM {0} Adding parent [{1} - {2}) @ {3} - {4} to queue\n",
self->epoch,
parentRange.begin.printable(),
parentRange.end.printable(),
parentVersion,
@ -3553,10 +3602,19 @@ ACTOR Future<Void> purgeRange(Reference<BlobManagerData> self, KeyRangeRef range
// we won't run into any issues with trying to "re-delete" a blob file since deleting
// a file that doesn't exist is considered successful
TraceEvent("PurgeGranulesTraversalComplete", self->id)
.detail("Epoch", self->epoch)
.detail("Range", range)
.detail("PurgeVersion", purgeVersion)
.detail("Force", force)
.detail("VisitedCount", visited.size())
.detail("DeletingFullyCount", toFullyDelete.size())
.detail("DeletingPartiallyCount", toPartiallyDelete.size());
state std::vector<Future<Void>> partialDeletions;
state int i;
if (BM_DEBUG) {
fmt::print("{0} granules to fully delete\n", toFullyDelete.size());
if (BM_PURGE_DEBUG) {
fmt::print("BM {0}: {1} granules to fully delete\n", self->epoch, toFullyDelete.size());
}
for (i = toFullyDelete.size() - 1; i >= 0; --i) {
state UID granuleId;
@ -3564,22 +3622,22 @@ ACTOR Future<Void> purgeRange(Reference<BlobManagerData> self, KeyRangeRef range
KeyRange keyRange;
std::tie(granuleId, historyKey, keyRange) = toFullyDelete[i];
// FIXME: consider batching into a single txn (need to take care of txn size limit)
if (BM_DEBUG) {
fmt::print("About to fully delete granule {0}\n", granuleId.toString());
if (BM_PURGE_DEBUG) {
fmt::print("BM {0}: About to fully delete granule {1}\n", self->epoch, granuleId.toString());
}
wait(fullyDeleteGranule(self, granuleId, historyKey, purgeVersion, range));
}
if (BM_DEBUG) {
fmt::print("{0} granules to partially delete\n", toPartiallyDelete.size());
if (BM_PURGE_DEBUG) {
fmt::print("BM {0}: {1} granules to partially delete\n", self->epoch, toPartiallyDelete.size());
}
for (i = toPartiallyDelete.size() - 1; i >= 0; --i) {
UID granuleId;
KeyRange range;
std::tie(granuleId, range) = toPartiallyDelete[i];
if (BM_DEBUG) {
fmt::print("About to partially delete granule {0}\n", granuleId.toString());
if (BM_PURGE_DEBUG) {
fmt::print("BM {0}: About to partially delete granule {1}\n", self->epoch, granuleId.toString());
}
partialDeletions.emplace_back(partiallyDeleteGranule(self, granuleId, purgeVersion, range));
}
@ -3591,8 +3649,9 @@ ACTOR Future<Void> purgeRange(Reference<BlobManagerData> self, KeyRangeRef range
// another purgeIntent that got written for this table while we were processing this one.
// If that is the case, we should not clear the key. Otherwise, we can just clear the key.
if (BM_DEBUG) {
fmt::print("Successfully purged range [{0} - {1}) at purgeVersion={2}\n",
if (BM_PURGE_DEBUG) {
fmt::print("BM {0}: Successfully purged range [{1} - {2}) at purgeVersion={3}\n",
self->epoch,
range.begin.printable(),
range.end.printable(),
purgeVersion);
@ -3604,6 +3663,8 @@ ACTOR Future<Void> purgeRange(Reference<BlobManagerData> self, KeyRangeRef range
.detail("PurgeVersion", purgeVersion)
.detail("Force", force);
CODE_PROBE(true, "range purge complete");
++self->stats.purgesProcessed;
return Void();
}
@ -3654,6 +3715,7 @@ ACTOR Future<Void> monitorPurgeKeys(Reference<BlobManagerData> self) {
// TODO: replace 10000 with a knob
state RangeResult purgeIntents = wait(tr->getRange(blobGranulePurgeKeys, BUGGIFY ? 1 : 10000));
if (purgeIntents.size()) {
CODE_PROBE(true, "BM found purges to process");
int rangeIdx = 0;
for (; rangeIdx < purgeIntents.size(); ++rangeIdx) {
Version purgeVersion;
@ -3675,8 +3737,9 @@ ACTOR Future<Void> monitorPurgeKeys(Reference<BlobManagerData> self) {
}
purgeMap.insert(range, std::make_pair(purgeVersion, force));
if (BM_DEBUG) {
fmt::print("about to purge range [{0} - {1}) @ {2}, force={3}\n",
if (BM_PURGE_DEBUG) {
fmt::print("BM {0} about to purge range [{1} - {2}) @ {3}, force={4}\n",
self->epoch,
range.begin.printable(),
range.end.printable(),
purgeVersion,
@ -3728,9 +3791,11 @@ ACTOR Future<Void> monitorPurgeKeys(Reference<BlobManagerData> self) {
}
}
if (BM_DEBUG) {
printf("Done clearing current set of purge intents.\n");
if (BM_PURGE_DEBUG) {
fmt::print("BM {0} Done clearing current set of purge intents.\n", self->epoch);
}
CODE_PROBE(true, "BM finished processing purge intents");
}
}

View File

@ -602,7 +602,20 @@ ACTOR Future<BlobFileIndex> writeDeltaFile(Reference<BlobWorkerData> bwData,
state std::string fileName = randomBGFilename(bwData->id, granuleID, currentDeltaVersion, ".delta");
state Value serialized = ObjectWriter::toValue(deltasToWrite, Unversioned());
state Optional<BlobGranuleCipherKeysCtx> cipherKeysCtx;
state Optional<BlobGranuleCipherKeysMeta> cipherKeysMeta;
state Arena arena;
// TODO support encryption, figure out proper state stuff
/*if (isBlobFileEncryptionSupported()) {
BlobGranuleCipherKeysCtx ciphKeysCtx = wait(getLatestGranuleCipherKeys(bwData, keyRange, &arena));
cipherKeysCtx = ciphKeysCtx;
cipherKeysMeta = BlobGranuleCipherKeysCtx::toCipherKeysMeta(cipherKeysCtx.get());
}*/
Optional<CompressionFilter> compressFilter = getBlobFileCompressFilter();
state Value serialized = serializeChunkedDeltaFile(
deltasToWrite, keyRange, SERVER_KNOBS->BG_DELTA_FILE_TARGET_CHUNK_BYTES, compressFilter, cipherKeysCtx);
state size_t serializedSize = serialized.size();
// Free up deltasToWrite here to reduce memory
@ -640,7 +653,7 @@ ACTOR Future<BlobFileIndex> writeDeltaFile(Reference<BlobWorkerData> bwData,
Key dfKey = blobGranuleFileKeyFor(granuleID, currentDeltaVersion, 'D');
// TODO change once we support file multiplexing
Value dfValue = blobGranuleFileValueFor(fname, 0, serializedSize, serializedSize);
Value dfValue = blobGranuleFileValueFor(fname, 0, serializedSize, serializedSize, cipherKeysMeta);
tr->set(dfKey, dfValue);
if (oldGranuleComplete.present()) {
@ -668,7 +681,7 @@ ACTOR Future<BlobFileIndex> writeDeltaFile(Reference<BlobWorkerData> bwData,
wait(delay(deterministicRandom()->random01()));
}
// FIXME: change when we implement multiplexing
return BlobFileIndex(currentDeltaVersion, fname, 0, serializedSize, serializedSize);
return BlobFileIndex(currentDeltaVersion, fname, 0, serializedSize, serializedSize, cipherKeysMeta);
} catch (Error& e) {
wait(tr->onError(e));
}
@ -753,8 +766,8 @@ ACTOR Future<BlobFileIndex> writeSnapshot(Reference<BlobWorkerData> bwData,
}
Optional<CompressionFilter> compressFilter = getBlobFileCompressFilter();
state Value serialized =
serializeChunkedSnapshot(snapshot, SERVER_KNOBS->BG_SNAPSHOT_FILE_TARGET_CHUNKS, compressFilter, cipherKeysCtx);
state Value serialized = serializeChunkedSnapshot(
snapshot, SERVER_KNOBS->BG_SNAPSHOT_FILE_TARGET_CHUNK_BYTES, compressFilter, cipherKeysCtx);
state size_t serializedSize = serialized.size();
// free snapshot to reduce memory
@ -970,6 +983,7 @@ ACTOR Future<BlobFileIndex> compactFromBlob(Reference<BlobWorkerData> bwData,
snapshotF.cipherKeysMeta);
// TODO: optimization - batch 'encryption-key' lookup given the GranuleFile set is known
// FIXME: get cipher keys for delta as well!
if (chunk.snapshotFile.get().cipherKeysMetaRef.present()) {
ASSERT(isBlobFileEncryptionSupported());
BlobGranuleCipherKeysCtx cipherKeysCtx =
@ -3187,6 +3201,8 @@ ACTOR Future<Void> doBlobGranuleFileRequest(Reference<BlobWorkerData> bwData, Bl
getGranuleCipherKeys(bwData, chunk.snapshotFile.get().cipherKeysMetaRef.get(), &rep.arena);
}
// FIXME: get cipher keys for delta files too!
// new deltas (if version is larger than version of last delta file)
// FIXME: do trivial key bounds here if key range is not fully contained in request key
// range

View File

@ -368,3 +368,15 @@ Future<Void> MovableCoordinatedState::setExclusive(Value v) {
Future<Void> MovableCoordinatedState::move(ClusterConnectionString const& nc) {
return MovableCoordinatedStateImpl::move(impl.get(), nc);
}
Optional<Value> updateCCSInMovableValue(ValueRef movableVal, KeyRef oldClusterKey, KeyRef newClusterKey) {
Optional<Value> result;
MovableValue moveVal = BinaryReader::fromStringRef<MovableValue>(
movableVal, IncludeVersion(ProtocolVersion::withMovableCoordinatedStateV2()));
if (moveVal.other.present() && moveVal.other.get().startsWith(oldClusterKey)) {
TraceEvent(SevDebug, "UpdateCCSInMovableValue").detail("OldConnectionString", moveVal.other.get());
moveVal.other = moveVal.other.get().removePrefix(oldClusterKey).withPrefix(newClusterKey);
result = BinaryWriter::toValue(moveVal, IncludeVersion(ProtocolVersion::withMovableCoordinatedStateV2()));
}
return result;
}

View File

@ -776,3 +776,78 @@ ACTOR Future<Void> coordinationServer(std::string dataFolder,
throw;
}
}
ACTOR Future<Void> changeClusterDescription(std::string datafolder, KeyRef newClusterKey, KeyRef oldClusterKey) {
state UID myID = deterministicRandom()->randomUniqueID();
state OnDemandStore store(datafolder, myID, "coordination-");
RangeResult res = wait(store->readRange(allKeys));
// Context, in coordinators' kv-store
// cluster description and the random id are always appear together as the clusterKey
// The old cluster key, (call it oldCKey) below can appear in the following scenarios:
// 1. oldCKey is a key in the store: the value is a binary format of _GenerationRegVal_ which contains a different
// clusterKey(either movedFrom or moveTo)
// 2. oldCKey appears in a key for forwarding message:
// 2.1: the prefix is _fwdKeys.begin_: the value is the new connection string
// 2.2: the prefix is _fwdTimeKeys.begin_: the value is the time
// 3. oldCKey does not appear in any keys but in a value:
// 3.1: it's in the value of a forwarding message(see 2.1)
// 3.2: it's inside the value of _GenerationRegVal_ (see 1), which is a cluster connection string.
// it seems that even we do not change it the cluster should still be good, but to be safe we still update it.
for (auto& [key, value] : res) {
if (key.startsWith(fwdKeys.begin)) {
if (key.removePrefix(fwdKeys.begin) == oldClusterKey) {
store->clear(singleKeyRange(key));
store->set(KeyValueRef(newClusterKey.withPrefix(fwdKeys.begin), value));
} else if (value.startsWith(oldClusterKey)) {
store->set(KeyValueRef(key, value.removePrefix(oldClusterKey).withPrefix(newClusterKey)));
}
} else if (key.startsWith(fwdTimeKeys.begin) && key.removePrefix(fwdTimeKeys.begin) == oldClusterKey) {
store->clear(singleKeyRange(key));
store->set(KeyValueRef(newClusterKey.withPrefix(fwdTimeKeys.begin), value));
} else if (key == oldClusterKey) {
store->clear(singleKeyRange(key));
store->set(KeyValueRef(newClusterKey, value));
} else {
// parse the value part
GenerationRegVal regVal = BinaryReader::fromStringRef<GenerationRegVal>(value, IncludeVersion());
if (regVal.val.present()) {
Optional<Value> newVal = updateCCSInMovableValue(regVal.val.get(), oldClusterKey, newClusterKey);
if (newVal.present()) {
regVal.val = newVal.get();
store->set(KeyValueRef(
key, BinaryWriter::toValue(regVal, IncludeVersion(ProtocolVersion::withGenerationRegVal()))));
}
}
}
}
wait(store->commit());
return Void();
}
Future<Void> coordChangeClusterKey(std::string dataFolder, KeyRef newClusterKey, KeyRef oldClusterKey) {
TraceEvent(SevInfo, "CoordChangeClusterKey")
.detail("DataFolder", dataFolder)
.detail("NewClusterKey", newClusterKey)
.detail("OldClusterKey", oldClusterKey);
std::string absDataFolder = abspath(dataFolder);
std::vector<std::string> returnList = platform::listDirectories(absDataFolder);
std::vector<Future<Void>> futures;
for (const auto& dirEntry : returnList) {
if (dirEntry == "." || dirEntry == "..") {
continue;
}
std::string processDir = dataFolder + "/" + dirEntry;
TraceEvent(SevInfo, "UpdatingCoordDataForProcess").detail("ProcessDataDir", processDir);
std::vector<std::string> returnFiles = platform::listFiles(processDir, "");
bool isCoord = false;
for (const auto& fileEntry : returnFiles) {
if (fileEntry.rfind("coordination-", 0) == 0) {
isCoord = true;
}
}
if (!isCoord)
continue;
futures.push_back(changeClusterDescription(processDir, newClusterKey, oldClusterKey));
}
return waitForAll(futures);
}

View File

@ -41,6 +41,7 @@
#include "fdbserver/TenantCache.h"
#include "fdbserver/TLogInterface.h"
#include "fdbserver/WaitFailure.h"
#include "fdbserver/workloads/workloads.actor.h"
#include "flow/ActorCollection.h"
#include "flow/Arena.h"
#include "flow/BooleanParam.h"
@ -290,6 +291,7 @@ ACTOR Future<Void> pollMoveKeysLock(Database cx, MoveKeysLock lock, const DDEnab
}
struct DataDistributor : NonCopyable, ReferenceCounted<DataDistributor> {
public:
Reference<AsyncVar<ServerDBInfo> const> dbInfo;
UID ddId;
PromiseStream<Future<Void>> addActor;
@ -311,7 +313,9 @@ struct DataDistributor : NonCopyable, ReferenceCounted<DataDistributor> {
// fully-functional.
DDTeamCollection* teamCollection;
Reference<ShardsAffectedByTeamFailure> shardsAffectedByTeamFailure;
PromiseStream<RelocateShard> relocationProducer, relocationConsumer; // comsumer is a yield stream from producer
// consumer is a yield stream from producer. The RelocateShard is pushed into relocationProducer and popped from
// relocationConsumer (by DDQueue)
PromiseStream<RelocateShard> relocationProducer, relocationConsumer;
DataDistributor(Reference<AsyncVar<ServerDBInfo> const> const& db, UID id)
: dbInfo(db), ddId(id), txnProcessor(nullptr), initialDDEventHolder(makeReference<EventCacheHolder>("InitialDD")),
@ -436,11 +440,7 @@ struct DataDistributor : NonCopyable, ReferenceCounted<DataDistributor> {
return Void();
}
// Resume inflight relocations from the previous DD
// TODO: add a test to verify the inflight relocation correctness and measure the memory usage with 4 million shards
ACTOR static Future<Void> resumeRelocations(Reference<DataDistributor> self) {
ASSERT(self->shardsAffectedByTeamFailure); // has to be allocated
ACTOR static Future<Void> resumeFromShards(Reference<DataDistributor> self, bool traceShard) {
state int shard = 0;
for (; shard < self->initData->shards.size() - 1; shard++) {
const DDShardInfo& iShard = self->initData->shards[shard];
@ -452,8 +452,8 @@ struct DataDistributor : NonCopyable, ReferenceCounted<DataDistributor> {
if (self->configuration.usableRegions > 1) {
teams.push_back(ShardsAffectedByTeamFailure::Team(iShard.remoteSrc, false));
}
if (g_network->isSimulated()) {
TraceEvent("DDInitShard")
if (traceShard) {
TraceEvent(SevDebug, "DDInitShard")
.detail("Keys", keys)
.detail("PrimarySrc", describe(iShard.primarySrc))
.detail("RemoteSrc", describe(iShard.remoteSrc))
@ -472,20 +472,27 @@ struct DataDistributor : NonCopyable, ReferenceCounted<DataDistributor> {
if (!unhealthy && self->configuration.usableRegions > 1) {
unhealthy = iShard.remoteSrc.size() != self->configuration.storageTeamSize;
}
self->relocationProducer.send(RelocateShard(keys,
unhealthy ? SERVER_KNOBS->PRIORITY_TEAM_UNHEALTHY
: SERVER_KNOBS->PRIORITY_RECOVER_MOVE,
RelocateReason::OTHER));
self->relocationProducer.send(
RelocateShard(keys,
unhealthy ? DataMovementReason::TEAM_UNHEALTHY : DataMovementReason::RECOVER_MOVE,
RelocateReason::OTHER));
}
wait(yield(TaskPriority::DataDistribution));
}
return Void();
}
// TODO: unit test needed
ACTOR static Future<Void> resumeFromDataMoves(Reference<DataDistributor> self, Future<Void> readyToStart) {
state KeyRangeMap<std::shared_ptr<DataMove>>::iterator it = self->initData->dataMoveMap.ranges().begin();
wait(readyToStart);
for (; it != self->initData->dataMoveMap.ranges().end(); ++it) {
const DataMoveMetaData& meta = it.value()->meta;
if (it.value()->isCancelled() || (it.value()->valid && !CLIENT_KNOBS->SHARD_ENCODE_LOCATION_METADATA)) {
RelocateShard rs(meta.range, SERVER_KNOBS->PRIORITY_RECOVER_MOVE, RelocateReason::OTHER);
RelocateShard rs(meta.range, DataMovementReason::RECOVER_MOVE, RelocateReason::OTHER);
rs.dataMoveId = meta.id;
rs.cancelled = true;
self->relocationProducer.send(rs);
@ -494,7 +501,7 @@ struct DataDistributor : NonCopyable, ReferenceCounted<DataDistributor> {
TraceEvent(SevDebug, "DDInitFoundDataMove", self->ddId).detail("DataMove", meta.toString());
ASSERT(meta.range == it.range());
// TODO: Persist priority in DataMoveMetaData.
RelocateShard rs(meta.range, SERVER_KNOBS->PRIORITY_RECOVER_MOVE, RelocateReason::OTHER);
RelocateShard rs(meta.range, DataMovementReason::RECOVER_MOVE, RelocateReason::OTHER);
rs.dataMoveId = meta.id;
rs.dataMove = it.value();
std::vector<ShardsAffectedByTeamFailure::Team> teams;
@ -517,6 +524,16 @@ struct DataDistributor : NonCopyable, ReferenceCounted<DataDistributor> {
}
return Void();
}
// Resume inflight relocations from the previous DD
// TODO: The initialDataDistribution is unused once resumeRelocations and
// DataDistributionTracker::trackInitialShards are done. In the future, we can release the object to save memory
// usage if it turns out to be a problem.
Future<Void> resumeRelocations() {
ASSERT(shardsAffectedByTeamFailure); // has to be allocated
Future<Void> shardsReady = resumeFromShards(Reference<DataDistributor>::addRef(this), g_network->isSimulated());
return resumeFromDataMoves(Reference<DataDistributor>::addRef(this), shardsReady);
}
};
// Runs the data distribution algorithm for FDB, including the DD Queue, DD tracker, and DD team collection
@ -564,18 +581,19 @@ ACTOR Future<Void> dataDistribution(Reference<DataDistributor> self,
state Reference<AsyncVar<bool>> processingUnhealthy(new AsyncVar<bool>(false));
state Reference<AsyncVar<bool>> processingWiggle(new AsyncVar<bool>(false));
state Promise<Void> readyToStart;
self->shardsAffectedByTeamFailure = makeReference<ShardsAffectedByTeamFailure>();
wait(DataDistributor::resumeRelocations(self));
wait(self->resumeRelocations());
std::vector<TeamCollectionInterface> tcis;
std::vector<TeamCollectionInterface> tcis; // primary and remote region interface
Reference<AsyncVar<bool>> anyZeroHealthyTeams; // true if primary or remote has zero healthy team
std::vector<Reference<AsyncVar<bool>>> zeroHealthyTeams; // primary and remote
Reference<AsyncVar<bool>> anyZeroHealthyTeams;
std::vector<Reference<AsyncVar<bool>>> zeroHealthyTeams;
tcis.push_back(TeamCollectionInterface());
zeroHealthyTeams.push_back(makeReference<AsyncVar<bool>>(true));
int storageTeamSize = self->configuration.storageTeamSize;
std::vector<Future<Void>> actors;
std::vector<Future<Void>> actors; // the container of ACTORs
if (self->configuration.usableRegions > 1) {
tcis.push_back(TeamCollectionInterface());
storageTeamSize = 2 * self->configuration.storageTeamSize;
@ -1379,6 +1397,16 @@ static Future<ErrorOr<Void>> badTestFuture(double duration, Error e) {
return tag(delay(duration), ErrorOr<Void>(e));
}
inline DDShardInfo doubleToNoLocationShardInfo(double d, bool hasDest) {
DDShardInfo res(doubleToTestKey(d), anonymousShardId, anonymousShardId);
res.primarySrc.emplace_back((uint64_t)d, 0);
if (hasDest) {
res.primaryDest.emplace_back((uint64_t)d + 1, 0);
res.hasDest = true;
}
return res;
}
} // namespace data_distribution_test
TEST_CASE("/DataDistribution/WaitForMost") {
@ -1440,3 +1468,44 @@ TEST_CASE("/DataDistributor/StorageWiggler/Order") {
ASSERT(!wiggler.getNextServerId().present());
return Void();
}
TEST_CASE("/DataDistributor/Initialization/ResumeFromShard") {
state Reference<AsyncVar<ServerDBInfo> const> dbInfo;
state Reference<DataDistributor> self(new DataDistributor(dbInfo, UID()));
self->shardsAffectedByTeamFailure = makeReference<ShardsAffectedByTeamFailure>();
self->initData = makeReference<InitialDataDistribution>();
self->configuration.usableRegions = 1;
self->configuration.storageTeamSize = 1;
// add DDShardInfo
self->shardsAffectedByTeamFailure->setCheckMode(
ShardsAffectedByTeamFailure::CheckMode::ForceNoCheck); // skip check when build
int shardNum = deterministicRandom()->randomInt(1000, CLIENT_KNOBS->TOO_MANY * 5); // 2000000000; OOM
std::cout << "generating " << shardNum << " shards...\n";
for (int i = 1; i <= SERVER_KNOBS->DD_MOVE_KEYS_PARALLELISM; ++i) {
self->initData->shards.emplace_back(data_distribution_test::doubleToNoLocationShardInfo(i, true));
}
for (int i = SERVER_KNOBS->DD_MOVE_KEYS_PARALLELISM + 1; i <= shardNum; ++i) {
self->initData->shards.emplace_back(data_distribution_test::doubleToNoLocationShardInfo(i, false));
}
self->initData->shards.emplace_back(DDShardInfo(allKeys.end));
std::cout << "Start resuming...\n";
wait(DataDistributor::resumeFromShards(self, false));
std::cout << "Start validation...\n";
auto relocateFuture = self->relocationProducer.getFuture();
for (int i = 0; i < SERVER_KNOBS->DD_MOVE_KEYS_PARALLELISM; ++i) {
ASSERT(relocateFuture.isReady());
auto rs = relocateFuture.pop();
ASSERT(rs.isRestore() == false);
ASSERT(rs.cancelled == false);
ASSERT(rs.dataMoveId == anonymousShardId);
ASSERT(rs.priority == SERVER_KNOBS->PRIORITY_RECOVER_MOVE);
// std::cout << rs.keys.begin.toString() << " " << self->initData->shards[i].key.toString() << " \n";
ASSERT(rs.keys.begin.compare(self->initData->shards[i].key) == 0);
ASSERT(rs.keys.end == self->initData->shards[i + 1].key);
}
self->shardsAffectedByTeamFailure->setCheckMode(ShardsAffectedByTeamFailure::CheckMode::ForceCheck);
self->shardsAffectedByTeamFailure->check();
return Void();
}

View File

@ -41,12 +41,6 @@
typedef Reference<IDataDistributionTeam> ITeamRef;
typedef std::pair<ITeamRef, ITeamRef> SrcDestTeamPair;
// FIXME: Always use DataMovementReason to invoke these functions.
inline bool isDiskRebalancePriority(int priority) {
return priority == SERVER_KNOBS->PRIORITY_REBALANCE_UNDERUTILIZED_TEAM ||
priority == SERVER_KNOBS->PRIORITY_REBALANCE_OVERUTILIZED_TEAM;
}
inline bool isDataMovementForDiskBalancing(DataMovementReason reason) {
return reason == DataMovementReason::REBALANCE_UNDERUTILIZED_TEAM ||
reason == DataMovementReason::REBALANCE_OVERUTILIZED_TEAM;
@ -57,16 +51,12 @@ inline bool isDataMovementForReadBalancing(DataMovementReason reason) {
reason == DataMovementReason::REBALANCE_READ_UNDERUTIL_TEAM;
}
inline bool isMountainChopperPriority(int priority) {
return priority == SERVER_KNOBS->PRIORITY_REBALANCE_OVERUTILIZED_TEAM ||
priority == SERVER_KNOBS->PRIORITY_REBALANCE_READ_OVERUTIL_TEAM;
}
inline bool isDataMovementForMountainChopper(DataMovementReason reason) {
return reason == DataMovementReason::REBALANCE_OVERUTILIZED_TEAM ||
reason == DataMovementReason::REBALANCE_READ_OVERUTIL_TEAM;
}
// FIXME: Always use DataMovementReason to invoke these functions.
inline bool isValleyFillerPriority(int priority) {
return priority == SERVER_KNOBS->PRIORITY_REBALANCE_UNDERUTILIZED_TEAM ||
priority == SERVER_KNOBS->PRIORITY_REBALANCE_READ_UNDERUTIL_TEAM;
@ -80,6 +70,9 @@ inline bool isDataMovementForValleyFiller(DataMovementReason reason) {
int dataMovementPriority(DataMovementReason reason) {
int priority;
switch (reason) {
case DataMovementReason::INVALID:
priority = -1;
break;
case DataMovementReason::RECOVER_MOVE:
priority = SERVER_KNOBS->PRIORITY_RECOVER_MOVE;
break;
@ -162,9 +155,9 @@ struct RelocateData {
: keys(rs.keys), priority(rs.priority), boundaryPriority(isBoundaryPriority(rs.priority) ? rs.priority : -1),
healthPriority(isHealthPriority(rs.priority) ? rs.priority : -1), reason(rs.reason), startTime(now()),
randomId(deterministicRandom()->randomUniqueID()), dataMoveId(rs.dataMoveId), workFactor(0),
wantsNewServers(isMountainChopperPriority(rs.priority) || isValleyFillerPriority(rs.priority) ||
rs.priority == SERVER_KNOBS->PRIORITY_SPLIT_SHARD ||
rs.priority == SERVER_KNOBS->PRIORITY_TEAM_REDUNDANT),
wantsNewServers(
isDataMovementForMountainChopper(rs.moveReason) || isDataMovementForValleyFiller(rs.moveReason) ||
rs.moveReason == DataMovementReason::SPLIT_SHARD || rs.moveReason == DataMovementReason::TEAM_REDUNDANT),
cancellable(true), interval("QueuedRelocation"), dataMove(rs.dataMove) {
if (dataMove != nullptr) {
this->src.insert(this->src.end(), dataMove->meta.src.begin(), dataMove->meta.src.end());
@ -813,7 +806,6 @@ struct DDQueueData {
}
ACTOR static Future<Void> getSourceServersForRange(DDQueueData* self,
Database cx,
RelocateData input,
PromiseStream<RelocateData> output,
Reference<FlowLock> fetchLock) {
@ -929,7 +921,7 @@ struct DDQueueData {
fetchingSourcesQueue.insert(rrs);
getSourceActors.insert(
rrs.keys, getSourceServersForRange(this, cx, rrs, fetchSourceServersComplete, fetchSourceLock));
rrs.keys, getSourceServersForRange(this, rrs, fetchSourceServersComplete, fetchSourceLock));
} else {
RelocateData newData(rrs);
newData.keys = affectedQueuedItems[r];
@ -1739,7 +1731,7 @@ inline double getWorstCpu(const HealthMetrics& metrics, const std::vector<UID>&
// Move the shard with the top K highest read density of sourceTeam's to destTeam if sourceTeam has much more read load
// than destTeam
ACTOR Future<bool> rebalanceReadLoad(DDQueueData* self,
int priority,
DataMovementReason moveReason,
Reference<IDataDistributionTeam> sourceTeam,
Reference<IDataDistributionTeam> destTeam,
bool primary,
@ -1807,7 +1799,7 @@ ACTOR Future<bool> rebalanceReadLoad(DDQueueData* self,
ShardsAffectedByTeamFailure::Team(sourceTeam->getServerIDs(), primary));
for (int i = 0; i < shards.size(); i++) {
if (shard == shards[i]) {
self->output.send(RelocateShard(shard, priority, RelocateReason::REBALANCE_READ));
self->output.send(RelocateShard(shard, moveReason, RelocateReason::REBALANCE_READ));
self->updateLastAsSource(sourceTeam->getServerIDs());
return true;
}
@ -1818,7 +1810,7 @@ ACTOR Future<bool> rebalanceReadLoad(DDQueueData* self,
// Move a random shard from sourceTeam if sourceTeam has much more data than provided destTeam
ACTOR static Future<bool> rebalanceTeams(DDQueueData* self,
int priority,
DataMovementReason moveReason,
Reference<IDataDistributionTeam const> sourceTeam,
Reference<IDataDistributionTeam const> destTeam,
bool primary,
@ -1879,7 +1871,7 @@ ACTOR static Future<bool> rebalanceTeams(DDQueueData* self,
ShardsAffectedByTeamFailure::Team(sourceTeam->getServerIDs(), primary));
for (int i = 0; i < shards.size(); i++) {
if (moveShard == shards[i]) {
self->output.send(RelocateShard(moveShard, priority, RelocateReason::REBALANCE_DISK));
self->output.send(RelocateShard(moveShard, moveReason, RelocateReason::REBALANCE_DISK));
return true;
}
}
@ -2008,9 +2000,9 @@ ACTOR Future<Void> BgDDLoadRebalance(DDQueueData* self, int teamCollectionIndex,
// clang-format off
if (sourceTeam.isValid() && destTeam.isValid()) {
if (readRebalance) {
wait(store(moved,rebalanceReadLoad(self, ddPriority, sourceTeam, destTeam, teamCollectionIndex == 0, &traceEvent)));
wait(store(moved,rebalanceReadLoad(self, reason, sourceTeam, destTeam, teamCollectionIndex == 0, &traceEvent)));
} else {
wait(store(moved,rebalanceTeams(self, ddPriority, sourceTeam, destTeam, teamCollectionIndex == 0, &traceEvent)));
wait(store(moved,rebalanceTeams(self, reason, sourceTeam, destTeam, teamCollectionIndex == 0, &traceEvent)));
}
}
// clang-format on
@ -2106,7 +2098,7 @@ ACTOR Future<Void> BgDDMountainChopper(DDQueueData* self, int teamCollectionInde
if (loadedTeam.first.present()) {
bool _moved = wait(rebalanceTeams(self,
SERVER_KNOBS->PRIORITY_REBALANCE_OVERUTILIZED_TEAM,
DataMovementReason::REBALANCE_OVERUTILIZED_TEAM,
loadedTeam.first.get(),
randomTeam.first.get(),
teamCollectionIndex == 0,
@ -2205,7 +2197,7 @@ ACTOR Future<Void> BgDDValleyFiller(DDQueueData* self, int teamCollectionIndex)
if (unloadedTeam.first.present()) {
bool _moved = wait(rebalanceTeams(self,
SERVER_KNOBS->PRIORITY_REBALANCE_UNDERUTILIZED_TEAM,
DataMovementReason::REBALANCE_UNDERUTILIZED_TEAM,
randomTeam.first.get(),
unloadedTeam.first.get(),
teamCollectionIndex == 0,
@ -2267,8 +2259,8 @@ ACTOR Future<Void> dataDistributionQueue(Database cx,
for (int i = 0; i < teamCollections.size(); i++) {
// FIXME: Use BgDDLoadBalance for disk rebalance too after DD simulation test proof.
// balancingFutures.push_back(BgDDLoadRebalance(&self, i, SERVER_KNOBS->PRIORITY_REBALANCE_OVERUTILIZED_TEAM));
// balancingFutures.push_back(BgDDLoadRebalance(&self, i, SERVER_KNOBS->PRIORITY_REBALANCE_UNDERUTILIZED_TEAM));
// balancingFutures.push_back(BgDDLoadRebalance(&self, i, DataMovementReason::REBALANCE_OVERUTILIZED_TEAM));
// balancingFutures.push_back(BgDDLoadRebalance(&self, i, DataMovementReason::REBALANCE_UNDERUTILIZED_TEAM));
if (SERVER_KNOBS->READ_SAMPLING_ENABLED) {
balancingFutures.push_back(BgDDLoadRebalance(&self, i, DataMovementReason::REBALANCE_READ_OVERUTIL_TEAM));
balancingFutures.push_back(BgDDLoadRebalance(&self, i, DataMovementReason::REBALANCE_READ_UNDERUTIL_TEAM));

View File

@ -524,12 +524,12 @@ ACTOR Future<Void> shardSplitter(DataDistributionTracker* self,
for (int i = 0; i < skipRange; i++) {
KeyRangeRef r(splitKeys[i], splitKeys[i + 1]);
self->shardsAffectedByTeamFailure->defineShard(r);
self->output.send(RelocateShard(r, SERVER_KNOBS->PRIORITY_SPLIT_SHARD, RelocateReason::OTHER));
self->output.send(RelocateShard(r, DataMovementReason::SPLIT_SHARD, RelocateReason::OTHER));
}
for (int i = numShards - 1; i > skipRange; i--) {
KeyRangeRef r(splitKeys[i], splitKeys[i + 1]);
self->shardsAffectedByTeamFailure->defineShard(r);
self->output.send(RelocateShard(r, SERVER_KNOBS->PRIORITY_SPLIT_SHARD, RelocateReason::OTHER));
self->output.send(RelocateShard(r, DataMovementReason::SPLIT_SHARD, RelocateReason::OTHER));
}
self->sizeChanges.add(changeSizes(self, keys, shardSize->get().get().metrics.bytes));
@ -675,7 +675,7 @@ Future<Void> shardMerger(DataDistributionTracker* self,
}
restartShardTrackers(self, mergeRange, ShardMetrics(endingStats, lastLowBandwidthStartTime, shardCount));
self->shardsAffectedByTeamFailure->defineShard(mergeRange);
self->output.send(RelocateShard(mergeRange, SERVER_KNOBS->PRIORITY_MERGE_SHARD, RelocateReason::OTHER));
self->output.send(RelocateShard(mergeRange, DataMovementReason::MERGE_SHARD, RelocateReason::OTHER));
// We are about to be cancelled by the call to restartShardTrackers
return Void();
@ -1189,8 +1189,14 @@ void ShardsAffectedByTeamFailure::finishMove(KeyRangeRef keys) {
}
}
void ShardsAffectedByTeamFailure::setCheckMode(CheckMode mode) {
checkMode = mode;
}
void ShardsAffectedByTeamFailure::check() const {
if (EXPENSIVE_VALIDATION) {
if (checkMode == CheckMode::ForceNoCheck)
return;
if (EXPENSIVE_VALIDATION || checkMode == CheckMode::ForceCheck) {
for (auto t = team_shards.begin(); t != team_shards.end(); ++t) {
auto i = shard_teams.rangeContaining(t->second.begin);
if (i->range() != t->second || !std::count(i->value().first.begin(), i->value().first.end(), t->first)) {

View File

@ -79,6 +79,10 @@ static_assert((ROCKSDB_MAJOR == 6 && ROCKSDB_MINOR == 27) ? ROCKSDB_PATCH >= 3 :
namespace {
using rocksdb::BackgroundErrorReason;
struct SharedRocksDBState {
bool closing = false;
};
// Returns string representation of RocksDB background error reason.
// Error reason code:
// https://github.com/facebook/rocksdb/blob/12d798ac06bcce36be703b057d5f5f4dab3b270c/include/rocksdb/listener.h#L125
@ -737,6 +741,7 @@ ACTOR Future<Void> flowLockLogger(UID id, const FlowLock* readLock, const FlowLo
}
ACTOR Future<Void> rocksDBMetricLogger(UID id,
std::shared_ptr<SharedRocksDBState> sharedState,
std::shared_ptr<rocksdb::Statistics> statistics,
std::shared_ptr<PerfContextMetrics> perfContextMetrics,
rocksdb::DB* db,
@ -780,6 +785,7 @@ ACTOR Future<Void> rocksDBMetricLogger(UID id,
{ "CountIterSkippedKeys", rocksdb::NUMBER_ITER_SKIP, 0 },
};
state std::vector<std::pair<const char*, std::string>> intPropertyStats = {
{ "NumImmutableMemtables", rocksdb::DB::Properties::kNumImmutableMemTable },
{ "NumImmutableMemtablesFlushed", rocksdb::DB::Properties::kNumImmutableMemTableFlushed },
@ -823,6 +829,9 @@ ACTOR Future<Void> rocksDBMetricLogger(UID id,
loop {
wait(delay(SERVER_KNOBS->ROCKSDB_METRICS_DELAY));
if (sharedState->closing) {
break;
}
TraceEvent e("RocksDBMetrics", id);
uint64_t stat;
for (auto& [name, ticker, cum] : tickerStats) {
@ -873,6 +882,8 @@ ACTOR Future<Void> rocksDBMetricLogger(UID id,
perfContextMetrics->log(true);
}
}
return Void();
}
void logRocksDBError(UID id,
@ -921,6 +932,8 @@ struct RocksDBKeyValueStore : IKeyValueStore {
DB& db;
CF& cf;
std::unordered_set<rocksdb::ColumnFamilyHandle*> cfHandles;
UID id;
std::shared_ptr<rocksdb::RateLimiter> rateLimiter;
std::shared_ptr<ReadIteratorPool> readIterPool;
@ -954,15 +967,10 @@ struct RocksDBKeyValueStore : IKeyValueStore {
}
}
~Writer() override {
if (db) {
delete db;
}
}
void init() override {}
struct OpenAction : TypedAction<Writer, OpenAction> {
std::shared_ptr<SharedRocksDBState> sharedState;
std::string path;
ThreadReturnPromise<Void> done;
Optional<Future<Void>>& metrics;
@ -970,14 +978,15 @@ struct RocksDBKeyValueStore : IKeyValueStore {
const FlowLock* fetchLock;
std::shared_ptr<RocksDBErrorListener> errorListener;
Counters& counters;
OpenAction(std::string path,
OpenAction(std::shared_ptr<SharedRocksDBState> sharedState,
std::string path,
Optional<Future<Void>>& metrics,
const FlowLock* readLock,
const FlowLock* fetchLock,
std::shared_ptr<RocksDBErrorListener> errorListener,
Counters& counters)
: path(std::move(path)), metrics(metrics), readLock(readLock), fetchLock(fetchLock),
errorListener(errorListener), counters(counters) {}
: sharedState(sharedState), path(std::move(path)), metrics(metrics), readLock(readLock),
fetchLock(fetchLock), errorListener(errorListener), counters(counters) {}
double getTimeEstimate() const override { return SERVER_KNOBS->COMMIT_TIME_ESTIMATE; }
};
@ -1004,6 +1013,7 @@ struct RocksDBKeyValueStore : IKeyValueStore {
std::vector<rocksdb::ColumnFamilyHandle*> handles;
status = rocksdb::DB::Open(options, a.path, descriptors, &handles, &db);
cfHandles.insert(handles.begin(), handles.end());
if (!status.ok()) {
logRocksDBError(id, status, "Open");
@ -1020,6 +1030,7 @@ struct RocksDBKeyValueStore : IKeyValueStore {
if (cf == nullptr) {
status = db->CreateColumnFamily(cfOptions, SERVER_KNOBS->DEFAULT_FDB_ROCKSDB_COLUMN_FAMILY, &cf);
cfHandles.insert(cf);
if (!status.ok()) {
logRocksDBError(id, status, "Open");
a.done.sendError(statusToError(status));
@ -1037,13 +1048,20 @@ struct RocksDBKeyValueStore : IKeyValueStore {
// The current thread and main thread are same when the code runs in simulation.
// blockUntilReady() is getting the thread into deadlock state, so directly calling
// the metricsLogger.
a.metrics = rocksDBMetricLogger(
id, options.statistics, perfContextMetrics, db, readIterPool, &a.counters, cf) &&
flowLockLogger(id, a.readLock, a.fetchLock) && refreshReadIteratorPool(readIterPool);
a.metrics =
rocksDBMetricLogger(
id, a.sharedState, options.statistics, perfContextMetrics, db, readIterPool, &a.counters, cf) &&
flowLockLogger(id, a.readLock, a.fetchLock) && refreshReadIteratorPool(readIterPool);
} else {
onMainThread([&] {
a.metrics = rocksDBMetricLogger(
id, options.statistics, perfContextMetrics, db, readIterPool, &a.counters, cf) &&
a.metrics = rocksDBMetricLogger(id,
a.sharedState,
options.statistics,
perfContextMetrics,
db,
readIterPool,
&a.counters,
cf) &&
flowLockLogger(id, a.readLock, a.fetchLock) && refreshReadIteratorPool(readIterPool);
return Future<bool>(true);
}).blockUntilReady();
@ -1182,6 +1200,12 @@ struct RocksDBKeyValueStore : IKeyValueStore {
a.done.send(Void());
return;
}
for (rocksdb::ColumnFamilyHandle* handle : cfHandles) {
if (handle != nullptr) {
db->DestroyColumnFamilyHandle(handle);
}
}
cfHandles.clear();
auto s = db->Close();
if (!s.ok()) {
logRocksDBError(id, s, "Close");
@ -1547,35 +1571,9 @@ struct RocksDBKeyValueStore : IKeyValueStore {
}
};
DB db = nullptr;
std::shared_ptr<PerfContextMetrics> perfContextMetrics;
std::string path;
rocksdb::ColumnFamilyHandle* defaultFdbCF = nullptr;
UID id;
Reference<IThreadPool> writeThread;
Reference<IThreadPool> readThreads;
std::shared_ptr<RocksDBErrorListener> errorListener;
Future<Void> errorFuture;
Promise<Void> closePromise;
Future<Void> openFuture;
std::unique_ptr<rocksdb::WriteBatch> writeBatch;
Optional<Future<Void>> metrics;
FlowLock readSemaphore;
int numReadWaiters;
FlowLock fetchSemaphore;
int numFetchWaiters;
std::shared_ptr<ReadIteratorPool> readIterPool;
std::vector<std::unique_ptr<ThreadReturnPromiseStream<std::pair<std::string, double>>>> metricPromiseStreams;
// ThreadReturnPromiseStream pair.first stores the histogram name and
// pair.second stores the corresponding measured latency (seconds)
Future<Void> actorErrorListener;
Future<Void> collection;
PromiseStream<Future<Void>> addActor;
Counters counters;
explicit RocksDBKeyValueStore(const std::string& path, UID id)
: path(path), id(id), perfContextMetrics(new PerfContextMetrics()),
readIterPool(new ReadIteratorPool(id, db, defaultFdbCF)),
: sharedState(std::make_shared<SharedRocksDBState>()), path(path), id(id),
perfContextMetrics(new PerfContextMetrics()), readIterPool(new ReadIteratorPool(id, db, defaultFdbCF)),
readSemaphore(SERVER_KNOBS->ROCKSDB_READ_QUEUE_SOFT_MAX),
fetchSemaphore(SERVER_KNOBS->ROCKSDB_FETCH_QUEUE_SOFT_MAX),
numReadWaiters(SERVER_KNOBS->ROCKSDB_READ_QUEUE_HARD_MAX - SERVER_KNOBS->ROCKSDB_READ_QUEUE_SOFT_MAX),
@ -1730,6 +1728,8 @@ struct RocksDBKeyValueStore : IKeyValueStore {
Future<Void> getError() const override { return errorFuture; }
ACTOR static void doClose(RocksDBKeyValueStore* self, bool deleteOnClose) {
self->sharedState->closing = true;
// The metrics future retains a reference to the DB, so stop it before we delete it.
self->metrics.reset();
@ -1740,8 +1740,12 @@ struct RocksDBKeyValueStore : IKeyValueStore {
self->writeThread->post(a);
wait(f);
wait(self->writeThread->stop());
if (self->closePromise.canBeSet())
if (self->closePromise.canBeSet()) {
self->closePromise.send(Void());
}
if (self->db != nullptr) {
delete self->db;
}
delete self;
}
@ -1765,7 +1769,7 @@ struct RocksDBKeyValueStore : IKeyValueStore {
return openFuture;
}
auto a = std::make_unique<Writer::OpenAction>(
path, metrics, &readSemaphore, &fetchSemaphore, errorListener, counters);
this->sharedState, path, metrics, &readSemaphore, &fetchSemaphore, errorListener, counters);
openFuture = a->done.getFuture();
writeThread->post(a.release());
return openFuture;
@ -1978,6 +1982,33 @@ struct RocksDBKeyValueStore : IKeyValueStore {
}
return Void();
}
DB db = nullptr;
std::shared_ptr<SharedRocksDBState> sharedState;
std::shared_ptr<PerfContextMetrics> perfContextMetrics;
std::string path;
rocksdb::ColumnFamilyHandle* defaultFdbCF = nullptr;
UID id;
Reference<IThreadPool> writeThread;
Reference<IThreadPool> readThreads;
std::shared_ptr<RocksDBErrorListener> errorListener;
Future<Void> errorFuture;
Promise<Void> closePromise;
Future<Void> openFuture;
std::unique_ptr<rocksdb::WriteBatch> writeBatch;
Optional<Future<Void>> metrics;
FlowLock readSemaphore;
int numReadWaiters;
FlowLock fetchSemaphore;
int numFetchWaiters;
std::shared_ptr<ReadIteratorPool> readIterPool;
std::vector<std::unique_ptr<ThreadReturnPromiseStream<std::pair<std::string, double>>>> metricPromiseStreams;
// ThreadReturnPromiseStream pair.first stores the histogram name and
// pair.second stores the corresponding measured latency (seconds)
Future<Void> actorErrorListener;
Future<Void> collection;
PromiseStream<Future<Void>> addActor;
Counters counters;
};
void RocksDBKeyValueStore::Writer::action(CheckpointAction& a) {
@ -1987,7 +2018,7 @@ void RocksDBKeyValueStore::Writer::action(CheckpointAction& a) {
.detail("Format", static_cast<int>(a.request.format))
.detail("CheckpointDir", a.request.checkpointDir);
rocksdb::Checkpoint* checkpoint;
rocksdb::Checkpoint* checkpoint = nullptr;
rocksdb::Status s = rocksdb::Checkpoint::Create(db, &checkpoint);
if (!s.ok()) {
logRocksDBError(id, s, "Checkpoint");
@ -2051,9 +2082,15 @@ void RocksDBKeyValueStore::Writer::action(CheckpointAction& a) {
.detail("RocksSequenceNumber", debugCheckpointSeq)
.detail("CheckpointDir", checkpointDir);
} else {
if (checkpoint != nullptr) {
delete checkpoint;
}
throw not_implemented();
}
if (checkpoint != nullptr) {
delete checkpoint;
}
res.setState(CheckpointMetaData::Complete);
a.reply.send(res);
}
@ -2081,6 +2118,8 @@ void RocksDBKeyValueStore::Writer::action(RestoreAction& a) {
if (cf != nullptr) {
ASSERT(db->DropColumnFamily(cf).ok());
db->DestroyColumnFamilyHandle(cf);
cfHandles.erase(cf);
}
rocksdb::ExportImportFilesMetaData metaData = getMetaData(a.checkpoints[0]);
@ -2088,6 +2127,7 @@ void RocksDBKeyValueStore::Writer::action(RestoreAction& a) {
importOptions.move_files = true;
status = db->CreateColumnFamilyWithImport(
getCFOptions(), SERVER_KNOBS->DEFAULT_FDB_ROCKSDB_COLUMN_FAMILY, importOptions, metaData, &cf);
cfHandles.insert(cf);
if (!status.ok()) {
logRocksDBError(id, status, "Restore");
@ -2101,6 +2141,7 @@ void RocksDBKeyValueStore::Writer::action(RestoreAction& a) {
} else if (format == RocksDB) {
if (cf == nullptr) {
status = db->CreateColumnFamily(getCFOptions(), SERVER_KNOBS->DEFAULT_FDB_ROCKSDB_COLUMN_FAMILY, &cf);
cfHandles.insert(cf);
TraceEvent("RocksDBServeRestoreRange", id)
.detail("Path", a.path)
.detail("Checkpoint", describe(a.checkpoints));
@ -2217,7 +2258,7 @@ TEST_CASE("noSim/fdbserver/KeyValueStoreRocksDB/RocksDBBasic") {
}
Future<Void> closed = kvStore->onClosed();
kvStore->close();
kvStore->dispose();
wait(closed);
platform::eraseDirectoryRecursive(rocksDBTestDir);
@ -2250,7 +2291,7 @@ TEST_CASE("noSim/fdbserver/KeyValueStoreRocksDB/RocksDBReopen") {
ASSERT(Optional<Value>(LiteralStringRef("bar")) == val);
Future<Void> closed = kvStore->onClosed();
kvStore->close();
kvStore->dispose();
wait(closed);
platform::eraseDirectoryRecursive(rocksDBTestDir);
@ -2295,8 +2336,8 @@ TEST_CASE("noSim/fdbserver/KeyValueStoreRocksDB/CheckpointRestoreColumnFamily")
std::vector<Future<Void>> closes;
closes.push_back(kvStore->onClosed());
closes.push_back(kvStoreCopy->onClosed());
kvStore->close();
kvStoreCopy->close();
kvStore->dispose();
kvStoreCopy->dispose();
wait(waitForAll(closes));
platform::eraseDirectoryRecursive(rocksDBTestDir);
@ -2346,7 +2387,7 @@ TEST_CASE("noSim/fdbserver/KeyValueStoreRocksDB/CheckpointRestoreKeyValues") {
std::vector<Future<Void>> closes;
closes.push_back(cpReader->close());
closes.push_back(kvStore->onClosed());
kvStore->close();
kvStore->dispose();
wait(waitForAll(closes));
platform::eraseDirectoryRecursive(rocksDBTestDir);

View File

@ -110,9 +110,9 @@ class RocksDBErrorListener : public rocksdb::EventListener {
public:
RocksDBErrorListener(){};
void OnBackgroundError(rocksdb::BackgroundErrorReason reason, rocksdb::Status* bg_error) override {
TraceEvent(SevError, "RocksDBBGError")
TraceEvent(SevError, "ShardedRocksDBBGError")
.detail("Reason", getErrorReason(reason))
.detail("RocksDBSeverity", bg_error->severity())
.detail("ShardedRocksDBSeverity", bg_error->severity())
.detail("Status", bg_error->ToString());
std::unique_lock<std::mutex> lock(mutex);
if (!errorPromise.isValid())
@ -186,8 +186,8 @@ std::vector<std::pair<KeyRange, std::string>> decodeShardMapping(const RangeResu
void logRocksDBError(const rocksdb::Status& status, const std::string& method) {
auto level = status.IsTimedOut() ? SevWarn : SevError;
TraceEvent e(level, "RocksDBError");
e.detail("Error", status.ToString()).detail("Method", method).detail("RocksDBSeverity", status.severity());
TraceEvent e(level, "ShardedRocksDBError");
e.detail("Error", status.ToString()).detail("Method", method).detail("ShardedRocksDBSeverity", status.severity());
if (status.IsIOError()) {
e.detail("SubCode", status.subcode());
}
@ -219,7 +219,7 @@ const char* ShardOpToString(ShardOp op) {
}
}
void logShardEvent(StringRef name, ShardOp op, Severity severity = SevInfo, const std::string& message = "") {
TraceEvent e(severity, "KVSShardEvent");
TraceEvent e(severity, "ShardedRocksKVSShardEvent");
e.detail("Name", name).detail("Action", ShardOpToString(op));
if (!message.empty()) {
e.detail("Message", message);
@ -230,7 +230,7 @@ void logShardEvent(StringRef name,
ShardOp op,
Severity severity = SevInfo,
const std::string& message = "") {
TraceEvent e(severity, "KVSShardEvent");
TraceEvent e(severity, "ShardedRocksKVSShardEvent");
e.detail("Name", name).detail("Action", ShardOpToString(op)).detail("Begin", range.begin).detail("End", range.end);
if (message != "") {
e.detail("Message", message);
@ -343,7 +343,7 @@ public:
ASSERT(cf);
readRangeOptions.background_purge_on_iterator_cleanup = true;
readRangeOptions.auto_prefix_mode = (SERVER_KNOBS->ROCKSDB_PREFIX_LEN > 0);
TraceEvent(SevDebug, "ReadIteratorPool")
TraceEvent(SevVerbose, "ShardedRocksReadIteratorPool")
.detail("Path", path)
.detail("KnobRocksDBReadRangeReuseIterators", SERVER_KNOBS->ROCKSDB_READ_RANGE_REUSE_ITERATORS)
.detail("KnobRocksDBPrefixLen", SERVER_KNOBS->ROCKSDB_PREFIX_LEN);
@ -425,7 +425,7 @@ private:
ACTOR Future<Void> flowLockLogger(const FlowLock* readLock, const FlowLock* fetchLock) {
loop {
wait(delay(SERVER_KNOBS->ROCKSDB_METRICS_DELAY));
TraceEvent e("RocksDBFlowLock");
TraceEvent e("ShardedRocksDBFlowLock");
e.detail("ReadAvailable", readLock->available());
e.detail("ReadActivePermits", readLock->activePermits());
e.detail("ReadWaiters", readLock->waiters());
@ -588,13 +588,13 @@ public:
if (rState->closing) {
break;
}
TraceEvent(SevInfo, "KVSPhysialShardMetrics")
TraceEvent(SevInfo, "ShardedRocksKVSPhysialShardMetrics")
.detail("NumActiveShards", shardManager->numActiveShards())
.detail("TotalPhysicalShards", shardManager->numPhysicalShards());
}
} catch (Error& e) {
if (e.code() != error_code_actor_cancelled) {
TraceEvent(SevError, "ShardMetricsLoggerError").errorUnsuppressed(e);
TraceEvent(SevError, "ShardedRocksShardMetricsLoggerError").errorUnsuppressed(e);
}
}
return Void();
@ -602,7 +602,7 @@ public:
rocksdb::Status init() {
// Open instance.
TraceEvent(SevVerbose, "ShardManagerInitBegin", this->logId).detail("DataPath", path);
TraceEvent(SevInfo, "ShardedRocksShardManagerInitBegin", this->logId).detail("DataPath", path);
std::vector<std::string> columnFamilies;
rocksdb::Options options = getOptions();
rocksdb::Status status = rocksdb::DB::ListColumnFamilies(options, path, &columnFamilies);
@ -632,6 +632,8 @@ public:
}
if (foundMetadata) {
TraceEvent(SevInfo, "ShardedRocksInitLoadPhysicalShards", this->logId)
.detail("PhysicalShardCount", handles.size());
for (auto handle : handles) {
if (handle->GetName() == "kvs-metadata") {
metadataShard = std::make_shared<PhysicalShard>(db, "kvs-metadata", handle);
@ -639,7 +641,8 @@ public:
physicalShards[handle->GetName()] = std::make_shared<PhysicalShard>(db, handle->GetName(), handle);
}
columnFamilyMap[handle->GetID()] = handle;
TraceEvent(SevInfo, "ShardedRocskDB").detail("FoundShard", handle->GetName()).detail("Action", "Init");
TraceEvent(SevVerbose, "ShardedRocksInitPhysicalShard", this->logId)
.detail("PhysicalShard", handle->GetName());
}
RangeResult metadata;
readRangeInDb(metadataShard.get(), prefixRange(shardMappingPrefix), UINT16_MAX, UINT16_MAX, &metadata);
@ -647,7 +650,7 @@ public:
std::vector<std::pair<KeyRange, std::string>> mapping = decodeShardMapping(metadata, shardMappingPrefix);
for (const auto& [range, name] : mapping) {
TraceEvent(SevDebug, "ShardedRocksLoadPhysicalShard", this->logId)
TraceEvent(SevVerbose, "ShardedRocksLoadRange", this->logId)
.detail("Range", range)
.detail("PhysicalShard", name);
auto it = physicalShards.find(name);
@ -662,10 +665,10 @@ public:
activePhysicalShardIds.emplace(name);
}
// TODO: remove unused column families.
} else {
// DB is opened with default shard.
ASSERT(handles.size() == 1);
// Add SpecialKeys range. This range should not be modified.
std::shared_ptr<PhysicalShard> defaultShard = std::make_shared<PhysicalShard>(db, "default", handles[0]);
columnFamilyMap[defaultShard->cf->GetID()] = defaultShard->cf;
@ -688,7 +691,7 @@ public:
return status;
}
metadataShard->readIterPool->update();
TraceEvent(SevInfo, "InitializeMetaDataShard", this->logId)
TraceEvent(SevInfo, "ShardedRocksInitializeMetaDataShard", this->logId)
.detail("MetadataShardCF", metadataShard->cf->GetID());
}
physicalShards["kvs-metadata"] = metadataShard;
@ -696,7 +699,7 @@ public:
writeBatch = std::make_unique<rocksdb::WriteBatch>();
dirtyShards = std::make_unique<std::set<PhysicalShard*>>();
TraceEvent(SevDebug, "ShardManagerInitEnd", this->logId).detail("DataPath", path);
TraceEvent(SevInfo, "ShardedRocksShardManagerInitEnd", this->logId).detail("DataPath", path);
return status;
}
@ -712,7 +715,7 @@ public:
for (auto it = rangeIterator.begin(); it != rangeIterator.end(); ++it) {
if (it.value() == nullptr) {
TraceEvent(SevDebug, "ShardedRocksDB")
TraceEvent(SevVerbose, "ShardedRocksDB")
.detail("Info", "ShardNotFound")
.detail("BeginKey", range.begin)
.detail("EndKey", range.end);
@ -724,9 +727,10 @@ public:
}
PhysicalShard* addRange(KeyRange range, std::string id) {
TraceEvent(SevVerbose, "ShardedRocksAddRangeBegin", this->logId)
TraceEvent(SevInfo, "ShardedRocksAddRangeBegin", this->logId)
.detail("Range", range)
.detail("PhysicalShardID", id);
// Newly added range should not overlap with any existing range.
auto ranges = dataShardMap.intersectingRanges(range);
@ -750,7 +754,7 @@ public:
validate();
TraceEvent(SevVerbose, "ShardedRocksAddRangeEnd", this->logId)
TraceEvent(SevInfo, "ShardedRocksAddRangeEnd", this->logId)
.detail("Range", range)
.detail("PhysicalShardID", id);
@ -758,7 +762,7 @@ public:
}
std::vector<std::string> removeRange(KeyRange range) {
TraceEvent(SevVerbose, "ShardedRocksRemoveRangeBegin", this->logId).detail("Range", range);
TraceEvent(SevInfo, "ShardedRocksRemoveRangeBegin", this->logId).detail("Range", range);
std::vector<std::string> shardIds;
@ -796,6 +800,7 @@ public:
}
continue;
}
// Range modification could result in more than one segments. Remove the original segment key here.
existingShard->dataShards.erase(shardRange.begin.toString());
if (shardRange.begin < range.begin) {
@ -826,7 +831,7 @@ public:
validate();
TraceEvent(SevVerbose, "ShardedRocksRemoveRangeEnd", this->logId).detail("Range", range);
TraceEvent(SevInfo, "ShardedRocksRemoveRangeEnd", this->logId).detail("Range", range);
return shardIds;
}
@ -849,7 +854,7 @@ public:
TraceEvent(SevError, "ShardedRocksDB").detail("Error", "write to non-exist shard").detail("WriteKey", key);
return;
}
TraceEvent(SevVerbose, "ShardManagerPut", this->logId)
TraceEvent(SevVerbose, "ShardedRocksShardManagerPut", this->logId)
.detail("WriteKey", key)
.detail("Value", value)
.detail("MapRange", it.range())
@ -859,7 +864,9 @@ public:
ASSERT(dirtyShards != nullptr);
writeBatch->Put(it.value()->physicalShard->cf, toSlice(key), toSlice(value));
dirtyShards->insert(it.value()->physicalShard);
TraceEvent(SevVerbose, "ShardManagerPutEnd", this->logId).detail("WriteKey", key).detail("Value", value);
TraceEvent(SevVerbose, "ShardedRocksShardManagerPutEnd", this->logId)
.detail("WriteKey", key)
.detail("Value", value);
}
void clear(KeyRef key) {
@ -884,7 +891,7 @@ public:
}
void persistRangeMapping(KeyRangeRef range, bool isAdd) {
TraceEvent(SevDebug, "ShardedRocksDB")
TraceEvent(SevDebug, "ShardedRocksDB", this->logId)
.detail("Info", "RangeToPersist")
.detail("BeginKey", range.begin)
.detail("EndKey", range.end);
@ -902,7 +909,7 @@ public:
writeBatch->Put(metadataShard->cf,
getShardMappingKey(it.range().begin, shardMappingPrefix),
it.value()->physicalShard->id);
TraceEvent(SevDebug, "ShardedRocksDB")
TraceEvent(SevDebug, "ShardedRocksDB", this->logId)
.detail("Action", "PersistRangeMapping")
.detail("BeginKey", it.range().begin)
.detail("EndKey", it.range().end)
@ -911,7 +918,7 @@ public:
} else {
// Empty range.
writeBatch->Put(metadataShard->cf, getShardMappingKey(it.range().begin, shardMappingPrefix), "");
TraceEvent(SevDebug, "ShardedRocksDB")
TraceEvent(SevDebug, "ShardedRocksDB", this->logId)
.detail("Action", "PersistRangeMapping")
.detail("BeginKey", it.range().begin)
.detail("EndKey", it.range().end)
@ -921,7 +928,7 @@ public:
}
} else {
writeBatch->Put(metadataShard->cf, getShardMappingKey(range.begin, shardMappingPrefix), "");
TraceEvent(SevDebug, "ShardedRocksDB")
TraceEvent(SevDebug, "ShardedRocksDB", this->logId)
.detail("Action", "PersistRangeMapping")
.detail("RemoveRange", "True")
.detail("BeginKey", range.begin)
@ -972,7 +979,7 @@ public:
if (!s.ok()) {
logRocksDBError(s, "DestroyDB");
}
TraceEvent("RocksDB").detail("Info", "DBDestroyed");
TraceEvent("ShardedRocksDB", this->logId).detail("Info", "DBDestroyed");
}
rocksdb::DB* getDb() const { return db; }
@ -997,9 +1004,9 @@ public:
}
void validate() {
TraceEvent(SevVerbose, "ValidateShardManager", this->logId);
TraceEvent(SevVerbose, "ShardedRocksValidateShardManager", this->logId);
for (auto s = dataShardMap.ranges().begin(); s != dataShardMap.ranges().end(); ++s) {
TraceEvent e(SevVerbose, "ValidateDataShardMap", this->logId);
TraceEvent e(SevVerbose, "ShardedRocksValidateDataShardMap", this->logId);
e.detail("Range", s->range());
const DataShard* shard = s->value();
e.detail("ShardAddress", reinterpret_cast<std::uintptr_t>(shard));
@ -1008,6 +1015,13 @@ public:
} else {
e.detail("Shard", "Empty");
}
if (shard != nullptr) {
ASSERT(shard->range == static_cast<KeyRangeRef>(s->range()));
ASSERT(shard->physicalShard != nullptr);
auto it = shard->physicalShard->dataShards.find(shard->range.begin.toString());
ASSERT(it != shard->physicalShard->dataShards.end());
ASSERT(it->second.get() == shard);
}
}
}
@ -1338,7 +1352,7 @@ std::shared_ptr<rocksdb::Statistics> RocksDBMetrics::getStatsObjForRocksDB() {
}
void RocksDBMetrics::logStats(rocksdb::DB* db) {
TraceEvent e("RocksDBMetrics");
TraceEvent e("ShardedRocksDBMetrics");
uint64_t stat;
for (auto& [name, ticker, cumulation] : tickerStats) {
stat = stats->getTickerCount(ticker);
@ -1361,7 +1375,7 @@ void RocksDBMetrics::logStats(rocksdb::DB* db) {
}
void RocksDBMetrics::logMemUsagePerShard(std::string shardName, rocksdb::DB* db) {
TraceEvent e("RocksDBShardMemMetrics");
TraceEvent e("ShardedRocksDBShardMemMetrics");
uint64_t stat;
ASSERT(db != nullptr);
ASSERT(db->GetIntProperty(rocksdb::DB::Properties::kBlockCacheUsage, &stat));
@ -1387,7 +1401,7 @@ void RocksDBMetrics::setPerfContext(int index) {
}
void RocksDBMetrics::logPerfContext(bool ignoreZeroMetric) {
TraceEvent e("RocksDBPerfContextMetrics");
TraceEvent e("ShardedRocksDBPerfContextMetrics");
e.setMaxEventLength(20000);
for (auto& [name, metric, vals] : perfContextMetrics) {
uint64_t s = 0;
@ -1650,7 +1664,7 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
return;
}
TraceEvent(SevInfo, "RocksDB").detail("Method", "Open");
TraceEvent(SevInfo, "ShardedRocksDB").detail("Method", "Open");
a.done.send(Void());
}
@ -1841,7 +1855,7 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
} else {
a.shardManager->closeAllShards();
}
TraceEvent(SevInfo, "RocksDB").detail("Method", "Close");
TraceEvent(SevInfo, "ShardedRocksDB").detail("Method", "Close");
a.done.send(Void());
}
};
@ -1908,7 +1922,7 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
traceBatch.get().addEvent("GetValueDebug", a.debugID.get().first(), "Reader.Before");
}
if (readBeginTime - a.startTime > readValueTimeout) {
TraceEvent(SevWarn, "RocksDBError")
TraceEvent(SevWarn, "ShardedRocksDBError")
.detail("Error", "Read value request timedout")
.detail("Method", "ReadValueAction")
.detail("Timeout value", readValueTimeout);
@ -1995,7 +2009,7 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
"Reader.Before"); //.detail("TaskID", g_network->getCurrentTask());
}
if (readBeginTime - a.startTime > readValuePrefixTimeout) {
TraceEvent(SevWarn, "RocksDBError")
TraceEvent(SevWarn, "ShardedRocksDBError")
.detail("Error", "Read value prefix request timedout")
.detail("Method", "ReadValuePrefixAction")
.detail("Timeout value", readValuePrefixTimeout);
@ -2080,7 +2094,7 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
rocksDBMetrics->getReadRangeQueueWaitHistogram(threadIndex)->sampleSeconds(readBeginTime - a.startTime);
}
if (readBeginTime - a.startTime > readRangeTimeout) {
TraceEvent(SevWarn, "KVSReadTimeout")
TraceEvent(SevWarn, "ShardedRocksKVSReadTimeout")
.detail("Error", "Read range request timedout")
.detail("Method", "ReadRangeAction")
.detail("Timeout value", readRangeTimeout);
@ -2127,10 +2141,6 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
}
}
Histogram::getHistogram(
ROCKSDBSTORAGE_HISTOGRAM_GROUP, "ShardedRocksDBNumShardsInRangeRead"_sr, Histogram::Unit::countLinear)
->sample(numShards);
result.more =
(result.size() == a.rowLimit) || (result.size() == -a.rowLimit) || (accumulatedBytes >= a.byteLimit);
if (result.more) {
@ -2184,7 +2194,8 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
readThreads = createGenericThreadPool();
}
writeThread->addThread(new Writer(id, 0, shardManager.getColumnFamilyMap(), rocksDBMetrics), "fdb-rocksdb-wr");
TraceEvent("RocksDBReadThreads").detail("KnobRocksDBReadParallelism", SERVER_KNOBS->ROCKSDB_READ_PARALLELISM);
TraceEvent("ShardedRocksDBReadThreads", id)
.detail("KnobRocksDBReadParallelism", SERVER_KNOBS->ROCKSDB_READ_PARALLELISM);
for (unsigned i = 0; i < SERVER_KNOBS->ROCKSDB_READ_PARALLELISM; ++i) {
readThreads->addThread(new Reader(id, i, rocksDBMetrics), "fdb-rocksdb-re");
}
@ -2302,7 +2313,7 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
auto* shard = shardManager.getDataShard(key);
if (shard == nullptr || !shard->physicalShard->initialized()) {
// TODO: read non-exist system key range should not cause an error.
TraceEvent(SevWarnAlways, "ShardedRocksDB")
TraceEvent(SevWarnAlways, "ShardedRocksDB", this->id)
.detail("Detail", "Read non-exist key range")
.detail("ReadKey", key);
return Optional<Value>();
@ -2330,7 +2341,7 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
auto* shard = shardManager.getDataShard(key);
if (shard == nullptr || !shard->physicalShard->initialized()) {
// TODO: read non-exist system key range should not cause an error.
TraceEvent(SevWarnAlways, "ShardedRocksDB")
TraceEvent(SevWarnAlways, "ShardedRocksDB", this->id)
.detail("Detail", "Read non-exist key range")
.detail("ReadKey", key);
return Optional<Value>();
@ -2452,7 +2463,7 @@ IKeyValueStore* keyValueStoreShardedRocksDB(std::string const& path,
#ifdef SSD_ROCKSDB_EXPERIMENTAL
return new ShardedRocksDBKeyValueStore(path, logID);
#else
TraceEvent(SevError, "RocksDBEngineInitFailure").detail("Reason", "Built without RocksDB");
TraceEvent(SevError, "ShardedRocksDBEngineInitFailure").detail("Reason", "Built without RocksDB");
ASSERT(false);
return nullptr;
#endif // SSD_ROCKSDB_EXPERIMENTAL

View File

@ -21,7 +21,7 @@
#include "fdbserver/RESTKmsConnector.h"
#include "fdbclient/FDBTypes.h"
#include "fdbclient/HTTP.h"
#include "fdbrpc/HTTP.h"
#include "flow/IAsyncFile.h"
#include "fdbserver/KmsConnectorInterface.h"
#include "fdbserver/Knobs.h"

View File

@ -162,6 +162,7 @@ private:
CF cf;
Key begin;
Key end;
std::vector<rocksdb::ColumnFamilyHandle*> handles;
double readRangeTimeout;
std::unique_ptr<rocksdb::Iterator> cursor;
};
@ -233,7 +234,6 @@ void RocksDBCheckpointReader::Reader::action(RocksDBCheckpointReader::Reader::Op
descriptors.push_back(rocksdb::ColumnFamilyDescriptor{ name, cfOptions });
}
std::vector<rocksdb::ColumnFamilyHandle*> handles;
status = rocksdb::DB::OpenForReadOnly(options, a.path, descriptors, &handles, &db);
if (!status.ok()) {
@ -288,6 +288,14 @@ void RocksDBCheckpointReader::Reader::action(RocksDBCheckpointReader::Reader::Cl
return;
}
for (rocksdb::ColumnFamilyHandle* handle : handles) {
if (handle != nullptr) {
TraceEvent("RocksDBCheckpointReaderDestroyCF").detail("Path", a.path).detail("CF", handle->GetName());
db->DestroyColumnFamilyHandle(handle);
}
}
handles.clear();
rocksdb::Status s = db->Close();
if (!s.ok()) {
logRocksDBError(s, "Close");
@ -385,6 +393,9 @@ ACTOR Future<Void> RocksDBCheckpointReader::doClose(RocksDBCheckpointReader* sel
}
if (self != nullptr) {
if (self->db != nullptr) {
delete self->db;
}
delete self;
}

View File

@ -284,6 +284,13 @@ class TestConfig {
if (attrib == "blobGranulesEnabled") {
blobGranulesEnabled = strcmp(value.c_str(), "true") == 0;
}
if (attrib == "injectSSTargetedRestart") {
injectTargetedSSRestart = strcmp(value.c_str(), "true") == 0;
}
if (attrib == "injectSSDelay") {
injectSSDelay = strcmp(value.c_str(), "true") == 0;
}
}
ifs.close();
@ -334,6 +341,8 @@ public:
bool allowDefaultTenant = true;
bool allowDisablingTenants = true;
bool allowCreatingTenants = true;
bool injectTargetedSSRestart = false;
bool injectSSDelay = false;
ConfigDBType getConfigDBType() const { return configDBType; }
@ -394,7 +403,10 @@ public:
.add("allowDefaultTenant", &allowDefaultTenant)
.add("allowDisablingTenants", &allowDisablingTenants)
.add("allowCreatingTenants", &allowCreatingTenants)
.add("randomlyRenameZoneId", &randomlyRenameZoneId);
.add("randomlyRenameZoneId", &randomlyRenameZoneId)
.add("randomlyRenameZoneId", &randomlyRenameZoneId)
.add("injectTargetedSSRestart", &injectTargetedSSRestart)
.add("injectSSDelay", &injectSSDelay);
try {
auto file = toml::parse(testFile);
if (file.contains("configuration") && toml::find(file, "configuration").is_table()) {
@ -1401,7 +1413,7 @@ void SimulationConfig::setDatacenters(const TestConfig& testConfig) {
void SimulationConfig::setStorageEngine(const TestConfig& testConfig) {
// Using [0, 4) to disable the RocksDB storage engine.
// TODO: Figure out what is broken with the RocksDB engine in simulation.
int storage_engine_type = deterministicRandom()->randomInt(0, 4);
int storage_engine_type = deterministicRandom()->randomInt(0, 6);
if (testConfig.storageEngineType.present()) {
storage_engine_type = testConfig.storageEngineType.get();
} else {
@ -1409,7 +1421,7 @@ void SimulationConfig::setStorageEngine(const TestConfig& testConfig) {
while (std::find(testConfig.storageEngineExcludeTypes.begin(),
testConfig.storageEngineExcludeTypes.end(),
storage_engine_type) != testConfig.storageEngineExcludeTypes.end()) {
storage_engine_type = deterministicRandom()->randomInt(0, 5);
storage_engine_type = deterministicRandom()->randomInt(0, 6);
}
}
@ -1452,6 +1464,8 @@ void SimulationConfig::setStorageEngine(const TestConfig& testConfig) {
TraceEvent(SevWarnAlways, "RocksDBNonDeterminism")
.detail("Explanation", "The Sharded RocksDB storage engine is threaded and non-deterministic");
noUnseed = true;
auto& g_knobs = IKnobCollection::getMutableGlobalKnobCollection();
g_knobs.setKnob("shard_encode_location_metadata", KnobValueRef::create(bool{ true }));
break;
}
default:
@ -2393,6 +2407,13 @@ ACTOR void setupAndRun(std::string dataFolder,
testConfig.readFromConfig(testFile);
g_simulator.hasDiffProtocolProcess = testConfig.startIncompatibleProcess;
g_simulator.setDiffProtocol = false;
if (testConfig.injectTargetedSSRestart && deterministicRandom()->random01() < 0.25) {
g_simulator.injectTargetedSSRestartTime = 60.0 + 340.0 * deterministicRandom()->random01();
}
if (testConfig.injectSSDelay && deterministicRandom()->random01() < 0.25) {
g_simulator.injectSSDelayTime = 60.0 + 240.0 * deterministicRandom()->random01();
}
// Build simulator allow list
allowList.addTrustedSubnet("0.0.0.0/2"sv);
@ -2406,6 +2427,7 @@ ACTOR void setupAndRun(std::string dataFolder,
// https://github.com/apple/foundationdb/issues/5155
if (std::string_view(testFile).find("restarting") != std::string_view::npos) {
testConfig.storageEngineExcludeTypes.push_back(4);
testConfig.storageEngineExcludeTypes.push_back(5);
// Disable the default tenant in restarting tests for now
// TODO: persist the chosen default tenant in the restartInfo.ini file for the second test
@ -2418,6 +2440,7 @@ ACTOR void setupAndRun(std::string dataFolder,
// Re-enable the backup and restore related simulation tests when the tests are passing again.
if (std::string_view(testFile).find("Backup") != std::string_view::npos) {
testConfig.storageEngineExcludeTypes.push_back(4);
testConfig.storageEngineExcludeTypes.push_back(5);
}
// Disable the default tenant in backup and DR tests for now. This is because backup does not currently duplicate
@ -2432,6 +2455,7 @@ ACTOR void setupAndRun(std::string dataFolder,
// in the build.
if (!rocksDBEnabled) {
testConfig.storageEngineExcludeTypes.push_back(4);
testConfig.storageEngineExcludeTypes.push_back(5);
}
state ProtocolVersion protocolVersion = currentProtocolVersion;

View File

@ -742,6 +742,7 @@ ACTOR static Future<JsonBuilderObject> processStatusFetcher(
std::vector<std::pair<GrvProxyInterface, EventMap>> grvProxies,
std::vector<BlobWorkerInterface> blobWorkers,
ServerCoordinators coordinators,
std::vector<NetworkAddress> coordinatorAddresses,
Database cx,
Optional<DatabaseConfiguration> configuration,
Optional<Key> healthyZone,
@ -839,8 +840,7 @@ ACTOR static Future<JsonBuilderObject> processStatusFetcher(
}
}
std::vector<NetworkAddress> addressVec = wait(coordinators.ccr->getConnectionString().tryResolveHostnames());
for (const auto& coordinator : addressVec) {
for (const auto& coordinator : coordinatorAddresses) {
roles.addCoordinatorRole(coordinator);
}
@ -2751,6 +2751,9 @@ ACTOR Future<JsonBuilderObject> lockedStatusFetcher(Reference<AsyncVar<ServerDBI
try {
wait(tr.onError(e));
} catch (Error& e) {
if (e.code() == error_code_actor_cancelled)
throw;
incomplete_reasons->insert(format("Unable to determine if database is locked (%s).", e.what()));
break;
}
@ -3041,6 +3044,7 @@ ACTOR Future<StatusReply> clusterGetStatus(
statusObj["machines"] = machineStatusFetcher(mMetrics, workers, configuration, &status_incomplete_reasons);
state std::vector<NetworkAddress> coordinatorAddresses;
if (configuration.present()) {
// Do the latency probe by itself to avoid interference from other status activities
state bool isAvailable = true;
@ -3133,8 +3137,9 @@ ACTOR Future<StatusReply> clusterGetStatus(
state std::vector<JsonBuilderObject> workerStatuses = wait(getAll(futures2));
wait(success(primaryDCFO));
std::vector<NetworkAddress> coordinatorAddresses =
wait(coordinators.ccr->getConnectionString().tryResolveHostnames());
std::vector<NetworkAddress> addresses =
wait(timeoutError(coordinators.ccr->getConnectionString().tryResolveHostnames(), 5.0));
coordinatorAddresses = std::move(addresses);
int logFaultTolerance = 100;
if (db->get().recoveryState >= RecoveryState::ACCEPTING_COMMITS) {
@ -3275,6 +3280,7 @@ ACTOR Future<StatusReply> clusterGetStatus(
grvProxies,
blobWorkers,
coordinators,
coordinatorAddresses,
cx,
configuration,
loadResult.present() ? loadResult.get().healthyZone : Optional<Key>(),

View File

@ -518,7 +518,8 @@ struct LogData : NonCopyable, public ReferenceCounted<LogData> {
Deque<std::pair<Version, Standalone<VectorRef<uint8_t>>>> messageBlocks;
std::vector<std::vector<Reference<TagData>>> tag_data; // tag.locality | tag.id
int unpoppedRecoveredTags;
int unpoppedRecoveredTagCount;
std::set<Tag> unpoppedRecoveredTags;
std::map<Tag, Promise<Void>> waitingTags;
Reference<TagData> getTagData(Tag tag) {
@ -642,7 +643,7 @@ struct LogData : NonCopyable, public ReferenceCounted<LogData> {
std::string context)
: stopped(false), initialized(false), queueCommittingVersion(0), knownCommittedVersion(0),
durableKnownCommittedVersion(0), minKnownCommittedVersion(0), queuePoppedVersion(0), minPoppedTagVersion(0),
minPoppedTag(invalidTag), unpoppedRecoveredTags(0), cc("TLog", interf.id().toString()),
minPoppedTag(invalidTag), unpoppedRecoveredTagCount(0), cc("TLog", interf.id().toString()),
bytesInput("BytesInput", cc), bytesDurable("BytesDurable", cc), blockingPeeks("BlockingPeeks", cc),
blockingPeekTimeouts("BlockingPeekTimeouts", cc), emptyPeeks("EmptyPeeks", cc),
nonEmptyPeeks("NonEmptyPeeks", cc), logId(interf.id()), protocolVersion(protocolVersion),
@ -1196,14 +1197,20 @@ ACTOR Future<Void> tLogPopCore(TLogData* self, Tag inputTag, Version to, Referen
if (tagData->unpoppedRecovered && upTo > logData->recoveredAt) {
tagData->unpoppedRecovered = false;
logData->unpoppedRecoveredTags--;
logData->unpoppedRecoveredTagCount--;
logData->unpoppedRecoveredTags.erase(tag);
TraceEvent("TLogPoppedTag", logData->logId)
.detail("Tags", logData->unpoppedRecoveredTags)
.detail("Tags", logData->unpoppedRecoveredTagCount)
.detail("Tag", tag.toString())
.detail("DurableKCVer", logData->durableKnownCommittedVersion)
.detail("RecoveredAt", logData->recoveredAt);
if (logData->unpoppedRecoveredTags == 0 && logData->durableKnownCommittedVersion >= logData->recoveredAt &&
logData->recoveryComplete.canBeSet()) {
.detail("RecoveredAt", logData->recoveredAt)
.detail("UnpoppedTags", describe(logData->unpoppedRecoveredTags));
if (logData->unpoppedRecoveredTagCount == 0 &&
logData->durableKnownCommittedVersion >= logData->recoveredAt && logData->recoveryComplete.canBeSet()) {
TraceEvent("TLogRecoveryComplete", logData->logId)
.detail("Tags", logData->unpoppedRecoveredTagCount)
.detail("DurableKCVer", logData->durableKnownCommittedVersion)
.detail("RecoveredAt", logData->recoveredAt);
logData->recoveryComplete.send(Void());
}
}
@ -2153,10 +2160,10 @@ ACTOR Future<Void> doQueueCommit(TLogData* self,
ASSERT(ver > logData->queueCommittedVersion.get());
logData->durableKnownCommittedVersion = knownCommittedVersion;
if (logData->unpoppedRecoveredTags == 0 && knownCommittedVersion >= logData->recoveredAt &&
if (logData->unpoppedRecoveredTagCount == 0 && knownCommittedVersion >= logData->recoveredAt &&
logData->recoveryComplete.canBeSet()) {
TraceEvent("TLogRecoveryComplete", logData->logId)
.detail("Tags", logData->unpoppedRecoveredTags)
.detail("Tags", logData->unpoppedRecoveredTagCount)
.detail("DurableKCVer", logData->durableKnownCommittedVersion)
.detail("RecoveredAt", logData->recoveredAt);
logData->recoveryComplete.send(Void());
@ -3408,7 +3415,8 @@ ACTOR Future<Void> tLogStart(TLogData* self, InitializeTLogRequest req, Locality
logData->queueCommittedVersion.set(logData->unrecoveredBefore - 1);
logData->version.set(logData->unrecoveredBefore - 1);
logData->unpoppedRecoveredTags = req.allTags.size();
logData->unpoppedRecoveredTagCount = req.allTags.size();
logData->unpoppedRecoveredTags = std::set<Tag>(req.allTags.begin(), req.allTags.end());
wait(initPersistentState(self, logData) || logData->removed);
TraceEvent("TLogRecover", self->dbgid)

View File

@ -69,6 +69,7 @@
#include "fdbserver/TesterInterface.actor.h"
#include "fdbserver/WorkerInterface.actor.h"
#include "fdbserver/pubsub.h"
#include "fdbserver/OnDemandStore.h"
#include "fdbserver/workloads/workloads.actor.h"
#include "flow/ArgParseUtil.h"
#include "flow/DeterministicRandom.h"
@ -111,7 +112,8 @@ enum {
OPT_TRACECLOCK, OPT_NUMTESTERS, OPT_DEVHELP, OPT_PRINT_CODE_PROBES, OPT_ROLLSIZE, OPT_MAXLOGS, OPT_MAXLOGSSIZE, OPT_KNOB, OPT_UNITTESTPARAM, OPT_TESTSERVERS, OPT_TEST_ON_SERVERS, OPT_METRICSCONNFILE,
OPT_METRICSPREFIX, OPT_LOGGROUP, OPT_LOCALITY, OPT_IO_TRUST_SECONDS, OPT_IO_TRUST_WARN_ONLY, OPT_FILESYSTEM, OPT_PROFILER_RSS_SIZE, OPT_KVFILE,
OPT_TRACE_FORMAT, OPT_WHITELIST_BINPATH, OPT_BLOB_CREDENTIAL_FILE, OPT_CONFIG_PATH, OPT_USE_TEST_CONFIG_DB, OPT_FAULT_INJECTION, OPT_PROFILER, OPT_PRINT_SIMTIME,
OPT_FLOW_PROCESS_NAME, OPT_FLOW_PROCESS_ENDPOINT, OPT_IP_TRUSTED_MASK, OPT_KMS_CONN_DISCOVERY_URL_FILE, OPT_KMS_CONN_VALIDATION_TOKEN_DETAILS, OPT_KMS_CONN_GET_ENCRYPTION_KEYS_ENDPOINT
OPT_FLOW_PROCESS_NAME, OPT_FLOW_PROCESS_ENDPOINT, OPT_IP_TRUSTED_MASK, OPT_KMS_CONN_DISCOVERY_URL_FILE, OPT_KMS_CONN_VALIDATION_TOKEN_DETAILS, OPT_KMS_CONN_GET_ENCRYPTION_KEYS_ENDPOINT,
OPT_NEW_CLUSTER_KEY
};
CSimpleOpt::SOption g_rgOptions[] = {
@ -205,9 +207,11 @@ CSimpleOpt::SOption g_rgOptions[] = {
{ OPT_FLOW_PROCESS_NAME, "--process-name", SO_REQ_SEP },
{ OPT_FLOW_PROCESS_ENDPOINT, "--process-endpoint", SO_REQ_SEP },
{ OPT_IP_TRUSTED_MASK, "--trusted-subnet-", SO_REQ_SEP },
{ OPT_NEW_CLUSTER_KEY, "--new-cluster-key", SO_REQ_SEP },
{ OPT_KMS_CONN_DISCOVERY_URL_FILE, "--discover-kms-conn-url-file", SO_REQ_SEP},
{ OPT_KMS_CONN_VALIDATION_TOKEN_DETAILS, "--kms-conn-validation-token-details", SO_REQ_SEP},
{ OPT_KMS_CONN_GET_ENCRYPTION_KEYS_ENDPOINT, "--kms-conn-get-encryption-keys-endpoint", SO_REQ_SEP},
TLS_OPTION_FLAGS,
SO_END_OF_OPTIONS
};
@ -735,6 +739,17 @@ static void printUsage(const char* name, bool devhelp) {
" - FDB_DUMP_STARTKEY: start key for the dump, default is empty\n"
" - FDB_DUMP_ENDKEY: end key for the dump, default is \"\\xff\\xff\"\n"
" - FDB_DUMP_DEBUG: print key-values to stderr in escaped format\n");
printf(
"\n"
"The 'changedescription' role replaces the old cluster key in all coordinators' data file to the specified "
"new cluster key,\n"
"which is passed in by '--new-cluster-key'. In particular, cluster key means '[description]:[id]'.\n"
"'--datadir' is supposed to point to the top level directory of FDB's data, where subdirectories are for "
"each process's data.\n"
"The given cluster file passed in by '-C, --cluster-file' is considered to contain the old cluster key.\n"
"It is used before restoring a snapshotted cluster to let the cluster have a different cluster key.\n"
"Please make sure run it on every host in the cluster with the same '--new-cluster-key'.\n");
} else {
printOptionUsage("--dev-help", "Display developer-specific help and exit.");
}
@ -980,10 +995,12 @@ void restoreRoleFilesHelper(std::string dirSrc, std::string dirToMove, std::stri
namespace {
enum class ServerRole {
ChangeClusterKey,
ConsistencyCheck,
CreateTemplateDatabase,
DSLTest,
FDBD,
FlowProcess,
KVFileGenerateIOLogChecksums,
KVFileIntegrityCheck,
KVFileDump,
@ -996,13 +1013,12 @@ enum class ServerRole {
SkipListTest,
Test,
VersionedMapTest,
UnitTests,
FlowProcess
UnitTests
};
struct CLIOptions {
std::string commandLine;
std::string fileSystemPath, dataFolder, connFile, seedConnFile, seedConnString, logFolder = ".", metricsConnFile,
metricsPrefix;
metricsPrefix, newClusterKey;
std::string logGroup = "default";
uint64_t rollsize = TRACE_DEFAULT_ROLL_SIZE;
uint64_t maxLogsSize = TRACE_DEFAULT_MAX_LOGS_SIZE;
@ -1250,6 +1266,8 @@ private:
role = ServerRole::UnitTests;
else if (!strcmp(sRole, "flowprocess"))
role = ServerRole::FlowProcess;
else if (!strcmp(sRole, "changeclusterkey"))
role = ServerRole::ChangeClusterKey;
else {
fprintf(stderr, "ERROR: Unknown role `%s'\n", sRole);
printHelpTeaser(argv[0]);
@ -1653,6 +1671,19 @@ private:
knobs.emplace_back("rest_kms_connector_get_encryption_keys_endpoint", args.OptionArg());
break;
}
case OPT_NEW_CLUSTER_KEY: {
newClusterKey = args.OptionArg();
try {
ClusterConnectionString ccs;
// make sure the new cluster key is in valid format
ccs.parseKey(newClusterKey);
} catch (Error& e) {
std::cerr << "Invalid cluster key(description:id) '" << newClusterKey << "' from --new-cluster-key"
<< std::endl;
flushAndExit(FDB_EXIT_ERROR);
}
break;
}
}
}
@ -1748,6 +1779,21 @@ private:
flushAndExit(FDB_EXIT_ERROR);
}
if (role == ServerRole::ChangeClusterKey) {
bool error = false;
if (!newClusterKey.size()) {
fprintf(stderr, "ERROR: please specify --new-cluster-key\n");
error = true;
} else if (connectionFile->getConnectionString().clusterKey() == newClusterKey) {
fprintf(stderr, "ERROR: the new cluster key is the same as the old one\n");
error = true;
}
if (error) {
printHelpTeaser(argv[0]);
flushAndExit(FDB_EXIT_ERROR);
}
}
// Interpret legacy "maxLogs" option in the most sensible and unsurprising way we can while eliminating its code
// path
if (maxLogsSet) {
@ -2272,6 +2318,11 @@ int main(int argc, char* argv[]) {
} else if (role == ServerRole::KVFileDump) {
f = stopAfter(KVFileDump(opts.kvFile));
g_network->run();
} else if (role == ServerRole::ChangeClusterKey) {
Key newClusterKey(opts.newClusterKey);
Key oldClusterKey = opts.connectionFile->getConnectionString().clusterKey();
f = stopAfter(coordChangeClusterKey(opts.dataFolder, newClusterKey, oldClusterKey));
g_network->run();
}
int rc = FDB_EXIT_SUCCESS;

View File

@ -51,6 +51,8 @@ bool compareFDBAndBlob(RangeResult fdb,
Version v,
bool debug);
void printGranuleChunks(const Standalone<VectorRef<BlobGranuleChunkRef>>& chunks);
ACTOR Future<Void> clearAndAwaitMerge(Database cx, KeyRange range);
#include "flow/unactorcompiler.h"

View File

@ -236,4 +236,9 @@ Future<Void> coordinationServer(std::string const& dataFolder,
Reference<ConfigNode> const&,
ConfigBroadcastInterface const&);
// Read a value of MovableValue and if old cluster key presents in it, update to the new key
Optional<Value> updateCCSInMovableValue(ValueRef movableVal, KeyRef oldClusterKey, KeyRef newClusterKey);
Future<Void> coordChangeClusterKey(std::string dataFolder, KeyRef newClusterKey, KeyRef oldClusterKey);
#endif

View File

@ -39,6 +39,7 @@ enum class RelocateReason { INVALID = -1, OTHER, REBALANCE_DISK, REBALANCE_READ
// One-to-one relationship to the priority knobs
enum class DataMovementReason {
INVALID,
RECOVER_MOVE,
REBALANCE_UNDERUTILIZED_TEAM,
REBALANCE_OVERUTILIZED_TEAM,
@ -60,6 +61,8 @@ enum class DataMovementReason {
struct DDShardInfo;
extern int dataMovementPriority(DataMovementReason moveReason);
// Represents a data move in DD.
struct DataMove {
DataMove() : meta(DataMoveMetaData()), restore(false), valid(false), cancelled(false) {}
@ -89,9 +92,14 @@ struct RelocateShard {
std::shared_ptr<DataMove> dataMove; // Not null if this is a restored data move.
UID dataMoveId;
RelocateReason reason;
RelocateShard() : priority(0), cancelled(false), dataMoveId(anonymousShardId), reason(RelocateReason::INVALID) {}
RelocateShard(KeyRange const& keys, int priority, RelocateReason reason)
: keys(keys), priority(priority), cancelled(false), dataMoveId(anonymousShardId), reason(reason) {}
DataMovementReason moveReason;
RelocateShard()
: priority(0), cancelled(false), dataMoveId(anonymousShardId), reason(RelocateReason::INVALID),
moveReason(DataMovementReason::INVALID) {}
RelocateShard(KeyRange const& keys, DataMovementReason moveReason, RelocateReason reason)
: keys(keys), cancelled(false), dataMoveId(anonymousShardId), reason(reason), moveReason(moveReason) {
priority = dataMovementPriority(moveReason);
}
bool isRestore() const { return this->dataMove != nullptr; }
};
@ -286,6 +294,7 @@ class ShardsAffectedByTeamFailure : public ReferenceCounted<ShardsAffectedByTeam
public:
ShardsAffectedByTeamFailure() {}
enum class CheckMode { Normal = 0, ForceCheck, ForceNoCheck };
struct Team {
std::vector<UID> servers; // sorted
bool primary;
@ -335,6 +344,8 @@ public:
void finishMove(KeyRangeRef keys);
void check() const;
void setCheckMode(CheckMode);
PromiseStream<KeyRange> restartShardTracker;
private:
@ -348,6 +359,7 @@ private:
}
};
CheckMode checkMode = CheckMode::Normal;
KeyRangeMap<std::pair<std::vector<Team>, std::vector<Team>>>
shard_teams; // A shard can be affected by the failure of multiple teams if it is a queued merge, or when
// usable_regions > 1

View File

@ -294,8 +294,8 @@ Future<Void> bulkSetup(Database cx,
// Here we wait for data in flight to go to 0 (this will not work on a database with other users)
if (postSetupWarming != 0) {
try {
wait(delay(5.0) >>
waitForLowInFlight(cx, workload)); // Wait for the data distribution in a small test to start
wait(delay(5.0));
wait(waitForLowInFlight(cx, workload)); // Wait for the data distribution in a small test to start
} catch (Error& e) {
if (e.code() == error_code_actor_cancelled)
throw;

File diff suppressed because it is too large Load Diff

View File

@ -385,7 +385,9 @@ ACTOR Future<Reference<TestWorkload>> getWorkloadIface(WorkloadRequest work,
wcx.sharedRandomNumber = work.sharedRandomNumber;
workload = IWorkloadFactory::create(testName.toString(), wcx);
wait(workload->initialized());
if (workload) {
wait(workload->initialized());
}
auto unconsumedOptions = checkAllOptionsConsumed(workload ? workload->options : VectorRef<KeyValueRef>());
if (!workload || unconsumedOptions.size()) {

View File

@ -237,57 +237,64 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
while (timeTravelIt != timeTravelChecks.end() && currentTime >= timeTravelIt->first) {
state OldRead oldRead = timeTravelIt->second;
timeTravelChecksMemory -= oldRead.oldResult.expectedSize();
// advance iterator before doing read, so if it gets error we don't retry it
timeTravelIt = timeTravelChecks.erase(timeTravelIt);
if (prevPurgeVersion == -1) {
prevPurgeVersion = oldRead.v;
}
// advance iterator before doing read, so if it gets error we don't retry it
try {
state Version newPurgeVersion = 0;
state bool doPurging = allowPurging && deterministicRandom()->random01() < 0.5;
if (doPurging) {
Version maxPurgeVersion = oldRead.v;
for (auto& it : timeTravelChecks) {
maxPurgeVersion = std::min(it.second.v, maxPurgeVersion);
}
if (prevPurgeVersion < maxPurgeVersion) {
newPurgeVersion = deterministicRandom()->randomInt64(prevPurgeVersion, maxPurgeVersion);
prevPurgeVersion = std::max(prevPurgeVersion, newPurgeVersion);
Key purgeKey = wait(cx->purgeBlobGranules(normalKeys, newPurgeVersion, {}, false));
wait(cx->waitPurgeGranulesComplete(purgeKey));
self->purges++;
} else {
doPurging = false;
}
// before doing read, purge just before read version
state Version newPurgeVersion = 0;
state bool doPurging = allowPurging && deterministicRandom()->random01() < 0.5;
if (doPurging) {
CODE_PROBE(true, "BGV considering purge");
Version maxPurgeVersion = oldRead.v;
for (auto& it : timeTravelChecks) {
maxPurgeVersion = std::min(it.second.v, maxPurgeVersion);
}
if (prevPurgeVersion < maxPurgeVersion) {
CODE_PROBE(true, "BGV doing purge");
newPurgeVersion = deterministicRandom()->randomInt64(prevPurgeVersion, maxPurgeVersion);
prevPurgeVersion = std::max(prevPurgeVersion, newPurgeVersion);
if (BGV_DEBUG) {
fmt::print("BGV Purging @ {0}\n", newPurgeVersion);
}
try {
Key purgeKey = wait(cx->purgeBlobGranules(normalKeys, newPurgeVersion, {}, false));
if (BGV_DEBUG) {
fmt::print("BGV Purged @ {0}, waiting\n", newPurgeVersion);
}
wait(cx->waitPurgeGranulesComplete(purgeKey));
} catch (Error& e) {
if (e.code() == error_code_operation_cancelled) {
throw e;
}
// purging shouldn't error, it should retry.
if (BGV_DEBUG) {
fmt::print("Unexpected error {0} purging @ {1}!\n", e.name(), newPurgeVersion);
}
ASSERT(false);
}
CODE_PROBE(true, "BGV purge complete");
if (BGV_DEBUG) {
fmt::print("BGV Purge complete @ {0}\n", newPurgeVersion);
}
self->purges++;
} else {
doPurging = false;
}
}
// do time travel read
try {
std::pair<RangeResult, Standalone<VectorRef<BlobGranuleChunkRef>>> reReadResult =
wait(readFromBlob(cx, self->bstore, oldRead.range, 0, oldRead.v));
if (!compareFDBAndBlob(oldRead.oldResult, reReadResult, oldRead.range, oldRead.v, BGV_DEBUG)) {
self->mismatches++;
}
self->timeTravelReads++;
if (doPurging) {
wait(self->killBlobWorkers(cx, self));
std::pair<RangeResult, Standalone<VectorRef<BlobGranuleChunkRef>>> versionRead =
wait(readFromBlob(cx, self->bstore, oldRead.range, 0, prevPurgeVersion));
try {
Version minSnapshotVersion = newPurgeVersion;
for (auto& it : versionRead.second) {
minSnapshotVersion = std::min(minSnapshotVersion, it.snapshotVersion);
}
std::pair<RangeResult, Standalone<VectorRef<BlobGranuleChunkRef>>> versionRead =
wait(readFromBlob(cx, self->bstore, oldRead.range, 0, minSnapshotVersion - 1));
ASSERT(false);
} catch (Error& e) {
if (e.code() == error_code_actor_cancelled) {
throw;
}
ASSERT(e.code() == error_code_blob_granule_transaction_too_old);
}
}
} catch (Error& e) {
fmt::print("Error TT: {0}\n", e.name());
if (e.code() == error_code_blob_granule_transaction_too_old) {
self->timeTravelTooOld++;
// TODO: add debugging info for when this is a failure
@ -297,6 +304,51 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
oldRead.v);
}
}
// if purged just before read, verify that purge cleaned up data by restarting blob workers and
// reading older than the purge version
if (doPurging) {
wait(self->killBlobWorkers(cx, self));
if (BGV_DEBUG) {
fmt::print("BGV Reading post-purge [{0} - {1}) @ {2}\n",
oldRead.range.begin.printable(),
oldRead.range.end.printable(),
prevPurgeVersion);
}
// ensure purge version exactly is still readable
std::pair<RangeResult, Standalone<VectorRef<BlobGranuleChunkRef>>> versionRead1 =
wait(readFromBlob(cx, self->bstore, oldRead.range, 0, prevPurgeVersion));
if (BGV_DEBUG) {
fmt::print("BGV Post-purge first read:\n");
printGranuleChunks(versionRead1.second);
}
try {
// read at purgeVersion - 1, should NOT be readable
Version minSnapshotVersion = newPurgeVersion;
for (auto& it : versionRead1.second) {
minSnapshotVersion = std::min(minSnapshotVersion, it.snapshotVersion);
}
if (BGV_DEBUG) {
fmt::print("BGV Reading post-purge again [{0} - {1}) @ {2}\n",
oldRead.range.begin.printable(),
oldRead.range.end.printable(),
minSnapshotVersion - 1);
}
std::pair<RangeResult, Standalone<VectorRef<BlobGranuleChunkRef>>> versionRead2 =
wait(readFromBlob(cx, self->bstore, oldRead.range, 0, minSnapshotVersion - 1));
if (BGV_DEBUG) {
fmt::print("BGV ERROR: data not purged! Read successful!!\n");
printGranuleChunks(versionRead2.second);
}
ASSERT(false);
} catch (Error& e) {
if (e.code() == error_code_actor_cancelled) {
throw;
}
ASSERT(e.code() == error_code_blob_granule_transaction_too_old);
CODE_PROBE(true, "BGV verified too old after purge");
}
}
}
// pick a random range
@ -471,6 +523,8 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
// For some reason simulation is still passing when this fails?.. so assert for now
ASSERT(result);
// FIXME: if doPurging was set, possibly do one last purge here, and verify it succeeds with no errors
if (self->clientId == 0 && SERVER_KNOBS->BG_ENABLE_MERGING && deterministicRandom()->random01() < 0.1) {
CODE_PROBE(true, "BGV clearing database and awaiting merge");
wait(clearAndAwaitMerge(cx, normalKeys));

View File

@ -0,0 +1,767 @@
/*
* ChangeFeedOperations.actor.cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "fdbclient/FDBOptions.g.h"
#include "fdbclient/ManagementAPI.actor.h"
#include "fdbclient/NativeAPI.actor.h"
#include "fdbclient/SystemData.h"
#include "fdbserver/TesterInterface.actor.h"
#include "fdbserver/workloads/workloads.actor.h"
#include "fdbserver/workloads/BulkSetup.actor.h"
#include "flow/Arena.h"
#include "flow/IRandom.h"
#include "flow/Trace.h"
#include "flow/Util.h"
#include "flow/serialize.h"
#include <cstring>
#include <limits>
#include "flow/actorcompiler.h" // This must be the last #include.
// enable to debug specific operations for a given change feed
#define DEBUG_KEY ""_sr
#define DEBUG_CF(feedKey) (feedKey.printable() == DEBUG_KEY)
ACTOR Future<Void> doPop(Database cx, Key key, Key feedID, Version version, Version* doneOut) {
wait(cx->popChangeFeedMutations(feedID, version));
if (*doneOut < version) {
*doneOut = version;
}
if (DEBUG_CF(key)) {
fmt::print("DBG) {0} Popped through {1}\n", key.printable(), version);
}
// TODO: could strengthen pop checking by validating that a read immediately after the pop completes has no data
return Void();
}
struct FeedTestData : ReferenceCounted<FeedTestData>, NonCopyable {
Key key;
KeyRange keyRange;
Key feedID;
int nextVal;
Future<Void> liveReader;
bool lastCleared = false;
std::vector<Future<Void>> pops;
Version poppingVersion;
Version poppedVersion;
Optional<Version> stopVersion;
bool destroying;
bool destroyed;
bool complete;
int popWindow;
int popDelayWindow;
std::deque<std::pair<Version, Optional<Value>>> writesByVersion;
// these were all committed
std::deque<std::pair<Version, Optional<Value>>> pendingCheck;
NotifiedVersion checkVersion;
FeedTestData(Key key, bool doPops)
: key(key), keyRange(KeyRangeRef(key, keyAfter(key))), feedID(key.withPrefix(LiteralStringRef("CF"))), nextVal(0),
lastCleared(false), poppingVersion(0), poppedVersion(0), destroying(false), destroyed(false), complete(false),
checkVersion(0) {
if (doPops) {
popWindow = deterministicRandom()->randomExp(1, 8);
popDelayWindow = deterministicRandom()->randomInt(0, 2) * deterministicRandom()->randomExp(1, 4);
} else {
popWindow = -1;
popDelayWindow = -1;
}
}
Value nextValue() {
std::string v = std::to_string(nextVal);
nextVal++;
return Value(v);
}
void update(Version version, Optional<Value> value) {
if (!stopVersion.present()) {
// if feed is stopped, value should not get read
writesByVersion.push_back({ version, value });
pendingCheck.push_back(writesByVersion.back());
checkVersion.set(version);
}
}
void testComplete() {
complete = true;
checkVersion.set(checkVersion.get() + 1);
}
void pop(Database cx, Version v) {
if (DEBUG_CF(key)) {
fmt::print("DBG) {0} Popping through {1}\n", key.printable(), v);
}
ASSERT(poppingVersion < v);
poppingVersion = v;
while (!writesByVersion.empty() && v > writesByVersion.front().first) {
writesByVersion.pop_front();
}
while (!pendingCheck.empty() && v > pendingCheck.front().first) {
pendingCheck.pop_front();
}
pops.push_back(doPop(cx, key, feedID, v, &poppedVersion));
}
};
static void rollbackFeed(Key key,
std::deque<Standalone<MutationsAndVersionRef>>& buffered,
Version version,
MutationRef rollbackMutation) {
Version rollbackVersion;
BinaryReader br(rollbackMutation.param2, Unversioned());
br >> rollbackVersion;
TraceEvent("ChangeFeedRollback").detail("Key", key).detail("Ver", version).detail("RollbackVer", rollbackVersion);
if (DEBUG_CF(key)) {
fmt::print("DBG) {0} Rolling back {1} -> {2}\n", key.printable(), version, rollbackVersion);
}
while (!buffered.empty() && buffered.back().version > rollbackVersion) {
TraceEvent("ChangeFeedRollbackVer").detail("Ver", buffered.back().version);
buffered.pop_back();
}
}
static void checkNextResult(Key key,
std::deque<Standalone<MutationsAndVersionRef>>& buffered,
std::deque<std::pair<Version, Optional<Value>>>& checkData) {
// First asserts are checking data is in the form the test is supposed to produce
ASSERT(!buffered.empty());
ASSERT(buffered.front().mutations.size() == 1);
ASSERT(buffered.front().mutations[0].param1 == key);
// Below asserts are correctness of change feed invariants.
// Handle case where txn retried and wrote same value twice. checkData's version is the committed one, so the same
// update may appear at an earlier version. This is fine, as long as it then actually appears at the committed
// version
// TODO: could strengthen this check a bit and only allow it to appear at the lower version if the txn retried on
// commit_unknown_result?
if (checkData.front().first < buffered.front().version) {
fmt::print("ERROR. {0} Check version {1} != {2}.\n Check: {3} {4}\n Buffered: {5} {6}\n",
key.printable(),
checkData.front().first,
buffered.front().version,
checkData.front().second.present() ? "SET" : "CLEAR",
checkData.front().second.present() ? checkData.front().second.get().printable()
: keyAfter(key).printable(),
buffered.front().mutations[0].type == MutationRef::SetValue ? "SET" : "CLEAR",
buffered.front().mutations[0].param2.printable());
}
ASSERT(checkData.front().first >= buffered.front().version);
if (checkData.front().second.present()) {
ASSERT(buffered.front().mutations[0].type == MutationRef::SetValue);
ASSERT(buffered.front().mutations[0].param2 == checkData.front().second.get());
} else {
ASSERT(buffered.front().mutations[0].type == MutationRef::ClearRange);
ASSERT(buffered.front().mutations[0].param2 == keyAfter(key));
}
if (checkData.front().first == buffered.front().version) {
checkData.pop_front();
}
buffered.pop_front();
}
ACTOR Future<Void> liveReader(Database cx, Reference<FeedTestData> data, Version begin) {
state Version lastCheckVersion = 0;
state Version nextCheckVersion = 0;
state std::deque<Standalone<MutationsAndVersionRef>> buffered;
state Reference<ChangeFeedData> results = makeReference<ChangeFeedData>();
state Future<Void> stream =
cx->getChangeFeedStream(results, data->feedID, begin, std::numeric_limits<Version>::max(), data->keyRange);
try {
loop {
if (data->complete && data->pendingCheck.empty()) {
return Void();
}
nextCheckVersion = data->pendingCheck.empty() ? invalidVersion : data->pendingCheck.front().first;
choose {
when(Standalone<VectorRef<MutationsAndVersionRef>> res = waitNext(results->mutations.getFuture())) {
for (auto& it : res) {
if (it.mutations.size() == 1 && it.mutations.back().param1 == lastEpochEndPrivateKey) {
rollbackFeed(data->key, buffered, it.version, it.mutations.back());
} else {
if (it.mutations.size() == 0) {
// FIXME: THIS SHOULD NOT HAPPEN
// FIXME: these are also getting sent past stopVersion!!
} else {
if (data->stopVersion.present()) {
if (it.version > data->stopVersion.get()) {
fmt::print("DBG) {0} Read data with version {1} > stop version {2} ({3})\n",
data->key.printable(),
it.version,
data->stopVersion.get(),
it.mutations.size());
}
ASSERT(it.version <= data->stopVersion.get());
}
buffered.push_back(Standalone<MutationsAndVersionRef>(it));
if (DEBUG_CF(data->key)) {
fmt::print("DBG) {0} Live read through {1} ({2})\n",
data->key.printable(),
it.version,
it.mutations.size());
}
}
}
}
}
when(wait(data->checkVersion.whenAtLeast(lastCheckVersion + 1))) {
// wake loop and start new whenAtLeast whenever checkVersion is set
lastCheckVersion = data->checkVersion.get();
}
when(wait(data->pendingCheck.empty() ? Never()
: results->whenAtLeast(data->pendingCheck.front().first))) {
if (data->pendingCheck.empty() || data->pendingCheck.front().first > nextCheckVersion) {
// pendingCheck wasn't empty before whenAtLeast, and nextCheckVersion = the front version, so if
// either of these are true, the data was popped concurrently and we can move on to checking the
// next value
CODE_PROBE(true, "popped while waiting for whenAtLeast to check next value");
continue;
}
while (!buffered.empty() && buffered.front().version < data->poppingVersion) {
CODE_PROBE(true, "live reader ignoring data that is being popped");
buffered.pop_front();
}
if (buffered.empty()) {
if (data->poppingVersion < data->pendingCheck.front().first) {
fmt::print("DBG) {0} Buffered empty after ready for check, and data not popped! popped "
"{1}, popping {2}, check {3}\n",
data->key.printable(),
data->poppedVersion,
data->poppingVersion,
data->pendingCheck.front().first);
}
ASSERT(data->poppingVersion >= data->pendingCheck.front().first);
data->pendingCheck.pop_front();
} else {
Version v = buffered.front().version;
if (DEBUG_CF(data->key)) {
fmt::print("DBG) {0} Live checking through {1}\n",
data->key.printable(),
data->pendingCheck.front().first);
}
checkNextResult(data->key, buffered, data->pendingCheck);
if (DEBUG_CF(data->key)) {
fmt::print("DBG) {0} Live Checked through {1}\n", data->key.printable(), v);
}
if (data->popDelayWindow >= 0 && data->popWindow >= 0 &&
data->writesByVersion.size() == data->popWindow + data->popDelayWindow) {
data->pop(cx, data->writesByVersion[data->popWindow - 1].first + 1);
ASSERT(data->writesByVersion.size() == data->popDelayWindow);
}
}
}
}
}
} catch (Error& e) {
throw e;
}
}
ACTOR Future<Void> historicReader(Database cx,
Reference<FeedTestData> data,
Version begin,
Version end,
bool skipPopped) {
state std::deque<std::pair<Version, Optional<Value>>> checkData;
state std::deque<Standalone<MutationsAndVersionRef>> buffered;
state Reference<ChangeFeedData> results = makeReference<ChangeFeedData>();
state Future<Void> stream = cx->getChangeFeedStream(results, data->feedID, begin, end, data->keyRange);
state Version poppedVersionAtStart = data->poppedVersion;
if (DEBUG_CF(data->key)) {
fmt::print("DBG) {0} Starting historical read {1} - {2}\n", data->key.printable(), begin, end);
}
// TODO could cpu optimize this
for (auto& it : data->writesByVersion) {
if (it.first >= end) {
break;
}
if (it.first >= begin) {
checkData.push_back(it);
}
}
try {
loop {
Standalone<VectorRef<MutationsAndVersionRef>> res = waitNext(results->mutations.getFuture());
for (auto& it : res) {
if (it.mutations.size() == 1 && it.mutations.back().param1 == lastEpochEndPrivateKey) {
rollbackFeed(data->key, buffered, it.version, it.mutations.back());
} else {
if (it.mutations.size() == 0) {
// FIXME: THIS SHOULD NOT HAPPEN
// FIXME: these are also getting sent past stopVersion!!
} else {
if (data->stopVersion.present()) {
ASSERT(it.version <= data->stopVersion.get());
}
buffered.push_back(Standalone<MutationsAndVersionRef>(it));
}
}
}
}
} catch (Error& e) {
if (e.code() != error_code_end_of_stream) {
throw;
}
}
if (skipPopped) {
while (!buffered.empty() && buffered.front().version < data->poppingVersion) {
// ignore data
buffered.pop_front();
}
while (!checkData.empty() && checkData.front().first < data->poppingVersion) {
checkData.pop_front();
}
}
while (!checkData.empty() && !buffered.empty()) {
checkNextResult(data->key, buffered, checkData);
}
// Change feed missing data it should have
ASSERT(checkData.empty());
// Change feed read extra data it shouldn't have
ASSERT(buffered.empty());
// check pop version of cursor
// TODO: this check might not always work if read is for old data and SS is way behind
// FIXME: this check doesn't work for now, probably due to above comment
/*if (data->poppingVersion != 0) {
ASSERT(results->popVersion >= poppedVersionAtStart && results->popVersion <= data->poppingVersion);
}*/
return Void();
}
enum Op {
CREATE_DELETE = 0,
READ = 1,
UPDATE_CLEAR = 2,
STOP = 3,
POP = 4,
OP_COUNT = 5 /* keep this last */
};
struct ChangeFeedOperationsWorkload : TestWorkload {
// test settings
double testDuration;
int operationsPerSecond;
int targetFeeds;
bool clientsDisjointKeyspace;
bool clearKeyWhenDestroy;
double clearFrequency;
int popMode;
int opWeights[Op::OP_COUNT];
int totalOpWeight;
Future<Void> client;
std::unordered_set<Key> usedKeys;
std::vector<Reference<FeedTestData>> data;
ChangeFeedOperationsWorkload(WorkloadContext const& wcx) : TestWorkload(wcx) {
testDuration = getOption(options, "testDuration"_sr, 60.0);
operationsPerSecond = getOption(options, "opsPerSecond"_sr, 100.0);
int64_t rand = wcx.sharedRandomNumber;
targetFeeds = deterministicRandom()->randomExp(1, 1 + rand % 10);
targetFeeds *= (0.8 + (deterministicRandom()->random01() * 0.4));
targetFeeds = std::max(1, targetFeeds / clientCount);
rand /= 10;
clientsDisjointKeyspace = rand % 2;
rand /= 2;
clearKeyWhenDestroy = rand % 2;
rand /= 2;
bool doStops = rand % 2;
rand /= 2;
bool noCreateDelete = rand % 10 == 0;
rand /= 10;
popMode = rand % 3; // 0=none, 1=read-driven, 2=op-driven
rand /= 3;
ASSERT(clientId >= 0);
ASSERT(clientId < clientCount);
ASSERT(clientCount < 255);
clearFrequency = deterministicRandom()->random01();
for (int i = 0; i < Op::OP_COUNT; i++) {
int randWeight = deterministicRandom()->randomExp(0, 5);
ASSERT(randWeight > 0);
opWeights[i] = randWeight;
}
if (!doStops) {
opWeights[Op::STOP] = 0;
}
if (noCreateDelete) {
opWeights[Op::CREATE_DELETE] = 0;
}
if (popMode != 2) {
opWeights[Op::POP] = 0;
}
std::string weightString = "|";
totalOpWeight = 0;
for (int i = 0; i < Op::OP_COUNT; i++) {
totalOpWeight += opWeights[i];
weightString += std::to_string(opWeights[i]) + "|";
}
TraceEvent("ChangeFeedOperationsInit")
.detail("TargetFeeds", targetFeeds)
.detail("DisjointKeyspace", clientsDisjointKeyspace)
.detail("ClearWhenDestroy", clearKeyWhenDestroy)
.detail("DoStops", doStops)
.detail("NoCreateDelete", noCreateDelete)
.detail("Weights", weightString);
}
Key unusedNewRandomKey() {
while (true) {
Key k = newRandomKey();
if (usedKeys.insert(k).second) {
return k;
}
}
}
Key newRandomKey() {
if (clientsDisjointKeyspace) {
double keyspaceRange = (1.0 / clientCount);
double randPartOfRange = deterministicRandom()->random01() * (keyspaceRange - 0.0001);
double randomDouble = clientId * keyspaceRange + 0.0001 + randPartOfRange;
return doubleToTestKey(randomDouble);
} else {
// this is kinda hacky but it guarantees disjoint keys per client
Key ret = doubleToTestKey(deterministicRandom()->random01());
std::string str = ret.toString();
str.back() = (uint8_t)clientId;
return Key(str);
}
}
// Pick op with weighted average
Op pickRandomOp() {
int r = deterministicRandom()->randomInt(0, totalOpWeight);
int i = 0;
while (i < Op::OP_COUNT && (opWeights[i] <= r || opWeights[i] == 0)) {
r -= opWeights[i];
i++;
}
ASSERT(i < Op::OP_COUNT);
return (Op)i;
}
ACTOR Future<Void> createNewFeed(Database cx, ChangeFeedOperationsWorkload* self) {
state Transaction tr(cx);
state Key key = self->unusedNewRandomKey();
state Reference<FeedTestData> feedData = makeReference<FeedTestData>(key, self->popMode == 1);
state Value initialValue = feedData->nextValue();
if (DEBUG_CF(key)) {
fmt::print("DBG) Creating {0}\n", key.printable());
}
loop {
try {
tr.set(key, initialValue);
wait(updateChangeFeed(&tr, feedData->feedID, ChangeFeedStatus::CHANGE_FEED_CREATE, feedData->keyRange));
wait(tr.commit());
Version createVersion = tr.getCommittedVersion();
if (DEBUG_CF(key)) {
fmt::print("DBG) Created {0} @ {1}\n", key.printable(), createVersion);
}
feedData->update(createVersion, initialValue);
feedData->liveReader = liveReader(cx, feedData, createVersion);
self->data.push_back(feedData);
return Void();
} catch (Error& e) {
wait(tr.onError(e));
}
}
}
std::string description() const override { return "ChangeFeedOperationsWorkload"; }
Future<Void> setup(Database const& cx) override { return _setup(cx, this); }
ACTOR Future<Void> _setup(Database cx, ChangeFeedOperationsWorkload* self) {
// create initial targetFeeds feeds
TraceEvent("ChangeFeedOperationsSetup").detail("InitialFeeds", self->targetFeeds).log();
state int i;
for (i = 0; i < self->targetFeeds; i++) {
wait(self->createNewFeed(cx, self));
}
TraceEvent("ChangeFeedOperationsSetupComplete");
return Void();
}
Future<Void> start(Database const& cx) override {
client = changeFeedOperationsClient(cx->clone(), this);
return delay(testDuration);
}
Future<bool> check(Database const& cx) override {
client = Future<Void>();
return _check(cx, this);
}
ACTOR Future<Void> checkFeed(Database cx, ChangeFeedOperationsWorkload* self, Reference<FeedTestData> feedData) {
state int popIdx;
feedData->testComplete();
if (DEBUG_CF(feedData->key)) {
fmt::print("Final check {0} waiting on live reader\n", feedData->key.printable());
}
// wait on live reader and pops to make sure they complete without error
wait(feedData->liveReader);
if (DEBUG_CF(feedData->key)) {
fmt::print("Final check {0} waiting on {1} pops\n", feedData->key.printable(), feedData->pops.size());
}
for (popIdx = 0; popIdx < feedData->pops.size(); popIdx++) {
wait(feedData->pops[popIdx]);
}
// do final check, read everything not popped
if (DEBUG_CF(feedData->key)) {
fmt::print("Final check {0} waiting on data check\n", feedData->key.printable(), feedData->pops.size());
}
wait(self->doRead(cx, feedData, feedData->writesByVersion.size()));
// ensure reading [0, poppedVersion) returns no results
if (feedData->poppedVersion > 0) {
if (DEBUG_CF(feedData->key)) {
fmt::print(
"Final check {0} waiting on read popped check\n", feedData->key.printable(), feedData->pops.size());
}
// FIXME: re-enable checking for popped data by changing skipPopped back to false!
wait(historicReader(cx, feedData, 0, feedData->poppedVersion, true));
}
return Void();
}
ACTOR Future<bool> _check(Database cx, ChangeFeedOperationsWorkload* self) {
TraceEvent("ChangeFeedOperationsCheck").detail("FeedCount", self->data.size()).log();
fmt::print("Checking {0} feeds\n", self->data.size()); // TODO REMOVE
state std::vector<Future<Void>> feedChecks;
for (int i = 0; i < self->data.size(); i++) {
if (self->data[i]->destroying) {
continue;
}
if (DEBUG_CF(self->data[i]->key)) {
fmt::print("Final check {0}\n", self->data[i]->key.printable());
}
feedChecks.push_back(self->checkFeed(cx, self, self->data[i]));
}
wait(waitForAll(feedChecks));
// FIXME: check that all destroyed feeds are actually destroyed?
TraceEvent("ChangeFeedOperationsCheckComplete");
return true;
}
void getMetrics(std::vector<PerfMetric>& m) override {}
ACTOR Future<Void> stopFeed(Database cx, Reference<FeedTestData> feedData) {
state Transaction tr(cx);
if (DEBUG_CF(feedData->key)) {
fmt::print("DBG) {0} Stopping\n", feedData->key.printable());
}
loop {
try {
wait(updateChangeFeed(&tr, feedData->feedID, ChangeFeedStatus::CHANGE_FEED_STOP, feedData->keyRange));
wait(tr.commit());
Version stopVersion = tr.getCommittedVersion();
if (!feedData->stopVersion.present()) {
feedData->stopVersion = stopVersion;
}
if (DEBUG_CF(feedData->key)) {
fmt::print("DBG) {0} Stopped @ {1}\n", feedData->key.printable(), stopVersion);
}
return Void();
} catch (Error& e) {
wait(tr.onError(e));
}
}
}
void popFeed(Database cx, Reference<FeedTestData> feedData) {
if (!feedData->writesByVersion.empty()) {
feedData->pop(cx, feedData->writesByVersion.front().first + 1);
}
}
ACTOR Future<Void> destroyFeed(Database cx, ChangeFeedOperationsWorkload* self, int feedIdx) {
state Reference<FeedTestData> feedData = self->data[feedIdx];
state Transaction tr(cx);
feedData->destroying = true;
if (DEBUG_CF(feedData->key)) {
fmt::print("DBG) {0} Destroying\n", feedData->key.printable());
}
loop {
try {
wait(
updateChangeFeed(&tr, feedData->feedID, ChangeFeedStatus::CHANGE_FEED_DESTROY, feedData->keyRange));
if (self->clearKeyWhenDestroy) {
tr.clear(feedData->key);
}
wait(tr.commit());
feedData->destroyed = true;
// remove feed from list
ASSERT(self->data[feedIdx]->key == feedData->key);
swapAndPop(&self->data, feedIdx);
if (DEBUG_CF(feedData->key)) {
fmt::print("DBG) {0} Destroyed @ {1}\n", feedData->key.printable(), tr.getCommittedVersion());
}
return Void();
} catch (Error& e) {
wait(tr.onError(e));
}
}
}
ACTOR Future<Void> doRead(Database cx, Reference<FeedTestData> feedData, int targetReadWidth) {
if (feedData->writesByVersion.empty()) {
return Void();
}
Version beginVersion;
Version endVersion;
if (targetReadWidth >= feedData->writesByVersion.size()) {
beginVersion = feedData->writesByVersion.front().first;
endVersion = feedData->writesByVersion.back().first + 1;
} else {
// either up to or including end
int randStart = deterministicRandom()->randomInt(0, feedData->writesByVersion.size() - targetReadWidth);
beginVersion = feedData->writesByVersion[randStart].first;
int end = randStart + targetReadWidth;
if (end == feedData->writesByVersion.size()) {
endVersion = feedData->writesByVersion.back().first + 1;
} else {
// Make sure last included value (end version -1) is a committed version for checking
endVersion = feedData->writesByVersion[end].first + 1;
}
}
if (DEBUG_CF(feedData->key)) {
fmt::print("DBG) {0} Reading @ {1} - {2}\n", feedData->key.printable(), beginVersion, endVersion);
}
// FIXME: this sometimes reads popped data!
wait(historicReader(cx, feedData, beginVersion, endVersion, true));
if (DEBUG_CF(feedData->key)) {
fmt::print("DBG) {0} Read complete\n", feedData->key.printable());
}
return Void();
}
ACTOR Future<Void> doUpdateClear(Database cx,
ChangeFeedOperationsWorkload* self,
Reference<FeedTestData> feedData) {
state Transaction tr(cx);
state Optional<Value> updateValue;
// if value is already not set, don't do a clear, otherwise pick either
if (feedData->lastCleared || deterministicRandom()->random01() > self->clearFrequency) {
updateValue = feedData->nextValue();
if (DEBUG_CF(feedData->key)) {
fmt::print("DBG) {0} Setting {1}\n", feedData->key.printable(), updateValue.get().printable());
}
} else if (DEBUG_CF(feedData->key)) {
fmt::print("DBG) {0} Clearing\n", feedData->key.printable());
}
loop {
try {
if (updateValue.present()) {
tr.set(feedData->key, updateValue.get());
} else {
tr.clear(feedData->key);
}
wait(tr.commit());
Version writtenVersion = tr.getCommittedVersion();
if (DEBUG_CF(feedData->key) && updateValue.present()) {
fmt::print("DBG) {0} Set {1} @ {2}\n",
feedData->key.printable(),
updateValue.get().printable(),
writtenVersion);
}
if (DEBUG_CF(feedData->key) && !updateValue.present()) {
fmt::print("DBG) {0} Cleared @ {1}\n", feedData->key.printable(), writtenVersion);
}
feedData->update(writtenVersion, updateValue);
return Void();
} catch (Error& e) {
wait(tr.onError(e));
}
}
}
ACTOR Future<Void> changeFeedOperationsClient(Database cx, ChangeFeedOperationsWorkload* self) {
state double last = now();
loop {
state Future<Void> waitNextOp = poisson(&last, 1.0 / self->operationsPerSecond);
Op op = self->pickRandomOp();
int feedIdx = deterministicRandom()->randomInt(0, self->data.size());
if (op == Op::CREATE_DELETE) {
// bundle these together so random creates/deletes keep about the target number of feeds
if (deterministicRandom()->random01() < 0.5 || self->data.size() == 1) {
wait(self->createNewFeed(cx, self));
} else {
wait(self->destroyFeed(cx, self, feedIdx));
}
} else if (op == Op::READ) {
// relatively small random read
wait(self->doRead(cx, self->data[feedIdx], deterministicRandom()->randomExp(2, 8)));
} else if (op == Op::UPDATE_CLEAR) {
wait(self->doUpdateClear(cx, self, self->data[feedIdx]));
} else if (op == Op::STOP) {
wait(self->stopFeed(cx, self->data[feedIdx]));
} else if (op == Op::POP) {
self->popFeed(cx, self->data[feedIdx]);
} else {
ASSERT(false);
}
wait(waitNextOp);
}
}
};
WorkloadFactory<ChangeFeedOperationsWorkload> ChangeFeedOperationsWorkloadFactory("ChangeFeedOperations");

View File

@ -325,6 +325,7 @@ struct PhysicalShardMoveWorkLoad : TestWorkload {
TraceEvent("TestCancelDataMoveEnd").detail("DataMove", dataMove.toString());
}
TraceEvent("TestMoveShardStartMoveKeys").detail("DataMove", dataMoveId);
wait(moveKeys(cx,
dataMoveId,
keys,

View File

@ -215,7 +215,8 @@ struct SkewedReadWriteWorkload : ReadWriteCommon {
self->startReadWriteClients(cx, clients);
wait(timeout(waitForAll(clients), self->testDuration / self->skewRound, Void()));
clients.clear();
wait(delay(5.0) >> updateServerShards(cx, self));
wait(delay(5.0));
wait(updateServerShards(cx, self));
}
return Void();

View File

@ -175,6 +175,10 @@ struct SSCheckpointRestoreWorkload : TestWorkload {
ASSERT(res[i] == kvRange[i]);
}
Future<Void> close = kvStore->onClosed();
kvStore->dispose();
wait(close);
int ignore = wait(setDDMode(cx, 1));
return Void();
}

View File

@ -30,7 +30,6 @@
#include "fdbclient/TenantManagement.actor.h"
#include "fdbclient/TenantSpecialKeys.actor.h"
#include "fdbclient/ThreadSafeTransaction.h"
#include "fdbclient/libb64/decode.h"
#include "fdbrpc/simulator.h"
#include "fdbserver/workloads/workloads.actor.h"
#include "fdbserver/Knobs.h"
@ -38,6 +37,7 @@
#include "flow/IRandom.h"
#include "flow/ThreadHelper.actor.h"
#include "flow/flow.h"
#include "libb64/decode.h"
#include "flow/actorcompiler.h" // This must be the last #include.
struct TenantManagementWorkload : TestWorkload {

View File

@ -519,7 +519,12 @@ void FastAllocator<Size>::getMagazine() {
--g_allocation_tracing_disabled;
}
#endif
block = (void**)::allocate(magazine_size * Size, /*allowLargePages*/ false, /*includeGuardPages*/ true);
#ifdef VALGRIND
const bool includeGuardPages = false;
#else
const bool includeGuardPages = true;
#endif
block = (void**)::allocate(magazine_size * Size, /*allowLargePages*/ false, includeGuardPages);
#endif
// void** block = new void*[ magazine_size * PSize ];

View File

@ -168,6 +168,13 @@ void FlowKnobs::initialize(Randomize randomize, IsSimulated isSimulated) {
init( LOW_PRIORITY_DELAY_COUNT, 5 );
init( LOW_PRIORITY_MAX_DELAY, 5.0 );
// HTTP
init( HTTP_READ_SIZE, 128*1024 );
init( HTTP_SEND_SIZE, 32*1024 );
init( HTTP_VERBOSE_LEVEL, 0 );
init( HTTP_REQUEST_ID_HEADER, "" );
init( HTTP_RESPONSE_SKIP_VERIFY_CHECKSUM_FOR_PARTIAL_CONTENT, false );
//IAsyncFile
init( INCREMENTAL_DELETE_TRUNCATE_AMOUNT, 5e8 ); //500MB
init( INCREMENTAL_DELETE_INTERVAL, 1.0 ); //every 1 second

View File

@ -146,8 +146,8 @@ public:
void initMetrics() override;
// INetworkConnections interface
Future<Reference<IConnection>> connect(NetworkAddress toAddr, const std::string& host) override;
Future<Reference<IConnection>> connectExternal(NetworkAddress toAddr, const std::string& host) override;
Future<Reference<IConnection>> connect(NetworkAddress toAddr, tcp::socket* existingSocket = nullptr) override;
Future<Reference<IConnection>> connectExternal(NetworkAddress toAddr) override;
Future<Reference<IUDPSocket>> createUDPSocket(NetworkAddress toAddr) override;
Future<Reference<IUDPSocket>> createUDPSocket(bool isV6) override;
// The mock DNS methods should only be used in simulation.
@ -507,7 +507,7 @@ public:
UID getDebugID() const override { return id; }
tcp::socket& getSocket() { return socket; }
tcp::socket& getSocket() override { return socket; }
private:
UID id;
@ -839,10 +839,15 @@ public:
: id(nondeterministicRandom()->randomUniqueID()), socket(io_service), ssl_sock(socket, context->mutate()),
sslContext(context) {}
explicit SSLConnection(Reference<ReferencedObject<boost::asio::ssl::context>> context, tcp::socket* existingSocket)
: id(nondeterministicRandom()->randomUniqueID()), socket(std::move(*existingSocket)),
ssl_sock(socket, context->mutate()), sslContext(context) {}
// This is not part of the IConnection interface, because it is wrapped by INetwork::connect()
ACTOR static Future<Reference<IConnection>> connect(boost::asio::io_service* ios,
Reference<ReferencedObject<boost::asio::ssl::context>> context,
NetworkAddress addr) {
NetworkAddress addr,
tcp::socket* existingSocket = nullptr) {
std::pair<IPAddress, uint16_t> peerIP = std::make_pair(addr.ip, addr.port);
auto iter(g_network->networkInfo.serverTLSConnectionThrottler.find(peerIP));
if (iter != g_network->networkInfo.serverTLSConnectionThrottler.end()) {
@ -857,9 +862,15 @@ public:
}
}
if (existingSocket != nullptr) {
Reference<SSLConnection> self(new SSLConnection(context, existingSocket));
self->peer_address = addr;
self->init();
return self;
}
state Reference<SSLConnection> self(new SSLConnection(*ios, context));
self->peer_address = addr;
try {
auto to = tcpEndpoint(self->peer_address);
BindPromise p("N2_ConnectError", self->id);
@ -869,7 +880,7 @@ public:
wait(onConnected);
self->init();
return self;
} catch (Error& e) {
} catch (Error&) {
// Either the connection failed, or was cancelled by the caller
self->closeSocket();
throw;
@ -1097,7 +1108,7 @@ public:
UID getDebugID() const override { return id; }
tcp::socket& getSocket() { return socket; }
tcp::socket& getSocket() override { return socket; }
ssl_socket& getSSLSocket() { return ssl_sock; }
@ -1818,17 +1829,17 @@ THREAD_HANDLE Net2::startThread(THREAD_FUNC_RETURN (*func)(void*), void* arg, in
return ::startThread(func, arg, stackSize, name);
}
Future<Reference<IConnection>> Net2::connect(NetworkAddress toAddr, const std::string& host) {
Future<Reference<IConnection>> Net2::connect(NetworkAddress toAddr, tcp::socket* existingSocket) {
if (toAddr.isTLS()) {
initTLS(ETLSInitState::CONNECT);
return SSLConnection::connect(&this->reactor.ios, this->sslContextVar.get(), toAddr);
return SSLConnection::connect(&this->reactor.ios, this->sslContextVar.get(), toAddr, existingSocket);
}
return Connection::connect(&this->reactor.ios, toAddr);
}
Future<Reference<IConnection>> Net2::connectExternal(NetworkAddress toAddr, const std::string& host) {
return connect(toAddr, host);
Future<Reference<IConnection>> Net2::connectExternal(NetworkAddress toAddr) {
return connect(toAddr);
}
Future<Reference<IUDPSocket>> Net2::createUDPSocket(NetworkAddress toAddr) {

View File

@ -173,6 +173,19 @@ public:
}
bool coinflip() { return (this->random01() < 0.5); }
// Picks a number between 2^minExp and 2^maxExp, but uniformly distributed over exponential buckets 2^n - 2^n+1
// For example, randomExp(0, 4) would have a 25% chance of returning 1, a 25% chance of returning 2-3, a 25% chance
// of returning 4-7, and a 25% chance of returning 8-15
// Similar in Expected Value to doing 1 << randomInt(minExp, maxExp+1), except numbers returned aren't just powers
// of 2
int randomExp(int minExp, int maxExp) {
if (minExp == maxExp) { // N=2, case
return 1 << minExp;
}
int val = 1 << this->randomInt(minExp, maxExp);
return this->randomInt(val, val * 2);
}
};
extern FILE* randLog;

View File

@ -235,6 +235,13 @@ public:
int LOW_PRIORITY_DELAY_COUNT;
double LOW_PRIORITY_MAX_DELAY;
// HTTP
int HTTP_READ_SIZE;
int HTTP_SEND_SIZE;
int HTTP_VERBOSE_LEVEL;
std::string HTTP_REQUEST_ID_HEADER;
bool HTTP_RESPONSE_SKIP_VERIFY_CHECKSUM_FOR_PARTIAL_CONTENT; // skip verify md5 checksum for 206 response
// IAsyncFile
int64_t INCREMENTAL_DELETE_TRUNCATE_AMOUNT;
double INCREMENTAL_DELETE_INTERVAL;

View File

@ -175,6 +175,7 @@ public: // introduced features
PROTOCOL_VERSION_FEATURE(0x0FDB00B072000000LL, ShardEncodeLocationMetaData);
PROTOCOL_VERSION_FEATURE(0x0FDB00B072000000LL, Tenants);
PROTOCOL_VERSION_FEATURE(0x0FDB00B072000000LL, Metacluster);
PROTOCOL_VERSION_FEATURE(0x0FDB00B072000000LL, BlobGranuleFile);
};
template <>

View File

@ -1978,22 +1978,25 @@ Future<decltype(std::declval<Fun>()(std::declval<T>()).getValue())> runAfter(Fut
return res;
}
ACTOR template <class T, class U>
Future<U> runAfter(Future<T> lhs, Future<U> rhs) {
T val1 = wait(lhs);
U res = wait(rhs);
return res;
}
template <class T, class Fun>
auto operator>>=(Future<T> lhs, Fun&& rhs) -> Future<decltype(rhs(std::declval<T>()))> {
return runAfter(lhs, std::forward<Fun>(rhs));
}
/*
* NOTE: This implementation can't guarantee the doesn't really enforce the ACTOR execution order. See issue #7708
ACTOR template <class T, class U>
Future<U> runAfter(Future<T> lhs, Future<U> rhs) {
T val1 = wait(lhs);
U res = wait(rhs);
return res;
}
template <class T, class U>
Future<U> operator>>(Future<T> const& lhs, Future<U> const& rhs) {
return runAfter(lhs, rhs);
return runAfter(lhs, rhs);
}
*/
/*
* IAsyncListener is similar to AsyncVar, but it decouples the input and output, so the translation unit

View File

@ -472,6 +472,8 @@ public:
// At present, implemented by Sim2Conn where we want to disable bits flip for connections between parent process and
// child process, also reduce latency for this kind of connection
virtual bool isStableConnection() const { throw unsupported_operation(); }
virtual boost::asio::ip::tcp::socket& getSocket() = 0;
};
class IListener {
@ -688,9 +690,10 @@ public:
// Make an outgoing connection to the given address. May return an error or block indefinitely in case of
// connection problems!
virtual Future<Reference<IConnection>> connect(NetworkAddress toAddr, const std::string& host = "") = 0;
virtual Future<Reference<IConnection>> connect(NetworkAddress toAddr,
boost::asio::ip::tcp::socket* existingSocket = nullptr) = 0;
virtual Future<Reference<IConnection>> connectExternal(NetworkAddress toAddr, const std::string& host = "") = 0;
virtual Future<Reference<IConnection>> connectExternal(NetworkAddress toAddr) = 0;
// Make an outgoing udp connection and connect to the passed address.
virtual Future<Reference<IUDPSocket>> createUDPSocket(NetworkAddress toAddr) = 0;

View File

@ -293,7 +293,7 @@ Future<Reference<IConnection>> INetworkConnections::connect(const std::string& h
std::function<Future<Reference<IConnection>>(NetworkAddress const&)>,
Reference<IConnection>>(
pickEndpoint,
[=](NetworkAddress const& addr) -> Future<Reference<IConnection>> { return connectExternal(addr, host); });
[=](NetworkAddress const& addr) -> Future<Reference<IConnection>> { return connectExternal(addr); });
}
IUDPSocket::~IUDPSocket() {}

View File

@ -130,8 +130,7 @@ if(WITH_PYTHON)
add_fdb_test(TEST_FILES fast/BackupToDBCorrectnessClean.toml)
add_fdb_test(TEST_FILES fast/BlobGranuleVerifySmall.toml)
add_fdb_test(TEST_FILES fast/BlobGranuleVerifySmallClean.toml)
add_fdb_test(TEST_FILES fast/BlobGranuleVerifyAtomicOps.toml)
add_fdb_test(TEST_FILES fast/BlobGranuleVerifyCycle.toml)
add_fdb_test(TEST_FILES fast/BlobGranuleMoveVerifyCycle.toml)
add_fdb_test(TEST_FILES fast/CacheTest.toml)
add_fdb_test(TEST_FILES fast/CloggedSideband.toml)
add_fdb_test(TEST_FILES fast/CompressionUtilsUnit.toml)
@ -140,6 +139,7 @@ if(WITH_PYTHON)
add_fdb_test(TEST_FILES fast/CycleAndLock.toml)
add_fdb_test(TEST_FILES fast/CycleTest.toml)
add_fdb_test(TEST_FILES fast/ChangeFeeds.toml)
add_fdb_test(TEST_FILES fast/ChangeFeedOperations.toml)
add_fdb_test(TEST_FILES fast/DataLossRecovery.toml)
add_fdb_test(TEST_FILES fast/EncryptionOps.toml)
# TODO: fix failures and renable the test
@ -199,6 +199,8 @@ if(WITH_PYTHON)
add_fdb_test(TEST_FILES fast/PhysicalShardMove.toml IGNORE)
add_fdb_test(TEST_FILES fast/StorageServerCheckpointRestore.toml IGNORE)
endif()
add_fdb_test(TEST_FILES rare/BlobGranuleVerifyAtomicOps.toml)
add_fdb_test(TEST_FILES rare/BlobGranuleVerifyCycle.toml)
add_fdb_test(TEST_FILES rare/CheckRelocation.toml)
add_fdb_test(TEST_FILES rare/ClogUnclog.toml)
add_fdb_test(TEST_FILES rare/CloggedCycleWithKills.toml)

View File

@ -0,0 +1,48 @@
[configuration]
blobGranulesEnabled = true
allowDefaultTenant = false
# FIXME: re-enable rocks at some point
storageEngineExcludeTypes = [4]
[[knobs]]
bg_range_source = "blobRangeKeys"
[[test]]
testTitle = 'BlobGranuleMoveVerifyCycle'
[[test.workload]]
testName = 'Cycle'
transactionsPerSecond = 250.0
testDuration = 60.0
expectedRate = 0
[[test.workload]]
testName = 'RandomMoveKeys'
testDuration = 60.0
[[test.workload]]
testName = 'BlobGranuleVerifier'
testDuration = 60.0
[[test.workload]]
testName = 'RandomClogging'
testDuration = 60.0
[[test.workload]]
testName = 'Rollback'
meanDelay = 60.0
testDuration = 60.0
[[test.workload]]
testName = 'Attrition'
machinesToKill = 10
machinesToLeave = 3
reboot = true
testDuration = 60.0
[[test.workload]]
testName = 'Attrition'
machinesToKill = 10
machinesToLeave = 3
reboot = true
testDuration = 60.0

View File

@ -1,9 +1,11 @@
[configuration]
blobGranulesEnabled = true
allowDefaultTenant = false
injectTargetedSSRestart = true
injectSSDelay = true
# FIXME: exclude redwood because WriteDuringRead can write massive KV pairs and we don't chunk change feed data on disk yet
# FIXME: re-enable rocks at some point
storageEngineExcludeTypes = [3, 4]
storageEngineExcludeTypes = [3, 4, 5]
[[knobs]]
bg_range_source = "blobRangeKeys"

View File

@ -3,7 +3,7 @@ blobGranulesEnabled = true
allowDefaultTenant = false
# FIXME: exclude redwood because WriteDuringRead can write massive KV pairs and we don't chunk change feed data on disk yet
# FIXME: re-enable rocks at some point
storageEngineExcludeTypes = [3, 4]
storageEngineExcludeTypes = [3, 4, 5]
[[knobs]]
bg_range_source = "blobRangeKeys"

View File

@ -0,0 +1,10 @@
[configuration]
allowDefaultTenant = false
# TODO add failure events, and then add a version that also supports randomMoveKeys
[[test]]
testTitle = 'ChangeFeedOperationsTest'
[[test.workload]]
testName = 'ChangeFeedOperations'

View File

@ -1,8 +1,10 @@
[configuration]
blobGranulesEnabled = true
allowDefaultTenant = false
injectTargetedSSRestart = true
injectSSDelay = true
# FIXME: re-enable rocks at some point
storageEngineExcludeTypes = [4]
storageEngineExcludeTypes = [4, 5]
[[knobs]]
bg_range_source = "blobRangeKeys"

View File

@ -1,8 +1,10 @@
[configuration]
blobGranulesEnabled = true
allowDefaultTenant = false
injectTargetedSSRestart = true
injectSSDelay = true
# FIXME: re-enable rocks at some point
storageEngineExcludeTypes = [4]
storageEngineExcludeTypes = [4, 5]
[[knobs]]
bg_range_source = "blobRangeKeys"

View File

@ -1,4 +1,4 @@
storageEngineExcludeTypes=3
storageEngineExcludeTypes=[3, 4, 5]
;Take snap and do cycle test
testTitle=SnapCyclePre

View File

@ -1,3 +1,4 @@
storageEngineExcludeTypes=[4, 5]
buggify=off
testTitle=SnapCycleRestore

View File

@ -1,4 +1,4 @@
storageEngineExcludeTypes=3
storageEngineExcludeTypes=[3, 4, 5]
logAntiQuorum = 0

View File

@ -1,3 +1,5 @@
storageEngineExcludeTypes=[4, 5]
testTitle=RestoreBackup
simBackupAgents=BackupToFile
clearAfterTest=false

View File

@ -1,4 +1,4 @@
storageEngineExcludeTypes=3
storageEngineExcludeTypes=[3, 4, 5]
;write 1000 Keys ending with even numbers
testTitle=SnapTestPre

View File

@ -1,3 +1,5 @@
storageEngineExcludeTypes=[4, 5]
buggify=off
; verify all keys are even numbered

Some files were not shown because too many files have changed in this diff Show More