Merge remote-tracking branch 'origin/master' into add-master-logging

This commit is contained in:
sfc-gh-tclinkenbeard 2021-05-03 14:49:08 -07:00
commit 0a9289a580
212 changed files with 11087 additions and 1966 deletions

View File

@ -152,6 +152,7 @@ if(CMAKE_SYSTEM_NAME STREQUAL "FreeBSD")
endif()
include(CompileBoost)
include(GetMsgpack)
add_subdirectory(flow)
add_subdirectory(fdbrpc)
add_subdirectory(fdbclient)

View File

@ -36,7 +36,7 @@ Members of the Apple FoundationDB team are part of the core committers helping r
## Contributing
### Opening a Pull Request
We love pull requests! For minor changes, feel free to open up a PR directly. For larger feature development and any changes that may require community discussion, we ask that you discuss your ideas on the [community forums](https://forums.foundationdb.org) prior to opening a PR, and then reference that thread within your PR comment.
We love pull requests! For minor changes, feel free to open up a PR directly. For larger feature development and any changes that may require community discussion, we ask that you discuss your ideas on the [community forums](https://forums.foundationdb.org) prior to opening a PR, and then reference that thread within your PR comment. Please refer to [FoundationDB Commit Process](https://github.com/apple/foundationdb/wiki/FoundationDB-Commit-Process) for more detailed guidelines.
CI will be run automatically for core committers, and for community PRs it will be initiated by the request of a core committer. Tests can also be run locally via `ctest`, and core committers can run additional validation on pull requests prior to merging them.

View File

@ -23,6 +23,7 @@
#define FDB_INCLUDE_LEGACY_TYPES
#include "fdbclient/MultiVersionTransaction.h"
#include "fdbclient/MultiVersionAssignmentVars.h"
#include "foundationdb/fdb_c.h"
int g_api_version = 0;
@ -364,6 +365,22 @@ extern "C" DLLEXPORT double fdb_database_get_main_thread_busyness(FDBDatabase* d
return DB(d)->getMainThreadBusyness();
}
// Returns the protocol version reported by the coordinator this client is connected to
// If an expected version is non-zero, the future won't return until the protocol version is different than expected
// Note: this will never return if the server is running a protocol from FDB 5.0 or older
extern "C" DLLEXPORT FDBFuture* fdb_database_get_server_protocol(FDBDatabase* db, uint64_t expected_version) {
Optional<ProtocolVersion> expected;
if (expected_version > 0) {
expected = ProtocolVersion(expected_version);
}
return (
FDBFuture*)(mapThreadFuture<ProtocolVersion,
uint64_t>(DB(db)->getServerProtocol(expected), [](ErrorOr<ProtocolVersion> result) {
return result.map<uint64_t>([](ProtocolVersion pv) { return pv.versionWithFlags(); });
}).extractPtr());
}
extern "C" DLLEXPORT void fdb_transaction_destroy(FDBTransaction* tr) {
try {
TXN(tr)->delref();
@ -583,10 +600,6 @@ extern "C" DLLEXPORT FDBFuture* fdb_transaction_get_approximate_size(FDBTransact
return (FDBFuture*)TXN(tr)->getApproximateSize().extractPtr();
}
extern "C" DLLEXPORT FDBFuture* fdb_get_server_protocol(const char* clusterFilePath) {
return (FDBFuture*)(API->getServerProtocol(clusterFilePath ? clusterFilePath : "").extractPtr());
}
extern "C" DLLEXPORT FDBFuture* fdb_transaction_get_versionstamp(FDBTransaction* tr) {
return (FDBFuture*)(TXN(tr)->getVersionstamp().extractPtr());
}

View File

@ -189,6 +189,8 @@ DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_database_create_snapshot(FDBDatabase
DLLEXPORT WARN_UNUSED_RESULT double fdb_database_get_main_thread_busyness(FDBDatabase* db);
DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_database_get_server_protocol(FDBDatabase* db, uint64_t expected_version);
DLLEXPORT void fdb_transaction_destroy(FDBTransaction* tr);
DLLEXPORT void fdb_transaction_cancel(FDBTransaction* tr);
@ -281,8 +283,6 @@ DLLEXPORT WARN_UNUSED_RESULT fdb_error_t fdb_transaction_get_committed_version(F
*/
DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_transaction_get_approximate_size(FDBTransaction* tr);
DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_get_server_protocol(const char* clusterFilePath);
DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_transaction_get_versionstamp(FDBTransaction* tr);
DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_transaction_on_error(FDBTransaction* tr, fdb_error_t error);

View File

@ -1513,17 +1513,17 @@ TEST_CASE("fdb_transaction_get_approximate_size") {
}
}
TEST_CASE("fdb_get_server_protocol") {
TEST_CASE("fdb_database_get_server_protocol") {
// We don't really have any expectations other than "don't crash" here
FDBFuture* protocolFuture = fdb_get_server_protocol(clusterFilePath.c_str());
FDBFuture* protocolFuture = fdb_database_get_server_protocol(db, 0);
uint64_t out;
fdb_check(fdb_future_block_until_ready(protocolFuture));
fdb_check(fdb_future_get_uint64(protocolFuture, &out));
fdb_future_destroy(protocolFuture);
// "Default" cluster file version
protocolFuture = fdb_get_server_protocol(nullptr);
// Passing in an expected version that's different than the cluster version
protocolFuture = fdb_database_get_server_protocol(db, 0x0FDB00A200090000LL);
fdb_check(fdb_future_block_until_ready(protocolFuture));
fdb_check(fdb_future_get_uint64(protocolFuture, &out));
fdb_future_destroy(protocolFuture);

View File

@ -580,6 +580,20 @@ JNIEXPORT void JNICALL Java_com_apple_foundationdb_FDBDatabase_Database_1setOpti
}
}
// Get network thread busyness (updated every 1s)
// A value of 0 indicates that the client is more or less idle
// A value of 1 (or more) indicates that the client is saturated
JNIEXPORT jdouble JNICALL Java_com_apple_foundationdb_FDBDatabase_Database_1getMainThreadBusyness(JNIEnv* jenv,
jobject,
jlong dbPtr) {
if (!dbPtr) {
throwParamNotNull(jenv);
return 0;
}
FDBDatabase* database = (FDBDatabase*)dbPtr;
return (jdouble)fdb_database_get_main_thread_busyness(database);
}
JNIEXPORT jboolean JNICALL Java_com_apple_foundationdb_FDB_Error_1predicate(JNIEnv* jenv,
jobject,
jint predicate,

View File

@ -0,0 +1,42 @@
package com.apple.foundationdb.tuple;
import java.nio.charset.Charset;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
public class ByteArrayUtilTest {
@Test
void printableWorksForAllByteValues(){
//Quick test to make sure that no bytes are unprintable
byte[] bytes = new byte[2*((int)Byte.MAX_VALUE+1)];
for(int i=0; i< bytes.length;i++){
bytes[i] = (byte)(i & 0xff);
}
String value = ByteArrayUtil.printable(bytes);
String expected = "\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\x09\\x0a\\x0b\\x0c\\x0d\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b\\x9c\\x9d\\x9e\\x9f\\xa0\\xa1\\xa2\\xa3\\xa4\\xa5\\xa6\\xa7\\xa8\\xa9\\xaa\\xab\\xac\\xad\\xae\\xaf\\xb0\\xb1\\xb2\\xb3\\xb4\\xb5\\xb6\\xb7\\xb8\\xb9\\xba\\xbb\\xbc\\xbd\\xbe\\xbf\\xc0\\xc1\\xc2\\xc3\\xc4\\xc5\\xc6\\xc7\\xc8\\xc9\\xca\\xcb\\xcc\\xcd\\xce\\xcf\\xd0\\xd1\\xd2\\xd3\\xd4\\xd5\\xd6\\xd7\\xd8\\xd9\\xda\\xdb\\xdc\\xdd\\xde\\xdf\\xe0\\xe1\\xe2\\xe3\\xe4\\xe5\\xe6\\xe7\\xe8\\xe9\\xea\\xeb\\xec\\xed\\xee\\xef\\xf0\\xf1\\xf2\\xf3\\xf4\\xf5\\xf6\\xf7\\xf8\\xf9\\xfa\\xfb\\xfc\\xfd\\xfe\\xff";
Assertions.assertEquals(expected,value,"Incorrect printable string");
}
@Test
void printableWorksForAsciiStrings(){
char[] asciiChars = new char[]{
'!','"','#','$','%','&','\'','(',')','*','+',',','~','.','/',
'0','1','2','3','4','5','6','7','8','9',':',';','<','?','@',
'A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z',
'[','\\',']','^','_','`',
'a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z','{','|','}','~',(char)127
};
for(int i=0;i<asciiChars.length;i++){
String substring = new String(asciiChars,0,i);
byte[] asciiBytes = substring.getBytes(Charset.forName("UTF-8"));
String printable = ByteArrayUtil.printable(asciiBytes);
String expected = substring.replace("\\", "\\\\");
Assertions.assertEquals(expected,printable,"Incorrect printable string");
}
}
}

View File

@ -80,6 +80,15 @@ public interface Database extends AutoCloseable, TransactionContext {
*/
DatabaseOptions options();
/**
* Returns a value which indicates the saturation of the client
* <br>
* <b>Note:</b> By default, this value is updated every second
*
* @return a value where 0 indicates that the client is idle and 1 (or larger) indicates that the client is saturated.
*/
double getMainThreadBusyness();
/**
* Runs a read-only transactional function against this {@code Database} with retry logic.
* {@link Function#apply(Object) apply(ReadTransaction)} will be called on the

View File

@ -150,6 +150,16 @@ class FDBDatabase extends NativeObjectWrapper implements Database, OptionConsume
}
}
@Override
public double getMainThreadBusyness() {
pointerReadLock.lock();
try {
return Database_getMainThreadBusyness(getPtr());
} finally {
pointerReadLock.unlock();
}
}
@Override
public Executor getExecutor() {
return executor;
@ -163,4 +173,5 @@ class FDBDatabase extends NativeObjectWrapper implements Database, OptionConsume
private native long Database_createTransaction(long cPtr);
private native void Database_dispose(long cPtr);
private native void Database_setOption(long cPtr, int code, byte[] value) throws FDBException;
private native double Database_getMainThreadBusyness(long cPtr);
}

View File

@ -419,6 +419,9 @@ public class ByteArrayUtil extends FastByteComparisons {
return ByteBuffer.wrap(src).order(ByteOrder.LITTLE_ENDIAN).getLong();
}
private static final char[] hexChars =
new char[] { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f' };
/**
* Gets a human readable version of a byte array. The bytes that correspond with
* ASCII printable characters [32-127) are passed through. Other bytes are
@ -437,7 +440,14 @@ public class ByteArrayUtil extends FastByteComparisons {
byte b = val[i];
if (b >= 32 && b < 127 && b != '\\') s.append((char)b);
else if (b == '\\') s.append("\\\\");
else s.append(String.format("\\x%02x", b));
else {
//use a lookup table here to avoid doing an expensive String.format() call
s.append("\\x");
int nib = (b & 0xF0) >> 4;
s.append(hexChars[nib]);
nib = b & 0x0F;
s.append(hexChars[nib]);
}
}
return s.toString();
}

View File

@ -510,6 +510,12 @@ public class AsyncStackTester {
db.options().setTransactionCausalReadRisky();
db.options().setTransactionIncludePortInAddress();
// Test network busyness
double busyness = db.getMainThreadBusyness();
if (busyness < 0) {
throw new IllegalStateException("Network busyness cannot be less than 0");
}
tr.options().setPrioritySystemImmediate();
tr.options().setPriorityBatch();
tr.options().setCausalReadRisky();

View File

@ -28,6 +28,7 @@
set(JAVA_JUNIT_TESTS
src/junit/com/apple/foundationdb/tuple/ArrayUtilSortTest.java
src/junit/com/apple/foundationdb/tuple/ArrayUtilTest.java
src/junit/com/apple/foundationdb/tuple/ByteArrayUtilTest.java
src/junit/com/apple/foundationdb/tuple/TupleComparisonTest.java
src/junit/com/apple/foundationdb/tuple/TuplePackingTest.java
src/junit/com/apple/foundationdb/tuple/TupleSerializationTest.java

View File

@ -95,7 +95,6 @@ def api_version(ver):
'transactional',
'options',
'StreamingMode',
'get_server_protocol'
)
_add_symbols(fdb.impl, list)

View File

@ -1531,9 +1531,6 @@ def init_c_api():
_capi.fdb_transaction_get_approximate_size.argtypes = [ctypes.c_void_p]
_capi.fdb_transaction_get_approximate_size.restype = ctypes.c_void_p
_capi.fdb_get_server_protocol.argtypes = [ctypes.c_char_p]
_capi.fdb_get_server_protocol.restype = ctypes.c_void_p
_capi.fdb_transaction_get_versionstamp.argtypes = [ctypes.c_void_p]
_capi.fdb_transaction_get_versionstamp.restype = ctypes.c_void_p
@ -1733,13 +1730,6 @@ open_databases = {}
cacheLock = threading.Lock()
def get_server_protocol(clusterFilePath=None):
with _network_thread_reentrant_lock:
if not _network_thread:
init()
return FutureUInt64(_capi.fdb_get_server_protocol(optionalParamToBytes(clusterFilePath)[0]))
def open(cluster_file=None, event_model=None):
"""Opens the given database (or the default database of the cluster indicated
by the fdb.cluster file in a platform-specific location, if no cluster_file

View File

@ -37,6 +37,7 @@ RUN sed -i -e '/enabled/d' /etc/yum.repos.d/CentOS-Base.repo && \
lz4-devel \
lz4-static \
mono-devel \
redhat-lsb-core \
rpm-build \
tcl-devel \
unzip \
@ -216,6 +217,14 @@ RUN source /opt/rh/devtoolset-8/enable && \
cd .. && \
rm -rf /tmp/*
# download old fdbserver binaries
ARG FDB_VERSION="6.2.29"
RUN mkdir -p /opt/foundationdb/old && \
curl -Ls https://www.foundationdb.org/downloads/misc/fdbservers-${FDB_VERSION}.tar.gz | \
tar --no-same-owner --directory /opt/foundationdb/old -xz && \
chmod +x /opt/foundationdb/old/* && \
ln -sf /opt/foundationdb/old/fdbserver-${FDB_VERSION} /opt/foundationdb/old/fdbserver
# build/install distcc
RUN source /opt/rh/devtoolset-8/enable && \
source /opt/rh/rh-python36/enable && \

View File

@ -28,7 +28,7 @@ RUN source /opt/rh/devtoolset-8/enable && \
subprocess32 && \
mkdir fdb-joshua && \
cd fdb-joshua && \
git clone --branch code_pipeline https://github.com/FoundationDB/fdb-joshua . && \
git clone https://github.com/FoundationDB/fdb-joshua . && \
pip3 install /tmp/fdb-joshua && \
cd /tmp && \
curl -Ls https://amazon-eks.s3.us-west-2.amazonaws.com/1.18.9/2020-11-02/bin/linux/amd64/kubectl -o kubectl && \
@ -43,20 +43,13 @@ RUN source /opt/rh/devtoolset-8/enable && \
./aws/install && \
rm -rf /tmp/*
ARG OLD_FDB_BINARY_DIR=/app/deploy/global_data/oldBinaries/
ARG OLD_TLS_LIBRARY_DIR=/app/deploy/runtime/.tls_5_1/
ARG FDB_VERSION="6.2.29"
RUN mkdir -p ${OLD_FDB_BINARY_DIR} \
${OLD_TLS_LIBRARY_DIR} \
/usr/lib/foundationdb/plugins && \
curl -Ls https://www.foundationdb.org/downloads/misc/fdbservers-${FDB_VERSION}.tar.gz | tar -xz -C ${OLD_FDB_BINARY_DIR} && \
rm -f ${OLD_FDB_BINARY_DIR}/*.sha256 && \
chmod +x ${OLD_FDB_BINARY_DIR}/* && \
curl -Ls https://www.foundationdb.org/downloads/misc/joshua_tls_library.tar.gz | tar -xz -C ${OLD_TLS_LIBRARY_DIR} --strip-components=1 && \
RUN mkdir -p /usr/lib/foundationdb/plugins && \
curl -Ls https://www.foundationdb.org/downloads/misc/joshua_tls_library.tar.gz | \
tar --strip-components=1 --no-same-owner --directory /usr/lib/foundationdb/plugins -xz && \
ln -sf /usr/lib/foundationdb/plugins/FDBGnuTLS.so /usr/lib/foundationdb/plugins/fdb-libressl-plugin.so && \
curl -Ls https://www.foundationdb.org/downloads/${FDB_VERSION}/linux/libfdb_c_${FDB_VERSION}.so -o /usr/lib64/libfdb_c_${FDB_VERSION}.so && \
ln -s /usr/lib64/libfdb_c_${FDB_VERSION}.so /usr/lib64/libfdb_c.so && \
ln -s ${OLD_TLS_LIBRARY_DIR}/FDBGnuTLS.so /usr/lib/foundationdb/plugins/fdb-libressl-plugin.so && \
ln -s ${OLD_TLS_LIBRARY_DIR}/FDBGnuTLS.so /usr/lib/foundationdb/plugins/FDBGnuTLS.so
ln -sf /usr/lib64/libfdb_c_${FDB_VERSION}.so /usr/lib64/libfdb_c.so
WORKDIR /root
RUN rm -f /root/anaconda-ks.cfg && \
@ -65,8 +58,13 @@ RUN rm -f /root/anaconda-ks.cfg && \
'source /opt/rh/rh-python36/enable' \
'source /opt/rh/rh-ruby26/enable' \
'' \
'function cmk_ci() {' \
' cmake -S ${HOME}/src/foundationdb -B ${HOME}/build_output -D USE_CCACHE=ON -D USE_WERROR=ON -D RocksDB_ROOT=/opt/rocksdb-6.10.1 -D RUN_JUNIT_TESTS=ON -D RUN_JAVA_INTEGRATION_TESTS=ON -G Ninja && \' \
' ninja -v -C ${HOME}/build_output -j 84 all packages strip_targets' \
'}' \
'function cmk() {' \
' cmake -S ${HOME}/src/foundationdb -B ${HOME}/build_output -D USE_CCACHE=ON -D USE_WERROR=ON -D RocksDB_ROOT=/opt/rocksdb-6.10.1 -D RUN_JUNIT_TESTS=ON -D RUN_JAVA_INTEGRATION_TESTS=ON -G Ninja && ninja -C ${HOME}/build_output -j 84' \
' cmake -S ${HOME}/src/foundationdb -B ${HOME}/build_output -D USE_CCACHE=ON -D USE_WERROR=ON -D RocksDB_ROOT=/opt/rocksdb-6.10.1 -D RUN_JUNIT_TESTS=ON -D RUN_JAVA_INTEGRATION_TESTS=ON -G Ninja && \' \
' ninja -C ${HOME}/build_output -j 84' \
'}' \
'function ct() {' \
' cd ${HOME}/build_output && ctest -j 32 --no-compress-output -T test --output-on-failure' \
@ -78,4 +76,9 @@ RUN rm -f /root/anaconda-ks.cfg && \
' j start --tarball $(find ${HOME}/build_output/packages -name correctness\*.tar.gz) "${@}"' \
'}' \
'' \
'USER_BASHRC="$HOME/src/.bashrc.local"' \
'if test -f "$USER_BASHRC"; then' \
' source $USER_BASHRC' \
'fi' \
'' \
>> .bashrc

View File

@ -19,7 +19,6 @@ RUN rpmkeys --import mono-project.com.rpmkey.pgp && \
debbuild \
devtoolset-8 \
devtoolset-8-libubsan-devel \
devtoolset-8-valgrind-devel \
devtoolset-8-systemtap-sdt-devel \
docker-ce \
dos2unix \
@ -35,6 +34,7 @@ RUN rpmkeys --import mono-project.com.rpmkey.pgp && \
lz4-devel \
lz4-static \
mono-devel \
redhat-lsb-core \
rpm-build \
tcl-devel \
unzip \
@ -200,6 +200,14 @@ RUN source /opt/rh/devtoolset-8/enable && \
cd .. && \
rm -rf /tmp/*
# download old fdbserver binaries
ARG FDB_VERSION="6.2.29"
RUN mkdir -p /opt/foundationdb/old && \
curl -Ls https://www.foundationdb.org/downloads/misc/fdbservers-${FDB_VERSION}.tar.gz | \
tar --no-same-owner --directory /opt/foundationdb/old -xz && \
chmod +x /opt/foundationdb/old/* && \
ln -sf /opt/foundationdb/old/fdbserver-${FDB_VERSION} /opt/foundationdb/old/fdbserver
# build/install distcc
RUN source /opt/rh/devtoolset-8/enable && \
if [ "$(uname -p)" == "aarch64" ]; then \
@ -220,4 +228,18 @@ RUN source /opt/rh/devtoolset-8/enable && \
cd .. && \
rm -rf /tmp/*
# valgrind
RUN source /opt/rh/devtoolset-8/enable && \
curl -Ls https://sourceware.org/pub/valgrind/valgrind-3.17.0.tar.bz2 -o valgrind-3.17.0.tar.bz2 && \
echo "ad3aec668e813e40f238995f60796d9590eee64a16dff88421430630e69285a2 valgrind-3.17.0.tar.bz2" > valgrind-sha.txt && \
sha256sum -c valgrind-sha.txt && \
mkdir valgrind && \
tar --strip-components 1 --no-same-owner --no-same-permissions --directory valgrind -xjf valgrind-3.17.0.tar.bz2 && \
cd valgrind && \
./configure && \
make && \
make install && \
cd .. && \
rm -rf /tmp/*
RUN curl -Ls https://github.com/manticoresoftware/manticoresearch/raw/master/misc/junit/ctest2junit.xsl -o /opt/ctest2junit.xsl

View File

@ -31,7 +31,7 @@ RUN source /opt/rh/devtoolset-8/enable && \
subprocess32 && \
mkdir fdb-joshua && \
cd fdb-joshua && \
git clone --branch code_pipeline https://github.com/FoundationDB/fdb-joshua . && \
git clone https://github.com/FoundationDB/fdb-joshua . && \
pip3 install /tmp/fdb-joshua && \
cd /tmp && \
curl -Ls https://amazon-eks.s3.us-west-2.amazonaws.com/1.18.9/2020-11-02/bin/linux/amd64/kubectl -o kubectl && \
@ -46,20 +46,13 @@ RUN source /opt/rh/devtoolset-8/enable && \
./aws/install && \
rm -rf /tmp/*
ARG OLD_FDB_BINARY_DIR=/app/deploy/global_data/oldBinaries/
ARG OLD_TLS_LIBRARY_DIR=/app/deploy/runtime/.tls_5_1/
ARG FDB_VERSION="6.2.29"
RUN mkdir -p ${OLD_FDB_BINARY_DIR} \
${OLD_TLS_LIBRARY_DIR} \
/usr/lib/foundationdb/plugins && \
curl -Ls https://www.foundationdb.org/downloads/misc/fdbservers-${FDB_VERSION}.tar.gz | tar -xz -C ${OLD_FDB_BINARY_DIR} && \
rm -f ${OLD_FDB_BINARY_DIR}/*.sha256 && \
chmod +x ${OLD_FDB_BINARY_DIR}/* && \
curl -Ls https://www.foundationdb.org/downloads/misc/joshua_tls_library.tar.gz | tar -xz -C ${OLD_TLS_LIBRARY_DIR} --strip-components=1 && \
RUN mkdir -p /usr/lib/foundationdb/plugins && \
curl -Ls https://www.foundationdb.org/downloads/misc/joshua_tls_library.tar.gz | \
tar --strip-components=1 --no-same-owner --directory /usr/lib/foundationdb/plugins -xz && \
ln -sf /usr/lib/foundationdb/plugins/FDBGnuTLS.so /usr/lib/foundationdb/plugins/fdb-libressl-plugin.so && \
curl -Ls https://www.foundationdb.org/downloads/${FDB_VERSION}/linux/libfdb_c_${FDB_VERSION}.so -o /usr/lib64/libfdb_c_${FDB_VERSION}.so && \
ln -s /usr/lib64/libfdb_c_${FDB_VERSION}.so /usr/lib64/libfdb_c.so && \
ln -s ${OLD_TLS_LIBRARY_DIR}/FDBGnuTLS.so /usr/lib/foundationdb/plugins/fdb-libressl-plugin.so && \
ln -s ${OLD_TLS_LIBRARY_DIR}/FDBGnuTLS.so /usr/lib/foundationdb/plugins/FDBGnuTLS.so
ln -sf /usr/lib64/libfdb_c_${FDB_VERSION}.so /usr/lib64/libfdb_c.so
WORKDIR /root
RUN curl -Ls https://update.code.visualstudio.com/latest/server-linux-x64/stable -o /tmp/vscode-server-linux-x64.tar.gz && \
@ -93,8 +86,13 @@ RUN rm -f /root/anaconda-ks.cfg && \
'source /opt/rh/rh-python36/enable' \
'source /opt/rh/rh-ruby26/enable' \
'' \
'function cmk_ci() {' \
' cmake -S ${HOME}/src/foundationdb -B ${HOME}/build_output -D USE_CCACHE=ON -D USE_WERROR=ON -D RocksDB_ROOT=/opt/rocksdb-6.10.1 -D RUN_JUNIT_TESTS=ON -D RUN_JAVA_INTEGRATION_TESTS=ON -G Ninja && \' \
' ninja -v -C ${HOME}/build_output -j 84 all packages strip_targets' \
'}' \
'function cmk() {' \
' cmake -S ${HOME}/src/foundationdb -B ${HOME}/build_output -D USE_CCACHE=ON -D USE_WERROR=ON -D RocksDB_ROOT=/opt/rocksdb-6.10.1 -D RUN_JUNIT_TESTS=ON -D RUN_JAVA_INTEGRATION_TESTS=ON -G Ninja && ninja -C ${HOME}/build_output -j 84' \
' cmake -S ${HOME}/src/foundationdb -B ${HOME}/build_output -D USE_CCACHE=ON -D USE_WERROR=ON -D RocksDB_ROOT=/opt/rocksdb-6.10.1 -D RUN_JUNIT_TESTS=ON -D RUN_JAVA_INTEGRATION_TESTS=ON -G Ninja && \' \
' ninja -C ${HOME}/build_output -j 84' \
'}' \
'function ct() {' \
' cd ${HOME}/build_output && ctest -j 32 --no-compress-output -T test --output-on-failure' \
@ -106,4 +104,10 @@ RUN rm -f /root/anaconda-ks.cfg && \
' j start --tarball $(find ${HOME}/build_output/packages -name correctness\*.tar.gz) "${@}"' \
'}' \
'' \
'USER_BASHRC="$HOME/src/.bashrc.local"' \
'if test -f "$USER_BASHRC"; then' \
' source $USER_BASHRC' \
'fi' \
'' \
'bash ${HOME}/docker_proxy.sh' \
>> .bashrc

View File

@ -280,7 +280,12 @@ else()
-Wno-unknown-attributes)
endif()
add_compile_options(
-Wall -Wextra
-Wall
-Wextra
-Wredundant-move
-Wpessimizing-move
-Woverloaded-virtual
-Wshift-sign-overflow
# Here's the current set of warnings we need to explicitly disable to compile warning-free with clang 10
-Wno-comment
-Wno-dangling-else
@ -288,16 +293,12 @@ else()
-Wno-format
-Wno-mismatched-tags
-Wno-missing-field-initializers
-Wno-overloaded-virtual
-Wno-reorder
-Wno-reorder-ctor
-Wno-sign-compare
-Wno-tautological-pointer-compare
-Wno-undefined-var-template
-Wno-tautological-pointer-compare
-Wredundant-move
-Wpessimizing-move
-Woverloaded-virtual
-Wno-unknown-pragmas
-Wno-unknown-warning-option
-Wno-unused-function

19
cmake/GetMsgpack.cmake Normal file
View File

@ -0,0 +1,19 @@
find_package(msgpack 3.3.0 EXACT QUIET CONFIG)
add_library(msgpack INTERFACE)
if(msgpack_FOUND)
target_link_libraries(msgpack INTERFACE msgpackc-cxx)
else()
include(ExternalProject)
ExternalProject_add(msgpackProject
URL "https://github.com/msgpack/msgpack-c/releases/download/cpp-3.3.0/msgpack-3.3.0.tar.gz"
URL_HASH SHA256=6e114d12a5ddb8cb11f669f83f32246e484a8addd0ce93f274996f1941c1f07b
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
INSTALL_COMMAND ""
)
ExternalProject_Get_property(msgpackProject SOURCE_DIR)
target_include_directories(msgpack SYSTEM INTERFACE "${SOURCE_DIR}/include")
endif()

View File

@ -103,54 +103,10 @@ function(symlink_files)
endif()
endfunction()
# 'map' from (destination, package) to path
# format vars like install_destination_for_${destination}_${package}
set(install_destination_for_bin_tgz "bin")
set(install_destination_for_bin_deb "usr/bin")
set(install_destination_for_bin_el6 "usr/bin")
set(install_destination_for_bin_el7 "usr/bin")
set(install_destination_for_bin_pm "usr/local/bin")
set(install_destination_for_sbin_tgz "sbin")
set(install_destination_for_sbin_deb "usr/sbin")
set(install_destination_for_sbin_el6 "usr/sbin")
set(install_destination_for_sbin_el7 "usr/sbin")
set(install_destination_for_sbin_pm "usr/local/libexec")
set(install_destination_for_lib_tgz "lib")
set(install_destination_for_lib_deb "usr/lib")
set(install_destination_for_lib_el6 "usr/lib64")
set(install_destination_for_lib_el7 "usr/lib64")
set(install_destination_for_lib_pm "usr/local/lib")
set(install_destination_for_fdbmonitor_tgz "sbin")
set(install_destination_for_fdbmonitor_deb "usr/lib/foundationdb")
set(install_destination_for_fdbmonitor_el6 "usr/lib/foundationdb")
set(install_destination_for_fdbmonitor_el7 "usr/lib/foundationdb")
set(install_destination_for_fdbmonitor_pm "usr/local/libexec")
set(install_destination_for_include_tgz "include")
set(install_destination_for_include_deb "usr/include")
set(install_destination_for_include_el6 "usr/include")
set(install_destination_for_include_el7 "usr/include")
set(install_destination_for_include_pm "usr/local/include")
set(install_destination_for_etc_tgz "etc/foundationdb")
set(install_destination_for_etc_deb "etc/foundationdb")
set(install_destination_for_etc_el6 "etc/foundationdb")
set(install_destination_for_etc_el7 "etc/foundationdb")
set(install_destination_for_etc_pm "usr/local/etc/foundationdb")
set(install_destination_for_log_tgz "log/foundationdb")
set(install_destination_for_log_deb "var/log/foundationdb")
set(install_destination_for_log_el6 "var/log/foundationdb")
set(install_destination_for_log_el7 "var/log/foundationdb")
set(install_destination_for_log_pm "usr/local/foundationdb/logs")
set(install_destination_for_data_tgz "lib/foundationdb")
set(install_destination_for_data_deb "var/lib/foundationdb/data")
set(install_destination_for_data_el6 "var/lib/foundationdb/data")
set(install_destination_for_data_el7 "var/lib/foundationdb/data")
set(install_destination_for_data_pm "usr/local/foundationdb/data")
fdb_install_packages(TGZ DEB EL7 PM VERSIONED)
fdb_install_dirs(BIN SBIN LIB FDBMONITOR INCLUDE ETC LOG DATA)
message(STATUS "FDB_INSTALL_DIRS -> ${FDB_INSTALL_DIRS}")
# 'map' from (destination, package) to path
# format vars like install_destination_for_${destination}_${package}
install_destinations(TGZ
BIN bin
SBIN sbin
@ -169,7 +125,7 @@ install_destinations(DEB
INCLUDE usr/include
ETC etc/foundationdb
LOG var/log/foundationdb
DATA var/lib/foundationdb)
DATA var/lib/foundationdb/data)
copy_install_destinations(DEB EL7)
install_destinations(EL7 LIB usr/lib64)
install_destinations(PM
@ -227,6 +183,13 @@ set(LIB_DIR lib64)
configure_file("${PROJECT_SOURCE_DIR}/packaging/multiversion/clients/postinst" "${script_dir}/clients/postinst-el7" @ONLY)
configure_file("${PROJECT_SOURCE_DIR}/packaging/multiversion/clients/prerm" "${script_dir}/clients" @ONLY)
################################################################################
# Move Docker Setup
################################################################################
file(COPY "${PROJECT_SOURCE_DIR}/packaging/docker" DESTINATION "${PROJECT_BINARY_DIR}/packages/")
################################################################################
# General CPack configuration
################################################################################

View File

@ -0,0 +1,134 @@
#!/usr/bin/env python3
#
# grv_test.py
#
# This source file is part of the FoundationDB open source project
#
# Copyright 2013-2020 Apple Inc. and the FoundationDB project authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import argparse
import inspect
import sys
import rate_model
import workload_model
import proxy_model
import ratekeeper_model
from priority import Priority
from plot import Plotter
parser = argparse.ArgumentParser()
parser.add_argument('-w', '--workload', type=str, help='Name of workload to run')
parser.add_argument('-r', '--ratekeeper', type=str, help='Name of ratekeeper model')
parser.add_argument('-d', '--duration', type=int, default=240, help='Duration of simulated test, in seconds. Defaults to 240.')
parser.add_argument('-L', '--limiter', type=str, default='Original', help='Name of limiter implementation. Defaults to \'Original\'.')
parser.add_argument('-p', '--proxy', type=str, default='ProxyModel', help='Name of proxy implementation. Defaults to \'ProxyModel\'.')
parser.add_argument('--list', action='store_true', default=False, help='List options for all models.')
parser.add_argument('--no-graph', action='store_true', default=False, help='Disable graphical output.')
args = parser.parse_args()
def print_choices_list(context=None):
if context == 'workload' or context is None:
print('Workloads:')
for w in workload_model.predefined_workloads.keys():
print(' %s' % w)
if context == 'ratekeeper' or context is None:
print('\nRatekeeper models:')
for r in ratekeeper_model.predefined_ratekeeper.keys():
print(' %s' % r)
proxy_model_classes = [c for c in [getattr(proxy_model, a) for a in dir(proxy_model)] if inspect.isclass(c)]
if context == 'proxy' or context is None:
print('\nProxy models:')
for p in proxy_model_classes:
if issubclass(p, proxy_model.ProxyModel):
print(' %s' % p.__name__)
if context == 'limiter' or context is None:
print('\nProxy limiters:')
for p in proxy_model_classes:
if issubclass(p, proxy_model.Limiter) and p != proxy_model.Limiter:
name = p.__name__
if name.endswith('Limiter'):
name = name[0:-len('Limiter')]
print(' %s' % name)
if args.workload is None or args.ratekeeper is None:
print('ERROR: A workload (-w/--workload) and ratekeeper model (-r/--ratekeeper) must be specified.\n')
print_choices_list()
sys.exit(1)
if args.list:
print_choices_list()
sys.exit(0)
def validate_class_type(var, name, superclass):
cls = getattr(var, name, None)
return cls is not None and inspect.isclass(cls) and issubclass(cls, superclass)
if not args.ratekeeper in ratekeeper_model.predefined_ratekeeper:
print('Invalid ratekeeper model `%s\'' % args.ratekeeper)
print_choices_list('ratekeeper')
sys.exit(1)
if not args.workload in workload_model.predefined_workloads:
print('Invalid workload model `%s\'' % args.workload)
print_choices_list('workload')
sys.exit(1)
if not validate_class_type(proxy_model, args.proxy, proxy_model.ProxyModel):
print('Invalid proxy model `%s\'' % args.proxy)
print_choices_list('proxy')
sys.exit(1)
limiter_name = args.limiter
if not validate_class_type(proxy_model, limiter_name, proxy_model.Limiter):
limiter_name += 'Limiter'
if not validate_class_type(proxy_model, limiter_name, proxy_model.Limiter):
print('Invalid proxy limiter `%s\'' % args.limiter)
print_choices_list('limiter')
sys.exit(1)
ratekeeper = ratekeeper_model.predefined_ratekeeper[args.ratekeeper]
workload = workload_model.predefined_workloads[args.workload]
limiter = getattr(proxy_model, limiter_name)
proxy = getattr(proxy_model, args.proxy)(args.duration, ratekeeper, workload, limiter)
proxy.run()
for priority in workload.priorities():
latencies = sorted([p for t in proxy.results.latencies[priority].values() for p in t])
total_started = sum(proxy.results.started[priority].values())
still_queued = sum([r.count for r in proxy.request_queue if r.priority == priority])
if len(latencies) > 0:
print('\n%s: %d requests in %d seconds (rate=%f). %d still queued.' % (priority, total_started, proxy.time, float(total_started)/proxy.time, still_queued))
print(' Median latency: %f' % latencies[len(latencies)//2])
print(' 90%% latency: %f' % latencies[int(0.9*len(latencies))])
print(' 99%% latency: %f' % latencies[int(0.99*len(latencies))])
print(' 99.9%% latency: %f' % latencies[int(0.999*len(latencies))])
print(' Max latency: %f' % latencies[-1])
print('')
if not args.no_graph:
plotter = Plotter(proxy.results)
plotter.display()

107
contrib/grv_proxy_model/plot.py Executable file
View File

@ -0,0 +1,107 @@
#
# plot.py
#
# This source file is part of the FoundationDB open source project
#
# Copyright 2013-2020 Apple Inc. and the FoundationDB project authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import matplotlib.pyplot as plt
class Plotter:
def __init__(self, results):
self.results = results
def add_plot(data, time_resolution, label, use_avg=False):
out_data = {}
counts = {}
for t in data.keys():
out_data.setdefault(t//time_resolution*time_resolution, 0)
counts.setdefault(t//time_resolution*time_resolution, 0)
out_data[t//time_resolution*time_resolution] += data[t]
counts[t//time_resolution*time_resolution] += 1
if use_avg:
out_data = { t: v/counts[t] for t,v in out_data.items() }
plt.plot(list(out_data.keys()), list(out_data.values()), label=label)
def add_plot_with_times(data, label):
plt.plot(list(data.keys()), list(data.values()), label=label)
def display(self, time_resolution=0.1):
plt.figure(figsize=(40,9))
plt.subplot(3, 3, 1)
for priority in self.results.started.keys():
Plotter.add_plot(self.results.started[priority], time_resolution, priority)
plt.xlabel('Time (s)')
plt.ylabel('Released/s')
plt.legend()
plt.subplot(3, 3, 2)
for priority in self.results.queued.keys():
Plotter.add_plot(self.results.queued[priority], time_resolution, priority)
plt.xlabel('Time (s)')
plt.ylabel('Requests/s')
plt.legend()
plt.subplot(3, 3, 3)
for priority in self.results.unprocessed_queue_sizes.keys():
data = {k: max(v) for (k,v) in self.results.unprocessed_queue_sizes[priority].items()}
Plotter.add_plot(data, time_resolution, priority)
plt.xlabel('Time (s)')
plt.ylabel('Max queue size')
plt.legend()
num = 4
for priority in self.results.latencies.keys():
plt.subplot(3, 3, num)
median_latencies = {k: v[int(0.5*len(v))] if len(v) > 0 else 0 for (k,v) in self.results.latencies[priority].items()}
percentile90_latencies = {k: v[int(0.9*len(v))] if len(v) > 0 else 0 for (k,v) in self.results.latencies[priority].items()}
max_latencies = {k: max(v) if len(v) > 0 else 0 for (k,v) in self.results.latencies[priority].items()}
Plotter.add_plot(median_latencies, time_resolution, 'median')
Plotter.add_plot(percentile90_latencies, time_resolution, '90th percentile')
Plotter.add_plot(max_latencies, time_resolution, 'max')
plt.xlabel('Time (s)')
plt.ylabel(str(priority) + ' Latency (s)')
plt.yscale('log')
plt.legend()
num += 1
for priority in self.results.rate.keys():
plt.subplot(3, 3, num)
if len(self.results.rate[priority]) > 0:
Plotter.add_plot(self.results.rate[priority], time_resolution, 'Rate', use_avg=True)
if len(self.results.released[priority]) > 0:
Plotter.add_plot(self.results.released[priority], time_resolution, 'Released', use_avg=True)
if len(self.results.limit[priority]) > 0:
Plotter.add_plot(self.results.limit[priority], time_resolution, 'Limit', use_avg=True)
if len(self.results.limit_and_budget[priority]) > 0:
Plotter.add_plot(self.results.limit_and_budget[priority], time_resolution, 'Limit and budget', use_avg=True)
if len(self.results.budget[priority]) > 0:
Plotter.add_plot(self.results.budget[priority], time_resolution, 'Budget', use_avg=True)
plt.xlabel('Time (s)')
plt.ylabel('Value (' + str(priority) + ')')
plt.legend()
num += 1
plt.show()

View File

@ -0,0 +1,40 @@
#
# priority.py
#
# This source file is part of the FoundationDB open source project
#
# Copyright 2013-2020 Apple Inc. and the FoundationDB project authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import functools
@functools.total_ordering
class Priority:
def __init__(self, priority_value, label):
self.priority_value = priority_value
self.label = label
def __lt__(self, other):
return self.priority_value < other.priority_value
def __str__(self):
return self.label
def __repr__(self):
return repr(self.label)
Priority.SYSTEM = Priority(0, "System")
Priority.DEFAULT = Priority(1, "Default")
Priority.BATCH = Priority(2, "Batch")

View File

@ -0,0 +1,338 @@
#
# proxy_model.py
#
# This source file is part of the FoundationDB open source project
#
# Copyright 2013-2020 Apple Inc. and the FoundationDB project authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import copy
import functools
import heapq
from priority import Priority
from smoother import Smoother
@functools.total_ordering
class Task:
def __init__(self, time, fxn):
self.time = time
self.fxn = fxn
def __lt__(self, other):
return self.time < other.time
class Limiter:
class UpdateRateParams:
def __init__(self, time):
self.time = time
class UpdateLimitParams:
def __init__(self, time, elapsed):
self.time = time
self.elapsed = elapsed
class CanStartParams:
def __init__(self, time, num_started, count):
self.time = time
self.num_started = num_started
self.count = count
class UpdateBudgetParams:
def __init__(self, time, num_started, num_started_at_priority, min_priority, last_batch, queue_empty, elapsed):
self.time = time
self.num_started = num_started
self.num_started_at_priority = num_started_at_priority
self.min_priority = min_priority
self.last_batch = last_batch
self.queue_empty = queue_empty
self.elapsed = elapsed
def __init__(self, priority, ratekeeper_model, proxy_model):
self.priority = priority
self.ratekeeper_model = ratekeeper_model
self.proxy_model = proxy_model
self.limit = 0
self.rate = self.ratekeeper_model.get_limit(0, self.priority)
def update_rate(self, params):
pass
def update_limit(self, params):
pass
def can_start(self, params):
pass
def update_budget(self, params):
pass
class OriginalLimiter(Limiter):
def __init__(self, priority, limit_rate_model, proxy_model):
Limiter.__init__(self, priority, limit_rate_model, proxy_model)
def update_rate(self, params):
self.rate = self.ratekeeper_model.get_limit(params.time, self.priority)
def update_limit(self, params):
self.limit = min(0, self.limit) + params.elapsed * self.rate
self.limit = min(self.limit, self.rate * 0.01)
self.limit = min(self.limit, 100000)
self.proxy_model.results.rate[self.priority][params.time] = self.rate
self.proxy_model.results.limit[self.priority][params.time] = self.limit
def can_start(self, params):
return params.num_started < self.limit
def update_budget(self, params):
self.limit -= params.num_started
class PositiveBudgetLimiter(OriginalLimiter):
def __init__(self, priority, limit_rate_model, proxy_model):
OriginalLimiter.__init__(self, priority, limit_rate_model, proxy_model)
def update_limit(self, params):
self.limit += params.elapsed * self.rate
self.limit = min(self.limit, 2.0 * self.rate)
class ClampedBudgetLimiter(PositiveBudgetLimiter):
def __init__(self, priority, limit_rate_model, proxy_model):
PositiveBudgetLimiter.__init__(self, priority, limit_rate_model, proxy_model)
def update_budget(self, params):
min_budget = -self.rate * 5.0
if self.limit > min_budget:
self.limit = max(self.limit - params.num_started, min_budget)
class TimeLimiter(PositiveBudgetLimiter):
def __init__(self, priority, limit_rate_model, proxy_model):
PositiveBudgetLimiter.__init__(self, priority, limit_rate_model, proxy_model)
self.locked_until = 0
def can_start(self, params):
return params.time >= self.locked_until and PositiveBudgetLimiter.can_start(self, params)
def update_budget(self, params):
#print('Start update budget: time=%f, limit=%f, locked_until=%f, num_started=%d, priority=%s, min_priority=%s, last_batch=%d' % (params.time, self.limit, self.locked_until, params.num_started, self.priority, params.min_priority, params.last_batch))
if params.min_priority >= self.priority or params.num_started < self.limit:
self.limit -= params.num_started
else:
self.limit = min(self.limit, max(self.limit - params.num_started, -params.last_batch))
self.locked_until = min(params.time + 2.0, max(params.time, self.locked_until) + (params.num_started - self.limit)/self.rate)
#print('End update budget: time=%f, limit=%f, locked_until=%f, num_started=%d, priority=%s, min_priority=%s' % (params.time, self.limit, self.locked_until, params.num_started, self.priority, params.min_priority))
class TimePositiveBudgetLimiter(PositiveBudgetLimiter):
def __init__(self, priority, limit_rate_model, proxy_model):
PositiveBudgetLimiter.__init__(self, priority, limit_rate_model, proxy_model)
self.locked_until = 0
def update_limit(self, params):
if params.time >= self.locked_until:
PositiveBudgetLimiter.update_limit(self, params)
def can_start(self, params):
return params.num_started + params.count <= self.limit
def update_budget(self, params):
#if params.num_started > 0:
#print('Start update budget: time=%f, limit=%f, locked_until=%f, num_started=%d, priority=%s, min_priority=%s, last_batch=%d' % (params.time, self.limit, self.locked_until, params.num_started, self.priority, params.min_priority, params.last_batch))
if params.num_started > self.limit:
self.locked_until = min(params.time + 2.0, max(params.time, self.locked_until) + penalty/self.rate)
self.limit = 0
else:
self.limit -= params.num_started
#if params.num_started > 0:
#print('End update budget: time=%f, limit=%f, locked_until=%f, num_started=%d, priority=%s, min_priority=%s' % (params.time, self.limit, self.locked_until, params.num_started, self.priority, params.min_priority))
class SmoothingLimiter(OriginalLimiter):
def __init__(self, priority, limit_rate_model, proxy_model):
OriginalLimiter.__init__(self, priority, limit_rate_model, proxy_model)
self.smooth_released = Smoother(2)
self.smooth_rate_limit = Smoother(2)
self.rate_set = False
def update_rate(self, params):
OriginalLimiter.update_rate(self, params)
if not self.rate_set:
self.rate_set = True
self.smooth_rate_limit.reset(self.rate)
else:
self.smooth_rate_limit.set_total(params.time, self.rate)
def update_limit(self, params):
self.limit = 2.0 * (self.smooth_rate_limit.smooth_total(params.time) - self.smooth_released.smooth_rate(params.time))
def can_start(self, params):
return params.num_started + params.count <= self.limit
def update_budget(self, params):
self.smooth_released.add_delta(params.time, params.num_started)
class SmoothingBudgetLimiter(SmoothingLimiter):
def __init__(self, priority, limit_rate_model, proxy_model):
SmoothingLimiter.__init__(self, priority, limit_rate_model, proxy_model)
#self.smooth_filled = Smoother(2)
self.budget = 0
def update_limit(self, params):
release_rate = (self.smooth_rate_limit.smooth_total(params.time) - self.smooth_released.smooth_rate(params.time))
#self.smooth_filled.set_total(params.time, 1 if release_rate > 0 else 0)
self.limit = 2.0 * release_rate
self.proxy_model.results.rate[self.priority][params.time] = self.smooth_rate_limit.smooth_total(params.time)
self.proxy_model.results.released[self.priority][params.time] = self.smooth_released.smooth_rate(params.time)
self.proxy_model.results.limit[self.priority][params.time] = self.limit
self.proxy_model.results.limit_and_budget[self.priority][params.time] = self.limit + self.budget
self.proxy_model.results.budget[self.priority][params.time] = self.budget
#self.budget = max(0, self.budget + params.elapsed * self.smooth_rate_limit.smooth_total(params.time))
#if self.smooth_filled.smooth_total(params.time) >= 0.1:
#self.budget += params.elapsed * self.smooth_rate_limit.smooth_total(params.time)
#print('Update limit: time=%f, priority=%s, limit=%f, rate=%f, released=%f, budget=%f' % (params.time, self.priority, self.limit, self.smooth_rate_limit.smooth_total(params.time), self.smooth_released.smooth_rate(params.time), self.budget))
def can_start(self, params):
return params.num_started + params.count <= self.limit + self.budget #or params.num_started + params.count <= self.budget
def update_budget(self, params):
self.budget = max(0, self.budget + (self.limit - params.num_started_at_priority) / 2 * params.elapsed)
if params.queue_empty:
self.budget = min(10, self.budget)
self.smooth_released.add_delta(params.time, params.num_started_at_priority)
class ProxyModel:
class Results:
def __init__(self, priorities, duration):
self.started = self.init_result(priorities, 0, duration)
self.queued = self.init_result(priorities, 0, duration)
self.latencies = self.init_result(priorities, [], duration)
self.unprocessed_queue_sizes = self.init_result(priorities, [], duration)
self.rate = {p:{} for p in priorities}
self.released = {p:{} for p in priorities}
self.limit = {p:{} for p in priorities}
self.limit_and_budget = {p:{} for p in priorities}
self.budget = {p:{} for p in priorities}
def init_result(self, priorities, starting_value, duration):
return {p: {s: copy.copy(starting_value) for s in range(0, duration)} for p in priorities}
def __init__(self, duration, ratekeeper_model, workload_model, Limiter):
self.time = 0
self.log_time = 0
self.duration = duration
self.priority_limiters = { priority: Limiter(priority, ratekeeper_model, self) for priority in workload_model.priorities() }
self.workload_model = workload_model
self.request_scheduled = { p: False for p in self.workload_model.priorities()}
self.tasks = []
self.request_queue = []
self.results = ProxyModel.Results(self.workload_model.priorities(), duration)
def run(self):
self.update_rate()
self.process_requests(self.time)
for priority in self.workload_model.priorities():
next_request = self.workload_model.next_request(self.time, priority)
assert next_request is not None
heapq.heappush(self.tasks, Task(next_request.time, lambda next_request=next_request: self.receive_request(next_request)))
self.request_scheduled[priority] = True
while True:# or len(self.request_queue) > 0:
if int(self.time) > self.log_time:
self.log_time = int(self.time)
#print(self.log_time)
task = heapq.heappop(self.tasks)
self.time = task.time
if self.time >= self.duration:
break
task.fxn()
def update_rate(self):
for limiter in self.priority_limiters.values():
limiter.update_rate(Limiter.UpdateRateParams(self.time))
heapq.heappush(self.tasks, Task(self.time + 0.01, lambda: self.update_rate()))
def receive_request(self, request):
heapq.heappush(self.request_queue, request)
self.results.queued[request.priority][int(self.time)] += request.count
next_request = self.workload_model.next_request(self.time, request.priority)
if next_request is not None and next_request.time < self.duration:
heapq.heappush(self.tasks, Task(next_request.time, lambda: self.receive_request(next_request)))
else:
self.request_scheduled[request.priority] = False
def process_requests(self, last_time):
elapsed = self.time - last_time
for limiter in self.priority_limiters.values():
limiter.update_limit(Limiter.UpdateLimitParams(self.time, elapsed))
current_started = 0
started = {p:0 for p in self.workload_model.priorities()}
min_priority = Priority.SYSTEM
last_batch = 0
while len(self.request_queue) > 0:
request = self.request_queue[0]
if not self.priority_limiters[request.priority].can_start(Limiter.CanStartParams(self.time, current_started, request.count)):
break
min_priority = request.priority
last_batch = request.count
if self.workload_model.request_completed(request) and not self.request_scheduled[request.priority]:
next_request = self.workload_model.next_request(self.time, request.priority)
assert next_request is not None
heapq.heappush(self.tasks, Task(next_request.time, lambda next_request=next_request: self.receive_request(next_request)))
self.request_scheduled[request.priority] = True
current_started += request.count
started[request.priority] += request.count
heapq.heappop(self.request_queue)
self.results.started[request.priority][int(self.time)] += request.count
self.results.latencies[request.priority][int(self.time)].append(self.time-request.time)
if len(self.request_queue) == 0:
min_priority = Priority.BATCH
for priority, limiter in self.priority_limiters.items():
started_at_priority = sum([v for p,v in started.items() if p <= priority])
limiter.update_budget(Limiter.UpdateBudgetParams(self.time, current_started, started_at_priority, min_priority, last_batch, len(self.request_queue) == 0 or self.request_queue[0].priority > priority, elapsed))
for priority in self.workload_model.priorities():
self.results.unprocessed_queue_sizes[priority][int(self.time)].append(self.workload_model.workload_models[priority].outstanding)
current_time = self.time
delay = 0.001
heapq.heappush(self.tasks, Task(self.time + delay, lambda: self.process_requests(current_time)))

View File

@ -0,0 +1,83 @@
#
# rate_model.py
#
# This source file is part of the FoundationDB open source project
#
# Copyright 2013-2020 Apple Inc. and the FoundationDB project authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import numpy
class RateModel:
def __init__(self):
pass
def get_rate(self, time):
pass
class FixedRateModel(RateModel):
def __init__(self, rate):
RateModel.__init__(self)
self.rate = rate
def get_rate(self, time):
return self.rate
class UnlimitedRateModel(FixedRateModel):
def __init__(self):
self.rate = 1e9
class IntervalRateModel(RateModel):
def __init__(self, intervals):
self.intervals = sorted(intervals)
def get_rate(self, time):
if len(self.intervals) == 0 or time < self.intervals[0][0]:
return 0
target_interval = len(self.intervals)-1
for i in range(1, len(self.intervals)):
if time < self.intervals[i][0]:
target_interval = i-1
break
self.intervals = self.intervals[target_interval:]
return self.intervals[0][1]
class SawtoothRateModel(RateModel):
def __init__(self, low, high, frequency):
self.low = low
self.high = high
self.frequency = frequency
def get_rate(self, time):
if int(2*time/self.frequency) % 2 == 0:
return self.low
else:
return self.high
class DistributionRateModel(RateModel):
def __init__(self, distribution, frequency):
self.distribution = distribution
self.frequency = frequency
self.last_change = 0
self.rate = None
def get_rate(self, time):
if self.frequency == 0 or int((time - self.last_change) / self.frequency) > int(self.last_change / self.frequency) or self.rate is None:
self.last_change = time
self.rate = self.distribution()
return self.rate

View File

@ -0,0 +1,67 @@
#
# ratekeeper.py
#
# This source file is part of the FoundationDB open source project
#
# Copyright 2013-2020 Apple Inc. and the FoundationDB project authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import numpy
import rate_model
from priority import Priority
class RatekeeperModel:
def __init__(self, limit_models):
self.limit_models = limit_models
def get_limit(self, time, priority):
return self.limit_models[priority].get_rate(time)
predefined_ratekeeper = {}
predefined_ratekeeper['default200_batch100'] = RatekeeperModel(
{
Priority.SYSTEM: rate_model.UnlimitedRateModel(),
Priority.DEFAULT: rate_model.FixedRateModel(200),
Priority.BATCH: rate_model.FixedRateModel(100)
})
predefined_ratekeeper['default_sawtooth'] = RatekeeperModel(
{
Priority.SYSTEM: rate_model.UnlimitedRateModel(),
Priority.DEFAULT: rate_model.SawtoothRateModel(10, 200, 1),
Priority.BATCH: rate_model.FixedRateModel(0)
})
predefined_ratekeeper['default_uniform_random'] = RatekeeperModel(
{
Priority.SYSTEM: rate_model.UnlimitedRateModel(),
Priority.DEFAULT: rate_model.DistributionRateModel(lambda: numpy.random.uniform(10, 200), 1),
Priority.BATCH: rate_model.FixedRateModel(0)
})
predefined_ratekeeper['default_trickle'] = RatekeeperModel(
{
Priority.SYSTEM: rate_model.UnlimitedRateModel(),
Priority.DEFAULT: rate_model.FixedRateModel(3),
Priority.BATCH: rate_model.FixedRateModel(0)
})
predefined_ratekeeper['default1000'] = RatekeeperModel(
{
Priority.SYSTEM: rate_model.UnlimitedRateModel(),
Priority.DEFAULT: rate_model.FixedRateModel(1000),
Priority.BATCH: rate_model.FixedRateModel(500)
})

View File

@ -0,0 +1,53 @@
#
# smoother.py
#
# This source file is part of the FoundationDB open source project
#
# Copyright 2013-2020 Apple Inc. and the FoundationDB project authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import math
class Smoother:
def __init__(self, folding_time):
self.folding_time = folding_time
self.reset(0)
def reset(self, value):
self.time = 0
self.total = value
self.estimate = value
def set_total(self, time, total):
self.add_delta(time, total-self.total)
def add_delta(self, time, delta):
self.update(time)
self.total += delta
def smooth_total(self, time):
self.update(time)
return self.estimate
def smooth_rate(self, time):
self.update(time)
return (self.total-self.estimate) / self.folding_time
def update(self, time):
elapsed = time - self.time
if elapsed > 0:
self.time = time
self.estimate += (self.total-self.estimate) * (1-math.exp(-elapsed/self.folding_time))

View File

@ -0,0 +1,201 @@
#
# workload_model.py
#
# This source file is part of the FoundationDB open source project
#
# Copyright 2013-2020 Apple Inc. and the FoundationDB project authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import functools
import numpy
import math
import rate_model
from priority import Priority
@functools.total_ordering
class Request:
def __init__(self, time, count, priority):
self.time = time
self.count = count
self.priority = priority
def __lt__(self, other):
return self.priority < other.priority
class PriorityWorkloadModel:
def __init__(self, priority, rate_model, batch_model, generator, max_outstanding=1e9):
self.priority = priority
self.rate_model = rate_model
self.batch_model = batch_model
self.generator = generator
self.max_outstanding = max_outstanding
self.outstanding = 0
def next_request(self, time):
if self.outstanding >= self.max_outstanding:
return None
batch_size = self.batch_model.next_batch()
self.outstanding += batch_size
interval = self.generator.next_request_interval(self.rate_model.get_rate(time))
return Request(time + interval, batch_size, self.priority)
def request_completed(self, request):
was_full = self.max_outstanding <= self.outstanding
self.outstanding -= request.count
return was_full and self.outstanding < self.max_outstanding
class WorkloadModel:
def __init__(self, workload_models):
self.workload_models = workload_models
def priorities(self):
return list(self.workload_models.keys())
def next_request(self, time, priority):
return self.workload_models[priority].next_request(time)
def request_completed(self, request):
return self.workload_models[request.priority].request_completed(request)
class Distribution:
EXPONENTIAL = lambda x: numpy.random.exponential(x)
UNIFORM = lambda x: numpy.random.uniform(0, 2.0*x)
FIXED = lambda x: x
class BatchGenerator:
def __init__(self):
pass
def next_batch(self):
pass
class DistributionBatchGenerator(BatchGenerator):
def __init__(self, distribution, size):
BatchGenerator.__init__(self)
self.distribution = distribution
self.size = size
def next_batch(self):
return math.ceil(self.distribution(self.size))
class RequestGenerator:
def __init__(self):
pass
def next_request_interval(self, rate):
pass
class DistributionRequestGenerator(RequestGenerator):
def __init__(self, distribution):
RequestGenerator.__init__(self)
self.distribution = distribution
def next_request_interval(self, rate):
if rate == 0:
return 1e9
return self.distribution(1.0/rate)
predefined_workloads = {}
predefined_workloads['slow_exponential'] = WorkloadModel(
{
Priority.DEFAULT: PriorityWorkloadModel(Priority.DEFAULT,
rate_model.FixedRateModel(100),
DistributionBatchGenerator(Distribution.FIXED, 1),
DistributionRequestGenerator(Distribution.EXPONENTIAL),
max_outstanding=100
)
})
predefined_workloads['fixed_uniform'] = WorkloadModel(
{
Priority.SYSTEM: PriorityWorkloadModel(Priority.SYSTEM,
rate_model.FixedRateModel(0),
DistributionBatchGenerator(Distribution.FIXED, 1),
DistributionRequestGenerator(Distribution.UNIFORM),
max_outstanding=10
),
Priority.DEFAULT: PriorityWorkloadModel(Priority.DEFAULT,
rate_model.FixedRateModel(95),
DistributionBatchGenerator(Distribution.FIXED, 10),
DistributionRequestGenerator(Distribution.UNIFORM),
max_outstanding=200
),
Priority.BATCH: PriorityWorkloadModel(Priority.BATCH,
rate_model.FixedRateModel(1),
DistributionBatchGenerator(Distribution.UNIFORM, 500),
DistributionRequestGenerator(Distribution.UNIFORM),
max_outstanding=200
)
})
predefined_workloads['batch_starvation'] = WorkloadModel(
{
Priority.SYSTEM: PriorityWorkloadModel(Priority.SYSTEM,
rate_model.FixedRateModel(1),
DistributionBatchGenerator(Distribution.FIXED, 1),
DistributionRequestGenerator(Distribution.UNIFORM),
max_outstanding=10
),
Priority.DEFAULT: PriorityWorkloadModel(Priority.DEFAULT,
rate_model.IntervalRateModel([(0,50), (60,150), (120,90)]),
DistributionBatchGenerator(Distribution.FIXED, 1),
DistributionRequestGenerator(Distribution.UNIFORM),
max_outstanding=200
),
Priority.BATCH: PriorityWorkloadModel(Priority.BATCH,
rate_model.FixedRateModel(100),
DistributionBatchGenerator(Distribution.FIXED, 1),
DistributionRequestGenerator(Distribution.UNIFORM),
max_outstanding=200
)
})
predefined_workloads['default_low_high_low'] = WorkloadModel(
{
Priority.SYSTEM: PriorityWorkloadModel(Priority.SYSTEM,
rate_model.FixedRateModel(0),
DistributionBatchGenerator(Distribution.FIXED, 1),
DistributionRequestGenerator(Distribution.UNIFORM),
max_outstanding=10
),
Priority.DEFAULT: PriorityWorkloadModel(Priority.DEFAULT,
rate_model.IntervalRateModel([(0,100), (60,300), (120,100)]),
DistributionBatchGenerator(Distribution.FIXED, 1),
DistributionRequestGenerator(Distribution.UNIFORM),
max_outstanding=200
),
Priority.BATCH: PriorityWorkloadModel(Priority.BATCH,
rate_model.FixedRateModel(0),
DistributionBatchGenerator(Distribution.FIXED, 1),
DistributionRequestGenerator(Distribution.UNIFORM),
max_outstanding=200
)
})
for rate in [83, 100, 180, 190, 200]:
predefined_workloads['default%d' % rate] = WorkloadModel(
{
Priority.DEFAULT: PriorityWorkloadModel(Priority.DEFAULT,
rate_model.FixedRateModel(rate),
DistributionBatchGenerator(Distribution.FIXED, 1),
DistributionRequestGenerator(Distribution.EXPONENTIAL),
max_outstanding=1000
)
})

View File

@ -0,0 +1,87 @@
#!/usr/bin/env python3
#
# fdb_c_version.py
#
# This source file is part of the FoundationDB open source project
#
# Copyright 2013-2021 Apple Inc. and the FoundationDB project authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import ctypes
import sys
import platform
import os
def error(message):
print(message)
sys.exit(1)
def get_version_string(library_path):
try:
lib = ctypes.cdll.LoadLibrary(library_path)
except Exception as e:
error('Could not load library %r: %s' % (library_path, str(e)))
lib.fdb_get_error.restype = ctypes.c_char_p
try:
r = lib.fdb_select_api_version_impl(410, 410)
if r != 0:
error('Error setting API version: %s (%d)' % (lib.fdb_get_error(r), r))
except Exception as e:
error('Error calling fdb_select_api_version_impl: %s' % str(e))
try:
lib.fdb_get_client_version.restype = ctypes.c_char_p
version_str = lib.fdb_get_client_version().decode('utf-8')
except Exception as e:
error('Error getting version information from client library: %s' % str(e))
version_components = version_str.split(',')
package_version = '.'.join(version_components[0].split('.')[0:2])
version_str = 'FoundationDB Client %s (v%s)\n' % (package_version, version_components[0])
version_str += 'source version %s\n' % version_components[1]
version_str += 'protocol %s' % version_components[2]
return version_str
if __name__ == '__main__':
if platform.system() == 'Linux':
default_lib = 'libfdb_c.so'
platform_name = 'Linux'
dlopen = 'dlopen'
elif platform.system() == 'Windows':
default_lib = 'fdb_c.dll'
platform_name = 'Windows'
dlopen = 'LoadLibrary'
elif platform.system() == 'Darwin':
default_lib = 'libfdb_c.dylib'
platform_name = 'macOS'
dlopen = 'dlopen'
else:
error('Unsupported platform: %s' % platform.system())
parser = argparse.ArgumentParser(description='Prints version information for an FDB client library (e.g. %s). Must be run on a library built for the current platform (%s).' % (default_lib, platform_name))
parser.add_argument('library_path', type=str, help='Path to the client library. If not specified, the library will be searched for according to the procedures for %s on the current platform (%s).' % (dlopen, platform_name), default=None, nargs='?')
args = parser.parse_args()
if args.library_path is None:
args.library_path = default_lib
elif not os.path.isfile(args.library_path):
error('Library does not exist: %r' % args.library_path)
print(get_version_string(args.library_path))

1
design/Commit/Commit.svg Normal file

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 67 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 38 KiB

1
design/Commit/GRV.svg Normal file

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 29 KiB

1
design/Commit/Get.svg Normal file

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 29 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 30 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 39 KiB

View File

@ -0,0 +1,215 @@
# How a commit is done in FDB
This doc describes how commit is done in FDB 6.3+.
The commit path in FDB 6.3 and before is documented in [documentation/sphinx/source/read-write-path.rst](https://github.com/apple/foundationdb/pull/4099).
## Overall description
Legend:
* `alt` means alternative paths
* The texts in `[]` are conditions
* The texts above the arrow are messages.
The diagrams are generated using https://sequencediagram.org. The source code of the diagrams are the `*.sequence` files.
![CommitOverall](CommitOverall.svg)
## Description of each sections
Before all RPCs mentioned below, the client would first verify if the commit proxies and GRV proxies are changed, by comparing the client information ID it holds to the ID the cluster coordinator holds. If they are different, the proxies are changed and the client will refresh the proxies list.
### GetReadVersion Section
* The GRV Proxy sends a request to master to retrieve the current commit version. This version is the read version of the request.
### Preresolution Section
* The commit proxy sends a request for commit version, with a request number.
- The request number is a monotonically increasing number per commit proxy.
- This ensures for each proxy, the master will process the requests in order.
* The master server waits until the request number is current.
When the current request number is larger than the incoming request number
* If a commit version is already assigned to the incoming request number, return the commit version and the previous commit version. (i.e. `prevVersion`)
* Otherwise return `Never`
* Increase current commit version, return it back to the commit proxy.
* Only one process serves as master. Thus the commit version is unique for each cluster.
* The monotonically increasing commit version will ensure that each transaction is processed in a strict serial order.
### Resolution section
* The commit proxy sends the transaction to the resolver.
* Resolver waits until its version reaches `prevVersion`
* Ensures all transactions having version smaller than this transaction are resolved.
* Detects conflicts for the given transaction:
* If there is no conflict, return `TransactionCommitted` as the status
* Any conflict, return `TransactionConflict` status
* If the read snapshot is not in MVCC, return `TransactionTooOld` status
### Post Resolution section
* The proxy waits until the local batch number is current
* The proxy updates the metadata keys and attaches corresponding storage servers' tags to all mutations.
* The proxy then waits until the commit version is current, i.e. the proxy's committed version is catching up with the commit version of the batch and these two versions are within the MVCC window.
* The proxy pushes the commit data to TLogs.
* TLog waits the commit version to be current, then persists the commit.
* Wait until *all* TLogs return the transaction result.
### Reply section
* The proxy updates the master with the committed version for next GRV request at the master.
* Reply the result to the client, base on the result from the resolver.
## Tracking the process using `g_traceBatch`
`g_traceBatch` can be used for querying the transactions and commits. A typical query in the trace logs is:
```
Type=type Location=location
```
The format of `location` is, in general, `<source_file_name>.<function/actor name>.<log information>`, e.g.
```
NativeAPI.getConsistentReadVersion.Before
```
means the `location` is at `NativeAPI.actor.cpp`, `ACTOR` `getConsistentReadVersion`, `Before` requesting the read version from GRV Proxy.
Some example queries are:
```
Type=TransactionDebug Location=NativeAPI*
```
```
LogGroup=loggroup Type=CommitDebug Location=Resolver.resolveBatch.*
```
In the following sections, <span style="color:green">green</span> tag indicates an attach; <span style="color:blue">blue</span> tag indicates an event that the location follows the format mentioned above, where only the `<log information>` is included; <span style="color:lightblue">light-blue</span> tag indicates an event that the location is not following the format, where the full location is included. All the `g_traceBatch` events are tabularized after the diagram.
`contrib/commit_debug.py` can be used to visualize the commit process.
### Get Read Version
![GetReadVersion](GRV.svg)
| **Role** | **File name** | **Function/Actor** | **Trace** | **Type** | **Location** |
| ------------ | -------------- | --------------------------- | --------------------------------- | ------------------------------------------------------------ | ------------------------------------------------------------ |
| **Client** | NativeAPI | Transaction::getReadVersion | | | |
| | | readVersionBatcher | | [*TransactionAttachID*](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbclient/NativeAPI.actor.cpp#L4639) | |
| | | getConsistentReadVersion | Before | TransactionDebug | [NativeAPI.getConsistentReadVersion.Before](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbclient/NativeAPI.actor.cpp#L4564) |
| **GRVProxy** | GrvProxyServer | queueGetReadVersionRequests | Before | TransactionDebug | [GrvProxyServer.queueTransactionStartRequests.Before](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/GrvProxyServer.actor.cpp#L373-L375) |
| | | transactionStarter | | [*TransactionAttachID*](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/GrvProxyServer.actor.cpp#L734-L735) | |
| | | | AskLiveCommittedVersionFromMaster | TransactionDebug | [GrvProxyServer.transactionStarter.AskLiveCommittedVersionFromMaster](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/GrvProxyServer.actor.cpp#L787-L789) |
| | | getLiveCommittedVersion | confirmEpochLive | TransactionDebug | [GrvProxyServer.getLiveCommittedVersion.confirmEpochLive](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/GrvProxyServer.actor.cpp#L479-L480) |
| **Master** | MasterServer | serveLiveCommittedVersion | GetRawCommittedVersion | TransactionDebug | [MasterServer.serveLiveCommittedVersion.GetRawCommittedVersion](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/masterserver.actor.cpp#L1187-L1189) |
| **GRVProxy** | GrvProxyServer | getLiveCommittedVersion | After | TransactionDebug | [GrvProxyServer.getLiveCommittedVersion.After](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/GrvProxyServer.actor.cpp#L500-L501) |
| **Client** | NativeAPI | getConsistentReadVersion | After | TransactionDebug | [NativeAPI.getConsistentReadVersion.After](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbclient/NativeAPI.actor.cpp#L4594-L4595) |
### Get
![Get](Get.svg)
| **Role** | **File name** | **Function/Actor** | **Trace** | **Name** | **Location** | **Notes** |
| ------------------ | ------------------- | ----------------------------------- | ------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ |
| **Client** | NativeAPI | Transaction::get | | | | |
| | | Transaction::getReadVersion | | | *(Refer to GetReadVersion)* | |
| | | getKeyLocation | Before | TransactionDebug | [NativeAPI.getKeyLocation.Before](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbclient/NativeAPI.actor.cpp#L1975-L1976) | getKeyLocation is called by getValue, getKeyLocation actually calls getKeyLocation_internal |
| | | | After | TransactionDebug | [NativeAPI.getKeyLocation.After](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbclient/NativeAPI.actor.cpp#L1988-L1989) | |
| | | getValue | | [*GetValueAttachID*](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbclient/NativeAPI.actor.cpp#L2164) | | |
| | | | Before | GetValueDebug | [NativeAPI.getValue.Before](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbclient/NativeAPI.actor.cpp#L2165-L2167) | |
| **Storage Server** | StorageServer | serveGetValueRequests | received | GetValueDebug | [StorageServer.received](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/storageserver.actor.cpp#L4325-L4327) | |
| | | getValueQ | DoRead | GetValueDebug | [getValueQ.DoRead](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/storageserver.actor.cpp#L1115-L1117) | |
| | | | AfterVersion | GetValueDebug | [getValueQ.AfterVersion](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/storageserver.actor.cpp#L1122-L1124) | |
| | KeyValueStoreSQLite | KeyValueStoreSQLite::Reader::action | Before | GetValueDebug | [Reader.Before](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/KeyValueStoreSQLite.actor.cpp#L1654-L1656) | |
| | | | After | GetValueDebug | [Reader.After](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/KeyValueStoreSQLite.actor.cpp#L1662-L1664) | |
| | StorageServer | | AfterRead | GetValueDebug | [getValueQ.AfterRead](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/storageserver.actor.cpp#L1185-L1187) | |
| **Client** | NativeAPI | getValue | After | GetValueDebug | [NativeAPI.getValue.After](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbclient/NativeAPI.actor.cpp#L2216-L2218) | (When successful) |
| | | | Error | GetValueDebug | [NativeAPI.getValue.Error](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbclient/NativeAPI.actor.cpp#L2232-L2234) | (Wehn failure) |
### Get Range
![GetRange](GetRange.svg)
| **Role** | **File name** | **Function/Actor** | **Trace** | **Name** | **Location** | **Notes** |
| ------------------ | ------------- | --------------------------- | -------------- | ---------------- | ------------------------------------------------------------ | ------------------------------------ |
| **Client** | NativeAPI | Transaction::getRange | | | | |
| | | Transaction::getReadVersion | | | *(Refer to GetReadVersion)* | |
| | | getKeyLocation | Before | TransactionDebug | [NativeAPI.getKeyLocation.Before](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbclient/NativeAPI.actor.cpp#L1975) | getKeyLocation is called by getRange |
| | | | After | TransactionDebug | [NativeAPI.getKeyLocation.After](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbclient/NativeAPI.actor.cpp#L1988-L1989) | |
| | | getRange | Before | TransactionDebug | [NativeAPI.getRange.Before](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbclient/NativeAPI.actor.cpp#L3004) | |
| **Storage Server** | storageserver | getKeyValuesQ | Before | TransactionDebug | [storageserver.getKeyValues.Before](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/storageserver.actor.cpp#L1812) | |
| | | | AfterVersion | TransactionDebug | [storageserver.getKeyValues.AfterVersion](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/storageserver.actor.cpp#L1821) | |
| | | | AfterKeys | TransactionDebug | [storageserver.getKeyValues.AfterKeys](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/storageserver.actor.cpp#L1846) | |
| | | | Send | TransactionDebug | [storageserver.getKeyValues.Send](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/storageserver.actor.cpp#L1866) | (When no keys found) |
| | | | AfterReadRange | TransactionDebug | [storageserver.getKeyValues.AfterReadRange](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/storageserver.actor.cpp#L1886) | (When found keys in this SS) |
| **Client** | NativeAPI | getRange | After | TransactionDebug | [NativeAPI.getRange.After](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbclient/NativeAPI.actor.cpp#L3044-L3046) | (When successful) |
| | | | Error | TransactionDebug | [NativeAPI.getRange.Error](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbclient/NativeAPI.actor.cpp#L3155-L3156) | (Wehn failure) |
### GetRange Fallback
![GetRangeFallback](GetRangeFallback.svg)
| **Role** | **File name** | **Function/Actor** | **Trace** | **Type** | **Location** | **Notes** |
| ---------- | ------------- | -------------------- | ------------ | ---------------- | ------------------------------------------------------------ | ----------------------------------------------- |
| **Client** | NativeAPI | getRangeFallback | | | | |
| | | getKey | | | *GetKeyAttachID* | |
| | | | AfterVersion | GetKeyDebug | [NativeAPI.getKey.AfterVersion](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbclient/NativeAPI.actor.cpp#L2263-L2266) | |
| | | | Before | GetKeyDebug | [NativeAPI.getKey.Before](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbclient/NativeAPI.actor.cpp#L2285-L2288) | |
| | | | After | GetKeyDebug | [NativeAPI.getKey.After](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbclient/NativeAPI.actor.cpp#L2316-L2318) | Success |
| | | | Error | GetKeyDebug | [NativeAPI.getKey.Error](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbclient/NativeAPI.actor.cpp#L2326) | Error |
| | | getReadVersion | | | | *(Refer to GetReadVersion)* |
| | | getKeyRangeLocations | Before | TransactionDebug | [NativeAPI.getKeyLocations.Before](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbclient/NativeAPI.actor.cpp#L2029) | |
| | | | After | TransactionDebug | [NativeAPI.getKeyLocations.After](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbclient/NativeAPI.actor.cpp#L2044) | |
| | | getExactRange | Before | TransactionDebug | [NativeAPI.getExactRange.Before](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbclient/NativeAPI.actor.cpp#L2674) | getKeyRangeLocations is called by getExactRange |
| | | | After | TransactionDebug | [NativeAPI.getExactRange.After](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbclient/NativeAPI.actor.cpp#L2707) | |
### Commit
![Commit](Commit.svg)
| **Role** | **File name** | **Function/Actor** | **Trace** | **Type** | **Location** | **Notes** |
| ---------------- | ----------------- | ------------------------------------------- | -------------------- | ------------------------------------------------------------ | ------------------------------------------------------------ | --------- |
| **Client** | NativeAPI | Transaction::commit | | | | |
| | | commitAndWatch | | | | |
| | | tryCommit | | *[commitAttachID](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbclient/NativeAPI.actor.cpp#L4100)* | | |
| | | | Before | CommitDebug | [NativeAPI.commit.Before](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbclient/NativeAPI.actor.cpp#L4101-L4102) | |
| **Commit Proxy** | CommitProxyServer | commitBatcher | batcher | CommitDebug | [CommitProxyServer.batcher](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/CommitProxyServer.actor.cpp#L244-L245) | |
| | | commitBatch | | | | |
| | | CommitBatchContext::setupTraceBatch | | *[CommitAttachID](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/CommitProxyServer.actor.cpp#L526)* | | |
| | | | Before | CommitDebug | [CommitProxyServer.commitBatch.Before](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/CommitProxyServer.actor.cpp#L532) | |
| | | CommitBatchContext::preresolutionProcessing | GettingCommitVersion | CommitDebug | [CommitProxyServer.commitBatch.GettingCommitVersion](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/CommitProxyServer.actor.cpp#L616-L617) | |
| | | | GotCommitVersion | CommitDebug | [CommitProxyServer.commitBatch.GotCommitVersion](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/CommitProxyServer.actor.cpp#L643) | |
| **Resolver** | Resolver | resolveBatch | | *[CommitAttachID](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/Resolver.actor.cpp#L116)* | | |
| | | | Before | CommitDebug | [Resolver.resolveBatch.Before](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/Resolver.actor.cpp#L117) | |
| | | | AfterQueueSizeCheck | CommitDebug | [Resolver.resolveBatch.AfterQueueSizeCheck](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/Resolver.actor.cpp#L137) | |
| | | | AfterOrderer | CommitDebug | [Resolver.resolveBatch.AfterOrderer](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/Resolver.actor.cpp#L172) | |
| | | | After | CommitDebug | [Resolver.resolveBatch.After](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/Resolver.actor.cpp#L296) | |
| **Commit Proxy** | CommitProxyServer | CommitBatchContext::postResolution | ProcessingMutations | CommitDebug | [CommitProxyServer.CommitBatch.ProcessingMutations](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/CommitProxyServer.actor.cpp#L1074) | |
| | | | AfterStoreCommits | CommitDebug | [CommitProxyServer.CommitBatch.AfterStoreCommits](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/CommitProxyServer.actor.cpp#L1154) | |
| **TLog** | TLogServer | tLogCommit | | *[commitAttachID](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/TLogServer.actor.cpp#L2047)* | | |
| | | | BeforeWaitForVersion | CommitDebug | [TLogServer.tLogCommit.BeforeWaitForVersion](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/TLogServer.actor.cpp#L2048) | |
| | | | Before | CommitDebug | [TLog.tLogCommit.Before](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/TLogServer.actor.cpp#L2083) | |
| | | | AfterTLogCommit | CommitDebug | [TLog.tLogCommit.AfterTLogCommit](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/TLogServer.actor.cpp#L2107) | |
| | | | After | CommitDebug | [TLog.tLogCommit.After](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/TLogServer.actor.cpp#L2125) | |
| **Commit Proxy** | CommitProxyServer | CommitBatchContext::reply | AfterLogPush | CommitDebug | [CommitProxyServer.CommitBatch.AfterLogPush](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/CommitProxyServer.actor.cpp#L1263) | |
| **Client** | NativeAPI | tryCommit | After | CommitDebug | [NativeAPI.commit.After](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbclient/NativeAPI.actor.cpp#L4152) | |
| | | commitAndWatch | | | | |
| | | watchValue | | *[WatchValueAttachID](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbclient/NativeAPI.actor.cpp#L2408)* | | |
| | | | Before | WatchValueDebug | [NativeAPI.watchValue.Before]() | |
| | | | After | WatchValueDebug | [NativeAPI.watchValue.After](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbclient/NativeAPI.actor.cpp#L2431-L2433) | |

View File

@ -0,0 +1,148 @@
title Commit
participantgroup **Client** (NativeAPI.actor.cpp)
participant "Transaction::commit" as tC
participant "commitAndWatch" as cAW
participant "tryCommit" as Commit
participant "watchValue" as wV
end
participantgroup **CommitProxy** (CommitProxyServer.actor.cpp)
participant "commitBatcher" as cB
participant "commitBatch" as Batch
participant "TagPartitionedLogSystem" as TPLS
end
participantgroup **Master**
participant "getVersion" as gV
participant "serveLiveCommittedVersion" as sLCV
end
participantgroup **Resolver** (Resolver.actor.cpp)
participant "resolveBatch" as rB
end
participantgroup **TLog** (TLogServer.actor.cpp)
participant "tLogCommit" as tLC
end
participantgroup **Storage Server** (storageserver.actor.cpp)
participant "serveWatchValueRequests" as sWVR
end
autoactivation off
tC -> cAW:
cAW -> Commit: CommitTransactionRequest
note right of Commit: <color:#green>//CommitAttachID//</color>
note right of Commit: <color:#lightblue>NativeAPI.commit.Before</color>
Commit -> cB: CommitTransactionRequest
loop Batch requests
box over cB: Batch commit requests
end
cB -> Batch: Batched CommitTransactionRequests
note right of Batch: <color:#lightblue>--CommitProxyServer.batcher--</color>
box over Batch: Preresolution
note right of Batch: <color:#blue>GettingCommitVersion</color>
Batch -> gV: GetCommitVersionRequest
gV -> Batch: GetCommitVersionReply
note right of Batch: <color:#blue>GotCommitVersion</color>
box over Batch: Resolve
Batch -> rB: ResolveTransactionBatchRequest
note right of rB: <color:#blue>Before</color>
box over rB: Wait for memory/needed version
note right of rB: <color:#blue>AfterQueueSizeCheck</color>
box over rB: Wait for resolver version
note right of rB: <color:#blue>AfterOrderer</color>
box over rB: Resolve the conflicts
note right of rB: <color:#blue>After</color>
rB --> Batch: ResolveTransactionBatchReply
note right of Batch: <color:#blue>ProcessingMutations</color>
box over Batch: Calculate the metadata
box over Batch: Determine which transactions should be committed
box over Batch: Assign storage server tags to mutations
loop Wait txn commit version enter the MVCC window
Batch -> sLCV: GetRawCommittedVersionRequest
sLCV --> Batch: GetRawCommittedVersionReply
end
note right of Batch: <color:#blue>AfterStoreCommits</color>
Batch -> TPLS: Version, LogPushData
TPLS -> tLC: TLogCommitRequest
note right of tLC: <color:#green>//CommitAttachID//</color>
note right of tLC: <color:#blue>BeforeWaitForVersion</color>
box over tLC: Wait for the version
note right of tLC: <color:#blue>Before</color>
box over tLC: Store the commit
box over tLC: Put commit into persistent queue
note right of tLC: <color:#blue>AfterTLogCommit</color>
box over tLC: Wait all prior message being committed
note right of tLC: <color:#blue>After</color>
tLC --> TPLS: TLogCommitReply
TPLS -> Batch: Version (min)
note right of Batch: AfterLogPush
Batch --> Commit: CommitID
note right of Commit: <color:#lightblue>--NativeAPI.commit.After--</color>
Commit --> cAW:
cAW -> wV: Version
note right of wV: <color:#green>//WatchValueAttachID//</color>
note right of wV: <color:#blue>Before</color>
wV -> sWVR: WatchValueRequest
note right of sWVR: <color:#lightblue>--watchValueQ.Before--</color>
box over sWVR: Ensure version is not too old
note right of sWVR: <color:#lightblue>--watchValueQ.AfterVersion--</color>
loop Value not change
box over sWVR: Check storageserver::getValueQ
note right of sWVR: <color:#lightblue>--watchValueQ.AfterRead--</color>
end
sWVR --> wV: Version
note right of wV: <color:#blue>After</color>
cAW --> tC:

View File

@ -0,0 +1,54 @@
title Commit in FoundationDB
participant "Client" as C
participant "GetReadVersionProxy" as GRV
participant "CommitProxy" as P
participant "Master" as M
participant "Resolver" as R
participant "TLog" as T
C ->> GRV: Request read version
GRV ->> M: Request committed version
M ->> GRV: Respond committed version
GRV ->> C: Respond read version
C ->> P: Commit a mutation with read version
box right of P: Pre-resolution
P ->> M: Request a commit version
alt New request
M ->> P: Commit version
else Replied before with a commit version
M ->> P: Commit version
else Replied before without commit version
M --x P: Never
end
box right of P: Resolution
P ->> R: Send the transaction to the resolver
alt No conflict
R ->> P: TransactionCommitted
else Conflict
R ->> P: TransactionConflict
else Read snapshot older than oldest version
R ->> P: TransactionTooOld
end
box right of P: Post-resolution
P ->> T: Push the transaction data to TLog
alt TLog not stopped
T ->> P: The version of the transactions that are already durable
else TLog stopped
T ->> P: tlog_stopped
end
box right of P: Reply
P ->> M: Report raw commit version
M -->> P: Void
alt Commit successful
P ->> C: Commit version
else Conflict
P ->> C: Not committed: conflict
else Transaction too old
P ->> C: Not committed: too old
end

View File

@ -0,0 +1,68 @@
title Get
participantgroup **Client** (NativeAPI.actor.cpp)
participant "Transaction::get" as get
participant "Transaction::getReadVersion" as gRV
participant "getValue" as gV
participant "getKeyLocation" as gKL
end
participantgroup **CommitProxy** (CommitProxyServer.actor.cpp)
participant "doKeyServerLocationRequest" as dKSLR
end
participantgroup **Storage Server** (storageserver.actor.cpp)
participant "serveGetValueRequests" as sGVR
participant "getValueQ" as gVQ
end
participantgroup **KeyValueStoreSQLite** (KeyValueStoreSQLite.actor.cpp)
participant "KeyValueStoreSQLite::Reader::action" as axn
end
autoactivation off
get -> gRV:
box over gRV: //Consult Get Read Version section//
gRV --> get: Version
get -> gV: Version, Key
gV -> gKL: Key
note right of gKL: <color:#blue>Before</color>
gKL -> dKSLR: GetKeyServerLocationsRequest
dKSLR --> gKL: GetKeyServerLocationsReply
note right of gKL: <color:#blue>After</color>
gKL --> gV: LocationInfo
note right of gV: <color:#green>//GetValueAttachID//</color>
note right of gV: <color:#blue>Before</color>
gV -> sGVR: GetValueRequest
note right of sGVR: <color:#lightblue>--storageServer.received--</color>
sGVR -> gVQ: GetValueRequest
note right of gVQ: <color:#lightblue>--getValueQ.DoRead--</color>
note right of gVQ: <color:#lightblue>--getValueQ.AfterVersion--</color>
gVQ -> axn: Key
note right of axn: <color:#lightblue>--Reader.Before--</color>
note right of axn: <color:#lightblue>--Reader.After--</color>
axn --> gVQ: Value
note right of gVQ: <color:#lightblue>--getValueQ.AfterRead--</color>
gVQ --> gV: GetValueReply
alt Error
note right of gV: <color:#blue>Error</color>
gV --> get: Error
else Success
note right of gV: <color:#blue>After</color>
gV --> get: Value
end

View File

@ -0,0 +1,60 @@
title GetRange
participantgroup **Client** (NativeAPI.actor.cpp)
participant "Transaction::getRange" as tGR
participant "Transaction::getReadVersion" as gRV
participant "getRange" as gR
participant "getKeyLocation" as gKL
end
participantgroup **Storage Server** (storageserver.actor.cpp)
participant "getKeyValuesQ" as gKVQ
end
autoactivation off
tGR -> gRV:
tGR -> gR: KeyRange
gRV -->(2) gR: Version
loop Keys in the range
gR -> gKL: Key
box over gKL: //Consult Get section//
gKL --> gR: LocationInfo
note right of gR: <color:#blue>Before</color>
gR -> gKVQ: GetKeyValuesRequest
note right of gKVQ: <color:#lightblue>--storageserver.getKeyValues.Before--</color>
box over gKVQ: Wait the SS version
note right of gKVQ: <color:#lightblue>--storageserver.getKeyValues.AfterVersion--</color>
box over gKVQ: Realign the keys
note right of gKVQ: <color:#lightblue>--storageserver.getKeyValues.AfterKeys--</color>
alt No KV pair stored in this server
note right of gKVQ: <color:#lightblue>--storageserver.getKeyValues.Send--</color>
gKVQ --> gR: GetKeyValuesReply (empty)
else KV pair found
note right of gKVQ: <color:#lightblue>--storageserver.getKeyValues.AfterReadRange--</color>
gKVQ --> gR: GetKeyValuesReply
end
note right of gR: <color:#blue>After</color>
box over gR: Combines the results
end
alt Error
note right of gR: <color:#blue>Error</color>
box over gR: Fallback
gR -> tGR: RangeResultRef or Error
else Successful
gR -> tGR: RangeResultRef
end

View File

@ -0,0 +1,80 @@
title GetRange Fallback
participantgroup **Client** (NativeAPI.actor.cpp)
participant "getRangeFallback" as gRF
participant "getKey" as gK
participant "getExactRange" as gER
participant "getKeyRangeLocations" as gKRL
end
participantgroup **Storage Server** (storageserver.actor.cpp)
participant "serveGetKeyValuesRequests" as sGKVR
participant "serveGetKeyRequests" as sGKR
end
autoactivation off
opt Key need resolve
gRF -> gK: KeySelector
box over gK: Wait for the version
note right of gK: <color:#green>//GetKeyAttachID//</color>
note right of gK: <color:#blue>AfterVersion</color>
box over gK: See getKeyLocation in Get
note right of gK: <color:#blue>Before</color>
gK -> sGKR: GetKeyRequest
sGKR --> gK: GetKeyReply
alt Success
note right of gK: <color:#blue>After</color>
gK --> gRF: Key
else Error
note right of gK: <color:#blue>Error</color>
end
end
box over gRF: Update read version if necessary
gRF -> gER: Version, KeyRangeRef
loop Loop over keys in the range
gER -> gKRL: KeyRange
note right of gKRL: <color:#blue>Before</color>
box over gKRL: Get the locations
note right of gKRL: <color:#blue>After</color>
gKRL --> gER: LocationInfo
loop Loop over shards
note right of gER: <color:#blue>Before</color>
gER -> sGKVR: GetKeyValuesRequest
note right of sGKVR: <color:#lightblue>--storageserver.getKeyValues.Before--</color>
box over sGKVR: Wait the SS version
note right of sGKVR: <color:#lightblue>--storageserver.getKeyValues.AfterVersion--</color>
box over sGKVR: Realign the keys
note right of sGKVR: <color:#lightblue>--storageserver.getKeyValues.AfterKeys--</color>
alt No KV pair stored in this server
note right of sGKVR: <color:#lightblue>--storageserver.getKeyValues.Send--</color>
sGKVR --> gER: GetKeyValuesReply (empty)
else KV pair found
note right of sGKVR: <color:#lightblue>--storageserver.getKeyValues.AfterReadRange--</color>
sGKVR --> gER: GetKeyValuesReply
end
note right of gER: <color:#blue>After</color>
end
end
gER --> gRF: RangeResultRef

View File

@ -0,0 +1,66 @@
title Get Read Version
participantgroup **Client** (NativeAPI.actor.cpp)
participant "Transaction::getReadVersion" as gRV
participant "readVersionBatcher" as rVB
participant "getConsistentReadVersion" as gCRV
end
participantgroup **GRVProxy** (GrvProxyServer.actor.cpp)
participant "queueGetReadVersionRequests" as qGRVR
participant "transactionStarter" as tS
participant "getLiveCommittedVersion" as gLCV
end
participantgroup **Master** (masterserver.actor.cpp)
participant "serveLiveCommittedVersion" as sLCV
end
autoactivation off
gRV -> rVB: VersionRequest
loop Batch requests
box over rVB:Batch read version requests
end
note right of rVB: <color:#green>//TransactionAttachID//</color>
rVB -> gCRV:
note right of gCRV: <color:#blue>Before</color>
gCRV -> qGRVR: GetReadVersionRequest
loop Batch requests
box over qGRVR: Batch read version requests
end
note right of qGRVR: <color:#lightblue>--GrvProxyServer.queueTransactionStartRequests.Before--</color>
qGRVR -> tS:
note right of tS: <color:#green>//TransactionAttachID//</color>
note right of tS: <color:#blue>AskLiveCommittedVersionFromMaster</color>
tS -> gLCV:
note right of gLCV: <color:#blue>confirmEpochLive</color>
gLCV -> sLCV: GetRawCommittedVersionRequest
note right of sLCV: <color:#blue>GetRawCommittedVersion</color>
sLCV --> gLCV: GetRawCommittedVersionReply
note right of gLCV: <color:#blue>After</color>
gLCV --> gCRV: GetReadVersionReply
note right of gCRV: <color:#blue>After</color>
gCRV --> rVB: GetReadVersionReply
rVB --> gRV: GetReadVersionReply

View File

@ -244,6 +244,9 @@ The ``start`` subcommand is used to start a backup. If there is already a backu
``-s <DURATION>`` or ``--snapshot_interval <DURATION>``
Specifies the duration, in seconds, of the inconsistent snapshots written to the backup in continuous mode. The default is 864000 which is 10 days.
``--initial_snapshot_interval <DURATION>``
Specifies the duration, in seconds, of the first inconsistent snapshot written to the backup. The default is 0, which means as fast as possible.
``--partitioned_log_experimental``
Specifies the backup uses the partitioned mutation logs generated by backup workers. Since FDB version 6.3, this option is experimental and requires using fast restore for restoring the database from the generated files. The default is to use non-partitioned mutation logs generated by backup agents.
@ -487,6 +490,9 @@ The ``start`` command will start a new restore on the specified (or default) tag
``--orig_cluster_file <CONNFILE>``
The cluster file for the original database from which the backup was created. The original database is only needed to convert a --timestamp argument to a database version.
``--inconsistent_snapshot_only``
Ignore mutation log files during the restore to speedup the process. Because only range files are restored, this option gives an inconsistent snapshot in most cases and is not recommended to use.
.. program:: fdbrestore abort
``abort``

View File

@ -315,7 +315,7 @@ and pass the test with ``-f``:
.. code-block:: sh
fdbserver -r simulator -f testfile.txt
fdbserver -r simulation -f testfile.txt
Running a Workload on an actual Cluster

View File

@ -949,6 +949,12 @@ that process, and wait for necessary data to be moved away.
#. ``\xff\xff/management/options/failed/force`` Read/write. Setting this key disables safety checks for writes to ``\xff\xff/management/failed/<exclusion>``. Setting this key only has an effect in the current transaction and is not persisted on commit.
#. ``\xff\xff/management/min_required_commit_version`` Read/write. Changing this key will change the corresponding system key ``\xff/minRequiredCommitVersion = [[Version]]``. The value of this special key is the literal text of the underlying ``Version``, which is ``int64_t``. If you set the key with a value failed to be parsed as ``int64_t``, ``special_keys_api_failure`` will be thrown. In addition, the given ``Version`` should be larger than the current read version and smaller than the upper bound(``2**63-1-version_per_second*3600*24*365*1000``). Otherwise, ``special_keys_api_failure`` is thrown. For more details, see help text of ``fdbcli`` command ``advanceversion``.
#. ``\xff\xff/management/profiling/<client_txn_sample_rate|client_txn_size_limit>`` Read/write. Changing these two keys will change the corresponding system keys ``\xff\x02/fdbClientInfo/<client_txn_sample_rate|client_txn_size_limit>``, respectively. The value of ``\xff\xff/management/client_txn_sample_rate`` is a literal text of ``double``, and the value of ``\xff\xff/management/client_txn_size_limit`` is a literal text of ``int64_t``. A special value ``default`` can be set to or read from these two keys, representing the client profiling is disabled. In addition, ``clear`` in this range is not allowed. For more details, see help text of ``fdbcli`` command ``profile client``.
#. ``\xff\xff/management/maintenance/<zone_id> := <seconds>`` Read/write. Set/clear a key in this range will change the corresponding system key ``\xff\x02/healthyZone``. The value is a literal text of a non-negative ``double`` which represents the remaining time for the zone to be in maintenance. Commiting with an invalid value will throw ``special_keys_api_failure``. Only one zone is allowed to be in maintenance at the same time. Setting a new key in the range will override the old one and the transaction will throw ``special_keys_api_failure`` error if more than one zone is given. For more details, see help text of ``fdbcli`` command ``maintenance``.
In addition, a special key ``\xff\xff/management/maintenance/IgnoreSSFailures`` in the range, if set, will disable datadistribution for storage server failures.
It is doing the same thing as the fdbcli command ``datadistribution disable ssfailure``.
Maintenance mode will be unable to use until the key is cleared, which is the same as the fdbcli command ``datadistribution enable ssfailure``.
While the key is set, any commit that tries to set a key in the range will fail with the ``special_keys_api_failure`` error.
#. ``\xff\xff/management/data_distribution/<mode|rebalance_ignored>`` Read/write. Changing these two keys will change the two corresponding system keys ``\xff/dataDistributionMode`` and ``\xff\x02/rebalanceDDIgnored``. The value of ``\xff\xff/management/data_distribution/mode`` is a literal text of ``0`` (disable) or ``1`` (enable). Transactions committed with invalid values will throw ``special_keys_api_failure`` . The value of ``\xff\xff/management/data_distribution/rebalance_ignored`` is empty. If present, it means data distribution is disabled for rebalance. Any transaction committed with non-empty value for this key will throw ``special_keys_api_failure``. For more details, see help text of ``fdbcli`` command ``datadistribution``.
An exclusion is syntactically either an ip address (e.g. ``127.0.0.1``), or
an ip address and port (e.g. ``127.0.0.1:4500``). If no port is specified,

View File

@ -27,6 +27,7 @@
"storage",
"transaction",
"resolution",
"stateless",
"commit_proxy",
"grv_proxy",
"master",
@ -120,7 +121,7 @@
"counter":0,
"roughness":0.0
},
"grv_latency_statistics":{
"grv_latency_statistics":{ // GRV Latency metrics are grouped according to priority (currently batch or default).
"default":{
"count":0,
"min":0.0,
@ -132,6 +133,18 @@
"p95":0.0,
"p99":0.0,
"p99.9":0.0
},
"batch":{
"count":0,
"min":0.0,
"max":0.0,
"median":0.0,
"mean":0.0,
"p25":0.0,
"p90":0.0,
"p95":0.0,
"p99":0.0,
"p99.9":0.0
}
},
"read_latency_statistics":{

View File

@ -2,6 +2,10 @@
Release Notes
#############
6.3.13
======
* The multi-version client now requires at most two client connections with version 6.2 or larger, regardless of how many external clients are configured. Clients older than 6.2 will continue to create an additional connection each. `(PR #4667) <https://github.com/apple/foundationdb/pull/4667>`_
6.3.12
======
* Change the default for --knob_tls_server_handshake_threads to 64. The previous was 1000. This avoids starting 1000 threads by default, but may adversely affect recovery time for large clusters using tls. Users with large tls clusters should consider explicitly setting this knob in their foundationdb.conf file. `(PR #4421) <https://github.com/apple/foundationdb/pull/4421>`_

View File

@ -15,7 +15,8 @@ Features
Performance
-----------
* Increased performance of dr_agent when copying the mutation log. The ``COPY_LOG_BLOCK_SIZE``, ``COPY_LOG_BLOCKS_PER_TASK``, ``COPY_LOG_PREFETCH_BLOCKS``, ``COPY_LOG_READ_AHEAD_BYTES`` and ``COPY_LOG_TASK_DURATION_NANOS`` knobs can be set. `(PR 3436) <https://github.com/apple/foundationdb/pull/3436>`_
* Increased performance of dr_agent when copying the mutation log. The ``COPY_LOG_BLOCK_SIZE``, ``COPY_LOG_BLOCKS_PER_TASK``, ``COPY_LOG_PREFETCH_BLOCKS``, ``COPY_LOG_READ_AHEAD_BYTES`` and ``COPY_LOG_TASK_DURATION_NANOS`` knobs can be set. `(PR #3436) <https://github.com/apple/foundationdb/pull/3436>`_
* Reduced the number of connections required by the multi-version client when loading external clients. When connecting to 7.0 clusters, only one connection with version 6.2 or larger will be used. With older clusters, at most two connections with version 6.2 or larger will be used. Clients older than version 6.2 will continue to create an additional connection each. `(PR #4667) <https://github.com/apple/foundationdb/pull/4667>`_
Reliability
-----------

View File

@ -0,0 +1,90 @@
/*
* BackupTLSConfig.cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2021 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <iostream>
#include "fdbclient/NativeAPI.actor.h"
#include "flow/Arena.h"
#include "flow/Error.h"
#include "flow/network.h"
#include "fdbbackup/BackupTLSConfig.h"
void BackupTLSConfig::setupBlobCredentials() {
// Add blob credentials files from the environment to the list collected from the command line.
const char* blobCredsFromENV = getenv("FDB_BLOB_CREDENTIALS");
if (blobCredsFromENV != nullptr) {
StringRef t((uint8_t*)blobCredsFromENV, strlen(blobCredsFromENV));
do {
StringRef file = t.eat(":");
if (file.size() != 0)
blobCredentials.push_back(file.toString());
} while (t.size() != 0);
}
// Update the global blob credential files list
std::vector<std::string>* pFiles = (std::vector<std::string>*)g_network->global(INetwork::enBlobCredentialFiles);
if (pFiles != nullptr) {
for (auto& f : blobCredentials) {
pFiles->push_back(f);
}
}
}
bool BackupTLSConfig::setupTLS() {
if (tlsCertPath.size()) {
try {
setNetworkOption(FDBNetworkOptions::TLS_CERT_PATH, tlsCertPath);
} catch (Error& e) {
std::cerr << "ERROR: cannot set TLS certificate path to " << tlsCertPath << " (" << e.what() << ")\n";
return false;
}
}
if (tlsCAPath.size()) {
try {
setNetworkOption(FDBNetworkOptions::TLS_CA_PATH, tlsCAPath);
} catch (Error& e) {
std::cerr << "ERROR: cannot set TLS CA path to " << tlsCAPath << " (" << e.what() << ")\n";
return false;
}
}
if (tlsKeyPath.size()) {
try {
if (tlsPassword.size())
setNetworkOption(FDBNetworkOptions::TLS_PASSWORD, tlsPassword);
setNetworkOption(FDBNetworkOptions::TLS_KEY_PATH, tlsKeyPath);
} catch (Error& e) {
std::cerr << "ERROR: cannot set TLS key path to " << tlsKeyPath << " (" << e.what() << ")\n";
return false;
}
}
if (tlsVerifyPeers.size()) {
try {
setNetworkOption(FDBNetworkOptions::TLS_VERIFY_PEERS, tlsVerifyPeers);
} catch (Error& e) {
std::cerr << "ERROR: cannot set TLS peer verification to " << tlsVerifyPeers << " (" << e.what()
<< ")\n";
return false;
}
}
return true;
}

View File

@ -0,0 +1,41 @@
/*
* BackupTLSConfig.h
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2021 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef FDBBACKUP_BACKUPTLSCONFIG_H
#define FDBBACKUP_BACKUPTLSCONFIG_H
#pragma once
#include <string>
#include <vector>
// TLS and blob credentials for backups and setup for these credentials.
struct BackupTLSConfig {
std::string tlsCertPath, tlsKeyPath, tlsCAPath, tlsPassword, tlsVerifyPeers;
std::vector<std::string> blobCredentials;
// Returns if TLS setup is successful
bool setupTLS();
// Sets up blob crentials. Add the file specified by FDB_BLOB_CREDENTIALS as well.
// Note this must be called after g_network is set up.
void setupBlobCredentials();
};
#endif // FDBBACKUP_BACKUPTLSCONFIG_H

View File

@ -1,5 +1,7 @@
set(FDBBACKUP_SRCS
backup.actor.cpp)
BackupTLSConfig.h
BackupTLSConfig.cpp
backup.actor.cpp)
add_flow_target(EXECUTABLE NAME fdbbackup SRCS ${FDBBACKUP_SRCS})
target_link_libraries(fdbbackup PRIVATE fdbclient)
@ -11,6 +13,8 @@ add_flow_target(EXECUTABLE NAME fdbconvert SRCS ${FDBCONVERT_SRCS})
target_link_libraries(fdbconvert PRIVATE fdbclient)
set(FDBDECODE_SRCS
BackupTLSConfig.h
BackupTLSConfig.cpp
FileDecoder.actor.cpp
FileConverter.h)
add_flow_target(EXECUTABLE NAME fdbdecode SRCS ${FDBDECODE_SRCS})

View File

@ -24,6 +24,7 @@
#include <cinttypes>
#include "flow/SimpleOpt.h"
#include "flow/TLSConfig.actor.h"
namespace file_converter {
@ -31,6 +32,7 @@ namespace file_converter {
enum {
OPT_CONTAINER,
OPT_BEGIN_VERSION,
OPT_BLOB_CREDENTIALS,
OPT_CRASHONERROR,
OPT_END_VERSION,
OPT_TRACE,
@ -55,6 +57,10 @@ CSimpleOpt::SOption gConverterOptions[] = { { OPT_CONTAINER, "-r", SO_REQ_SEP },
{ OPT_TRACE_LOG_GROUP, "--loggroup", SO_REQ_SEP },
{ OPT_INPUT_FILE, "-i", SO_REQ_SEP },
{ OPT_INPUT_FILE, "--input", SO_REQ_SEP },
{ OPT_BLOB_CREDENTIALS, "--blob_credentials", SO_REQ_SEP },
#ifndef TLS_DISABLED
TLS_OPTION_FLAGS
#endif
{ OPT_BUILD_FLAGS, "--build_flags", SO_NONE },
{ OPT_HELP, "-?", SO_NONE },
{ OPT_HELP, "-h", SO_NONE },

View File

@ -22,10 +22,12 @@
#include <iostream>
#include <vector>
#include "fdbbackup/BackupTLSConfig.h"
#include "fdbclient/BackupAgent.actor.h"
#include "fdbclient/BackupContainer.h"
#include "fdbbackup/FileConverter.h"
#include "fdbclient/MutationList.h"
#include "flow/Trace.h"
#include "flow/flow.h"
#include "flow/serialize.h"
#include "fdbclient/BuildFlags.h"
@ -38,31 +40,52 @@ extern bool g_crashOnError;
namespace file_converter {
void printDecodeUsage() {
std::cout << "\n"
" -r, --container Container URL.\n"
" -i, --input FILE Log file to be decoded.\n"
" --crash Crash on serious error.\n"
" --build_flags Print build information and exit.\n"
"\n";
std::cout
<< "Decoder for FoundationDB backup mutation logs.\n"
"Usage: fdbdecode [OPTIONS]\n"
" -r, --container URL\n"
" Backup container URL, e.g., file:///some/path/.\n"
" -i, --input FILE\n"
" Log file filter, only matched files are decoded.\n"
" --log Enables trace file logging for the CLI session.\n"
" --logdir PATH Specifes the output directory for trace files. If\n"
" unspecified, defaults to the current directory. Has\n"
" no effect unless --log is specified.\n"
" --loggroup LOG_GROUP\n"
" Sets the LogGroup field with the specified value for all\n"
" events in the trace output (defaults to `default').\n"
" --trace_format FORMAT\n"
" Select the format of the trace files, xml (the default) or json.\n"
" Has no effect unless --log is specified.\n"
" --crash Crash on serious error.\n"
" --blob_credentials FILE\n"
" File containing blob credentials in JSON format.\n"
" The same credential format/file fdbbackup uses.\n"
#ifndef TLS_DISABLED
TLS_HELP
#endif
" --build_flags Print build information and exit.\n"
"\n";
return;
}
void printBuildInformation() {
printf("%s", jsonBuildInformation().c_str());
std::cout << jsonBuildInformation() << "\n";
}
struct DecodeParams {
std::string container_url;
std::string file;
std::string fileFilter; // only files match the filter will be decoded
bool log_enabled = false;
std::string log_dir, trace_format, trace_log_group;
BackupTLSConfig tlsConfig;
std::string toString() {
std::string s;
s.append("ContainerURL: ");
s.append(container_url);
s.append(", File: ");
s.append(file);
s.append(", FileFilter: ");
s.append(fileFilter);
if (log_enabled) {
if (!log_dir.empty()) {
s.append(" LogDir:").append(log_dir);
@ -76,6 +99,8 @@ struct DecodeParams {
}
return s;
}
};
int parseDecodeCommandLine(DecodeParams* param, CSimpleOpt* args) {
@ -93,7 +118,6 @@ int parseDecodeCommandLine(DecodeParams* param, CSimpleOpt* args) {
int optId = args->OptionId();
switch (optId) {
case OPT_HELP:
printDecodeUsage();
return FDB_EXIT_ERROR;
case OPT_CONTAINER:
@ -105,7 +129,7 @@ int parseDecodeCommandLine(DecodeParams* param, CSimpleOpt* args) {
break;
case OPT_INPUT_FILE:
param->file = args->OptionArg();
param->fileFilter = args->OptionArg();
break;
case OPT_TRACE:
@ -127,6 +151,37 @@ int parseDecodeCommandLine(DecodeParams* param, CSimpleOpt* args) {
case OPT_TRACE_LOG_GROUP:
param->trace_log_group = args->OptionArg();
break;
case OPT_BLOB_CREDENTIALS:
param->tlsConfig.blobCredentials.push_back(args->OptionArg());
break;
#ifndef TLS_DISABLED
case TLSConfig::OPT_TLS_PLUGIN:
args->OptionArg();
break;
case TLSConfig::OPT_TLS_CERTIFICATES:
param->tlsConfig.tlsCertPath = args->OptionArg();
break;
case TLSConfig::OPT_TLS_PASSWORD:
param->tlsConfig.tlsPassword = args->OptionArg();
break;
case TLSConfig::OPT_TLS_CA_FILE:
param->tlsConfig.tlsCAPath = args->OptionArg();
break;
case TLSConfig::OPT_TLS_KEY:
param->tlsConfig.tlsKeyPath = args->OptionArg();
break;
case TLSConfig::OPT_TLS_VERIFY_PEERS:
param->tlsConfig.tlsVerifyPeers = args->OptionArg();
break;
#endif
case OPT_BUILD_FLAGS:
printBuildInformation();
return FDB_EXIT_ERROR;
@ -147,7 +202,7 @@ void printLogFiles(std::string msg, const std::vector<LogFile>& files) {
std::vector<LogFile> getRelevantLogFiles(const std::vector<LogFile>& files, const DecodeParams& params) {
std::vector<LogFile> filtered;
for (const auto& file : files) {
if (file.fileName.find(params.file) != std::string::npos) {
if (file.fileName.find(params.fileFilter) != std::string::npos) {
filtered.push_back(file);
}
}
@ -515,6 +570,11 @@ int main(int argc, char** argv) {
}
}
if (!param.tlsConfig.setupTLS()) {
TraceEvent(SevError, "TLSError");
throw tls_error();
}
platformInit();
Error::init();
@ -523,13 +583,14 @@ int main(int argc, char** argv) {
TraceEvent::setNetworkThread();
openTraceFile(NetworkAddress(), 10 << 20, 10 << 20, param.log_dir, "decode", param.trace_log_group);
param.tlsConfig.setupBlobCredentials();
auto f = stopAfter(decode_logs(param));
runNetwork();
return status;
} catch (Error& e) {
fprintf(stderr, "ERROR: %s\n", e.what());
std::cerr << "ERROR: " << e.what() << "\n";
return FDB_EXIT_ERROR;
} catch (std::exception& e) {
TraceEvent(SevError, "MainError").error(unknown_error()).detail("RootException", e.what());

View File

@ -18,6 +18,7 @@
* limitations under the License.
*/
#include "fdbbackup/BackupTLSConfig.h"
#include "fdbclient/JsonBuilder.h"
#include "flow/Arena.h"
#include "flow/Error.h"
@ -105,6 +106,7 @@ enum {
// Backup constants
OPT_DESTCONTAINER,
OPT_SNAPSHOTINTERVAL,
OPT_INITIAL_SNAPSHOT_INTERVAL,
OPT_ERRORLIMIT,
OPT_NOSTOPWHENDONE,
OPT_EXPIRE_BEFORE_VERSION,
@ -144,6 +146,7 @@ enum {
OPT_RESTORE_CLUSTERFILE_DEST,
OPT_RESTORE_CLUSTERFILE_ORIG,
OPT_RESTORE_BEGIN_VERSION,
OPT_RESTORE_INCONSISTENT_SNAPSHOT_ONLY,
// Shared constants
OPT_CLUSTERFILE,
@ -232,6 +235,7 @@ CSimpleOpt::SOption g_rgBackupStartOptions[] = {
{ OPT_USE_PARTITIONED_LOG, "--partitioned_log_experimental", SO_NONE },
{ OPT_SNAPSHOTINTERVAL, "-s", SO_REQ_SEP },
{ OPT_SNAPSHOTINTERVAL, "--snapshot_interval", SO_REQ_SEP },
{ OPT_INITIAL_SNAPSHOT_INTERVAL, "--initial_snapshot_interval", SO_REQ_SEP },
{ OPT_TAGNAME, "-t", SO_REQ_SEP },
{ OPT_TAGNAME, "--tagname", SO_REQ_SEP },
{ OPT_BACKUPKEYS, "-k", SO_REQ_SEP },
@ -691,6 +695,7 @@ CSimpleOpt::SOption g_rgRestoreOptions[] = {
{ OPT_BLOB_CREDENTIALS, "--blob_credentials", SO_REQ_SEP },
{ OPT_INCREMENTALONLY, "--incremental", SO_NONE },
{ OPT_RESTORE_BEGIN_VERSION, "--begin_version", SO_REQ_SEP },
{ OPT_RESTORE_INCONSISTENT_SNAPSHOT_ONLY, "--inconsistent_snapshot_only", SO_NONE },
#ifndef TLS_DISABLED
TLS_OPTION_FLAGS
#endif
@ -1879,6 +1884,7 @@ ACTOR Future<Void> submitDBBackup(Database src,
ACTOR Future<Void> submitBackup(Database db,
std::string url,
int initialSnapshotIntervalSeconds,
int snapshotIntervalSeconds,
Standalone<VectorRef<KeyRangeRef>> backupRanges,
std::string tagName,
@ -1935,6 +1941,7 @@ ACTOR Future<Void> submitBackup(Database db,
else {
wait(backupAgent.submitBackup(db,
KeyRef(url),
initialSnapshotIntervalSeconds,
snapshotIntervalSeconds,
tagName,
backupRanges,
@ -2251,7 +2258,8 @@ ACTOR Future<Void> runRestore(Database db,
bool waitForDone,
std::string addPrefix,
std::string removePrefix,
bool incrementalBackupOnly) {
bool onlyAppyMutationLogs,
bool inconsistentSnapshotOnly) {
if (ranges.empty()) {
ranges.push_back_deep(ranges.arena(), normalKeys);
}
@ -2297,7 +2305,7 @@ ACTOR Future<Void> runRestore(Database db,
BackupDescription desc = wait(bc->describeBackup());
if (incrementalBackupOnly && desc.contiguousLogEnd.present()) {
if (onlyAppyMutationLogs && desc.contiguousLogEnd.present()) {
targetVersion = desc.contiguousLogEnd.get() - 1;
} else if (desc.maxRestorableVersion.present()) {
targetVersion = desc.maxRestorableVersion.get();
@ -2322,7 +2330,8 @@ ACTOR Future<Void> runRestore(Database db,
KeyRef(addPrefix),
KeyRef(removePrefix),
true,
incrementalBackupOnly,
onlyAppyMutationLogs,
inconsistentSnapshotOnly,
beginVersion));
if (waitForDone && verbose) {
@ -3212,6 +3221,8 @@ int main(int argc, char* argv[]) {
std::string destinationContainer;
bool describeDeep = false;
bool describeTimestamps = false;
int initialSnapshotIntervalSeconds =
0; // The initial snapshot has a desired duration of 0, meaning go as fast as possible.
int snapshotIntervalSeconds = CLIENT_KNOBS->BACKUP_DEFAULT_SNAPSHOT_INTERVAL_SEC;
std::string clusterFile;
std::string sourceClusterFile;
@ -3236,6 +3247,8 @@ int main(int argc, char* argv[]) {
bool stopWhenDone = true;
bool usePartitionedLog = false; // Set to true to use new backup system
bool incrementalBackupOnly = false;
bool onlyAppyMutationLogs = false;
bool inconsistentSnapshotOnly = false;
bool forceAction = false;
bool trace = false;
bool quietDisplay = false;
@ -3251,8 +3264,7 @@ int main(int argc, char* argv[]) {
LocalityData localities;
uint64_t memLimit = 8LL << 30;
Optional<uint64_t> ti;
std::vector<std::string> blobCredentials;
std::string tlsCertPath, tlsKeyPath, tlsCAPath, tlsPassword, tlsVerifyPeers;
BackupTLSConfig tlsConfig;
Version dumpBegin = 0;
Version dumpEnd = std::numeric_limits<Version>::max();
std::string restoreClusterFileDest;
@ -3467,6 +3479,7 @@ int main(int argc, char* argv[]) {
modifyOptions.destURL = destinationContainer;
break;
case OPT_SNAPSHOTINTERVAL:
case OPT_INITIAL_SNAPSHOT_INTERVAL:
case OPT_MOD_ACTIVE_INTERVAL: {
const char* a = args->OptionArg();
int seconds;
@ -3478,6 +3491,8 @@ int main(int argc, char* argv[]) {
if (optId == OPT_SNAPSHOTINTERVAL) {
snapshotIntervalSeconds = seconds;
modifyOptions.snapshotIntervalSeconds = seconds;
} else if (optId == OPT_INITIAL_SNAPSHOT_INTERVAL) {
initialSnapshotIntervalSeconds = seconds;
} else if (optId == OPT_MOD_ACTIVE_INTERVAL) {
modifyOptions.activeSnapshotIntervalSeconds = seconds;
}
@ -3497,6 +3512,7 @@ int main(int argc, char* argv[]) {
break;
case OPT_INCREMENTALONLY:
incrementalBackupOnly = true;
onlyAppyMutationLogs = true;
break;
case OPT_RESTORECONTAINER:
restoreContainer = args->OptionArg();
@ -3547,6 +3563,10 @@ int main(int argc, char* argv[]) {
restoreVersion = ver;
break;
}
case OPT_RESTORE_INCONSISTENT_SNAPSHOT_ONLY: {
inconsistentSnapshotOnly = true;
break;
}
#ifdef _WIN32
case OPT_PARENTPID: {
auto pid_str = args->OptionArg();
@ -3578,26 +3598,26 @@ int main(int argc, char* argv[]) {
memLimit = ti.get();
break;
case OPT_BLOB_CREDENTIALS:
blobCredentials.push_back(args->OptionArg());
tlsConfig.blobCredentials.push_back(args->OptionArg());
break;
#ifndef TLS_DISABLED
case TLSConfig::OPT_TLS_PLUGIN:
args->OptionArg();
break;
case TLSConfig::OPT_TLS_CERTIFICATES:
tlsCertPath = args->OptionArg();
tlsConfig.tlsCertPath = args->OptionArg();
break;
case TLSConfig::OPT_TLS_PASSWORD:
tlsPassword = args->OptionArg();
tlsConfig.tlsPassword = args->OptionArg();
break;
case TLSConfig::OPT_TLS_CA_FILE:
tlsCAPath = args->OptionArg();
tlsConfig.tlsCAPath = args->OptionArg();
break;
case TLSConfig::OPT_TLS_KEY:
tlsKeyPath = args->OptionArg();
tlsConfig.tlsKeyPath = args->OptionArg();
break;
case TLSConfig::OPT_TLS_VERIFY_PEERS:
tlsVerifyPeers = args->OptionArg();
tlsConfig.tlsVerifyPeers = args->OptionArg();
break;
#endif
case OPT_DUMP_BEGIN:
@ -3731,42 +3751,8 @@ int main(int argc, char* argv[]) {
setNetworkOption(FDBNetworkOptions::DISABLE_CLIENT_STATISTICS_LOGGING);
// deferred TLS options
if (tlsCertPath.size()) {
try {
setNetworkOption(FDBNetworkOptions::TLS_CERT_PATH, tlsCertPath);
} catch (Error& e) {
fprintf(stderr, "ERROR: cannot set TLS certificate path to `%s' (%s)\n", tlsCertPath.c_str(), e.what());
return 1;
}
}
if (tlsCAPath.size()) {
try {
setNetworkOption(FDBNetworkOptions::TLS_CA_PATH, tlsCAPath);
} catch (Error& e) {
fprintf(stderr, "ERROR: cannot set TLS CA path to `%s' (%s)\n", tlsCAPath.c_str(), e.what());
return 1;
}
}
if (tlsKeyPath.size()) {
try {
if (tlsPassword.size())
setNetworkOption(FDBNetworkOptions::TLS_PASSWORD, tlsPassword);
setNetworkOption(FDBNetworkOptions::TLS_KEY_PATH, tlsKeyPath);
} catch (Error& e) {
fprintf(stderr, "ERROR: cannot set TLS key path to `%s' (%s)\n", tlsKeyPath.c_str(), e.what());
return 1;
}
}
if (tlsVerifyPeers.size()) {
try {
setNetworkOption(FDBNetworkOptions::TLS_VERIFY_PEERS, tlsVerifyPeers);
} catch (Error& e) {
fprintf(
stderr, "ERROR: cannot set TLS peer verification to `%s' (%s)\n", tlsVerifyPeers.c_str(), e.what());
return 1;
}
if (!tlsConfig.setupTLS()) {
return 1;
}
Error::init();
@ -3806,25 +3792,8 @@ int main(int argc, char* argv[]) {
// are logged. This thread will eventually run the network, so call it now.
TraceEvent::setNetworkThread();
// Add blob credentials files from the environment to the list collected from the command line.
const char* blobCredsFromENV = getenv("FDB_BLOB_CREDENTIALS");
if (blobCredsFromENV != nullptr) {
StringRef t((uint8_t*)blobCredsFromENV, strlen(blobCredsFromENV));
do {
StringRef file = t.eat(":");
if (file.size() != 0)
blobCredentials.push_back(file.toString());
} while (t.size() != 0);
}
// Update the global blob credential files list
std::vector<std::string>* pFiles =
(std::vector<std::string>*)g_network->global(INetwork::enBlobCredentialFiles);
if (pFiles != nullptr) {
for (auto& f : blobCredentials) {
pFiles->push_back(f);
}
}
// Sets up blob credentials, including one from the environment FDB_BLOB_CREDENTIALS.
tlsConfig.setupBlobCredentials();
// Opens a trace file if trace is set (and if a trace file isn't already open)
// For most modes, initCluster() will open a trace file, but some fdbbackup operations do not require
@ -3888,6 +3857,7 @@ int main(int argc, char* argv[]) {
openBackupContainer(argv[0], destinationContainer);
f = stopAfter(submitBackup(db,
destinationContainer,
initialSnapshotIntervalSeconds,
snapshotIntervalSeconds,
backupKeys,
tagName,
@ -4064,7 +4034,8 @@ int main(int argc, char* argv[]) {
waitForDone,
addPrefix,
removePrefix,
incrementalBackupOnly));
onlyAppyMutationLogs,
inconsistentSnapshotOnly));
break;
case RestoreType::WAIT:
f = stopAfter(success(ba.waitRestore(db, KeyRef(tagName), true)));

View File

@ -24,6 +24,7 @@
#include "fdbclient/Status.h"
#include "fdbclient/StatusClient.h"
#include "fdbclient/DatabaseContext.h"
#include "fdbclient/GlobalConfig.actor.h"
#include "fdbclient/NativeAPI.actor.h"
#include "fdbclient/ReadYourWrites.h"
#include "fdbclient/ClusterInterface.h"
@ -3841,25 +3842,16 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise) {
is_error = true;
continue;
}
state Future<Optional<Standalone<StringRef>>> sampleRateFuture =
tr->get(fdbClientInfoTxnSampleRate);
state Future<Optional<Standalone<StringRef>>> sizeLimitFuture =
tr->get(fdbClientInfoTxnSizeLimit);
wait(makeInterruptable(success(sampleRateFuture) && success(sizeLimitFuture)));
const double sampleRateDbl = GlobalConfig::globalConfig().get<double>(
fdbClientInfoTxnSampleRate, std::numeric_limits<double>::infinity());
const int64_t sizeLimit =
GlobalConfig::globalConfig().get<int64_t>(fdbClientInfoTxnSizeLimit, -1);
std::string sampleRateStr = "default", sizeLimitStr = "default";
if (sampleRateFuture.get().present()) {
const double sampleRateDbl =
BinaryReader::fromStringRef<double>(sampleRateFuture.get().get(), Unversioned());
if (!std::isinf(sampleRateDbl)) {
sampleRateStr = boost::lexical_cast<std::string>(sampleRateDbl);
}
if (!std::isinf(sampleRateDbl)) {
sampleRateStr = boost::lexical_cast<std::string>(sampleRateDbl);
}
if (sizeLimitFuture.get().present()) {
const int64_t sizeLimit =
BinaryReader::fromStringRef<int64_t>(sizeLimitFuture.get().get(), Unversioned());
if (sizeLimit != -1) {
sizeLimitStr = boost::lexical_cast<std::string>(sizeLimit);
}
if (sizeLimit != -1) {
sizeLimitStr = boost::lexical_cast<std::string>(sizeLimit);
}
printf("Client profiling rate is set to %s and size limit is set to %s.\n",
sampleRateStr.c_str(),
@ -3897,8 +3889,12 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise) {
continue;
}
}
tr->set(fdbClientInfoTxnSampleRate, BinaryWriter::toValue(sampleRate, Unversioned()));
tr->set(fdbClientInfoTxnSizeLimit, BinaryWriter::toValue(sizeLimit, Unversioned()));
Tuple rate = Tuple().appendDouble(sampleRate);
Tuple size = Tuple().append(sizeLimit);
tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES);
tr->set(GlobalConfig::prefixedKey(fdbClientInfoTxnSampleRate), rate.pack());
tr->set(GlobalConfig::prefixedKey(fdbClientInfoTxnSizeLimit), size.pack());
if (!intrans) {
wait(commitTransaction(tr));
}

View File

@ -0,0 +1,381 @@
/*
* ActorLineageProfiler.cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2021 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "flow/flow.h"
#include "flow/singleton.h"
#include "fdbrpc/IAsyncFile.h"
#include "fdbclient/ActorLineageProfiler.h"
#include <msgpack.hpp>
#include <memory>
#include <boost/endian/conversion.hpp>
#include <boost/asio.hpp>
using namespace std::literals;
class Packer : public msgpack::packer<msgpack::sbuffer> {
struct visitor_t {
using VisitorMap = std::unordered_map<std::type_index, std::function<void(std::any const&, Packer& packer)>>;
VisitorMap visitorMap;
template <class T>
static void any_visitor(std::any const& val, Packer& packer) {
const T& v = std::any_cast<const T&>(val);
packer.pack(v);
}
template <class... Args>
struct populate_visitor_map;
template <class Head, class... Tail>
struct populate_visitor_map<Head, Tail...> {
static void populate(VisitorMap& map) {
map.emplace(std::type_index(typeid(Head)), any_visitor<Head>);
populate_visitor_map<Tail...>::populate(map);
}
};
template <class Head>
struct populate_visitor_map<Head> {
static void populate(VisitorMap&) {}
};
visitor_t() {
populate_visitor_map<int64_t,
uint64_t,
bool,
float,
double,
std::string,
std::string_view,
std::vector<std::any>,
std::map<std::string, std::any>,
std::map<std::string_view, std::any>,
std::vector<std::map<std::string_view, std::any>>>::populate(visitorMap);
}
void visit(const std::any& val, Packer& packer) {
auto iter = visitorMap.find(val.type());
if (iter == visitorMap.end()) {
TraceEvent(SevError, "PackerTypeNotFound").detail("Type", val.type().name());
} else {
iter->second(val, packer);
}
}
};
msgpack::sbuffer sbuffer;
// Initializing visitor_t involves building a type-map. As this is a relatively expensive operation, we don't want
// to do this each time we create a Packer object. So visitor_t is a stateless class and we only use it as a
// visitor.
crossbow::singleton<visitor_t> visitor;
public:
Packer() : msgpack::packer<msgpack::sbuffer>(sbuffer) {}
void pack(std::any const& val) { visitor->visit(val, *this); }
void pack(bool val) {
if (val) {
pack_true();
} else {
pack_false();
}
}
void pack(uint64_t val) {
if (val <= std::numeric_limits<uint8_t>::max()) {
pack_uint8(uint8_t(val));
} else if (val <= std::numeric_limits<uint16_t>::max()) {
pack_uint16(uint16_t(val));
} else if (val <= std::numeric_limits<uint32_t>::max()) {
pack_uint32(uint32_t(val));
} else {
pack_uint64(val);
}
}
void pack(int64_t val) {
if (val >= 0) {
this->pack(uint64_t(val));
} else if (val >= std::numeric_limits<uint8_t>::min()) {
pack_int8(int8_t(val));
} else if (val >= std::numeric_limits<int16_t>::min()) {
pack_int16(int16_t(val));
} else if (val >= std::numeric_limits<int32_t>::min()) {
pack_int32(int32_t(val));
} else if (val >= std::numeric_limits<int64_t>::min()) {
pack_int64(int64_t(val));
}
}
void pack(float val) { pack_float(val); }
void pack(double val) { pack_double(val); }
void pack(std::string const& str) {
pack_str(str.size());
pack_str_body(str.data(), str.size());
}
void pack(std::string_view val) {
pack_str(val.size());
pack_str_body(val.data(), val.size());
}
template <class K, class V>
void pack(std::map<K, V> const& map) {
pack_map(map.size());
for (const auto& p : map) {
pack(p.first);
pack(p.second);
}
}
template <class T>
void pack(std::vector<T> const& val) {
pack_array(val.size());
for (const auto& v : val) {
pack(v);
}
}
std::pair<char*, unsigned> getbuf() {
unsigned size = sbuffer.size();
return std::make_pair(sbuffer.release(), size);
}
};
IALPCollectorBase::IALPCollectorBase() {
SampleCollector::instance().addCollector(this);
}
std::map<std::string_view, std::any> SampleCollectorT::collect(ActorLineage* lineage) {
std::map<std::string_view, std::any> out;
for (auto& collector : collectors) {
auto val = collector->collect(lineage);
if (val.has_value()) {
out[collector->name()] = val.value();
}
}
return out;
}
std::shared_ptr<Sample> SampleCollectorT::collect() {
auto sample = std::make_shared<Sample>();
double time = g_network->now();
sample->time = time;
for (auto& p : getSamples) {
Packer packer;
std::vector<std::map<std::string_view, std::any>> samples;
auto sampleVec = p.second();
for (auto& val : sampleVec) {
auto m = collect(val.getPtr());
if (!m.empty()) {
samples.emplace_back(std::move(m));
}
}
if (!samples.empty()) {
packer.pack(samples);
sample->data[p.first] = packer.getbuf();
}
}
return sample;
}
void SampleCollection_t::refresh() {
auto sample = _collector->collect();
auto min = std::min(sample->time - windowSize, sample->time);
{
Lock _{ mutex };
data.emplace_back(std::move(sample));
}
double oldest = data.front()->time;
// we don't need to check for data.empty() in this loop (or the inner loop) as we know that we will end
// up with at least one entry which is the most recent sample
while (oldest < min) {
Lock _{ mutex };
// we remove at most 10 elements at a time. This is so we don't block the main thread for too long.
for (int i = 0; i < 10 && oldest < min; ++i) {
data.pop_front();
oldest = data.front()->time;
}
}
//config->ingest(sample);
}
std::vector<std::shared_ptr<Sample>> SampleCollection_t::get(double from /*= 0.0*/,
double to /*= std::numeric_limits<double>::max()*/) const {
Lock _{ mutex };
std::vector<std::shared_ptr<Sample>> res;
for (const auto& sample : data) {
if (sample->time > to) {
break;
} else if (sample->time >= from) {
res.push_back(sample);
}
}
return res;
}
struct ProfilerImpl {
boost::asio::io_context context;
boost::asio::executor_work_guard<decltype(context.get_executor())> workGuard;
boost::asio::steady_timer timer;
std::thread mainThread;
unsigned frequency;
SampleCollection collection;
ProfilerImpl() : workGuard(context.get_executor()), timer(context) {
mainThread = std::thread([this]() { context.run(); });
}
~ProfilerImpl() {
setFrequency(0);
workGuard.reset();
mainThread.join();
}
void profileHandler(boost::system::error_code const& ec) {
if (ec) {
return;
}
collection->refresh();
timer = boost::asio::steady_timer(context, std::chrono::microseconds(1000000 / frequency));
timer.async_wait([this](auto const& ec) { profileHandler(ec); });
}
void setFrequency(unsigned frequency) {
boost::asio::post(context, [this, frequency]() {
this->frequency = frequency;
timer.cancel();
if (frequency > 0) {
profileHandler(boost::system::error_code{});
}
});
}
};
ActorLineageProfilerT::ActorLineageProfilerT() : impl(new ProfilerImpl()) {
collection->collector()->addGetter(WaitState::Network,
std::bind(&ActorLineageSet::copy, std::ref(g_network->getActorLineageSet())));
collection->collector()->addGetter(
WaitState::Disk,
std::bind(&ActorLineageSet::copy, std::ref(IAsyncFileSystem::filesystem()->getActorLineageSet())));
collection->collector()->addGetter(WaitState::Running, []() {
auto res = currentLineageThreadSafe.get();
if (res.isValid()) {
return std::vector<Reference<ActorLineage>>({ res });
}
return std::vector<Reference<ActorLineage>>();
});
}
ActorLineageProfilerT::~ActorLineageProfilerT() {
delete impl;
}
void ActorLineageProfilerT::setFrequency(unsigned frequency) {
impl->setFrequency(frequency);
}
boost::asio::io_context& ActorLineageProfilerT::context() {
return impl->context;
}
SampleIngestor::~SampleIngestor() {}
// Callback used to update the sampling profilers run frequency whenever the
// frequency changes.
void samplingProfilerUpdateFrequency(std::optional<std::any> freq) {
double frequency = 0;
if (freq.has_value()) {
frequency = std::any_cast<double>(freq.value());
}
TraceEvent(SevInfo, "SamplingProfilerUpdateFrequency").detail("Frequency", frequency);
ActorLineageProfiler::instance().setFrequency(frequency);
}
void ProfilerConfigT::reset(std::map<std::string, std::string> const& config) {
bool expectNoMore = false, useFluentD = false, useTCP = false;
std::string endpoint;
ConfigError err;
for (auto& kv : config) {
if (expectNoMore) {
err.description = format("Unexpected option %s", kv.first.c_str());
throw err;
}
if (kv.first == "collector") {
std::string val = kv.second;
std::for_each(val.begin(), val.end(), [](auto c) { return std::tolower(c); });
if (val == "none") {
setBackend(std::make_shared<NoneIngestor>());
} else if (val == "fluentd") {
useFluentD = true;
} else {
err.description = format("Unsupported collector: %s", val.c_str());
throw err;
}
} else if (kv.first == "collector_endpoint") {
endpoint = kv.second;
} else if (kv.first == "collector_protocol") {
auto val = kv.second;
std::for_each(val.begin(), val.end(), [](auto c) { return std::tolower(c); });
if (val == "tcp") {
useTCP = true;
} else if (val == "udp") {
useTCP = false;
} else {
err.description = format("Unsupported protocol for fluentd: %s", kv.second.c_str());
throw err;
}
} else {
err.description = format("Unknown option %s", kv.first.c_str());
throw err;
}
}
if (useFluentD) {
if (endpoint.empty()) {
err.description = "Endpoint is required for fluentd ingestor";
throw err;
}
NetworkAddress address;
try {
address = NetworkAddress::parse(endpoint);
} catch (Error& e) {
err.description = format("Can't parse address %s", endpoint.c_str());
throw err;
}
setBackend(std::make_shared<FluentDIngestor>(
useTCP ? FluentDIngestor::Protocol::TCP : FluentDIngestor::Protocol::TCP, address));
}
}
std::map<std::string, std::string> ProfilerConfigT::getConfig() const {
std::map<std::string, std::string> res;
if (ingestor) {
ingestor->getConfig(res);
}
return res;
}
// Callback used to update the sample collector window size.
void samplingProfilerUpdateWindow(std::optional<std::any> window) {
double duration = 0;
if (window.has_value()) {
duration = std::any_cast<double>(window.value());
}
TraceEvent(SevInfo, "SamplingProfilerUpdateWindow").detail("Duration", duration);
SampleCollection::instance().setWindowSize(duration);
}

View File

@ -0,0 +1,192 @@
/*
* ActorLineageProfiler.h
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2021 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "fdbclient/AnnotateActor.h"
#include <optional>
#include <string>
#include <any>
#include <vector>
#include <mutex>
#include <condition_variable>
#include "flow/singleton.h"
#include "flow/flow.h"
void samplingProfilerUpdateFrequency(std::optional<std::any> freq);
void samplingProfilerUpdateWindow(std::optional<std::any> window);
struct IALPCollectorBase {
virtual std::optional<std::any> collect(ActorLineage*) = 0;
virtual const std::string_view& name() = 0;
IALPCollectorBase();
};
template <class T>
struct IALPCollector : IALPCollectorBase {
const std::string_view& name() override { return T::name; }
};
struct Sample : std::enable_shared_from_this<Sample> {
double time = 0.0;
Sample() {}
Sample(Sample const&) = delete;
Sample& operator=(Sample const&) = delete;
std::unordered_map<WaitState, std::pair<char*, unsigned>> data;
~Sample() {
std::for_each(data.begin(), data.end(), [](std::pair<WaitState, std::pair<char*, unsigned>> entry) {
::free(entry.second.first);
});
}
};
class SampleIngestor : std::enable_shared_from_this<SampleIngestor> {
public:
virtual ~SampleIngestor();
virtual void ingest(std::shared_ptr<Sample> const& sample) = 0;
virtual void getConfig(std::map<std::string, std::string>&) const = 0;
};
class NoneIngestor : public SampleIngestor {
public:
void ingest(std::shared_ptr<Sample> const& sample) override {}
void getConfig(std::map<std::string, std::string>& res) const override { res["ingestor"] = "none"; }
};
// The FluentD ingestor uses the pimpl idiom. This is to make compilation less heavy weight as this implementation has
// dependencies to boost::asio
struct FluentDIngestorImpl;
class FluentDIngestor : public SampleIngestor {
public: // Public Types
enum class Protocol { TCP, UDP };
private: // members
FluentDIngestorImpl* impl;
public: // interface
void ingest(std::shared_ptr<Sample> const& sample) override;
FluentDIngestor(Protocol protocol, NetworkAddress& endpoint);
void getConfig(std::map<std::string, std::string>& res) const override;
~FluentDIngestor();
};
struct ConfigError {
std::string description;
};
class ProfilerConfigT {
private: // private types
using Lock = std::unique_lock<std::mutex>;
friend class crossbow::create_static<ProfilerConfigT>;
private: // members
std::shared_ptr<SampleIngestor> ingestor = std::make_shared<NoneIngestor>();
private: // construction
ProfilerConfigT() {}
ProfilerConfigT(ProfilerConfigT const&) = delete;
ProfilerConfigT& operator=(ProfilerConfigT const&) = delete;
void setBackend(std::shared_ptr<SampleIngestor> ingestor) { this->ingestor = ingestor; }
public:
void reset(std::map<std::string, std::string> const& config);
std::map<std::string, std::string> getConfig() const;
};
using ProfilerConfig = crossbow::singleton<ProfilerConfigT>;
class SampleCollectorT {
public: // Types
friend struct crossbow::create_static<SampleCollectorT>;
using Getter = std::function<std::vector<Reference<ActorLineage>>()>;
private:
std::vector<IALPCollectorBase*> collectors;
std::map<WaitState, Getter> getSamples;
SampleCollectorT() {}
std::map<std::string_view, std::any> collect(ActorLineage* lineage);
public:
void addCollector(IALPCollectorBase* collector) { collectors.push_back(collector); }
std::shared_ptr<Sample> collect();
void addGetter(WaitState waitState, Getter const& getter) { getSamples[waitState] = getter; };
};
using SampleCollector = crossbow::singleton<SampleCollectorT>;
class SampleCollection_t {
friend struct crossbow::create_static<SampleCollection_t>;
using Lock = std::unique_lock<std::mutex>;
SampleCollection_t() {}
SampleCollector _collector;
mutable std::mutex mutex;
std::atomic<double> windowSize = 0.0;
std::deque<std::shared_ptr<Sample>> data;
ProfilerConfig config;
public:
/**
* Define how many samples the collection shoul keep. The window size is defined by time dimension.
*
* \param duration How long a sample should be kept in the collection.
*/
void setWindowSize(double duration) { windowSize.store(duration); }
/**
* By default returns reference counted pointers of all samples. A window can be defined in terms of absolute time.
*
* \param from The minimal age of all returned samples.
* \param to The max age of all returned samples.
*/
std::vector<std::shared_ptr<Sample>> get(double from = 0.0, double to = std::numeric_limits<double>::max()) const;
/**
* Collects all new samples from the sample collector and stores them in the collection.
*/
void refresh();
const SampleCollector& collector() const { return _collector; }
SampleCollector& collector() { return _collector; }
};
using SampleCollection = crossbow::singleton<SampleCollection_t>;
struct ProfilerImpl;
namespace boost {
namespace asio {
// forward declare io_context because including boost asio is super expensive
class io_context;
} // namespace asio
} // namespace boost
class ActorLineageProfilerT {
friend struct crossbow::create_static<ActorLineageProfilerT>;
ProfilerImpl* impl;
SampleCollection collection;
ActorLineageProfilerT();
public:
~ActorLineageProfilerT();
void setFrequency(unsigned frequency);
boost::asio::io_context& context();
};
using ActorLineageProfiler = crossbow::singleton<ActorLineageProfilerT>;

View File

@ -0,0 +1,23 @@
/*
* AnnotateActor.cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2021 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "fdbclient/AnnotateActor.h"
std::map<WaitState, std::function<std::vector<Reference<ActorLineage>>()>> samples;

85
fdbclient/AnnotateActor.h Normal file
View File

@ -0,0 +1,85 @@
/*
* AnnotateActor.h
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2021 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "flow/flow.h"
#include "flow/network.h"
#include <string_view>
// Used to manually instrument waiting actors to collect samples for the
// sampling profiler.
struct AnnotateActor {
unsigned index;
bool set;
AnnotateActor() : set(false) {}
AnnotateActor(Reference<ActorLineage> lineage) : set(true) {
index = g_network->getActorLineageSet().insert(lineage);
if (index == ActorLineageSet::npos) {
set = false;
}
}
AnnotateActor(const AnnotateActor& other) = delete;
AnnotateActor(AnnotateActor&& other) = delete;
AnnotateActor& operator=(const AnnotateActor& other) = delete;
AnnotateActor& operator=(AnnotateActor&& other) {
if (this == &other) {
return *this;
}
this->index = other.index;
this->set = other.set;
other.set = false;
return *this;
}
~AnnotateActor() {
if (set) {
g_network->getActorLineageSet().erase(index);
}
}
};
enum class WaitState { Disk, Network, Running };
// usually we shouldn't use `using namespace` in a header file, but literals should be safe as user defined literals
// need to be prefixed with `_`
using namespace std::literals;
constexpr std::string_view to_string(WaitState st) {
switch (st) {
case WaitState::Disk:
return "Disk"sv;
case WaitState::Network:
return "Network"sv;
case WaitState::Running:
return "Running"sv;
default:
return ""sv;
}
}
extern std::map<WaitState, std::function<std::vector<Reference<ActorLineage>>()>> samples;

View File

@ -294,7 +294,8 @@ public:
Key addPrefix = Key(),
Key removePrefix = Key(),
bool lockDB = true,
bool incrementalBackupOnly = false,
bool onlyAppyMutationLogs = false,
bool inconsistentSnapshotOnly = false,
Version beginVersion = -1);
Future<Version> restore(Database cx,
Optional<Database> cxOrig,
@ -307,7 +308,8 @@ public:
Key addPrefix = Key(),
Key removePrefix = Key(),
bool lockDB = true,
bool incrementalBackupOnly = false,
bool onlyAppyMutationLogs = false,
bool inconsistentSnapshotOnly = false,
Version beginVersion = -1) {
Standalone<VectorRef<KeyRangeRef>> rangeRef;
rangeRef.push_back_deep(rangeRef.arena(), range);
@ -322,7 +324,8 @@ public:
addPrefix,
removePrefix,
lockDB,
incrementalBackupOnly,
onlyAppyMutationLogs,
inconsistentSnapshotOnly,
beginVersion);
}
Future<Version> atomicRestore(Database cx,
@ -357,6 +360,7 @@ public:
Future<Void> submitBackup(Reference<ReadYourWritesTransaction> tr,
Key outContainer,
int initialSnapshotIntervalSeconds,
int snapshotIntervalSeconds,
std::string tagName,
Standalone<VectorRef<KeyRangeRef>> backupRanges,
@ -365,6 +369,7 @@ public:
bool incrementalBackupOnly = false);
Future<Void> submitBackup(Database cx,
Key outContainer,
int initialSnapshotIntervalSeconds,
int snapshotIntervalSeconds,
std::string tagName,
Standalone<VectorRef<KeyRangeRef>> backupRanges,
@ -374,6 +379,7 @@ public:
return runRYWTransactionFailIfLocked(cx, [=](Reference<ReadYourWritesTransaction> tr) {
return submitBackup(tr,
outContainer,
initialSnapshotIntervalSeconds,
snapshotIntervalSeconds,
tagName,
backupRanges,
@ -404,7 +410,8 @@ public:
Future<std::string> getStatus(Database cx, bool showErrors, std::string tagName);
Future<std::string> getStatusJSON(Database cx, std::string tagName);
Future<Optional<Version>> getLastRestorable(Reference<ReadYourWritesTransaction> tr, Key tagName,
Future<Optional<Version>> getLastRestorable(Reference<ReadYourWritesTransaction> tr,
Key tagName,
bool snapshot = false);
void setLastRestorable(Reference<ReadYourWritesTransaction> tr, Key tagName, Version version);
@ -488,6 +495,14 @@ public:
[=](Reference<ReadYourWritesTransaction> tr) { return unlockBackup(tr, tagName); });
}
// Specifies the action to take on the backup's destination key range
// before the backup begins.
enum PreBackupAction {
NONE = 0, // No action is taken
VERIFY = 1, // Verify the key range being restored to is empty.
CLEAR = 2 // Clear the key range being restored to.
};
Future<Void> submitBackup(Reference<ReadYourWritesTransaction> tr,
Key tagName,
Standalone<VectorRef<KeyRangeRef>> backupRanges,
@ -495,7 +510,7 @@ public:
Key addPrefix = StringRef(),
Key removePrefix = StringRef(),
bool lockDatabase = false,
bool databasesInSync = false);
PreBackupAction backupAction = PreBackupAction::VERIFY);
Future<Void> submitBackup(Database cx,
Key tagName,
Standalone<VectorRef<KeyRangeRef>> backupRanges,
@ -503,10 +518,10 @@ public:
Key addPrefix = StringRef(),
Key removePrefix = StringRef(),
bool lockDatabase = false,
bool databasesInSync = false) {
PreBackupAction backupAction = PreBackupAction::VERIFY) {
return runRYWTransaction(cx, [=](Reference<ReadYourWritesTransaction> tr) {
return submitBackup(
tr, tagName, backupRanges, stopWhenDone, addPrefix, removePrefix, lockDatabase, databasesInSync);
tr, tagName, backupRanges, stopWhenDone, addPrefix, removePrefix, lockDatabase, backupAction);
});
}
@ -835,6 +850,11 @@ public:
typedef KeyBackedMap<Key, bool> RangeDispatchMapT;
RangeDispatchMapT snapshotRangeDispatchMap() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); }
// Interval to use for the first (initial) snapshot.
KeyBackedProperty<int64_t> initialSnapshotIntervalSeconds() {
return configSpace.pack(LiteralStringRef(__FUNCTION__));
}
// Interval to use for determining the target end version for new snapshots
KeyBackedProperty<int64_t> snapshotIntervalSeconds() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); }
@ -864,8 +884,9 @@ public:
Future<Version> beginVersion = tr->getReadVersion();
Future<int64_t> defaultInterval = 0;
if (intervalSeconds < 0)
if (intervalSeconds < 0) {
defaultInterval = copy.snapshotIntervalSeconds().getOrThrow(tr);
}
// Make sure read version and possibly the snapshot interval value are ready, then clear/init the snapshot
// config members

View File

@ -743,6 +743,9 @@ ACTOR Future<Void> applyMutations(Database cx,
wait(coalesceKeyVersionCache(
uid, newEndVersion, keyVersion, commit, committedVersion, addActor, &commitLock));
beginVersion = newEndVersion;
if (BUGGIFY) {
wait(delay(2.0));
}
}
} catch (Error& e) {
TraceEvent(e.code() == error_code_restore_missing_data ? SevWarnAlways : SevError, "ApplyMutationsError")

View File

@ -1,4 +1,7 @@
set(FDBCLIENT_SRCS
ActorLineageProfiler.h
ActorLineageProfiler.cpp
AnnotateActor.cpp
AsyncFileS3BlobStore.actor.cpp
AsyncFileS3BlobStore.actor.h
AsyncTaskThread.actor.cpp
@ -27,7 +30,11 @@ set(FDBCLIENT_SRCS
EventTypes.actor.h
FDBOptions.h
FDBTypes.h
FluentDSampleIngestor.cpp
FileBackupAgent.actor.cpp
GlobalConfig.h
GlobalConfig.actor.h
GlobalConfig.actor.cpp
GrvProxyInterface.h
HTTP.actor.cpp
IClientApi.h
@ -137,8 +144,7 @@ endif()
add_flow_target(STATIC_LIBRARY NAME fdbclient SRCS ${FDBCLIENT_SRCS} ADDL_SRCS ${options_srcs})
add_dependencies(fdbclient fdboptions)
target_link_libraries(fdbclient PUBLIC fdbrpc msgpack)
if(BUILD_AZURE_BACKUP)
target_link_libraries(fdbclient PUBLIC fdbrpc PRIVATE curl uuid azure-storage-lite)
else()
target_link_libraries(fdbclient PUBLIC fdbrpc)
target_link_libraries(fdbclient PRIVATE curl uuid azure-storage-lite)
endif()

View File

@ -31,6 +31,7 @@
#include "fdbclient/CommitTransaction.h"
#include "fdbserver/RatekeeperInterface.h"
#include "fdbclient/TagThrottle.h"
#include "fdbclient/GlobalConfig.h"
#include "fdbrpc/Stats.h"
#include "fdbrpc/TimedRequest.h"
@ -113,16 +114,10 @@ struct ClientDBInfo {
vector<CommitProxyInterface> commitProxies;
Optional<CommitProxyInterface>
firstCommitProxy; // not serialized, used for commitOnFirstProxy when the commit proxies vector has been shrunk
double clientTxnInfoSampleRate;
int64_t clientTxnInfoSizeLimit;
Optional<Value> forward;
double transactionTagSampleRate;
double transactionTagSampleCost;
vector<VersionHistory> history;
ClientDBInfo()
: clientTxnInfoSampleRate(std::numeric_limits<double>::infinity()), clientTxnInfoSizeLimit(-1),
transactionTagSampleRate(CLIENT_KNOBS->READ_TAG_SAMPLE_RATE),
transactionTagSampleCost(CLIENT_KNOBS->COMMIT_SAMPLE_COST) {}
ClientDBInfo() {}
bool operator==(ClientDBInfo const& r) const { return id == r.id; }
bool operator!=(ClientDBInfo const& r) const { return id != r.id; }
@ -132,15 +127,7 @@ struct ClientDBInfo {
if constexpr (!is_fb_function<Archive>) {
ASSERT(ar.protocolVersion().isValid());
}
serializer(ar,
grvProxies,
commitProxies,
id,
clientTxnInfoSampleRate,
clientTxnInfoSizeLimit,
forward,
transactionTagSampleRate,
transactionTagSampleCost);
serializer(ar, grvProxies, commitProxies, id, forward, history);
}
};

View File

@ -35,6 +35,7 @@ constexpr UID WLTOKEN_CLIENTLEADERREG_OPENDATABASE(-1, 3);
constexpr UID WLTOKEN_PROTOCOL_INFO(-1, 10);
// The coordinator interface as exposed to clients
struct ClientLeaderRegInterface {
RequestStream<struct GetLeaderRequest> getLeader;
RequestStream<struct OpenDatabaseCoordRequest> openDatabase;
@ -42,6 +43,10 @@ struct ClientLeaderRegInterface {
ClientLeaderRegInterface() {}
ClientLeaderRegInterface(NetworkAddress remote);
ClientLeaderRegInterface(INetwork* local);
bool operator==(const ClientLeaderRegInterface& rhs) const {
return getLeader == rhs.getLeader && openDatabase == rhs.openDatabase;
}
};
class ClusterConnectionString {
@ -107,8 +112,9 @@ private:
struct LeaderInfo {
constexpr static FileIdentifier file_identifier = 8338794;
// The first 7 bits of changeID represent cluster controller process class fitness, the lower the better
UID changeID;
static const uint64_t mask = ~(127ll << 57);
static const uint64_t changeIDMask = ~(uint64_t(0b1111111) << 57);
Value serializedInfo;
bool forward; // If true, serializedInfo is a connection string instead!
@ -125,13 +131,13 @@ struct LeaderInfo {
// The first 7 bits of ChangeID represent cluster controller process class fitness, the lower the better
void updateChangeID(ClusterControllerPriorityInfo info) {
changeID = UID(((uint64_t)info.processClassFitness << 57) | ((uint64_t)info.isExcluded << 60) |
((uint64_t)info.dcFitness << 61) | (changeID.first() & mask),
((uint64_t)info.dcFitness << 61) | (changeID.first() & changeIDMask),
changeID.second());
}
// All but the first 7 bits are used to represent process id
bool equalInternalId(LeaderInfo const& leaderInfo) const {
return ((changeID.first() & mask) == (leaderInfo.changeID.first() & mask)) &&
return ((changeID.first() & changeIDMask) == (leaderInfo.changeID.first() & changeIDMask)) &&
changeID.second() == leaderInfo.changeID.second();
}
@ -139,8 +145,10 @@ struct LeaderInfo {
// 1. the candidate has better process class fitness and the candidate is not the leader
// 2. the leader process class fitness becomes worse
bool leaderChangeRequired(LeaderInfo const& candidate) const {
return ((changeID.first() & ~mask) > (candidate.changeID.first() & ~mask) && !equalInternalId(candidate)) ||
((changeID.first() & ~mask) < (candidate.changeID.first() & ~mask) && equalInternalId(candidate));
return ((changeID.first() & ~changeIDMask) > (candidate.changeID.first() & ~changeIDMask) &&
!equalInternalId(candidate)) ||
((changeID.first() & ~changeIDMask) < (candidate.changeID.first() & ~changeIDMask) &&
equalInternalId(candidate));
}
ClusterControllerPriorityInfo getPriorityInfo() const {

View File

@ -1072,7 +1072,7 @@ struct CopyLogsTaskFunc : TaskFuncBase {
wait(waitForAll(addTaskVector) && taskBucket->finish(tr, task));
} else {
if (appliedVersion <= stopVersionData) {
if (appliedVersion < applyVersion) {
wait(delay(FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY));
wait(success(CopyLogsTaskFunc::addTask(
tr, taskBucket, task, prevBeginVersion, beginVersion, TaskCompletionKey::signal(onDone))));
@ -2243,17 +2243,18 @@ struct StartFullBackupTaskFunc : TaskFuncBase {
return Void();
}
ACTOR static Future<Key> addTask(Reference<ReadYourWritesTransaction> tr,
Reference<TaskBucket> taskBucket,
Key logUid,
Key backupUid,
Key keyAddPrefix,
Key keyRemovePrefix,
Key keyConfigBackupRanges,
Key tagName,
TaskCompletionKey completionKey,
Reference<TaskFuture> waitFor = Reference<TaskFuture>(),
bool databasesInSync = false) {
ACTOR static Future<Key> addTask(
Reference<ReadYourWritesTransaction> tr,
Reference<TaskBucket> taskBucket,
Key logUid,
Key backupUid,
Key keyAddPrefix,
Key keyRemovePrefix,
Key keyConfigBackupRanges,
Key tagName,
TaskCompletionKey completionKey,
Reference<TaskFuture> waitFor = Reference<TaskFuture>(),
DatabaseBackupAgent::PreBackupAction backupAction = DatabaseBackupAgent::PreBackupAction::VERIFY) {
Key doneKey = wait(completionKey.get(tr, taskBucket));
auto task = makeReference<Task>(StartFullBackupTaskFunc::name, StartFullBackupTaskFunc::version, doneKey);
@ -2264,7 +2265,7 @@ struct StartFullBackupTaskFunc : TaskFuncBase {
task->params[BackupAgentBase::keyConfigBackupRanges] = keyConfigBackupRanges;
task->params[BackupAgentBase::keyTagName] = tagName;
task->params[DatabaseBackupAgent::keyDatabasesInSync] =
databasesInSync ? LiteralStringRef("t") : LiteralStringRef("f");
backupAction == DatabaseBackupAgent::PreBackupAction::NONE ? LiteralStringRef("t") : LiteralStringRef("f");
if (!waitFor) {
return taskBucket->addTask(tr,
@ -2514,7 +2515,7 @@ public:
Key addPrefix,
Key removePrefix,
bool lockDB,
bool databasesInSync) {
DatabaseBackupAgent::PreBackupAction backupAction) {
state UID logUid = deterministicRandom()->randomUniqueID();
state Key logUidValue = BinaryWriter::toValue(logUid, Unversioned());
state UID logUidCurrent = wait(backupAgent->getLogUid(tr, tagName));
@ -2558,7 +2559,7 @@ public:
}
}
if (!databasesInSync) {
if (backupAction == DatabaseBackupAgent::PreBackupAction::VERIFY) {
// Make sure all of the ranges are empty before we backup into them.
state std::vector<Future<Standalone<RangeResultRef>>> backupIntoResults;
for (auto& backupRange : backupRanges) {
@ -2572,6 +2573,11 @@ public:
throw restore_destination_not_empty();
}
}
} else if (backupAction == DatabaseBackupAgent::PreBackupAction::CLEAR) {
// Clear out all ranges before we backup into them.
for (auto& backupRange : backupRanges) {
tr->clear(backupRange.removePrefix(removePrefix).withPrefix(addPrefix));
}
}
// Clear the backup ranges for the tag
@ -2610,7 +2616,7 @@ public:
tr->clear(KeyRangeRef(mapPrefix, mapEnd));
state Version readVersion = invalidVersion;
if (databasesInSync) {
if (backupAction == DatabaseBackupAgent::PreBackupAction::NONE) {
Transaction readTransaction(backupAgent->taskBucket->src);
readTransaction.setOption(FDBTransactionOptions::LOCK_AWARE);
Version _ = wait(readTransaction.getReadVersion());
@ -2629,7 +2635,7 @@ public:
tagName,
TaskCompletionKey::noSignal(),
Reference<TaskFuture>(),
databasesInSync));
backupAction));
if (lockDB)
wait(lockDatabase(tr, logUid));
@ -2772,8 +2778,14 @@ public:
TraceEvent("DBA_SwitchoverVersionUpgraded");
try {
wait(drAgent.submitBackup(
backupAgent->taskBucket->src, tagName, backupRanges, false, addPrefix, removePrefix, true, true));
wait(drAgent.submitBackup(backupAgent->taskBucket->src,
tagName,
backupRanges,
false,
addPrefix,
removePrefix,
true,
DatabaseBackupAgent::PreBackupAction::NONE));
} catch (Error& e) {
if (e.code() != error_code_backup_duplicate)
throw;
@ -3236,9 +3248,9 @@ Future<Void> DatabaseBackupAgent::submitBackup(Reference<ReadYourWritesTransacti
Key addPrefix,
Key removePrefix,
bool lockDatabase,
bool databasesInSync) {
PreBackupAction backupAction) {
return DatabaseBackupAgentImpl::submitBackup(
this, tr, tagName, backupRanges, stopWhenDone, addPrefix, removePrefix, lockDatabase, databasesInSync);
this, tr, tagName, backupRanges, stopWhenDone, addPrefix, removePrefix, lockDatabase, backupAction);
}
Future<Void> DatabaseBackupAgent::discontinueBackup(Reference<ReadYourWritesTransaction> tr, Key tagName) {

View File

@ -152,6 +152,7 @@ public:
return (DatabaseContext*)DatabaseContext::operator new(sizeof(DatabaseContext));
}
// Static constructor used by server processes to create a DatabaseContext
// For internal (fdbserver) use only
static Database create(Reference<AsyncVar<ClientDBInfo>> clientInfo,
Future<Void> clientInfoMonitor,
@ -164,9 +165,11 @@ public:
~DatabaseContext();
// Constructs a new copy of this DatabaseContext from the parameters of this DatabaseContext
Database clone() const {
return Database(new DatabaseContext(connectionFile,
clientInfo,
coordinator,
clientInfoMonitor,
taskID,
clientLocality,
@ -196,6 +199,11 @@ public:
Future<Void> onProxiesChanged();
Future<HealthMetrics> getHealthMetrics(bool detailed);
// Returns the protocol version reported by the coordinator this client is connected to
// If an expected version is given, the future won't return until the protocol version is different than expected
// Note: this will never return if the server is running a protocol from FDB 5.0 or older
Future<ProtocolVersion> getClusterProtocol(Optional<ProtocolVersion> expectedVersion = Optional<ProtocolVersion>());
// Update the watch counter for the database
void addWatch();
void removeWatch();
@ -247,6 +255,7 @@ public:
// private:
explicit DatabaseContext(Reference<AsyncVar<Reference<ClusterConnectionFile>>> connectionFile,
Reference<AsyncVar<ClientDBInfo>> clientDBInfo,
Reference<AsyncVar<Optional<ClientLeaderRegInterface>>> coordinator,
Future<Void> clientInfoMonitor,
TaskPriority taskID,
LocalityData const& clientLocality,
@ -380,6 +389,9 @@ public:
Future<Void> clientInfoMonitor;
Future<Void> connected;
// An AsyncVar that reports the coordinator this DatabaseContext is interacting with
Reference<AsyncVar<Optional<ClientLeaderRegInterface>>> coordinator;
Reference<AsyncVar<Optional<ClusterInterface>>> statusClusterInterface;
Future<Void> statusLeaderMon;
double lastStatusFetch;

View File

@ -866,22 +866,36 @@ struct TLogSpillType {
// Contains the amount of free and total space for a storage server, in bytes
struct StorageBytes {
// Free space on the filesystem
int64_t free;
// Total space on the filesystem
int64_t total;
int64_t used; // Used by *this* store, not total-free
int64_t available; // Amount of disk space that can be used by data structure, including free disk space and
// internally reusable space
// Used by *this* store, not total - free
int64_t used;
// Amount of space available for use by the store, which includes free space on the filesystem
// and internal free space within the store data that is immediately reusable.
int64_t available;
// Amount of space that could eventually be available for use after garbage collection
int64_t temp;
StorageBytes() {}
StorageBytes(int64_t free, int64_t total, int64_t used, int64_t available)
: free(free), total(total), used(used), available(available) {}
StorageBytes(int64_t free, int64_t total, int64_t used, int64_t available, int64_t temp = 0)
: free(free), total(total), used(used), available(available), temp(temp) {}
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, free, total, used, available);
}
};
std::string toString() const {
return format("{%.2f MB total, %.2f MB free, %.2f MB available, %.2f MB used, %.2f MB temp}",
total / 1e6,
free / 1e6,
available / 1e6,
used / 1e6,
temp / 1e6);
}
};
struct LogMessageVersion {
// Each message pushed into the log system has a unique, totally ordered LogMessageVersion
// See ILogSystem::push() for how these are assigned

View File

@ -141,7 +141,8 @@ public:
}
KeyBackedProperty<Key> addPrefix() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); }
KeyBackedProperty<Key> removePrefix() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); }
KeyBackedProperty<bool> incrementalBackupOnly() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); }
KeyBackedProperty<bool> onlyAppyMutationLogs() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); }
KeyBackedProperty<bool> inconsistentSnapshotOnly() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); }
// XXX: Remove restoreRange() once it is safe to remove. It has been changed to restoreRanges
KeyBackedProperty<KeyRange> restoreRange() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); }
KeyBackedProperty<std::vector<KeyRange>> restoreRanges() {
@ -150,6 +151,7 @@ public:
KeyBackedProperty<Key> batchFuture() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); }
KeyBackedProperty<Version> beginVersion() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); }
KeyBackedProperty<Version> restoreVersion() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); }
KeyBackedProperty<Version> firstConsistentVersion() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); }
KeyBackedProperty<Reference<IBackupContainer>> sourceContainer() {
return configSpace.pack(LiteralStringRef(__FUNCTION__));
@ -303,6 +305,13 @@ public:
tr->set(uidPrefixKey(applyMutationsBeginRange.begin, uid), BinaryWriter::toValue(ver, Unversioned()));
}
Future<Version> getApplyBeginVersion(Reference<ReadYourWritesTransaction> tr) {
return map(tr->get(uidPrefixKey(applyMutationsBeginRange.begin, uid)),
[=](Optional<Value> const& value) -> Version {
return value.present() ? BinaryReader::fromStringRef<Version>(value.get(), Unversioned()) : 0;
});
}
void setApplyEndVersion(Reference<ReadYourWritesTransaction> tr, Version ver) {
tr->set(uidPrefixKey(applyMutationsEndRange.begin, uid), BinaryWriter::toValue(ver, Unversioned()));
}
@ -314,6 +323,21 @@ public:
});
}
ACTOR static Future<Version> getCurrentVersion_impl(RestoreConfig* self, Reference<ReadYourWritesTransaction> tr) {
state ERestoreState status = wait(self->stateEnum().getD(tr));
state Version version = -1;
if (status == ERestoreState::RUNNING) {
wait(store(version, self->getApplyBeginVersion(tr)));
} else if (status == ERestoreState::COMPLETED) {
wait(store(version, self->restoreVersion().getD(tr)));
}
return version;
}
Future<Version> getCurrentVersion(Reference<ReadYourWritesTransaction> tr) {
return getCurrentVersion_impl(this, tr);
}
ACTOR static Future<std::string> getProgress_impl(RestoreConfig restore, Reference<ReadYourWritesTransaction> tr);
Future<std::string> getProgress(Reference<ReadYourWritesTransaction> tr) { return getProgress_impl(*this, tr); }
@ -334,15 +358,17 @@ ACTOR Future<std::string> RestoreConfig::getProgress_impl(RestoreConfig restore,
state Future<int64_t> fileBlocksFinished = restore.fileBlocksFinished().getD(tr);
state Future<int64_t> bytesWritten = restore.bytesWritten().getD(tr);
state Future<StringRef> status = restore.stateText(tr);
state Future<Version> currentVersion = restore.getCurrentVersion(tr);
state Future<Version> lag = restore.getApplyVersionLag(tr);
state Future<Version> firstConsistentVersion = restore.firstConsistentVersion().getD(tr);
state Future<std::string> tag = restore.tag().getD(tr);
state Future<std::pair<std::string, Version>> lastError = restore.lastError().getD(tr);
// restore might no longer be valid after the first wait so make sure it is not needed anymore.
state UID uid = restore.getUid();
wait(success(fileCount) && success(fileBlockCount) && success(fileBlocksDispatched) &&
success(fileBlocksFinished) && success(bytesWritten) && success(status) && success(lag) && success(tag) &&
success(lastError));
success(fileBlocksFinished) && success(bytesWritten) && success(status) && success(currentVersion) &&
success(lag) && success(firstConsistentVersion) && success(tag) && success(lastError));
std::string errstr = "None";
if (lastError.get().second != 0)
@ -359,11 +385,13 @@ ACTOR Future<std::string> RestoreConfig::getProgress_impl(RestoreConfig restore,
.detail("FileBlocksTotal", fileBlockCount.get())
.detail("FileBlocksInProgress", fileBlocksDispatched.get() - fileBlocksFinished.get())
.detail("BytesWritten", bytesWritten.get())
.detail("CurrentVersion", currentVersion.get())
.detail("FirstConsistentVersion", firstConsistentVersion.get())
.detail("ApplyLag", lag.get())
.detail("TaskInstance", THIS_ADDR);
return format("Tag: %s UID: %s State: %s Blocks: %lld/%lld BlocksInProgress: %lld Files: %lld BytesWritten: "
"%lld ApplyVersionLag: %lld LastError: %s",
"%lld CurrentVersion: %lld FirstConsistentVersion: %lld ApplyVersionLag: %lld LastError: %s",
tag.get().c_str(),
uid.toString().c_str(),
status.get().toString().c_str(),
@ -372,6 +400,8 @@ ACTOR Future<std::string> RestoreConfig::getProgress_impl(RestoreConfig restore,
fileBlocksDispatched.get() - fileBlocksFinished.get(),
fileCount.get(),
bytesWritten.get(),
currentVersion.get(),
firstConsistentVersion.get(),
lag.get(),
errstr.c_str());
}
@ -2059,7 +2089,7 @@ struct BackupLogRangeTaskFunc : BackupTaskFuncBase {
.detail("Size", outFile->size())
.detail("BeginVersion", beginVersion)
.detail("EndVersion", endVersion)
.detail("LastReadVersion", latestVersion);
.detail("LastReadVersion", lastVersion);
Params.fileSize().set(task, outFile->size());
@ -2777,9 +2807,9 @@ struct StartFullBackupTaskFunc : BackupTaskFuncBase {
state Reference<TaskFuture> backupFinished = futureBucket->future(tr);
// Initialize the initial snapshot and create tasks to continually write logs and snapshots
// The initial snapshot has a desired duration of 0, meaning go as fast as possible.
wait(config.initNewSnapshot(tr, 0));
// Initialize the initial snapshot and create tasks to continually write logs and snapshots.
state Optional<int64_t> initialSnapshotIntervalSeconds = wait(config.initialSnapshotIntervalSeconds().get(tr));
wait(config.initNewSnapshot(tr, initialSnapshotIntervalSeconds.orDefault(0)));
// Using priority 1 for both of these to at least start both tasks soon
// Do not add snapshot task if we only want the incremental backup
@ -3544,9 +3574,9 @@ struct RestoreDispatchTaskFunc : RestoreTaskFuncBase {
state int64_t remainingInBatch = Params.remainingInBatch().get(task);
state bool addingToExistingBatch = remainingInBatch > 0;
state Version restoreVersion;
state Future<Optional<bool>> incrementalBackupOnly = restore.incrementalBackupOnly().get(tr);
state Future<Optional<bool>> onlyAppyMutationLogs = restore.onlyAppyMutationLogs().get(tr);
wait(store(restoreVersion, restore.restoreVersion().getOrThrow(tr)) && success(incrementalBackupOnly) &&
wait(store(restoreVersion, restore.restoreVersion().getOrThrow(tr)) && success(onlyAppyMutationLogs) &&
checkTaskVersion(tr->getDatabase(), task, name, version));
// If not adding to an existing batch then update the apply mutations end version so the mutations from the
@ -4013,6 +4043,8 @@ struct StartFullRestoreTaskFunc : RestoreTaskFuncBase {
state Version beginVersion;
state Reference<IBackupContainer> bc;
state std::vector<KeyRange> ranges;
state bool logsOnly;
state bool inconsistentSnapshotOnly;
loop {
try {
@ -4020,11 +4052,12 @@ struct StartFullRestoreTaskFunc : RestoreTaskFuncBase {
tr->setOption(FDBTransactionOptions::LOCK_AWARE);
wait(checkTaskVersion(tr->getDatabase(), task, name, version));
Optional<Version> _beginVersion = wait(restore.beginVersion().get(tr));
beginVersion = _beginVersion.present() ? _beginVersion.get() : invalidVersion;
wait(store(beginVersion, restore.beginVersion().getD(tr, false, invalidVersion)));
wait(store(restoreVersion, restore.restoreVersion().getOrThrow(tr)));
wait(store(ranges, restore.getRestoreRangesOrDefault(tr)));
wait(store(logsOnly, restore.onlyAppyMutationLogs().getD(tr, false, false)));
wait(store(inconsistentSnapshotOnly, restore.inconsistentSnapshotOnly().getD(tr, false, false)));
wait(taskBucket->keepRunning(tr, task));
@ -4071,8 +4104,7 @@ struct StartFullRestoreTaskFunc : RestoreTaskFuncBase {
}
}
Optional<bool> _incremental = wait(restore.incrementalBackupOnly().get(tr));
state bool incremental = _incremental.present() ? _incremental.get() : false;
state Version firstConsistentVersion = invalidVersion;
if (beginVersion == invalidVersion) {
beginVersion = 0;
}
@ -4080,28 +4112,59 @@ struct StartFullRestoreTaskFunc : RestoreTaskFuncBase {
for (auto const& r : ranges) {
keyRangesFilter.push_back_deep(keyRangesFilter.arena(), KeyRangeRef(r));
}
Optional<RestorableFileSet> restorable =
wait(bc->getRestoreSet(restoreVersion, keyRangesFilter, incremental, beginVersion));
if (!incremental) {
beginVersion = restorable.get().snapshot.beginVersion;
}
state Optional<RestorableFileSet> restorable =
wait(bc->getRestoreSet(restoreVersion, keyRangesFilter, logsOnly, beginVersion));
if (!restorable.present())
throw restore_missing_data();
// First version for which log data should be applied
Params.firstVersion().set(task, beginVersion);
// Convert the two lists in restorable (logs and ranges) to a single list of RestoreFiles.
// Order does not matter, they will be put in order when written to the restoreFileMap below.
state std::vector<RestoreConfig::RestoreFile> files;
for (const RangeFile& f : restorable.get().ranges) {
files.push_back({ f.version, f.fileName, true, f.blockSize, f.fileSize });
if (!logsOnly) {
beginVersion = restorable.get().snapshot.beginVersion;
if (!inconsistentSnapshotOnly) {
for (const RangeFile& f : restorable.get().ranges) {
files.push_back({ f.version, f.fileName, true, f.blockSize, f.fileSize });
// In a restore with both snapshots and logs, the firstConsistentVersion is the highest version of
// any range file.
firstConsistentVersion = std::max(firstConsistentVersion, f.version);
}
} else {
for (int i = 0; i < restorable.get().ranges.size(); ++i) {
const RangeFile& f = restorable.get().ranges[i];
files.push_back({ f.version, f.fileName, true, f.blockSize, f.fileSize });
// In inconsistentSnapshotOnly mode, if all range files have the same version, then it is the
// firstConsistentVersion, otherwise unknown (use -1).
if (i != 0 && f.version != firstConsistentVersion) {
firstConsistentVersion = invalidVersion;
} else {
firstConsistentVersion = f.version;
}
}
}
} else {
// In logs-only (incremental) mode, the firstConsistentVersion should just be restore.beginVersion().
firstConsistentVersion = beginVersion;
}
if (!inconsistentSnapshotOnly) {
for (const LogFile& f : restorable.get().logs) {
files.push_back({ f.beginVersion, f.fileName, false, f.blockSize, f.fileSize, f.endVersion });
}
}
// First version for which log data should be applied
Params.firstVersion().set(task, beginVersion);
for (const LogFile& f : restorable.get().logs) {
files.push_back({ f.beginVersion, f.fileName, false, f.blockSize, f.fileSize, f.endVersion });
tr->reset();
loop {
try {
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr->setOption(FDBTransactionOptions::LOCK_AWARE);
restore.firstConsistentVersion().set(tr, firstConsistentVersion);
wait(tr->commit());
break;
} catch (Error& e) {
wait(tr->onError(e));
}
}
state std::vector<RestoreConfig::RestoreFile>::iterator start = files.begin();
@ -4176,7 +4239,7 @@ struct StartFullRestoreTaskFunc : RestoreTaskFuncBase {
tr, taskBucket, task, 0, "", 0, CLIENT_KNOBS->RESTORE_DISPATCH_BATCH_SIZE)));
wait(taskBucket->finish(tr, task));
state Future<Optional<bool>> logsOnly = restore.incrementalBackupOnly().get(tr);
state Future<Optional<bool>> logsOnly = restore.onlyAppyMutationLogs().get(tr);
wait(success(logsOnly));
if (logsOnly.get().present() && logsOnly.get().get()) {
// If this is an incremental restore, we need to set the applyMutationsMapPrefix
@ -4440,6 +4503,7 @@ public:
ACTOR static Future<Void> submitBackup(FileBackupAgent* backupAgent,
Reference<ReadYourWritesTransaction> tr,
Key outContainer,
int initialSnapshotIntervalSeconds,
int snapshotIntervalSeconds,
std::string tagName,
Standalone<VectorRef<KeyRangeRef>> backupRanges,
@ -4555,6 +4619,7 @@ public:
config.backupContainer().set(tr, bc);
config.stopWhenDone().set(tr, stopWhenDone);
config.backupRanges().set(tr, normalizedRanges);
config.initialSnapshotIntervalSeconds().set(tr, initialSnapshotIntervalSeconds);
config.snapshotIntervalSeconds().set(tr, snapshotIntervalSeconds);
config.partitionedLogEnabled().set(tr, partitionedLog);
config.incrementalBackupOnly().set(tr, incrementalBackupOnly);
@ -4574,7 +4639,8 @@ public:
Key addPrefix,
Key removePrefix,
bool lockDB,
bool incrementalBackupOnly,
bool onlyAppyMutationLogs,
bool inconsistentSnapshotOnly,
Version beginVersion,
UID uid) {
KeyRangeMap<int> restoreRangeSet;
@ -4588,8 +4654,9 @@ public:
restoreRanges.push_back(KeyRange(KeyRangeRef(restoreRange.range().begin, restoreRange.range().end)));
}
}
for (auto& restoreRange : restoreRanges)
ASSERT(restoreRange.contains(removePrefix) || removePrefix.size() == 0);
for (auto& restoreRange : restoreRanges) {
ASSERT(restoreRange.begin.startsWith(removePrefix) && restoreRange.end.startsWith(removePrefix));
}
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr->setOption(FDBTransactionOptions::LOCK_AWARE);
@ -4625,7 +4692,7 @@ public:
.removePrefix(removePrefix)
.withPrefix(addPrefix);
Standalone<RangeResultRef> existingRows = wait(tr->getRange(restoreIntoRange, 1));
if (existingRows.size() > 0 && !incrementalBackupOnly) {
if (existingRows.size() > 0 && !onlyAppyMutationLogs) {
throw restore_destination_not_empty();
}
}
@ -4642,7 +4709,8 @@ public:
restore.sourceContainer().set(tr, bc);
restore.stateEnum().set(tr, ERestoreState::QUEUED);
restore.restoreVersion().set(tr, restoreVersion);
restore.incrementalBackupOnly().set(tr, incrementalBackupOnly);
restore.onlyAppyMutationLogs().set(tr, onlyAppyMutationLogs);
restore.inconsistentSnapshotOnly().set(tr, inconsistentSnapshotOnly);
restore.beginVersion().set(tr, beginVersion);
if (BUGGIFY && restoreRanges.size() == 1) {
restore.restoreRange().set(tr, restoreRanges[0]);
@ -5181,7 +5249,8 @@ public:
}
ACTOR static Future<Optional<Version>> getLastRestorable(FileBackupAgent* backupAgent,
Reference<ReadYourWritesTransaction> tr, Key tagName,
Reference<ReadYourWritesTransaction> tr,
Key tagName,
bool snapshot) {
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr->setOption(FDBTransactionOptions::LOCK_AWARE);
@ -5215,7 +5284,9 @@ public:
// removePrefix: for each key to be restored, remove this prefix first.
// lockDB: if set lock the database with randomUid before performing restore;
// otherwise, check database is locked with the randomUid
// incrementalBackupOnly: only perform incremental backup
// onlyAppyMutationLogs: only perform incremental restore, by only applying mutation logs
// inconsistentSnapshotOnly: Ignore mutation log files during the restore to speedup the process.
// When set to true, gives an inconsistent snapshot, thus not recommended
// beginVersion: restore's begin version
// randomUid: the UID for lock the database
ACTOR static Future<Version> restore(FileBackupAgent* backupAgent,
@ -5230,9 +5301,15 @@ public:
Key addPrefix,
Key removePrefix,
bool lockDB,
bool incrementalBackupOnly,
bool onlyAppyMutationLogs,
bool inconsistentSnapshotOnly,
Version beginVersion,
UID randomUid) {
// The restore command line tool won't allow ranges to be empty, but correctness workloads somehow might.
if (ranges.empty()) {
throw restore_error();
}
state Reference<IBackupContainer> bc = IBackupContainer::openContainer(url.toString());
state BackupDescription desc = wait(bc->describeBackup(true));
@ -5244,12 +5321,12 @@ public:
if (targetVersion == invalidVersion && desc.maxRestorableVersion.present())
targetVersion = desc.maxRestorableVersion.get();
if (targetVersion == invalidVersion && incrementalBackupOnly && desc.contiguousLogEnd.present()) {
if (targetVersion == invalidVersion && onlyAppyMutationLogs && desc.contiguousLogEnd.present()) {
targetVersion = desc.contiguousLogEnd.get() - 1;
}
Optional<RestorableFileSet> restoreSet =
wait(bc->getRestoreSet(targetVersion, ranges, incrementalBackupOnly, beginVersion));
wait(bc->getRestoreSet(targetVersion, ranges, onlyAppyMutationLogs, beginVersion));
if (!restoreSet.present()) {
TraceEvent(SevWarn, "FileBackupAgentRestoreNotPossible")
@ -5281,7 +5358,8 @@ public:
addPrefix,
removePrefix,
lockDB,
incrementalBackupOnly,
onlyAppyMutationLogs,
inconsistentSnapshotOnly,
beginVersion,
randomUid));
wait(tr->commit());
@ -5437,6 +5515,7 @@ public:
removePrefix,
true,
false,
false,
invalidVersion,
randomUid));
return ver;
@ -5497,7 +5576,8 @@ Future<Version> FileBackupAgent::restore(Database cx,
Key addPrefix,
Key removePrefix,
bool lockDB,
bool incrementalBackupOnly,
bool onlyAppyMutationLogs,
bool inconsistentSnapshotOnly,
Version beginVersion) {
return FileBackupAgentImpl::restore(this,
cx,
@ -5511,7 +5591,8 @@ Future<Version> FileBackupAgent::restore(Database cx,
addPrefix,
removePrefix,
lockDB,
incrementalBackupOnly,
onlyAppyMutationLogs,
inconsistentSnapshotOnly,
beginVersion,
deterministicRandom()->randomUniqueID());
}
@ -5542,6 +5623,7 @@ Future<ERestoreState> FileBackupAgent::waitRestore(Database cx, Key tagName, boo
Future<Void> FileBackupAgent::submitBackup(Reference<ReadYourWritesTransaction> tr,
Key outContainer,
int initialSnapshotIntervalSeconds,
int snapshotIntervalSeconds,
std::string tagName,
Standalone<VectorRef<KeyRangeRef>> backupRanges,
@ -5551,6 +5633,7 @@ Future<Void> FileBackupAgent::submitBackup(Reference<ReadYourWritesTransaction>
return FileBackupAgentImpl::submitBackup(this,
tr,
outContainer,
initialSnapshotIntervalSeconds,
snapshotIntervalSeconds,
tagName,
backupRanges,
@ -5575,7 +5658,8 @@ Future<std::string> FileBackupAgent::getStatusJSON(Database cx, std::string tagN
return FileBackupAgentImpl::getStatusJSON(this, cx, tagName);
}
Future<Optional<Version>> FileBackupAgent::getLastRestorable(Reference<ReadYourWritesTransaction> tr, Key tagName,
Future<Optional<Version>> FileBackupAgent::getLastRestorable(Reference<ReadYourWritesTransaction> tr,
Key tagName,
bool snapshot) {
return FileBackupAgentImpl::getLastRestorable(this, tr, tagName, snapshot);
}

View File

@ -0,0 +1,257 @@
/*
* FluentDSampleIngestor.cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2021 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "fdbclient/ActorLineageProfiler.h"
#include <boost/asio.hpp>
#include <boost/asio/co_spawn.hpp>
#include <msgpack.hpp>
namespace {
boost::asio::ip::address ipAddress(IPAddress const& n) {
if (n.isV6()) {
return boost::asio::ip::address_v6(n.toV6());
} else {
return boost::asio::ip::address_v4(n.toV4());
}
}
template <class Protocol>
boost::asio::ip::basic_endpoint<Protocol> toEndpoint(NetworkAddress const n) {
return boost::asio::ip::basic_endpoint<Protocol>(ipAddress(n.ip), n.port);
}
struct FluentDSocket {
virtual ~FluentDSocket() {}
virtual void connect(NetworkAddress const& endpoint) = 0;
virtual void send(std::shared_ptr<Sample> const& sample) = 0;
virtual const boost::system::error_code& failed() const = 0;
};
template <class Protocol, class Callback>
class SampleSender : public std::enable_shared_from_this<SampleSender<Protocol, Callback>> {
using Socket = typename Protocol::socket;
using Iter = typename decltype(Sample::data)::iterator;
Socket& socket;
Callback callback;
Iter iter, end;
struct Buf {
const char* data;
const unsigned size;
Buf(const char* data, unsigned size) : data(data), size(size) {}
Buf(Buf const&) = delete;
Buf& operator=(Buf const&) = delete;
~Buf() { delete[] data; }
};
void sendCompletionHandler(boost::system::error_code const& ec) {
if (ec) {
callback(ec);
} else {
++iter;
sendNext();
}
}
void send(boost::asio::ip::tcp::socket& socket, std::shared_ptr<Buf> const& buf) {
boost::asio::async_write(
socket,
boost::asio::const_buffer(buf->data, buf->size),
[buf, self = this->shared_from_this()](auto const& ec, size_t) { self->sendCompletionHandler(ec); });
}
void send(boost::asio::ip::udp::socket& socket, std::shared_ptr<Buf> const& buf) {
socket.async_send(
boost::asio::const_buffer(buf->data, buf->size),
[buf, self = this->shared_from_this()](auto const& ec, size_t) { self->sendCompletionHandler(ec); });
}
void sendNext() {
if (iter == end) {
callback(boost::system::error_code());
}
// 1. calculate size of buffer
unsigned size = 1; // 1 for fixmap identifier byte
auto waitState = to_string(iter->first);
if (waitState.size() < 32) {
size = waitState.size() + 1;
} else {
size = waitState.size() + 2;
}
size += iter->second.second;
// 2. allocate the buffer
std::unique_ptr<char[]> buf(new char[size]);
unsigned off = 0;
// 3. serialize fixmap
buf[off++] = 0x81; // map of size 1
// 3.1 serialize key
if (waitState.size() < 32) {
buf[off++] = 0xa0 + waitState.size(); // fixstr
} else {
buf[off++] = 0xd9;
buf[off++] = char(waitState.size());
}
memcpy(buf.get() + off, waitState.data(), waitState.size());
off += waitState.size();
// 3.2 append serialized value
memcpy(buf.get() + off, iter->second.first, iter->second.second);
// 4. send the result to fluentd
send(socket, std::make_shared<Buf>(buf.release(), size));
}
public:
SampleSender(Socket& socket, Callback const& callback, std::shared_ptr<Sample> const& sample)
: socket(socket), callback(callback), iter(sample->data.begin()), end(sample->data.end()) {
sendNext();
}
};
// Sample function to make instanciation of SampleSender easier
template <class Protocol, class Callback>
std::shared_ptr<SampleSender<Protocol, Callback>> makeSampleSender(typename Protocol::socket& socket, Callback const& callback, std::shared_ptr<Sample> const& sample) {
return std::make_shared<SampleSender<Protocol, Callback>>(socket, callback, sample);
}
template <class Protocol>
struct FluentDSocketImpl : FluentDSocket, std::enable_shared_from_this<FluentDSocketImpl<Protocol>> {
static constexpr unsigned MAX_QUEUE_SIZE = 100;
boost::asio::io_context& context;
typename Protocol::socket socket;
FluentDSocketImpl(boost::asio::io_context& context) : context(context), socket(context) {}
bool ready = false;
std::deque<std::shared_ptr<Sample>> queue;
boost::system::error_code _failed;
const boost::system::error_code& failed() const override { return _failed; }
void sendCompletionHandler(boost::system::error_code const& ec) {
if (ec) {
// TODO: trace error
_failed = ec;
return;
}
if (queue.empty()) {
ready = true;
} else {
auto sample = queue.front();
queue.pop_front();
sendImpl(sample);
}
}
void sendImpl(std::shared_ptr<Sample> const& sample) {
makeSampleSender<Protocol>(socket, [self = this->shared_from_this()](boost::system::error_code const& ec){
self->sendCompletionHandler(ec);
}, sample);
}
void send(std::shared_ptr<Sample> const& sample) override {
if (_failed) {
return;
}
if (ready) {
ready = false;
sendImpl(sample);
} else {
if (queue.size() < MAX_QUEUE_SIZE) {
queue.push_back(sample);
} // TODO: else trace a warning
}
}
void connect(NetworkAddress const& endpoint) override {
auto to = toEndpoint<Protocol>(endpoint);
socket.async_connect(to, [self = this->shared_from_this()](boost::system::error_code const& ec) {
if (ec) {
// TODO: error handling
self->_failed = ec;
return;
}
self->ready = true;
});
}
};
} // namespace
struct FluentDIngestorImpl {
using Protocol = FluentDIngestor::Protocol;
Protocol protocol;
NetworkAddress endpoint;
boost::asio::io_context& io_context;
std::unique_ptr<FluentDSocket> socket;
boost::asio::steady_timer retryTimer;
FluentDIngestorImpl(Protocol protocol, NetworkAddress const& endpoint)
: protocol(protocol), endpoint(endpoint), io_context(ActorLineageProfiler::instance().context()),
retryTimer(io_context) {
connect();
}
~FluentDIngestorImpl() { retryTimer.cancel(); }
void connect() {
switch (protocol) {
case Protocol::TCP:
socket.reset(new FluentDSocketImpl<boost::asio::ip::tcp>(io_context));
break;
case Protocol::UDP:
socket.reset(new FluentDSocketImpl<boost::asio::ip::udp>(io_context));
break;
}
socket->connect(endpoint);
}
void retry() {
retryTimer = boost::asio::steady_timer(io_context, std::chrono::seconds(1));
retryTimer.async_wait([this](auto const& ec) {
if (ec) {
return;
}
connect();
});
socket.reset();
}
};
FluentDIngestor::~FluentDIngestor() {
delete impl;
}
FluentDIngestor::FluentDIngestor(Protocol protocol, NetworkAddress& endpoint)
: impl(new FluentDIngestorImpl(protocol, endpoint)) {}
void FluentDIngestor::ingest(const std::shared_ptr<Sample>& sample) {
if (!impl->socket) {
// the connection failed in the past and we wait for a timeout before we retry
return;
} else if (impl->socket->failed()) {
impl->retry();
return;
} else {
impl->socket->send(sample);
}
}
void FluentDIngestor::getConfig(std::map<std::string, std::string>& res) const {
res["ingestor"] = "fluentd";
res["collector_endpoint"] = impl->endpoint.toString();
res["collector_protocol"] = impl->protocol == Protocol::TCP ? "tcp" : "udp";
}

View File

@ -0,0 +1,257 @@
/*
* GlobalConfig.actor.cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2021 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "fdbclient/DatabaseContext.h"
#include "fdbclient/GlobalConfig.actor.h"
#include "fdbclient/SpecialKeySpace.actor.h"
#include "fdbclient/SystemData.h"
#include "fdbclient/Tuple.h"
#include "flow/flow.h"
#include "flow/genericactors.actor.h"
#include "flow/actorcompiler.h" // This must be the last #include.
const KeyRef fdbClientInfoTxnSampleRate = LiteralStringRef("config/fdb_client_info/client_txn_sample_rate");
const KeyRef fdbClientInfoTxnSizeLimit = LiteralStringRef("config/fdb_client_info/client_txn_size_limit");
const KeyRef transactionTagSampleRate = LiteralStringRef("config/transaction_tag_sample_rate");
const KeyRef transactionTagSampleCost = LiteralStringRef("config/transaction_tag_sample_cost");
const KeyRef samplingFrequency = LiteralStringRef("visibility/sampling/frequency");
const KeyRef samplingWindow = LiteralStringRef("visibility/sampling/window");
GlobalConfig::GlobalConfig() : lastUpdate(0) {}
void GlobalConfig::create(DatabaseContext* cx, Reference<AsyncVar<ClientDBInfo>> dbInfo) {
if (g_network->global(INetwork::enGlobalConfig) == nullptr) {
auto config = new GlobalConfig{};
config->cx = Database(cx);
config->dbInfo = dbInfo;
g_network->setGlobal(INetwork::enGlobalConfig, config);
config->_updater = updater(config);
}
}
GlobalConfig& GlobalConfig::globalConfig() {
void* res = g_network->global(INetwork::enGlobalConfig);
ASSERT(res);
return *reinterpret_cast<GlobalConfig*>(res);
}
void GlobalConfig::updateDBInfo(Reference<AsyncVar<ClientDBInfo>> dbInfo) {
// this->dbInfo = dbInfo;
}
Key GlobalConfig::prefixedKey(KeyRef key) {
return key.withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::GLOBALCONFIG).begin);
}
const Reference<ConfigValue> GlobalConfig::get(KeyRef name) {
auto it = data.find(name);
if (it == data.end()) {
return Reference<ConfigValue>();
}
return it->second;
}
const std::map<KeyRef, Reference<ConfigValue>> GlobalConfig::get(KeyRangeRef range) {
std::map<KeyRef, Reference<ConfigValue>> results;
for (const auto& [key, value] : data) {
if (range.contains(key)) {
results[key] = value;
}
}
return results;
}
Future<Void> GlobalConfig::onInitialized() {
return initialized.getFuture();
}
Future<Void> GlobalConfig::onChange() {
return configChanged.onTrigger();
}
void GlobalConfig::trigger(KeyRef key, std::function<void(std::optional<std::any>)> fn) {
callbacks.emplace(key, std::move(fn));
}
void GlobalConfig::insert(KeyRef key, ValueRef value) {
data.erase(key);
Arena arena(key.expectedSize() + value.expectedSize());
KeyRef stableKey = KeyRef(arena, key);
try {
std::any any;
Tuple t = Tuple::unpack(value);
if (t.getType(0) == Tuple::ElementType::UTF8) {
any = StringRef(arena, t.getString(0).contents());
} else if (t.getType(0) == Tuple::ElementType::INT) {
any = t.getInt(0);
} else if (t.getType(0) == Tuple::ElementType::BOOL) {
any = t.getBool(0);
} else if (t.getType(0) == Tuple::ElementType::FLOAT) {
any = t.getFloat(0);
} else if (t.getType(0) == Tuple::ElementType::DOUBLE) {
any = t.getDouble(0);
} else {
ASSERT(false);
}
data[stableKey] = makeReference<ConfigValue>(std::move(arena), std::move(any));
if (callbacks.find(stableKey) != callbacks.end()) {
callbacks[stableKey](data[stableKey]->value);
}
} catch (Error& e) {
TraceEvent("GlobalConfigTupleParseError").detail("What", e.what());
}
}
void GlobalConfig::erase(Key key) {
erase(KeyRangeRef(key, keyAfter(key)));
}
void GlobalConfig::erase(KeyRangeRef range) {
auto it = data.begin();
while (it != data.end()) {
if (range.contains(it->first)) {
if (callbacks.find(it->first) != callbacks.end()) {
callbacks[it->first](std::nullopt);
}
it = data.erase(it);
} else {
++it;
}
}
}
// Older FDB versions used different keys for client profiling data. This
// function performs a one-time migration of data in these keys to the new
// global configuration key space.
ACTOR Future<Void> GlobalConfig::migrate(GlobalConfig* self) {
state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(self->cx);
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
state Key migratedKey("\xff\x02/fdbClientInfo/migrated/"_sr);
state Optional<Value> migrated = wait(tr->get(migratedKey));
if (migrated.present()) {
// Already performed migration.
return Void();
}
state Optional<Value> sampleRate = wait(tr->get(Key("\xff\x02/fdbClientInfo/client_txn_sample_rate/"_sr)));
state Optional<Value> sizeLimit = wait(tr->get(Key("\xff\x02/fdbClientInfo/client_txn_size_limit/"_sr)));
loop {
try {
tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES);
// The value doesn't matter too much, as long as the key is set.
tr->set(migratedKey.contents(), "1"_sr);
if (sampleRate.present()) {
const double sampleRateDbl =
BinaryReader::fromStringRef<double>(sampleRate.get().contents(), Unversioned());
Tuple rate = Tuple().appendDouble(sampleRateDbl);
tr->set(GlobalConfig::prefixedKey(fdbClientInfoTxnSampleRate), rate.pack());
}
if (sizeLimit.present()) {
const int64_t sizeLimitInt =
BinaryReader::fromStringRef<int64_t>(sizeLimit.get().contents(), Unversioned());
Tuple size = Tuple().append(sizeLimitInt);
tr->set(GlobalConfig::prefixedKey(fdbClientInfoTxnSizeLimit), size.pack());
}
wait(tr->commit());
return Void();
} catch (Error& e) {
throw;
}
}
}
// Updates local copy of global configuration by reading the entire key-range
// from storage.
ACTOR Future<Void> GlobalConfig::refresh(GlobalConfig* self) {
self->erase(KeyRangeRef(""_sr, "\xff"_sr));
Transaction tr(self->cx);
Standalone<RangeResultRef> result = wait(tr.getRange(globalConfigDataKeys, CLIENT_KNOBS->TOO_MANY));
for (const auto& kv : result) {
KeyRef systemKey = kv.key.removePrefix(globalConfigKeysPrefix);
self->insert(systemKey, kv.value);
}
return Void();
}
// Applies updates to the local copy of the global configuration when this
// process receives an updated history.
ACTOR Future<Void> GlobalConfig::updater(GlobalConfig* self) {
// wait(self->cx->onConnected());
wait(self->migrate(self));
wait(self->refresh(self));
self->initialized.send(Void());
loop {
try {
wait(self->dbInfo->onChange());
auto& history = self->dbInfo->get().history;
if (history.size() == 0) {
continue;
}
if (self->lastUpdate < history[0].version) {
// This process missed too many global configuration
// history updates or the protocol version changed, so it
// must re-read the entire configuration range.
wait(self->refresh(self));
if (self->dbInfo->get().history.size() > 0) {
self->lastUpdate = self->dbInfo->get().history.back().version;
}
} else {
// Apply history in order, from lowest version to highest
// version. Mutation history should already be stored in
// ascending version order.
for (const auto& vh : history) {
if (vh.version <= self->lastUpdate) {
continue; // already applied this mutation
}
for (const auto& mutation : vh.mutations.contents()) {
if (mutation.type == MutationRef::SetValue) {
self->insert(mutation.param1, mutation.param2);
} else if (mutation.type == MutationRef::ClearRange) {
self->erase(KeyRangeRef(mutation.param1, mutation.param2));
} else {
ASSERT(false);
}
}
ASSERT(vh.version > self->lastUpdate);
self->lastUpdate = vh.version;
}
}
self->configChanged.trigger();
} catch (Error& e) {
throw;
}
}
}

View File

@ -0,0 +1,171 @@
/*
* GlobalConfig.actor.h
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2021 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#if defined(NO_INTELLISENSE) && !defined(FDBCLIENT_GLOBALCONFIG_ACTOR_G_H)
#define FDBCLIENT_GLOBALCONFIG_ACTOR_G_H
#include "fdbclient/GlobalConfig.actor.g.h"
#elif !defined(FDBCLIENT_GLOBALCONFIG_ACTOR_H)
#define FDBCLIENT_GLOBALCONFIG_ACTOR_H
#include <any>
#include <functional>
#include <map>
#include <optional>
#include <type_traits>
#include <unordered_map>
#include "fdbclient/CommitProxyInterface.h"
#include "fdbclient/GlobalConfig.h"
#include "fdbclient/ReadYourWrites.h"
#include "flow/actorcompiler.h" // has to be last include
// The global configuration is a series of typed key-value pairs synced to all
// nodes (server and client) in an FDB cluster in an eventually consistent
// manner. Only small key-value pairs should be stored in global configuration;
// an excessive amount of data can cause synchronization slowness.
// Keys
extern const KeyRef fdbClientInfoTxnSampleRate;
extern const KeyRef fdbClientInfoTxnSizeLimit;
extern const KeyRef transactionTagSampleRate;
extern const KeyRef transactionTagSampleCost;
extern const KeyRef samplingFrequency;
extern const KeyRef samplingWindow;
// Structure used to hold the values stored by global configuration. The arena
// is used as memory to store both the key and the value (the value is only
// stored in the arena if it is an object; primitives are just copied).
struct ConfigValue : ReferenceCounted<ConfigValue> {
Arena arena;
std::any value;
ConfigValue() {}
ConfigValue(Arena&& a, std::any&& v) : arena(a), value(v) {}
};
class GlobalConfig : NonCopyable {
public:
// Creates a GlobalConfig singleton, accessed by calling GlobalConfig().
// This function should only be called once by each process (however, it is
// idempotent and calling it multiple times will have no effect).
static void create(DatabaseContext* cx, Reference<AsyncVar<ClientDBInfo>> dbInfo);
// Returns a reference to the global GlobalConfig object. Clients should
// call this function whenever they need to read a value out of the global
// configuration.
static GlobalConfig& globalConfig();
// Updates the ClientDBInfo object used by global configuration to read new
// data. For server processes, this value needs to be set by the cluster
// controller, but global config is initialized before the cluster
// controller is, so this function provides a mechanism to update the
// object after initialization.
void updateDBInfo(Reference<AsyncVar<ClientDBInfo>> dbInfo);
// Use this function to turn a global configuration key defined above into
// the full path needed to set the value in the database.
//
// For example, given "config/a", returns "\xff\xff/global_config/config/a".
static Key prefixedKey(KeyRef key);
// Get a value from the framework. Values are returned as a ConfigValue
// reference which also contains the arena holding the object. As long as
// the caller keeps the ConfigValue reference, the value is guaranteed to
// be readable. An empty reference is returned if the value does not exist.
const Reference<ConfigValue> get(KeyRef name);
const std::map<KeyRef, Reference<ConfigValue>> get(KeyRangeRef range);
// For arithmetic value types, returns a copy of the value for the given
// key, or the supplied default value if the framework does not know about
// the key.
template <typename T, typename std::enable_if<std::is_arithmetic<T>{}, bool>::type = true>
const T get(KeyRef name, T defaultVal) {
try {
auto configValue = get(name);
if (configValue.isValid()) {
if (configValue->value.has_value()) {
return std::any_cast<T>(configValue->value);
}
}
return defaultVal;
} catch (Error& e) {
throw;
}
}
// Trying to write into the global configuration keyspace? To write data,
// submit a transaction to \xff\xff/global_config/<your-key> with
// <your-value> encoded using the FDB tuple typecodes. Use the helper
// function `prefixedKey` to correctly prefix your global configuration
// key.
// Triggers the returned future when the global configuration singleton has
// been created and is ready.
Future<Void> onInitialized();
// Triggers the returned future when any key-value pair in the global
// configuration changes.
Future<Void> onChange();
// Calls \ref fn when the value associated with \ref key is changed. \ref
// fn is passed the updated value for the key, or an empty optional if the
// key has been cleared. If the value is an allocated object, its memory
// remains in the control of the global configuration.
void trigger(KeyRef key, std::function<void(std::optional<std::any>)> fn);
private:
GlobalConfig();
// The functions below only affect the local copy of the global
// configuration keyspace! To insert or remove values across all nodes you
// must use a transaction (see the note above).
// Inserts the given key-value pair into the local copy of the global
// configuration keyspace, overwriting the old key-value pair if it exists.
// `value` must be encoded using the FDB tuple typecodes.
void insert(KeyRef key, ValueRef value);
// Removes the given key (and associated value) from the local copy of the
// global configuration keyspace.
void erase(Key key);
// Removes the given key range (and associated values) from the local copy
// of the global configuration keyspace.
void erase(KeyRangeRef range);
ACTOR static Future<Void> migrate(GlobalConfig* self);
ACTOR static Future<Void> refresh(GlobalConfig* self);
ACTOR static Future<Void> updater(GlobalConfig* self);
Database cx;
Reference<AsyncVar<ClientDBInfo>> dbInfo;
Future<Void> _updater;
Promise<Void> initialized;
AsyncTrigger configChanged;
std::unordered_map<StringRef, Reference<ConfigValue>> data;
Version lastUpdate;
std::unordered_map<KeyRef, std::function<void(std::optional<std::any>)>> callbacks;
};
#endif

45
fdbclient/GlobalConfig.h Normal file
View File

@ -0,0 +1,45 @@
/*
* GlobalConfig.h
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2021 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "fdbclient/CommitTransaction.h"
#include "fdbclient/FDBTypes.h"
// Used to store a list of mutations made to the global configuration at a
// specific version.
struct VersionHistory {
constexpr static FileIdentifier file_identifier = 5863456;
VersionHistory() {}
VersionHistory(Version v) : version(v) {}
Version version;
Standalone<VectorRef<MutationRef>> mutations;
bool operator<(const VersionHistory& other) const { return version < other.version; }
int expectedSize() const { return sizeof(version) + mutations.expectedSize(); }
template <typename Ar>
void serialize(Ar& ar) {
serializer(ar, mutations, version);
}
};

View File

@ -28,6 +28,7 @@
#include "flow/ThreadHelper.actor.h"
// An interface that represents a transaction created by a client
class ITransaction {
public:
virtual ~ITransaction() {}
@ -90,6 +91,7 @@ public:
virtual void delref() = 0;
};
// An interface that represents a connection to a cluster made by a client
class IDatabase {
public:
virtual ~IDatabase() {}
@ -98,6 +100,12 @@ public:
virtual void setOption(FDBDatabaseOptions::Option option, Optional<StringRef> value = Optional<StringRef>()) = 0;
virtual double getMainThreadBusyness() = 0;
// Returns the protocol version reported by the coordinator this client is connected to
// If an expected version is given, the future won't return until the protocol version is different than expected
// Note: this will never return if the server is running a protocol from FDB 5.0 or older
virtual ThreadFuture<ProtocolVersion> getServerProtocol(
Optional<ProtocolVersion> expectedVersion = Optional<ProtocolVersion>()) = 0;
virtual void addref() = 0;
virtual void delref() = 0;
@ -110,13 +118,16 @@ public:
virtual ThreadFuture<Void> createSnapshot(const StringRef& uid, const StringRef& snapshot_command) = 0;
};
// An interface that presents the top-level FDB client API as exposed through the C bindings
//
// This interface and its associated objects are intended to live outside the network thread, so its asynchronous
// operations use ThreadFutures and implementations should be thread safe.
class IClientApi {
public:
virtual ~IClientApi() {}
virtual void selectApiVersion(int apiVersion) = 0;
virtual const char* getClientVersion() = 0;
virtual ThreadFuture<uint64_t> getServerProtocol(const char* clusterFilePath) = 0;
virtual void setNetworkOption(FDBNetworkOptions::Option option,
Optional<StringRef> value = Optional<StringRef>()) = 0;

View File

@ -50,6 +50,7 @@ void ClientKnobs::initialize(bool randomize) {
init( RECOVERY_DELAY_SECONDS_PER_GENERATION, 60.0 );
init( MAX_GENERATIONS, 100 );
init( MAX_GENERATIONS_OVERRIDE, 0 );
init( MAX_GENERATIONS_SIM, 50 ); //Disable network connections after this many generations in simulation, should be less than RECOVERY_DELAY_START_GENERATION
init( COORDINATOR_RECONNECTION_DELAY, 1.0 );
init( CLIENT_EXAMPLE_AMOUNT, 20 );

View File

@ -42,6 +42,7 @@ public:
double RECOVERY_DELAY_SECONDS_PER_GENERATION;
double MAX_GENERATIONS;
double MAX_GENERATIONS_OVERRIDE;
double MAX_GENERATIONS_SIM;
double COORDINATOR_RECONNECTION_DELAY;
int CLIENT_EXAMPLE_AMOUNT;

View File

@ -432,7 +432,8 @@ Optional<std::pair<LeaderInfo, bool>> getLeader(const vector<Optional<LeaderInfo
for (int i = 0; i < nominees.size(); i++) {
if (nominees[i].present()) {
maskedNominees.push_back(std::make_pair(
UID(nominees[i].get().changeID.first() & LeaderInfo::mask, nominees[i].get().changeID.second()), i));
UID(nominees[i].get().changeID.first() & LeaderInfo::changeIDMask, nominees[i].get().changeID.second()),
i));
}
}
@ -495,7 +496,7 @@ ACTOR Future<MonitorLeaderInfo> monitorLeaderOneGeneration(Reference<ClusterConn
if (leader.get().first.forward) {
TraceEvent("MonitorLeaderForwarding")
.detail("NewConnStr", leader.get().first.serializedInfo.toString())
.detail("OldConnStr", info.intermediateConnFile->getConnectionString().toString());
.detail("OldConnStr", info.intermediateConnFile->getConnectionString().toString()).trackLatest("MonitorLeaderForwarding");
info.intermediateConnFile = makeReference<ClusterConnectionFile>(
connFile->getFilename(), ClusterConnectionString(leader.get().first.serializedInfo.toString()));
return info;
@ -757,6 +758,7 @@ void shrinkProxyList(ClientDBInfo& ni,
ACTOR Future<MonitorLeaderInfo> monitorProxiesOneGeneration(
Reference<ClusterConnectionFile> connFile,
Reference<AsyncVar<ClientDBInfo>> clientInfo,
Reference<AsyncVar<Optional<ClientLeaderRegInterface>>> coordinator,
MonitorLeaderInfo info,
Reference<ReferencedObject<Standalone<VectorRef<ClientVersionRef>>>> supportedVersions,
Key traceLogGroup) {
@ -774,6 +776,9 @@ ACTOR Future<MonitorLeaderInfo> monitorProxiesOneGeneration(
loop {
state ClientLeaderRegInterface clientLeaderServer(addrs[idx]);
state OpenDatabaseCoordRequest req;
coordinator->set(clientLeaderServer);
req.clusterKey = cs.clusterKey();
req.coordinators = cs.coordinators();
req.knownClientInfoID = clientInfo->get().id;
@ -840,13 +845,14 @@ ACTOR Future<MonitorLeaderInfo> monitorProxiesOneGeneration(
ACTOR Future<Void> monitorProxies(
Reference<AsyncVar<Reference<ClusterConnectionFile>>> connFile,
Reference<AsyncVar<ClientDBInfo>> clientInfo,
Reference<AsyncVar<Optional<ClientLeaderRegInterface>>> coordinator,
Reference<ReferencedObject<Standalone<VectorRef<ClientVersionRef>>>> supportedVersions,
Key traceLogGroup) {
state MonitorLeaderInfo info(connFile->get());
loop {
choose {
when(MonitorLeaderInfo _info = wait(monitorProxiesOneGeneration(
connFile->get(), clientInfo, info, supportedVersions, traceLogGroup))) {
connFile->get(), clientInfo, coordinator, info, supportedVersions, traceLogGroup))) {
info = _info;
}
when(wait(connFile->onChange())) {

View File

@ -76,6 +76,7 @@ Future<Void> monitorLeaderForProxies(Value const& key,
Future<Void> monitorProxies(
Reference<AsyncVar<Reference<ClusterConnectionFile>>> const& connFile,
Reference<AsyncVar<ClientDBInfo>> const& clientInfo,
Reference<AsyncVar<Optional<ClientLeaderRegInterface>>> const& coordinator,
Reference<ReferencedObject<Standalone<VectorRef<ClientVersionRef>>>> const& supportedVersions,
Key const& traceLogGroup);

View File

@ -289,12 +289,15 @@ void DLTransaction::reset() {
// DLDatabase
DLDatabase::DLDatabase(Reference<FdbCApi> api, ThreadFuture<FdbCApi::FDBDatabase*> dbFuture) : api(api), db(nullptr) {
addref();
ready = mapThreadFuture<FdbCApi::FDBDatabase*, Void>(dbFuture, [this](ErrorOr<FdbCApi::FDBDatabase*> db) {
if (db.isError()) {
delref();
return ErrorOr<Void>(db.getError());
}
this->db = db.get();
delref();
return ErrorOr<Void>(Void());
});
}
@ -356,7 +359,33 @@ double DLDatabase::getMainThreadBusyness() {
return 0;
}
// Returns the protocol version reported by the coordinator this client is connected to
// If an expected version is given, the future won't return until the protocol version is different than expected
// Note: this will never return if the server is running a protocol from FDB 5.0 or older
ThreadFuture<ProtocolVersion> DLDatabase::getServerProtocol(Optional<ProtocolVersion> expectedVersion) {
ASSERT(api->databaseGetServerProtocol != nullptr);
uint64_t expected =
expectedVersion.map<uint64_t>([](const ProtocolVersion& v) { return v.version(); }).orDefault(0);
FdbCApi::FDBFuture* f = api->databaseGetServerProtocol(db, expected);
return toThreadFuture<ProtocolVersion>(api, f, [](FdbCApi::FDBFuture* f, FdbCApi* api) {
uint64_t pv;
FdbCApi::fdb_error_t error = api->futureGetUInt64(f, &pv);
ASSERT(!error);
return ProtocolVersion(pv);
});
}
// DLApi
// Loads the specified function from a dynamic library
//
// fp - The function pointer where the loaded function will be stored
// lib - The dynamic library where the function is loaded from
// libPath - The path of the dynamic library (used for logging)
// functionName - The function to load
// requireFunction - Determines the behavior if the function is not present. If true, an error is thrown. If false,
// the function pointer will be set to nullptr.
template <class T>
void loadClientFunction(T* fp, void* lib, std::string libPath, const char* functionName, bool requireFunction = true) {
*(void**)(fp) = loadFunction(lib, functionName);
@ -403,6 +432,8 @@ void DLApi::init() {
fdbCPath,
"fdb_database_get_main_thread_busyness",
headerVersion >= 700);
loadClientFunction(
&api->databaseGetServerProtocol, lib, fdbCPath, "fdb_database_get_server_protocol", headerVersion >= 700);
loadClientFunction(&api->databaseDestroy, lib, fdbCPath, "fdb_database_destroy");
loadClientFunction(&api->databaseRebootWorker, lib, fdbCPath, "fdb_database_reboot_worker", headerVersion >= 700);
loadClientFunction(&api->databaseForceRecoveryWithDataLoss,
@ -452,7 +483,7 @@ void DLApi::init() {
loadClientFunction(
&api->futureGetInt64, lib, fdbCPath, headerVersion >= 620 ? "fdb_future_get_int64" : "fdb_future_get_version");
loadClientFunction(&api->futureGetUInt64, lib, fdbCPath, "fdb_future_get_uint64");
loadClientFunction(&api->futureGetUInt64, lib, fdbCPath, "fdb_future_get_uint64", headerVersion >= 700);
loadClientFunction(&api->futureGetError, lib, fdbCPath, "fdb_future_get_error");
loadClientFunction(&api->futureGetKey, lib, fdbCPath, "fdb_future_get_key");
loadClientFunction(&api->futureGetValue, lib, fdbCPath, "fdb_future_get_value");
@ -488,11 +519,6 @@ const char* DLApi::getClientVersion() {
return api->getClientVersion();
}
ThreadFuture<uint64_t> DLApi::getServerProtocol(const char* clusterFilePath) {
ASSERT(false);
return ThreadFuture<uint64_t>();
}
void DLApi::setNetworkOption(FDBNetworkOptions::Option option, Optional<StringRef> value) {
throwIfError(api->setNetworkOption(
option, value.present() ? value.get().begin() : nullptr, value.present() ? value.get().size() : 0));
@ -855,35 +881,52 @@ MultiVersionDatabase::MultiVersionDatabase(MultiVersionApi* api,
int threadIdx,
std::string clusterFilePath,
Reference<IDatabase> db,
Reference<IDatabase> versionMonitorDb,
bool openConnectors)
: dbState(new DatabaseState()) {
: dbState(new DatabaseState(clusterFilePath, versionMonitorDb)) {
dbState->db = db;
dbState->dbVar->set(db);
if (!openConnectors) {
dbState->currentClientIndex = 0;
} else {
if (openConnectors) {
if (!api->localClientDisabled) {
dbState->currentClientIndex = 0;
dbState->addConnection(api->getLocalClient(), clusterFilePath);
} else {
dbState->currentClientIndex = -1;
dbState->addClient(api->getLocalClient());
}
api->runOnExternalClients(threadIdx, [this, clusterFilePath](Reference<ClientInfo> client) {
dbState->addConnection(client, clusterFilePath);
api->runOnExternalClients(threadIdx, [this](Reference<ClientInfo> client) { dbState->addClient(client); });
if (!externalClientsInitialized.test_and_set()) {
api->runOnExternalClientsAllThreads([&clusterFilePath](Reference<ClientInfo> client) {
// This creates a database to initialize some client state on the external library
// We only do this on 6.2+ clients to avoid some bugs associated with older versions
// This deletes the new database immediately to discard its connections
if (client->protocolVersion.hasCloseUnusedConnection()) {
Reference<IDatabase> newDb = client->api->createDatabase(clusterFilePath.c_str());
}
});
}
// For clients older than 6.2 we create and maintain our database connection
api->runOnExternalClients(threadIdx, [this, &clusterFilePath](Reference<ClientInfo> client) {
if (!client->protocolVersion.hasCloseUnusedConnection()) {
dbState->legacyDatabaseConnections[client->protocolVersion] =
client->api->createDatabase(clusterFilePath.c_str());
}
});
dbState->startConnections();
Reference<DatabaseState> dbStateRef = dbState;
onMainThreadVoid([dbStateRef]() { dbStateRef->protocolVersionMonitor = dbStateRef->monitorProtocolVersion(); },
nullptr);
}
}
MultiVersionDatabase::~MultiVersionDatabase() {
dbState->cancelConnections();
dbState->close();
}
// Create a MultiVersionDatabase that wraps an already created IDatabase object
// For internal use in testing
Reference<IDatabase> MultiVersionDatabase::debugCreateFromExistingDatabase(Reference<IDatabase> db) {
return Reference<IDatabase>(new MultiVersionDatabase(MultiVersionApi::api, 0, "", db, false));
return Reference<IDatabase>(new MultiVersionDatabase(MultiVersionApi::api, 0, "", db, db, false));
}
Reference<ITransaction> MultiVersionDatabase::createTransaction() {
@ -941,180 +984,279 @@ double MultiVersionDatabase::getMainThreadBusyness() {
return 0;
}
void MultiVersionDatabase::Connector::connect() {
addref();
onMainThreadVoid(
[this]() {
if (!cancelled) {
connected = false;
if (connectionFuture.isValid()) {
connectionFuture.cancel();
}
candidateDatabase = client->api->createDatabase(clusterFilePath.c_str());
if (client->external) {
connectionFuture = candidateDatabase.castTo<DLDatabase>()->onReady();
} else {
connectionFuture = ThreadFuture<Void>(Void());
}
connectionFuture = flatMapThreadFuture<Void, Void>(connectionFuture, [this](ErrorOr<Void> ready) {
if (ready.isError()) {
return ErrorOr<ThreadFuture<Void>>(ready.getError());
}
tr = candidateDatabase->createTransaction();
return ErrorOr<ThreadFuture<Void>>(
mapThreadFuture<Version, Void>(tr->getReadVersion(), [](ErrorOr<Version> v) {
// If the version attempt returns an error, we regard that as a connection (except
// operation_cancelled)
if (v.isError() && v.getError().code() == error_code_operation_cancelled) {
return ErrorOr<Void>(v.getError());
} else {
return ErrorOr<Void>(Void());
}
}));
});
int userParam;
connectionFuture.callOrSetAsCallback(this, userParam, 0);
} else {
delref();
}
},
nullptr);
// Returns the protocol version reported by the coordinator this client is connected to
// If an expected version is given, the future won't return until the protocol version is different than expected
// Note: this will never return if the server is running a protocol from FDB 5.0 or older
ThreadFuture<ProtocolVersion> MultiVersionDatabase::getServerProtocol(Optional<ProtocolVersion> expectedVersion) {
return dbState->versionMonitorDb->getServerProtocol(expectedVersion);
}
// Only called from main thread
void MultiVersionDatabase::Connector::cancel() {
connected = false;
cancelled = true;
if (connectionFuture.isValid()) {
connectionFuture.cancel();
}
}
MultiVersionDatabase::DatabaseState::DatabaseState(std::string clusterFilePath, Reference<IDatabase> versionMonitorDb)
: clusterFilePath(clusterFilePath), versionMonitorDb(versionMonitorDb),
dbVar(new ThreadSafeAsyncVar<Reference<IDatabase>>(Reference<IDatabase>(nullptr))) {}
void MultiVersionDatabase::Connector::fire(const Void& unused, int& userParam) {
onMainThreadVoid(
[this]() {
if (!cancelled) {
connected = true;
dbState->stateChanged();
}
delref();
},
nullptr);
}
// Adds a client (local or externally loaded) that can be used to connect to the cluster
void MultiVersionDatabase::DatabaseState::addClient(Reference<ClientInfo> client) {
ProtocolVersion baseVersion = client->protocolVersion.normalizedVersion();
auto [itr, inserted] = clients.insert({ baseVersion, client });
if (!inserted) {
// SOMEDAY: prefer client with higher release version if protocol versions are compatible
Reference<ClientInfo> keptClient = itr->second;
Reference<ClientInfo> discardedClient = client;
if (client->canReplace(itr->second)) {
std::swap(keptClient, discardedClient);
clients[baseVersion] = client;
}
discardedClient->failed = true;
TraceEvent(SevWarn, "DuplicateClientVersion")
.detail("Keeping", keptClient->libPath)
.detail("KeptProtocolVersion", keptClient->protocolVersion)
.detail("Disabling", discardedClient->libPath)
.detail("DisabledProtocolVersion", discardedClient->protocolVersion);
void MultiVersionDatabase::Connector::error(const Error& e, int& userParam) {
if (e.code() != error_code_operation_cancelled) {
// TODO: is it right to abandon this connection attempt?
client->failed = true;
MultiVersionApi::api->updateSupportedVersions();
TraceEvent(SevError, "DatabaseConnectionError").error(e).detail("ClientLibrary", this->client->libPath);
}
delref();
if (!client->protocolVersion.hasInexpensiveMultiVersionClient() && !client->failed) {
TraceEvent("AddingLegacyVersionMonitor")
.detail("LibPath", client->libPath)
.detail("ProtocolVersion", client->protocolVersion);
legacyVersionMonitors.emplace_back(new LegacyVersionMonitor(client));
}
}
MultiVersionDatabase::DatabaseState::DatabaseState()
: dbVar(new ThreadSafeAsyncVar<Reference<IDatabase>>(Reference<IDatabase>(nullptr))), currentClientIndex(-1) {}
// Watch the cluster protocol version for changes and update the database state when it does.
// Must be called from the main thread
ThreadFuture<Void> MultiVersionDatabase::DatabaseState::monitorProtocolVersion() {
startLegacyVersionMonitors();
// Only called from main thread
void MultiVersionDatabase::DatabaseState::stateChanged() {
int newIndex = -1;
for (int i = 0; i < clients.size(); ++i) {
if (i != currentClientIndex && connectionAttempts[i]->connected) {
if (currentClientIndex >= 0 && !clients[i]->canReplace(clients[currentClientIndex])) {
TraceEvent(SevWarn, "DuplicateClientVersion")
.detail("Keeping", clients[currentClientIndex]->libPath)
.detail("KeptClientProtocolVersion", clients[currentClientIndex]->protocolVersion.version())
.detail("Disabling", clients[i]->libPath)
.detail("DisabledClientProtocolVersion", clients[i]->protocolVersion.version());
connectionAttempts[i]->connected = false; // Permanently disable this client in favor of the current one
clients[i]->failed = true;
MultiVersionApi::api->updateSupportedVersions();
return;
Optional<ProtocolVersion> expected = dbProtocolVersion;
ThreadFuture<ProtocolVersion> f = versionMonitorDb->getServerProtocol(dbProtocolVersion);
Reference<DatabaseState> self = Reference<DatabaseState>::addRef(this);
return mapThreadFuture<ProtocolVersion, Void>(f, [self, expected](ErrorOr<ProtocolVersion> cv) {
if (cv.isError()) {
if (cv.getError().code() == error_code_operation_cancelled) {
return ErrorOr<Void>(cv.getError());
}
newIndex = i;
break;
TraceEvent("ErrorGettingClusterProtocolVersion")
.detail("ExpectedProtocolVersion", expected)
.error(cv.getError());
}
ProtocolVersion clusterVersion =
!cv.isError() ? cv.get() : self->dbProtocolVersion.orDefault(currentProtocolVersion);
onMainThreadVoid([self, clusterVersion]() { self->protocolVersionChanged(clusterVersion); }, nullptr);
return ErrorOr<Void>(Void());
});
}
// Called when a change to the protocol version of the cluster has been detected.
// Must be called from the main thread
void MultiVersionDatabase::DatabaseState::protocolVersionChanged(ProtocolVersion protocolVersion) {
// If the protocol version changed but is still compatible, update our local version but keep the same connection
if (dbProtocolVersion.present() &&
protocolVersion.normalizedVersion() == dbProtocolVersion.get().normalizedVersion()) {
dbProtocolVersion = protocolVersion;
ASSERT(protocolVersionMonitor.isValid());
protocolVersionMonitor.cancel();
protocolVersionMonitor = monitorProtocolVersion();
}
// The protocol version has changed to a different, incompatible version
else {
TraceEvent("ProtocolVersionChanged")
.detail("NewProtocolVersion", protocolVersion)
.detail("OldProtocolVersion", dbProtocolVersion);
dbProtocolVersion = protocolVersion;
auto itr = clients.find(protocolVersion.normalizedVersion());
if (itr != clients.end()) {
auto& client = itr->second;
TraceEvent("CreatingDatabaseOnClient")
.detail("LibraryPath", client->libPath)
.detail("Failed", client->failed)
.detail("External", client->external);
Reference<IDatabase> newDb = client->api->createDatabase(clusterFilePath.c_str());
if (client->external && !MultiVersionApi::apiVersionAtLeast(610)) {
// Old API versions return a future when creating the database, so we need to wait for it
Reference<DatabaseState> self = Reference<DatabaseState>::addRef(this);
dbReady = mapThreadFuture<Void, Void>(
newDb.castTo<DLDatabase>()->onReady(), [self, newDb, client](ErrorOr<Void> ready) {
if (!ready.isError()) {
onMainThreadVoid([self, newDb, client]() { self->updateDatabase(newDb, client); }, nullptr);
} else {
onMainThreadVoid([self, client]() { self->updateDatabase(Reference<IDatabase>(), client); },
nullptr);
}
return ready;
});
} else {
updateDatabase(newDb, client);
}
} else {
// We don't have a client matching the current protocol
updateDatabase(Reference<IDatabase>(), Reference<ClientInfo>());
}
}
}
if (newIndex == -1) {
ASSERT_EQ(currentClientIndex, 0); // This can only happen for the local client, which we set as the current
// connection before we know it's connected
return;
}
// Replaces the active database connection with a new one. Must be called from the main thread.
void MultiVersionDatabase::DatabaseState::updateDatabase(Reference<IDatabase> newDb, Reference<ClientInfo> client) {
if (newDb) {
optionLock.enter();
for (auto option : options) {
try {
// In practice, this will set a deferred error instead of throwing. If that happens, the database
// will be unusable (attempts to use it will throw errors).
newDb->setOption(option.first, option.second.castTo<StringRef>());
} catch (Error& e) {
optionLock.leave();
// Restart connection for replaced client
auto newDb = connectionAttempts[newIndex]->candidateDatabase;
optionLock.enter();
for (auto option : options) {
try {
newDb->setOption(option.first,
option.second.castTo<StringRef>()); // In practice, this will set a deferred error instead
// of throwing. If that happens, the database will be
// unusable (attempts to use it will throw errors).
} catch (Error& e) {
optionLock.leave();
TraceEvent(SevError, "ClusterVersionChangeOptionError")
.error(e)
.detail("Option", option.first)
.detail("OptionValue", option.second)
.detail("LibPath", clients[newIndex]->libPath);
connectionAttempts[newIndex]->connected = false;
clients[newIndex]->failed = true;
MultiVersionApi::api->updateSupportedVersions();
return; // If we can't set all of the options on a cluster, we abandon the client
// If we can't set all of the options on a cluster, we abandon the client
TraceEvent(SevError, "ClusterVersionChangeOptionError")
.error(e)
.detail("Option", option.first)
.detail("OptionValue", option.second)
.detail("LibPath", client->libPath);
client->failed = true;
MultiVersionApi::api->updateSupportedVersions();
newDb = Reference<IDatabase>();
break;
}
}
}
db = newDb;
optionLock.leave();
db = newDb;
optionLock.leave();
if (dbProtocolVersion.get().hasStableInterfaces() && db) {
versionMonitorDb = db;
} else {
// For older clients that don't have an API to get the protocol version, we have to monitor it locally
versionMonitorDb = MultiVersionApi::api->getLocalClient()->api->createDatabase(clusterFilePath.c_str());
}
} else {
// We don't have a database connection, so use the local client to monitor the protocol version
db = Reference<IDatabase>();
versionMonitorDb = MultiVersionApi::api->getLocalClient()->api->createDatabase(clusterFilePath.c_str());
}
dbVar->set(db);
if (currentClientIndex >= 0 && connectionAttempts[currentClientIndex]->connected) {
connectionAttempts[currentClientIndex]->connected = false;
connectionAttempts[currentClientIndex]->connect();
}
ASSERT(newIndex >= 0 && newIndex < clients.size());
currentClientIndex = newIndex;
ASSERT(protocolVersionMonitor.isValid());
protocolVersionMonitor.cancel();
protocolVersionMonitor = monitorProtocolVersion();
}
void MultiVersionDatabase::DatabaseState::addConnection(Reference<ClientInfo> client, std::string clusterFilePath) {
clients.push_back(client);
connectionAttempts.push_back(
makeReference<Connector>(Reference<DatabaseState>::addRef(this), client, clusterFilePath));
}
void MultiVersionDatabase::DatabaseState::startConnections() {
for (auto c : connectionAttempts) {
c->connect();
// Starts version monitors for old client versions that don't support connect packet monitoring (<= 5.0).
// Must be called from the main thread
void MultiVersionDatabase::DatabaseState::startLegacyVersionMonitors() {
for (auto itr = legacyVersionMonitors.begin(); itr != legacyVersionMonitors.end(); ++itr) {
while (itr != legacyVersionMonitors.end() && (*itr)->client->failed) {
(*itr)->close();
itr = legacyVersionMonitors.erase(itr);
}
if (itr != legacyVersionMonitors.end() &&
(!dbProtocolVersion.present() || (*itr)->client->protocolVersion != dbProtocolVersion.get())) {
(*itr)->startConnectionMonitor(Reference<DatabaseState>::addRef(this));
}
}
}
void MultiVersionDatabase::DatabaseState::cancelConnections() {
addref();
// Cleans up state for the legacy version monitors to break reference cycles
void MultiVersionDatabase::DatabaseState::close() {
Reference<DatabaseState> self = Reference<DatabaseState>::addRef(this);
onMainThreadVoid(
[this]() {
for (auto c : connectionAttempts) {
c->cancel();
[self]() {
if (self->protocolVersionMonitor.isValid()) {
self->protocolVersionMonitor.cancel();
}
for (auto monitor : self->legacyVersionMonitors) {
monitor->close();
}
connectionAttempts.clear();
clients.clear();
delref();
self->legacyVersionMonitors.clear();
},
nullptr);
}
// MultiVersionApi
// Starts the connection monitor by creating a database object at an old version.
// Must be called from the main thread
void MultiVersionDatabase::LegacyVersionMonitor::startConnectionMonitor(
Reference<MultiVersionDatabase::DatabaseState> dbState) {
if (!monitorRunning) {
monitorRunning = true;
auto itr = dbState->legacyDatabaseConnections.find(client->protocolVersion);
ASSERT(itr != dbState->legacyDatabaseConnections.end());
db = itr->second;
tr = Reference<ITransaction>();
TraceEvent("StartingLegacyVersionMonitor").detail("ProtocolVersion", client->protocolVersion);
Reference<LegacyVersionMonitor> self = Reference<LegacyVersionMonitor>::addRef(this);
versionMonitor =
mapThreadFuture<Void, Void>(db.castTo<DLDatabase>()->onReady(), [self, dbState](ErrorOr<Void> ready) {
onMainThreadVoid(
[self, ready, dbState]() {
if (ready.isError()) {
if (ready.getError().code() != error_code_operation_cancelled) {
TraceEvent(SevError, "FailedToOpenDatabaseOnClient")
.error(ready.getError())
.detail("LibPath", self->client->libPath);
self->client->failed = true;
MultiVersionApi::api->updateSupportedVersions();
}
} else {
self->runGrvProbe(dbState);
}
},
nullptr);
return ready;
});
}
}
// Runs a GRV probe on the cluster to determine if the client version is compatible with the cluster.
// Must be called from main thread
void MultiVersionDatabase::LegacyVersionMonitor::runGrvProbe(Reference<MultiVersionDatabase::DatabaseState> dbState) {
tr = db->createTransaction();
Reference<LegacyVersionMonitor> self = Reference<LegacyVersionMonitor>::addRef(this);
versionMonitor = mapThreadFuture<Version, Void>(tr->getReadVersion(), [self, dbState](ErrorOr<Version> v) {
// If the version attempt returns an error, we regard that as a connection (except operation_cancelled)
if (!v.isError() || v.getError().code() != error_code_operation_cancelled) {
onMainThreadVoid(
[self, dbState]() {
self->monitorRunning = false;
dbState->protocolVersionChanged(self->client->protocolVersion);
},
nullptr);
}
return v.map<Void>([](Version v) { return Void(); });
});
}
void MultiVersionDatabase::LegacyVersionMonitor::close() {
if (versionMonitor.isValid()) {
versionMonitor.cancel();
}
}
std::atomic_flag MultiVersionDatabase::externalClientsInitialized = ATOMIC_FLAG_INIT;
// MultiVersionApi
bool MultiVersionApi::apiVersionAtLeast(int minVersion) {
ASSERT_NE(MultiVersionApi::api->apiVersion, 0);
return MultiVersionApi::api->apiVersion >= minVersion || MultiVersionApi::api->apiVersion < 0;
@ -1181,10 +1323,6 @@ const char* MultiVersionApi::getClientVersion() {
return localClient->api->getClientVersion();
}
ThreadFuture<uint64_t> MultiVersionApi::getServerProtocol(const char* clusterFilePath) {
return api->localClient->api->getServerProtocol(clusterFilePath);
}
void validateOption(Optional<StringRef> value, bool canBePresent, bool canBeAbsent, bool canBeEmpty = true) {
ASSERT(canBePresent || canBeAbsent);
@ -1581,6 +1719,7 @@ void MultiVersionApi::addNetworkThreadCompletionHook(void (*hook)(void*), void*
}
}
// Creates an IDatabase object that represents a connection to the cluster
Reference<IDatabase> MultiVersionApi::createDatabase(const char* clusterFilePath) {
lock.enter();
if (!networkSetup) {
@ -1595,28 +1734,21 @@ Reference<IDatabase> MultiVersionApi::createDatabase(const char* clusterFilePath
int threadIdx = nextThread;
nextThread = (nextThread + 1) % threadCount;
lock.leave();
for (auto it : externalClients) {
TraceEvent("CreatingDatabaseOnExternalClient")
.detail("LibraryPath", it.first)
.detail("Failed", it.second[threadIdx]->failed);
}
return Reference<IDatabase>(new MultiVersionDatabase(this, threadIdx, clusterFile, Reference<IDatabase>()));
Reference<IDatabase> localDb = localClient->api->createDatabase(clusterFilePath);
return Reference<IDatabase>(
new MultiVersionDatabase(this, threadIdx, clusterFile, Reference<IDatabase>(), localDb));
}
lock.leave();
ASSERT_LE(threadCount, 1);
auto db = localClient->api->createDatabase(clusterFilePath);
Reference<IDatabase> localDb = localClient->api->createDatabase(clusterFilePath);
if (bypassMultiClientApi) {
return db;
return localDb;
} else {
for (auto it : externalClients) {
TraceEvent("CreatingDatabaseOnExternalClient")
.detail("LibraryPath", it.first)
.detail("Failed", it.second[0]->failed);
}
return Reference<IDatabase>(new MultiVersionDatabase(this, 0, clusterFile, db));
return Reference<IDatabase>(new MultiVersionDatabase(this, 0, clusterFile, Reference<IDatabase>(), localDb));
}
}
@ -1948,6 +2080,12 @@ ACTOR Future<Void> checkUndestroyedFutures(std::vector<ThreadSingleAssignmentVar
return Void();
}
// Common code for tests of single assignment vars. Tests both correctness and thread safety.
// T should be a class that has a static method with the following signature:
//
// static FutureInfo createThreadFuture(FutureInfo f);
//
// See AbortableTest for an example T type
template <class T>
THREAD_FUNC runSingleAssignmentVarTest(void* arg) {
noUnseed = true;
@ -1960,6 +2098,9 @@ THREAD_FUNC runSingleAssignmentVarTest(void* arg) {
tf.validate();
tf.future.extractPtr(); // leaks
for (auto t : tf.threads) {
waitThread(t);
}
}
for (int numRuns = 0; numRuns < 25; ++numRuns) {
@ -2030,12 +2171,14 @@ struct AbortableTest {
TEST_CASE("/fdbclient/multiversionclient/AbortableSingleAssignmentVar") {
state volatile bool done = false;
g_network->startThread(runSingleAssignmentVarTest<AbortableTest>, (void*)&done);
state THREAD_HANDLE thread = g_network->startThread(runSingleAssignmentVarTest<AbortableTest>, (void*)&done);
while (!done) {
wait(delay(1.0));
}
waitThread(thread);
return Void();
}
@ -2107,20 +2250,24 @@ TEST_CASE("/fdbclient/multiversionclient/DLSingleAssignmentVar") {
state volatile bool done = false;
MultiVersionApi::api->callbackOnMainThread = true;
g_network->startThread(runSingleAssignmentVarTest<DLTest>, (void*)&done);
state THREAD_HANDLE thread = g_network->startThread(runSingleAssignmentVarTest<DLTest>, (void*)&done);
while (!done) {
wait(delay(1.0));
}
waitThread(thread);
done = false;
MultiVersionApi::api->callbackOnMainThread = false;
g_network->startThread(runSingleAssignmentVarTest<DLTest>, (void*)&done);
thread = g_network->startThread(runSingleAssignmentVarTest<DLTest>, (void*)&done);
while (!done) {
wait(delay(1.0));
}
waitThread(thread);
return Void();
}
@ -2145,12 +2292,14 @@ struct MapTest {
TEST_CASE("/fdbclient/multiversionclient/MapSingleAssignmentVar") {
state volatile bool done = false;
g_network->startThread(runSingleAssignmentVarTest<MapTest>, (void*)&done);
state THREAD_HANDLE thread = g_network->startThread(runSingleAssignmentVarTest<MapTest>, (void*)&done);
while (!done) {
wait(delay(1.0));
}
waitThread(thread);
return Void();
}
@ -2182,11 +2331,13 @@ struct FlatMapTest {
TEST_CASE("/fdbclient/multiversionclient/FlatMapSingleAssignmentVar") {
state volatile bool done = false;
g_network->startThread(runSingleAssignmentVarTest<FlatMapTest>, (void*)&done);
state THREAD_HANDLE thread = g_network->startThread(runSingleAssignmentVarTest<FlatMapTest>, (void*)&done);
while (!done) {
wait(delay(1.0));
}
waitThread(thread);
return Void();
}

View File

@ -28,6 +28,8 @@
#include "flow/ThreadHelper.actor.h"
// FdbCApi is used as a wrapper around the FoundationDB C API that gets loaded from an external client library.
// All of the required functions loaded from that external library are stored in function pointers in this struct.
struct FdbCApi : public ThreadSafeReferenceCounted<FdbCApi> {
typedef struct future FDBFuture;
typedef struct cluster FDBCluster;
@ -55,7 +57,6 @@ struct FdbCApi : public ThreadSafeReferenceCounted<FdbCApi> {
// Network
fdb_error_t (*selectApiVersion)(int runtimeVersion, int headerVersion);
const char* (*getClientVersion)();
FDBFuture* (*getServerProtocol)(const char* clusterFilePath);
fdb_error_t (*setNetworkOption)(FDBNetworkOptions::Option option, uint8_t const* value, int valueLength);
fdb_error_t (*setupNetwork)();
fdb_error_t (*runNetwork)();
@ -81,6 +82,7 @@ struct FdbCApi : public ThreadSafeReferenceCounted<FdbCApi> {
uint8_t const* snapshotCommmand,
int snapshotCommandLength);
double (*databaseGetMainThreadBusyness)(FDBDatabase* database);
FDBFuture* (*databaseGetServerProtocol)(FDBDatabase* database, uint64_t expectedVersion);
// Transaction
fdb_error_t (*transactionSetOption)(FDBTransaction* tr,
@ -185,6 +187,8 @@ struct FdbCApi : public ThreadSafeReferenceCounted<FdbCApi> {
fdb_error_t (*futureGetCluster)(FDBFuture* f, FDBCluster** outCluster);
};
// An implementation of ITransaction that wraps a transaction object created on an externally loaded client library.
// All API calls to that transaction are routed through the external library.
class DLTransaction : public ITransaction, ThreadSafeReferenceCounted<DLTransaction> {
public:
DLTransaction(Reference<FdbCApi> api, FdbCApi::FDBTransaction* tr) : api(api), tr(tr) {}
@ -249,6 +253,8 @@ private:
FdbCApi::FDBTransaction* const tr;
};
// An implementation of IDatabase that wraps a database object created on an externally loaded client library.
// All API calls to that database are routed through the external library.
class DLDatabase : public IDatabase, ThreadSafeReferenceCounted<DLDatabase> {
public:
DLDatabase(Reference<FdbCApi> api, FdbCApi::FDBDatabase* db) : api(api), db(db), ready(Void()) {}
@ -265,6 +271,12 @@ public:
void setOption(FDBDatabaseOptions::Option option, Optional<StringRef> value = Optional<StringRef>()) override;
double getMainThreadBusyness() override;
// Returns the protocol version reported by the coordinator this client is connected to
// If an expected version is given, the future won't return until the protocol version is different than expected
// Note: this will never return if the server is running a protocol from FDB 5.0 or older
ThreadFuture<ProtocolVersion> getServerProtocol(
Optional<ProtocolVersion> expectedVersion = Optional<ProtocolVersion>()) override;
void addref() override { ThreadSafeReferenceCounted<DLDatabase>::addref(); }
void delref() override { ThreadSafeReferenceCounted<DLDatabase>::delref(); }
@ -279,13 +291,14 @@ private:
ThreadFuture<Void> ready;
};
// An implementation of IClientApi that re-issues API calls to the C API of an externally loaded client library.
// The DL prefix stands for "dynamic library".
class DLApi : public IClientApi {
public:
DLApi(std::string fdbCPath, bool unlinkOnLoad = false);
void selectApiVersion(int apiVersion) override;
const char* getClientVersion() override;
ThreadFuture<uint64_t> getServerProtocol(const char* clusterFilePath) override;
void setNetworkOption(FDBNetworkOptions::Option option, Optional<StringRef> value = Optional<StringRef>()) override;
void setupNetwork() override;
@ -312,6 +325,9 @@ private:
class MultiVersionDatabase;
// An implementation of ITransaction that wraps a transaction created either locally or through a dynamically loaded
// external client. When needed (e.g on cluster version change), the MultiVersionTransaction can automatically replace
// its wrapped transaction with one from another client.
class MultiVersionTransaction : public ITransaction, ThreadSafeReferenceCounted<MultiVersionTransaction> {
public:
MultiVersionTransaction(Reference<MultiVersionDatabase> db,
@ -413,89 +429,147 @@ struct ClientInfo : ClientDesc, ThreadSafeReferenceCounted<ClientInfo> {
class MultiVersionApi;
// An implementation of IDatabase that wraps a database created either locally or through a dynamically loaded
// external client. The MultiVersionDatabase monitors the protocol version of the cluster and automatically
// replaces the wrapped database when the protocol version changes.
class MultiVersionDatabase final : public IDatabase, ThreadSafeReferenceCounted<MultiVersionDatabase> {
public:
MultiVersionDatabase(MultiVersionApi* api,
int threadIdx,
std::string clusterFilePath,
Reference<IDatabase> db,
Reference<IDatabase> versionMonitorDb,
bool openConnectors = true);
~MultiVersionDatabase() override;
Reference<ITransaction> createTransaction() override;
void setOption(FDBDatabaseOptions::Option option, Optional<StringRef> value = Optional<StringRef>()) override;
double getMainThreadBusyness() override;
// Returns the protocol version reported by the coordinator this client is connected to
// If an expected version is given, the future won't return until the protocol version is different than expected
// Note: this will never return if the server is running a protocol from FDB 5.0 or older
ThreadFuture<ProtocolVersion> getServerProtocol(
Optional<ProtocolVersion> expectedVersion = Optional<ProtocolVersion>()) override;
void addref() override { ThreadSafeReferenceCounted<MultiVersionDatabase>::addref(); }
void delref() override { ThreadSafeReferenceCounted<MultiVersionDatabase>::delref(); }
// Create a MultiVersionDatabase that wraps an already created IDatabase object
// For internal use in testing
static Reference<IDatabase> debugCreateFromExistingDatabase(Reference<IDatabase> db);
ThreadFuture<int64_t> rebootWorker(const StringRef& address, bool check, int duration) override;
ThreadFuture<Void> forceRecoveryWithDataLoss(const StringRef& dcid) override;
ThreadFuture<Void> createSnapshot(const StringRef& uid, const StringRef& snapshot_command) override;
private:
struct DatabaseState;
// private:
struct Connector : ThreadCallback, ThreadSafeReferenceCounted<Connector> {
Connector(Reference<DatabaseState> dbState, Reference<ClientInfo> client, std::string clusterFilePath)
: dbState(dbState), client(client), clusterFilePath(clusterFilePath), connected(false), cancelled(false) {}
void connect();
void cancel();
bool canFire(int notMadeActive) const override { return true; }
void fire(const Void& unused, int& userParam) override;
void error(const Error& e, int& userParam) override;
const Reference<ClientInfo> client;
const std::string clusterFilePath;
const Reference<DatabaseState> dbState;
ThreadFuture<Void> connectionFuture;
Reference<IDatabase> candidateDatabase;
Reference<ITransaction> tr;
bool connected;
bool cancelled;
};
struct LegacyVersionMonitor;
// A struct that manages the current connection state of the MultiVersionDatabase. This wraps the underlying
// IDatabase object that is currently interacting with the cluster.
struct DatabaseState : ThreadSafeReferenceCounted<DatabaseState> {
DatabaseState();
DatabaseState(std::string clusterFilePath, Reference<IDatabase> versionMonitorDb);
void stateChanged();
void addConnection(Reference<ClientInfo> client, std::string clusterFilePath);
void startConnections();
void cancelConnections();
// Replaces the active database connection with a new one. Must be called from the main thread.
void updateDatabase(Reference<IDatabase> newDb, Reference<ClientInfo> client);
// Called when a change to the protocol version of the cluster has been detected.
// Must be called from the main thread
void protocolVersionChanged(ProtocolVersion protocolVersion);
// Adds a client (local or externally loaded) that can be used to connect to the cluster
void addClient(Reference<ClientInfo> client);
// Watch the cluster protocol version for changes and update the database state when it does.
// Must be called from the main thread
ThreadFuture<Void> monitorProtocolVersion();
// Starts version monitors for old client versions that don't support connect packet monitoring (<= 5.0).
// Must be called from the main thread
void startLegacyVersionMonitors();
// Cleans up state for the legacy version monitors to break reference cycles
void close();
Reference<IDatabase> db;
const Reference<ThreadSafeAsyncVar<Reference<IDatabase>>> dbVar;
std::string clusterFilePath;
// Used to monitor the cluster protocol version. Will be the same as db unless we have either not connected
// yet or if the client version associated with db does not support protocol monitoring. In those cases,
// this will be a specially created local db.
Reference<IDatabase> versionMonitorDb;
ThreadFuture<Void> changed;
bool cancelled;
int currentClientIndex;
std::vector<Reference<ClientInfo>> clients;
std::vector<Reference<Connector>> connectionAttempts;
ThreadFuture<Void> dbReady;
ThreadFuture<Void> protocolVersionMonitor;
// Versions older than 6.1 do not benefit from having their database connections closed. Additionally,
// there are various issues that result in negative behavior in some cases if the connections are closed.
// Therefore, we leave them open.
std::map<ProtocolVersion, Reference<IDatabase>> legacyDatabaseConnections;
// Versions 5.0 and older do not support connection packet monitoring and require alternate techniques to
// determine the cluster version.
std::list<Reference<LegacyVersionMonitor>> legacyVersionMonitors;
Optional<ProtocolVersion> dbProtocolVersion;
// This maps a normalized protocol version to the client associated with it. This prevents compatible
// differences in protocol version not matching each other.
std::map<ProtocolVersion, Reference<ClientInfo>> clients;
std::vector<std::pair<FDBDatabaseOptions::Option, Optional<Standalone<StringRef>>>> options;
UniqueOrderedOptionList<FDBTransactionOptions> transactionDefaultOptions;
Mutex optionLock;
};
// A struct that enables monitoring whether the cluster is running an old version (<= 5.0) that doesn't support
// connect packet monitoring.
struct LegacyVersionMonitor : ThreadSafeReferenceCounted<LegacyVersionMonitor> {
LegacyVersionMonitor(Reference<ClientInfo> const& client) : client(client), monitorRunning(false) {}
// Terminates the version monitor to break reference cycles
void close();
// Starts the connection monitor by creating a database object at an old version.
// Must be called from the main thread
void startConnectionMonitor(Reference<DatabaseState> dbState);
// Runs a GRV probe on the cluster to determine if the client version is compatible with the cluster.
// Must be called from main thread
void runGrvProbe(Reference<DatabaseState> dbState);
Reference<ClientInfo> client;
Reference<IDatabase> db;
Reference<ITransaction> tr;
ThreadFuture<Void> versionMonitor;
bool monitorRunning;
};
const Reference<DatabaseState> dbState;
friend class MultiVersionTransaction;
// Clients must create a database object in order to initialize some of their state.
// This needs to be done only once, and this flag tracks whether that has happened.
static std::atomic_flag externalClientsInitialized;
};
// An implementation of IClientApi that can choose between multiple different client implementations either provided
// locally within the primary loaded fdb_c client or through any number of dynamically loaded clients.
//
// This functionality is used to provide support for multiple protocol versions simultaneously.
class MultiVersionApi : public IClientApi {
public:
void selectApiVersion(int apiVersion) override;
const char* getClientVersion() override;
ThreadFuture<uint64_t> getServerProtocol(const char* clusterFilePath) override;
void setNetworkOption(FDBNetworkOptions::Option option, Optional<StringRef> value = Optional<StringRef>()) override;
void setupNetwork() override;
@ -503,6 +577,7 @@ public:
void stopNetwork() override;
void addNetworkThreadCompletionHook(void (*hook)(void*), void* hookParameter) override;
// Creates an IDatabase object that represents a connection to the cluster
Reference<IDatabase> createDatabase(const char* clusterFilePath) override;
static MultiVersionApi* api;

View File

@ -32,10 +32,13 @@
#include "fdbrpc/FailureMonitor.h"
#include "fdbrpc/MultiInterface.h"
#include "fdbclient/ActorLineageProfiler.h"
#include "fdbclient/AnnotateActor.h"
#include "fdbclient/Atomic.h"
#include "fdbclient/ClusterInterface.h"
#include "fdbclient/CoordinationInterface.h"
#include "fdbclient/DatabaseContext.h"
#include "fdbclient/GlobalConfig.actor.h"
#include "fdbclient/JsonBuilder.h"
#include "fdbclient/KeyRangeMap.h"
#include "fdbclient/Knobs.h"
@ -47,6 +50,7 @@
#include "fdbclient/SpecialKeySpace.actor.h"
#include "fdbclient/StorageServerInterface.h"
#include "fdbclient/SystemData.h"
#include "fdbclient/TransactionLineage.h"
#include "fdbclient/versions.h"
#include "fdbrpc/LoadBalance.h"
#include "fdbrpc/Net2FileSystem.h"
@ -84,6 +88,8 @@ using std::pair;
namespace {
TransactionLineageCollector transactionLineageCollector;
template <class Interface, class Request>
Future<REPLY_TYPE(Request)> loadBalance(
DatabaseContext* ctx,
@ -505,12 +511,13 @@ ACTOR static Future<Void> clientStatusUpdateActor(DatabaseContext* cx) {
}
}
cx->clientStatusUpdater.outStatusQ.clear();
double clientSamplingProbability = std::isinf(cx->clientInfo->get().clientTxnInfoSampleRate)
? CLIENT_KNOBS->CSI_SAMPLING_PROBABILITY
: cx->clientInfo->get().clientTxnInfoSampleRate;
int64_t clientTxnInfoSizeLimit = cx->clientInfo->get().clientTxnInfoSizeLimit == -1
? CLIENT_KNOBS->CSI_SIZE_LIMIT
: cx->clientInfo->get().clientTxnInfoSizeLimit;
wait(GlobalConfig::globalConfig().onInitialized());
double sampleRate = GlobalConfig::globalConfig().get<double>(fdbClientInfoTxnSampleRate,
std::numeric_limits<double>::infinity());
double clientSamplingProbability =
std::isinf(sampleRate) ? CLIENT_KNOBS->CSI_SAMPLING_PROBABILITY : sampleRate;
int64_t sizeLimit = GlobalConfig::globalConfig().get<int64_t>(fdbClientInfoTxnSizeLimit, -1);
int64_t clientTxnInfoSizeLimit = sizeLimit == -1 ? CLIENT_KNOBS->CSI_SIZE_LIMIT : sizeLimit;
if (!trChunksQ.empty() && deterministicRandom()->random01() < clientSamplingProbability)
wait(delExcessClntTxnEntriesActor(&tr, clientTxnInfoSizeLimit));
@ -898,6 +905,7 @@ Future<Standalone<RangeResultRef>> HealthMetricsRangeImpl::getRange(ReadYourWrit
DatabaseContext::DatabaseContext(Reference<AsyncVar<Reference<ClusterConnectionFile>>> connectionFile,
Reference<AsyncVar<ClientDBInfo>> clientInfo,
Reference<AsyncVar<Optional<ClientLeaderRegInterface>>> coordinator,
Future<Void> clientInfoMonitor,
TaskPriority taskID,
LocalityData const& clientLocality,
@ -906,9 +914,10 @@ DatabaseContext::DatabaseContext(Reference<AsyncVar<Reference<ClusterConnectionF
bool internal,
int apiVersion,
bool switchable)
: connectionFile(connectionFile), clientInfo(clientInfo), clientInfoMonitor(clientInfoMonitor), taskID(taskID),
clientLocality(clientLocality), enableLocalityLoadBalance(enableLocalityLoadBalance), lockAware(lockAware),
apiVersion(apiVersion), switchable(switchable), proxyProvisional(false), cc("TransactionMetrics"),
: connectionFile(connectionFile), clientInfo(clientInfo), coordinator(coordinator),
clientInfoMonitor(clientInfoMonitor), taskID(taskID), clientLocality(clientLocality),
enableLocalityLoadBalance(enableLocalityLoadBalance), lockAware(lockAware), apiVersion(apiVersion),
switchable(switchable), proxyProvisional(false), cc("TransactionMetrics"),
transactionReadVersions("ReadVersions", cc), transactionReadVersionsThrottled("ReadVersionsThrottled", cc),
transactionReadVersionsCompleted("ReadVersionsCompleted", cc),
transactionReadVersionBatches("ReadVersionBatches", cc),
@ -956,6 +965,10 @@ DatabaseContext::DatabaseContext(Reference<AsyncVar<Reference<ClusterConnectionF
getValueSubmitted.init(LiteralStringRef("NativeAPI.GetValueSubmitted"));
getValueCompleted.init(LiteralStringRef("NativeAPI.GetValueCompleted"));
GlobalConfig::create(this, clientInfo);
GlobalConfig::globalConfig().trigger(samplingFrequency, samplingProfilerUpdateFrequency);
GlobalConfig::globalConfig().trigger(samplingWindow, samplingProfilerUpdateWindow);
monitorProxiesInfoChange = monitorProxiesChange(clientInfo, &proxiesChangeTrigger);
clientStatusUpdater.actor = clientStatusUpdateActor(this);
cacheListMonitor = monitorCacheList(this);
@ -1017,6 +1030,10 @@ DatabaseContext::DatabaseContext(Reference<AsyncVar<Reference<ClusterConnectionF
std::make_unique<ConsistencyCheckImpl>(
singleKeyRange(LiteralStringRef("consistency_check_suspended"))
.withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin)));
registerSpecialKeySpaceModule(
SpecialKeySpace::MODULE::GLOBALCONFIG,
SpecialKeySpace::IMPLTYPE::READWRITE,
std::make_unique<GlobalConfigImpl>(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::GLOBALCONFIG)));
registerSpecialKeySpaceModule(
SpecialKeySpace::MODULE::TRACING,
SpecialKeySpace::IMPLTYPE::READWRITE,
@ -1044,7 +1061,25 @@ DatabaseContext::DatabaseContext(Reference<AsyncVar<Reference<ClusterConnectionF
SpecialKeySpace::IMPLTYPE::READWRITE,
std::make_unique<ClientProfilingImpl>(
KeyRangeRef(LiteralStringRef("profiling/"), LiteralStringRef("profiling0"))
.withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin)));
registerSpecialKeySpaceModule(
SpecialKeySpace::MODULE::MANAGEMENT, SpecialKeySpace::IMPLTYPE::READWRITE,
std::make_unique<MaintenanceImpl>(
KeyRangeRef(LiteralStringRef("maintenance/"), LiteralStringRef("maintenance0"))
.withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin)));
registerSpecialKeySpaceModule(
SpecialKeySpace::MODULE::MANAGEMENT, SpecialKeySpace::IMPLTYPE::READWRITE,
std::make_unique<DataDistributionImpl>(
KeyRangeRef(LiteralStringRef("data_distribution/"), LiteralStringRef("data_distribution0"))
.withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin)));
registerSpecialKeySpaceModule(
SpecialKeySpace::MODULE::ACTORLINEAGE,
SpecialKeySpace::IMPLTYPE::READONLY,
std::make_unique<ActorLineageImpl>(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::ACTORLINEAGE)));
registerSpecialKeySpaceModule(SpecialKeySpace::MODULE::ACTOR_PROFILER_CONF,
SpecialKeySpace::IMPLTYPE::READWRITE,
std::make_unique<ActorProfilerConf>(SpecialKeySpace::getModuleRange(
SpecialKeySpace::MODULE::ACTOR_PROFILER_CONF)));
}
if (apiVersionAtLeast(630)) {
registerSpecialKeySpaceModule(SpecialKeySpace::MODULE::TRANSACTION,
@ -1156,6 +1191,8 @@ DatabaseContext::DatabaseContext(const Error& err)
transactionsExpensiveClearCostEstCount("ExpensiveClearCostEstCount", cc), internal(false),
transactionTracingEnabled(true) {}
// Static constructor used by server processes to create a DatabaseContext
// For internal (fdbserver) use only
Database DatabaseContext::create(Reference<AsyncVar<ClientDBInfo>> clientInfo,
Future<Void> clientInfoMonitor,
LocalityData clientLocality,
@ -1166,6 +1203,7 @@ Database DatabaseContext::create(Reference<AsyncVar<ClientDBInfo>> clientInfo,
bool switchable) {
return Database(new DatabaseContext(Reference<AsyncVar<Reference<ClusterConnectionFile>>>(),
clientInfo,
makeReference<AsyncVar<Optional<ClientLeaderRegInterface>>>(),
clientInfoMonitor,
taskID,
clientLocality,
@ -1266,14 +1304,16 @@ Future<Void> DatabaseContext::onProxiesChanged() {
}
bool DatabaseContext::sampleReadTags() const {
return clientInfo->get().transactionTagSampleRate > 0 &&
deterministicRandom()->random01() <= clientInfo->get().transactionTagSampleRate;
double sampleRate = GlobalConfig::globalConfig().get(transactionTagSampleRate, CLIENT_KNOBS->READ_TAG_SAMPLE_RATE);
return sampleRate > 0 && deterministicRandom()->random01() <= sampleRate;
}
bool DatabaseContext::sampleOnCost(uint64_t cost) const {
if (clientInfo->get().transactionTagSampleCost <= 0)
double sampleCost =
GlobalConfig::globalConfig().get<double>(transactionTagSampleCost, CLIENT_KNOBS->COMMIT_SAMPLE_COST);
if (sampleCost <= 0)
return false;
return deterministicRandom()->random01() <= (double)cost / clientInfo->get().transactionTagSampleCost;
return deterministicRandom()->random01() <= (double)cost / sampleCost;
}
int64_t extractIntOption(Optional<StringRef> value, int64_t minValue, int64_t maxValue) {
@ -1446,6 +1486,9 @@ void DatabaseContext::expireThrottles() {
extern IPAddress determinePublicIPAutomatically(ClusterConnectionString const& ccs);
// Creates a database object that represents a connection to a cluster
// This constructor uses a preallocated DatabaseContext that may have been created
// on another thread
Database Database::createDatabase(Reference<ClusterConnectionFile> connFile,
int apiVersion,
bool internal,
@ -1492,15 +1535,20 @@ Database Database::createDatabase(Reference<ClusterConnectionFile> connFile,
g_network->initTLS();
auto clientInfo = makeReference<AsyncVar<ClientDBInfo>>();
auto coordinator = makeReference<AsyncVar<Optional<ClientLeaderRegInterface>>>();
auto connectionFile = makeReference<AsyncVar<Reference<ClusterConnectionFile>>>();
connectionFile->set(connFile);
Future<Void> clientInfoMonitor = monitorProxies(
connectionFile, clientInfo, networkOptions.supportedVersions, StringRef(networkOptions.traceLogGroup));
Future<Void> clientInfoMonitor = monitorProxies(connectionFile,
clientInfo,
coordinator,
networkOptions.supportedVersions,
StringRef(networkOptions.traceLogGroup));
DatabaseContext* db;
if (preallocatedDb) {
db = new (preallocatedDb) DatabaseContext(connectionFile,
clientInfo,
coordinator,
clientInfoMonitor,
TaskPriority::DefaultEndpoint,
clientLocality,
@ -1512,6 +1560,7 @@ Database Database::createDatabase(Reference<ClusterConnectionFile> connFile,
} else {
db = new DatabaseContext(connectionFile,
clientInfo,
coordinator,
clientInfoMonitor,
TaskPriority::DefaultEndpoint,
clientLocality,
@ -2477,8 +2526,10 @@ ACTOR Future<Version> watchValue(Future<Version> version,
cx->invalidateCache(key);
wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, info.taskID));
} else if (e.code() == error_code_watch_cancelled || e.code() == error_code_process_behind) {
// clang-format off
TEST(e.code() == error_code_watch_cancelled); // Too many watches on the storage server, poll for changes instead
TEST(e.code() == error_code_process_behind); // The storage servers are all behind
// clang-format on
wait(delay(CLIENT_KNOBS->WATCH_POLLING_TIME, info.taskID));
} else if (e.code() == error_code_timed_out) { // The storage server occasionally times out watches in case
// it was cancelled
@ -3051,6 +3102,7 @@ ACTOR Future<Standalone<RangeResultRef>> getRange(Database cx,
throw deterministicRandom()->randomChoice(
std::vector<Error>{ transaction_too_old(), future_version() });
}
state AnnotateActor annotation(currentLineage);
GetKeyValuesReply _rep =
wait(loadBalance(cx.getPtr(),
beginServer.second,
@ -4872,37 +4924,95 @@ Future<Standalone<StringRef>> Transaction::getVersionstamp() {
return versionstampPromise.getFuture();
}
ACTOR Future<ProtocolVersion> coordinatorProtocolsFetcher(Reference<ClusterConnectionFile> f) {
state ClientCoordinators coord(f);
// Gets the protocol version reported by a coordinator via the protocol info interface
ACTOR Future<ProtocolVersion> getCoordinatorProtocol(NetworkAddressList coordinatorAddresses) {
RequestStream<ProtocolInfoRequest> requestStream{ Endpoint{ { coordinatorAddresses }, WLTOKEN_PROTOCOL_INFO } };
ProtocolInfoReply reply = wait(retryBrokenPromise(requestStream, ProtocolInfoRequest{}));
state vector<Future<ProtocolInfoReply>> coordProtocols;
coordProtocols.reserve(coord.clientLeaderServers.size());
for (int i = 0; i < coord.clientLeaderServers.size(); i++) {
RequestStream<ProtocolInfoRequest> requestStream{ Endpoint{
{ coord.clientLeaderServers[i].getLeader.getEndpoint().addresses }, WLTOKEN_PROTOCOL_INFO } };
coordProtocols.push_back(retryBrokenPromise(requestStream, ProtocolInfoRequest{}));
}
wait(smartQuorum(coordProtocols, coordProtocols.size() / 2 + 1, 1.5));
std::unordered_map<uint64_t, int> protocolCount;
for (int i = 0; i < coordProtocols.size(); i++) {
if (coordProtocols[i].isReady()) {
protocolCount[coordProtocols[i].get().version.version()]++;
}
}
uint64_t majorityProtocol = std::max_element(protocolCount.begin(),
protocolCount.end(),
[](const std::pair<uint64_t, int>& l,
const std::pair<uint64_t, int>& r) { return l.second < r.second; })
->first;
return ProtocolVersion(majorityProtocol);
return reply.version;
}
ACTOR Future<uint64_t> getCoordinatorProtocols(Reference<ClusterConnectionFile> f) {
ProtocolVersion protocolVersion = wait(coordinatorProtocolsFetcher(f));
return protocolVersion.version();
// Gets the protocol version reported by a coordinator in its connect packet
// If we are unable to get a version from the connect packet (e.g. because we lost connection with the peer), then this
// function will return with an unset result.
// If an expected version is given, this future won't return if the actual protocol version matches the expected version
ACTOR Future<Optional<ProtocolVersion>> getCoordinatorProtocolFromConnectPacket(
NetworkAddress coordinatorAddress,
Optional<ProtocolVersion> expectedVersion) {
state Reference<AsyncVar<Optional<ProtocolVersion>>> protocolVersion =
FlowTransport::transport().getPeerProtocolAsyncVar(coordinatorAddress);
loop {
if (protocolVersion->get().present() && protocolVersion->get() != expectedVersion) {
return protocolVersion->get();
}
Future<Void> change = protocolVersion->onChange();
if (!protocolVersion->get().present()) {
// If we still don't have any connection info after a timeout, retry sending the protocol version request
change = timeout(change, FLOW_KNOBS->CONNECTION_MONITOR_TIMEOUT, Void());
}
wait(change);
if (!protocolVersion->get().present()) {
return protocolVersion->get();
}
}
}
// Returns the protocol version reported by the given coordinator
// If an expected version is given, the future won't return until the protocol version is different than expected
ACTOR Future<ProtocolVersion> getClusterProtocolImpl(
Reference<AsyncVar<Optional<ClientLeaderRegInterface>>> coordinator,
Optional<ProtocolVersion> expectedVersion) {
state bool needToConnect = true;
state Future<ProtocolVersion> protocolVersion = Never();
loop {
if (!coordinator->get().present()) {
wait(coordinator->onChange());
} else {
Endpoint coordinatorEndpoint = coordinator->get().get().getLeader.getEndpoint();
if (needToConnect) {
// Even though we typically rely on the connect packet to get the protocol version, we need to send some
// request in order to start a connection. This protocol version request serves that purpose.
protocolVersion = getCoordinatorProtocol(coordinatorEndpoint.addresses);
needToConnect = false;
}
choose {
when(wait(coordinator->onChange())) { needToConnect = true; }
when(ProtocolVersion pv = wait(protocolVersion)) {
if (!expectedVersion.present() || expectedVersion.get() != pv) {
return pv;
}
protocolVersion = Never();
}
// Older versions of FDB don't have an endpoint to return the protocol version, so we get this info from
// the connect packet
when(Optional<ProtocolVersion> pv = wait(getCoordinatorProtocolFromConnectPacket(
coordinatorEndpoint.getPrimaryAddress(), expectedVersion))) {
if (pv.present()) {
return pv.get();
} else {
needToConnect = true;
}
}
}
}
}
}
// Returns the protocol version reported by the coordinator this client is currently connected to
// If an expected version is given, the future won't return until the protocol version is different than expected
// Note: this will never return if the server is running a protocol from FDB 5.0 or older
Future<ProtocolVersion> DatabaseContext::getClusterProtocol(Optional<ProtocolVersion> expectedVersion) {
return getClusterProtocolImpl(coordinator, expectedVersion);
}
uint32_t Transaction::getSize() {
@ -5370,9 +5480,8 @@ void Transaction::checkDeferredError() {
Reference<TransactionLogInfo> Transaction::createTrLogInfoProbabilistically(const Database& cx) {
if (!cx->isError()) {
double clientSamplingProbability = std::isinf(cx->clientInfo->get().clientTxnInfoSampleRate)
? CLIENT_KNOBS->CSI_SAMPLING_PROBABILITY
: cx->clientInfo->get().clientTxnInfoSampleRate;
double clientSamplingProbability = GlobalConfig::globalConfig().get<double>(
fdbClientInfoTxnSampleRate, CLIENT_KNOBS->CSI_SAMPLING_PROBABILITY);
if (((networkOptions.logClientInfo.present() && networkOptions.logClientInfo.get()) || BUGGIFY) &&
deterministicRandom()->random01() < clientSamplingProbability &&
(!g_network->isSimulated() || !g_simulator.speedUpSimulation)) {

View File

@ -76,11 +76,15 @@ class Database {
public:
enum { API_VERSION_LATEST = -1 };
// Creates a database object that represents a connection to a cluster
// This constructor uses a preallocated DatabaseContext that may have been created
// on another thread
static Database createDatabase(Reference<ClusterConnectionFile> connFile,
int apiVersion,
bool internal = true,
LocalityData const& clientLocality = LocalityData(),
DatabaseContext* preallocatedDb = nullptr);
static Database createDatabase(std::string connFileName,
int apiVersion,
bool internal = true,
@ -400,8 +404,6 @@ ACTOR Future<Void> snapCreate(Database cx, Standalone<StringRef> snapCmd, UID sn
// Checks with Data Distributor that it is safe to mark all servers in exclusions as failed
ACTOR Future<bool> checkSafeExclusions(Database cx, vector<AddressExclusion> exclusions);
ACTOR Future<uint64_t> getCoordinatorProtocols(Reference<ClusterConnectionFile> f);
inline uint64_t getWriteOperationCost(uint64_t bytes) {
return bytes / std::max(1, CLIENT_KNOBS->WRITE_COST_BYTE_FACTOR) + 1;
}

View File

@ -0,0 +1,81 @@
/*
* ProcessInterface.h
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2021 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "fdbclient/AnnotateActor.h"
#include "fdbclient/FDBTypes.h"
#include "fdbrpc/fdbrpc.h"
constexpr UID WLTOKEN_PROCESS(-1, 11);
struct ProcessInterface {
constexpr static FileIdentifier file_identifier = 985636;
RequestStream<struct GetProcessInterfaceRequest> getInterface;
RequestStream<struct ActorLineageRequest> actorLineage;
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, actorLineage);
}
};
struct GetProcessInterfaceRequest {
constexpr static FileIdentifier file_identifier = 7632546;
ReplyPromise<ProcessInterface> reply;
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, reply);
}
};
// This type is used to send serialized sample data over the network.
struct SerializedSample {
constexpr static FileIdentifier file_identifier = 15785634;
double time;
std::unordered_map<WaitState, std::string> data;
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, time, data);
}
};
struct ActorLineageReply {
constexpr static FileIdentifier file_identifier = 1887656;
std::vector<SerializedSample> samples;
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, samples);
}
};
struct ActorLineageRequest {
constexpr static FileIdentifier file_identifier = 11654765;
WaitState waitStateStart, waitStateEnd;
time_t timeStart, timeEnd;
ReplyPromise<ActorLineageReply> reply;
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, waitStateStart, waitStateEnd, timeStart, timeEnd, reply);
}
};

View File

@ -47,6 +47,7 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema(
"storage",
"transaction",
"resolution",
"stateless",
"commit_proxy",
"grv_proxy",
"master",
@ -155,6 +156,18 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema(
"p95":0.0,
"p99":0.0,
"p99.9":0.0
},
"batch":{
"count":0,
"min":0.0,
"max":0.0,
"median":0.0,
"mean":0.0,
"p25":0.0,
"p90":0.0,
"p95":0.0,
"p99":0.0,
"p99.9":0.0
}
},
"read_latency_statistics":{
@ -181,6 +194,18 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema(
"p99":0.0,
"p99.9":0.0
},
"commit_batching_window_size":{
"count":0,
"min":0.0,
"max":0.0,
"median":0.0,
"mean":0.0,
"p25":0.0,
"p90":0.0,
"p95":0.0,
"p99":0.0,
"p99.9":0.0
},
"grv_latency_bands":{
"$map": 1
},

View File

@ -21,7 +21,15 @@
#include "boost/lexical_cast.hpp"
#include "boost/algorithm/string.hpp"
#include <time.h>
#include <msgpack.hpp>
#include <exception>
#include "fdbclient/ActorLineageProfiler.h"
#include "fdbclient/Knobs.h"
#include "fdbclient/ProcessInterface.h"
#include "fdbclient/GlobalConfig.actor.h"
#include "fdbclient/SpecialKeySpace.actor.h"
#include "flow/Arena.h"
#include "flow/UnitTest.h"
@ -64,8 +72,15 @@ std::unordered_map<SpecialKeySpace::MODULE, KeyRange> SpecialKeySpace::moduleToB
{ SpecialKeySpace::MODULE::ERRORMSG, singleKeyRange(LiteralStringRef("\xff\xff/error_message")) },
{ SpecialKeySpace::MODULE::CONFIGURATION,
KeyRangeRef(LiteralStringRef("\xff\xff/configuration/"), LiteralStringRef("\xff\xff/configuration0")) },
{ SpecialKeySpace::MODULE::GLOBALCONFIG,
KeyRangeRef(LiteralStringRef("\xff\xff/global_config/"), LiteralStringRef("\xff\xff/global_config0")) },
{ SpecialKeySpace::MODULE::TRACING,
KeyRangeRef(LiteralStringRef("\xff\xff/tracing/"), LiteralStringRef("\xff\xff/tracing0")) }
KeyRangeRef(LiteralStringRef("\xff\xff/tracing/"), LiteralStringRef("\xff\xff/tracing0")) },
{ SpecialKeySpace::MODULE::ACTORLINEAGE,
KeyRangeRef(LiteralStringRef("\xff\xff/actor_lineage/"), LiteralStringRef("\xff\xff/actor_lineage0")) },
{ SpecialKeySpace::MODULE::ACTOR_PROFILER_CONF,
KeyRangeRef(LiteralStringRef("\xff\xff/actor_profiler_conf/"),
LiteralStringRef("\xff\xff/actor_profiler_conf0")) }
};
std::unordered_map<std::string, KeyRange> SpecialKeySpace::managementApiCommandToRange = {
@ -87,9 +102,24 @@ std::unordered_map<std::string, KeyRange> SpecialKeySpace::managementApiCommandT
.withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) },
{ "profile",
KeyRangeRef(LiteralStringRef("profiling/"), LiteralStringRef("profiling0"))
.withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) },
{ "maintenance",
KeyRangeRef(LiteralStringRef("maintenance/"), LiteralStringRef("maintenance0"))
.withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) },
{ "datadistribution",
KeyRangeRef(LiteralStringRef("data_distribution/"), LiteralStringRef("data_distribution0"))
.withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) }
};
std::unordered_map<std::string, KeyRange> SpecialKeySpace::actorLineageApiCommandToRange = {
{ "state",
KeyRangeRef(LiteralStringRef("state/"), LiteralStringRef("state0"))
.withPrefix(moduleToBoundary[MODULE::ACTORLINEAGE].begin) },
{ "time",
KeyRangeRef(LiteralStringRef("time/"), LiteralStringRef("time0"))
.withPrefix(moduleToBoundary[MODULE::ACTORLINEAGE].begin) }
};
std::set<std::string> SpecialKeySpace::options = { "excluded/force", "failed/force" };
std::set<std::string> SpecialKeySpace::tracingOptions = { kTracingTransactionIdKey, kTracingTokenKey };
@ -1369,10 +1399,132 @@ Future<Optional<std::string>> ConsistencyCheckImpl::commit(ReadYourWritesTransac
return Optional<std::string>();
}
TracingOptionsImpl::TracingOptionsImpl(KeyRangeRef kr) : SpecialKeyRangeRWImpl(kr) {
TraceEvent("TracingOptionsImpl::TracingOptionsImpl").detail("Range", kr);
GlobalConfigImpl::GlobalConfigImpl(KeyRangeRef kr) : SpecialKeyRangeRWImpl(kr) {}
// Returns key-value pairs for each value stored in the global configuration
// framework within the range specified. The special-key-space getrange
// function should only be used for informational purposes. All values are
// returned as strings regardless of their true type.
Future<Standalone<RangeResultRef>> GlobalConfigImpl::getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const {
Standalone<RangeResultRef> result;
auto& globalConfig = GlobalConfig::globalConfig();
KeyRangeRef modified =
KeyRangeRef(kr.begin.removePrefix(getKeyRange().begin), kr.end.removePrefix(getKeyRange().begin));
std::map<KeyRef, Reference<ConfigValue>> values = globalConfig.get(modified);
for (const auto& [key, config] : values) {
Key prefixedKey = key.withPrefix(getKeyRange().begin);
if (config.isValid() && config->value.has_value()) {
if (config->value.type() == typeid(StringRef)) {
result.push_back_deep(result.arena(),
KeyValueRef(prefixedKey, std::any_cast<StringRef>(config->value).toString()));
} else if (config->value.type() == typeid(int64_t)) {
result.push_back_deep(result.arena(),
KeyValueRef(prefixedKey, std::to_string(std::any_cast<int64_t>(config->value))));
} else if (config->value.type() == typeid(bool)) {
result.push_back_deep(result.arena(),
KeyValueRef(prefixedKey, std::to_string(std::any_cast<bool>(config->value))));
} else if (config->value.type() == typeid(float)) {
result.push_back_deep(result.arena(),
KeyValueRef(prefixedKey, std::to_string(std::any_cast<float>(config->value))));
} else if (config->value.type() == typeid(double)) {
result.push_back_deep(result.arena(),
KeyValueRef(prefixedKey, std::to_string(std::any_cast<double>(config->value))));
} else {
ASSERT(false);
}
}
}
return result;
}
// Marks the key for insertion into global configuration.
void GlobalConfigImpl::set(ReadYourWritesTransaction* ryw, const KeyRef& key, const ValueRef& value) {
ryw->getSpecialKeySpaceWriteMap().insert(key, std::make_pair(true, Optional<Value>(value)));
}
// Writes global configuration changes to durable memory. Also writes the
// changes made in the transaction to a recent history set, and updates the
// latest version which the global configuration was updated at.
ACTOR Future<Optional<std::string>> globalConfigCommitActor(GlobalConfigImpl* globalConfig,
ReadYourWritesTransaction* ryw) {
state Transaction& tr = ryw->getTransaction();
// History should only contain three most recent updates. If it currently
// has three items, remove the oldest to make room for a new item.
Standalone<RangeResultRef> history = wait(tr.getRange(globalConfigHistoryKeys, CLIENT_KNOBS->TOO_MANY));
constexpr int kGlobalConfigMaxHistorySize = 3;
if (history.size() > kGlobalConfigMaxHistorySize - 1) {
for (int i = 0; i < history.size() - (kGlobalConfigMaxHistorySize - 1); ++i) {
tr.clear(history[i].key);
}
}
VersionHistory vh{ 0 };
// Transform writes from the special-key-space (\xff\xff/global_config/) to
// the system key space (\xff/globalConfig/), and writes mutations to
// latest version history.
state RangeMap<Key, std::pair<bool, Optional<Value>>, KeyRangeRef>::Ranges ranges =
ryw->getSpecialKeySpaceWriteMap().containedRanges(specialKeys);
state RangeMap<Key, std::pair<bool, Optional<Value>>, KeyRangeRef>::iterator iter = ranges.begin();
while (iter != ranges.end()) {
std::pair<bool, Optional<Value>> entry = iter->value();
if (entry.first) {
if (entry.second.present() && iter->begin().startsWith(globalConfig->getKeyRange().begin)) {
Key bareKey = iter->begin().removePrefix(globalConfig->getKeyRange().begin);
vh.mutations.emplace_back_deep(vh.mutations.arena(),
MutationRef(MutationRef::SetValue, bareKey, entry.second.get()));
Key systemKey = bareKey.withPrefix(globalConfigKeysPrefix);
tr.set(systemKey, entry.second.get());
} else if (!entry.second.present() && iter->range().begin.startsWith(globalConfig->getKeyRange().begin) &&
iter->range().end.startsWith(globalConfig->getKeyRange().begin)) {
KeyRef bareRangeBegin = iter->range().begin.removePrefix(globalConfig->getKeyRange().begin);
KeyRef bareRangeEnd = iter->range().end.removePrefix(globalConfig->getKeyRange().begin);
vh.mutations.emplace_back_deep(vh.mutations.arena(),
MutationRef(MutationRef::ClearRange, bareRangeBegin, bareRangeEnd));
Key systemRangeBegin = bareRangeBegin.withPrefix(globalConfigKeysPrefix);
Key systemRangeEnd = bareRangeEnd.withPrefix(globalConfigKeysPrefix);
tr.clear(KeyRangeRef(systemRangeBegin, systemRangeEnd));
}
}
++iter;
}
// Record the mutations in this commit into the global configuration history.
Key historyKey = addVersionStampAtEnd(globalConfigHistoryPrefix);
ObjectWriter historyWriter(IncludeVersion());
historyWriter.serialize(vh);
tr.atomicOp(historyKey, historyWriter.toStringRef(), MutationRef::SetVersionstampedKey);
// Write version key to trigger update in cluster controller.
tr.atomicOp(globalConfigVersionKey,
LiteralStringRef("0123456789\x00\x00\x00\x00"), // versionstamp
MutationRef::SetVersionstampedValue);
return Optional<std::string>();
}
// Called when a transaction includes keys in the global configuration special-key-space range.
Future<Optional<std::string>> GlobalConfigImpl::commit(ReadYourWritesTransaction* ryw) {
return globalConfigCommitActor(this, ryw);
}
// Marks the range for deletion from global configuration.
void GlobalConfigImpl::clear(ReadYourWritesTransaction* ryw, const KeyRangeRef& range) {
ryw->getSpecialKeySpaceWriteMap().insert(range, std::make_pair(true, Optional<Value>()));
}
// Marks the key for deletion from global configuration.
void GlobalConfigImpl::clear(ReadYourWritesTransaction* ryw, const KeyRef& key) {
ryw->getSpecialKeySpaceWriteMap().insert(key, std::make_pair(true, Optional<Value>()));
}
TracingOptionsImpl::TracingOptionsImpl(KeyRangeRef kr) : SpecialKeyRangeRWImpl(kr) {}
Future<Standalone<RangeResultRef>> TracingOptionsImpl::getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const {
Standalone<RangeResultRef> result;
for (const auto& option : SpecialKeySpace::getTracingOptions()) {
@ -1738,6 +1890,7 @@ ACTOR static Future<Standalone<RangeResultRef>> ClientProfilingGetRangeActor(Rea
return result;
}
// TODO : add limitation on set operation
Future<Standalone<RangeResultRef>> ClientProfilingImpl::getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const {
return ClientProfilingGetRangeActor(ryw, getKeyRange().begin, kr);
}
@ -1794,3 +1947,462 @@ void ClientProfilingImpl::clear(ReadYourWritesTransaction* ryw, const KeyRef& ke
"profile",
"Clear operation is forbidden for profile client. You can set it to default to disable profiling.");
}
ActorLineageImpl::ActorLineageImpl(KeyRangeRef kr) : SpecialKeyRangeReadImpl(kr) {}
void parse(StringRef& val, int& i) {
i = std::stoi(val.toString());
}
void parse(StringRef& val, double& d) {
d = std::stod(val.toString());
}
void parse(StringRef& val, WaitState& w) {
if (val == LiteralStringRef("disk")) {
w = WaitState::Disk;
} else if (val == LiteralStringRef("network")) {
w = WaitState::Network;
} else if (val == LiteralStringRef("running")) {
w = WaitState::Running;
} else {
throw std::range_error("failed to parse run state");
}
}
void parse(StringRef& val, time_t& t) {
struct tm tm = { 0 };
if (strptime(val.toString().c_str(), "%FT%T%z", &tm) == nullptr) {
throw std::invalid_argument("failed to parse ISO 8601 datetime");
}
long timezone = tm.tm_gmtoff;
t = timegm(&tm);
if (t == -1) {
throw std::runtime_error("failed to convert ISO 8601 datetime");
}
t -= timezone;
}
void parse(StringRef& val, NetworkAddress& a) {
auto address = NetworkAddress::parse(val.toString());
if (!address.isValid()) {
throw std::invalid_argument("invalid host");
}
a = address;
}
// Base case function for parsing function below.
template <typename T>
void parse(std::vector<StringRef>::iterator it, std::vector<StringRef>::iterator end, T& t1) {
if (it == end) {
return;
}
parse(*it, t1);
}
// Given an iterator into a vector of string tokens, an iterator to the end of
// the search space in the vector (exclusive), and a list of references to
// types, parses each token in the vector into the associated type according to
// the order of the arguments.
//
// For example, given the vector ["1", "1.5", "127.0.0.1:4000"] and the
// argument list int a, double b, NetworkAddress c, after this function returns
// each parameter passed in will hold the parsed value from the token list.
//
// The appropriate parsing function must be implemented for the type you wish
// to parse. See the existing parsing functions above, and add your own if
// necessary.
template <typename T, typename... Types>
void parse(std::vector<StringRef>::iterator it, std::vector<StringRef>::iterator end, T& t1, Types&... remaining) {
// Return as soon as all tokens have been parsed. This allows parameters
// passed at the end to act as optional parameters -- they will only be set
// if the value exists.
if (it == end) {
return;
}
try {
parse(*it, t1);
parse(++it, end, remaining...);
} catch (Error& e) {
throw e;
} catch (std::exception& e) {
throw e;
}
}
ACTOR static Future<Standalone<RangeResultRef>> actorLineageGetRangeActor(ReadYourWritesTransaction* ryw,
KeyRef prefix,
KeyRangeRef kr) {
state Standalone<RangeResultRef> result;
// Set default values for all fields. The default will be used if the field
// is missing in the key.
state NetworkAddress host;
state WaitState waitStateStart = WaitState{ 0 };
state WaitState waitStateEnd = WaitState{ 2 };
state time_t timeStart = 0;
state time_t timeEnd = std::numeric_limits<time_t>::max();
state int seqStart = 0;
state int seqEnd = std::numeric_limits<int>::max();
state std::vector<StringRef> beginValues = kr.begin.removePrefix(prefix).splitAny("/"_sr);
state std::vector<StringRef> endValues = kr.end.removePrefix(prefix).splitAny("/"_sr);
// Require index (either "state" or "time") and address:port.
if (beginValues.size() < 2 || endValues.size() < 2) {
ryw->setSpecialKeySpaceErrorMsg("missing required parameters (index, host)");
throw special_keys_api_failure();
}
state NetworkAddress endRangeHost;
try {
if (SpecialKeySpace::getActorLineageApiCommandRange("state").contains(kr)) {
// For the range \xff\xff/actor_lineage/state/ip:port/wait-state/time/seq
parse(beginValues.begin() + 1, beginValues.end(), host, waitStateStart, timeStart, seqStart);
if (kr.begin != kr.end) {
parse(endValues.begin() + 1, endValues.end(), endRangeHost, waitStateEnd, timeEnd, seqEnd);
}
} else if (SpecialKeySpace::getActorLineageApiCommandRange("time").contains(kr)) {
// For the range \xff\xff/actor_lineage/time/ip:port/time/wait-state/seq
parse(beginValues.begin() + 1, beginValues.end(), host, timeStart, waitStateStart, seqStart);
if (kr.begin != kr.end) {
parse(endValues.begin() + 1, endValues.end(), endRangeHost, timeEnd, waitStateEnd, seqEnd);
}
} else {
ryw->setSpecialKeySpaceErrorMsg("invalid index in actor_lineage");
throw special_keys_api_failure();
}
} catch (Error& e) {
if (e.code() != special_keys_api_failure().code()) {
ryw->setSpecialKeySpaceErrorMsg("failed to parse key");
throw special_keys_api_failure();
} else {
throw e;
}
}
if (kr.begin != kr.end && host != endRangeHost) {
// The client doesn't know about all the hosts, so a get range covering
// multiple hosts has no way of knowing which IP:port combos to use.
ryw->setSpecialKeySpaceErrorMsg("the host must remain the same on both ends of the range");
throw special_keys_api_failure();
}
// Open endpoint to target process on each call. This can be optimized at
// some point...
state ProcessInterface process;
process.getInterface = RequestStream<GetProcessInterfaceRequest>(Endpoint({ host }, WLTOKEN_PROCESS));
ProcessInterface p = wait(retryBrokenPromise(process.getInterface, GetProcessInterfaceRequest{}));
process = p;
ActorLineageRequest actorLineageRequest;
actorLineageRequest.waitStateStart = waitStateStart;
actorLineageRequest.waitStateEnd = waitStateEnd;
actorLineageRequest.timeStart = timeStart;
actorLineageRequest.timeEnd = timeEnd;
ActorLineageReply reply = wait(process.actorLineage.getReply(actorLineageRequest));
time_t dt = 0;
int seq = -1;
for (const auto& sample : reply.samples) {
for (const auto& [waitState, data] : sample.data) {
time_t datetime = (time_t)sample.time;
seq = dt == datetime ? seq + 1 : 0;
dt = datetime;
if (seq < seqStart) { continue; }
else if (seq >= seqEnd) { break; }
char buf[50];
struct tm* tm;
tm = localtime(&datetime);
size_t size = strftime(buf, 50, "%FT%T%z", tm);
std::string date(buf, size);
std::ostringstream streamKey;
if (SpecialKeySpace::getActorLineageApiCommandRange("state").contains(kr)) {
streamKey << SpecialKeySpace::getActorLineageApiCommandPrefix("state").toString() << host.toString()
<< "/" << to_string(waitState) << "/" << date;
} else if (SpecialKeySpace::getActorLineageApiCommandRange("time").contains(kr)) {
streamKey << SpecialKeySpace::getActorLineageApiCommandPrefix("time").toString() << host.toString()
<< "/" << date << "/" << to_string(waitState);
;
} else {
ASSERT(false);
}
streamKey << "/" << seq;
msgpack::object_handle oh = msgpack::unpack(data.data(), data.size());
msgpack::object deserialized = oh.get();
std::ostringstream stream;
stream << deserialized;
result.push_back_deep(result.arena(), KeyValueRef(streamKey.str(), stream.str()));
}
}
return result;
}
Future<Standalone<RangeResultRef>> ActorLineageImpl::getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const {
return actorLineageGetRangeActor(ryw, getKeyRange().begin, kr);
}
namespace {
std::string_view to_string_view(StringRef sr) {
return std::string_view(reinterpret_cast<const char*>(sr.begin()), sr.size());
}
} // namespace
ActorProfilerConf::ActorProfilerConf(KeyRangeRef kr)
: SpecialKeyRangeRWImpl(kr), config(ProfilerConfig::instance().getConfig()) {}
Future<Standalone<RangeResultRef>> ActorProfilerConf::getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const {
Standalone<RangeResultRef> res;
std::string_view begin(to_string_view(kr.begin.removePrefix(range.begin))),
end(to_string_view(kr.end.removePrefix(range.begin)));
for (auto& p : config) {
if (p.first > end) {
break;
} else if (p.first > begin) {
KeyValueRef kv;
kv.key = StringRef(res.arena(), p.first);
kv.value = StringRef(res.arena(), p.second);
res.push_back(res.arena(), kv);
}
}
return res;
}
void ActorProfilerConf::set(ReadYourWritesTransaction* ryw, const KeyRef& key, const ValueRef& value) {
config[key.removePrefix(range.begin).toString()] = value.toString();
didWrite = true;
}
void ActorProfilerConf::clear(ReadYourWritesTransaction* ryw, const KeyRangeRef& kr) {
std::string begin(kr.begin.removePrefix(range.begin).toString()), end(kr.end.removePrefix(range.begin).toString());
auto first = config.lower_bound(begin);
if (first == config.end()) {
// nothing to clear
return;
}
didWrite = true;
auto last = config.upper_bound(end);
config.erase(first, last);
}
void ActorProfilerConf::clear(ReadYourWritesTransaction* ryw, const KeyRef& key) {
std::string k = key.removePrefix(range.begin).toString();
auto iter = config.find(k);
if (iter != config.end()) {
config.erase(iter);
}
didWrite = true;
}
Future<Optional<std::string>> ActorProfilerConf::commit(ReadYourWritesTransaction* ryw) {
Optional<std::string> res{};
try {
if (didWrite) {
ProfilerConfig::instance().reset(config);
}
return res;
} catch (ConfigError& err) {
return Optional<std::string>{ err.description };
}
}
MaintenanceImpl::MaintenanceImpl(KeyRangeRef kr) : SpecialKeyRangeRWImpl(kr) {}
// Used to read the healthZoneKey
// If the key is persisted and the delayed read version is still larger than current read version,
// we will calculate the remaining time(truncated to integer, the same as fdbcli) and return back as the value
// If the zoneId is the special one `ignoreSSFailuresZoneString`,
// value will be 0 (same as fdbcli)
ACTOR static Future<Standalone<RangeResultRef>> MaintenanceGetRangeActor(ReadYourWritesTransaction* ryw,
KeyRef prefix,
KeyRangeRef kr) {
state Standalone<RangeResultRef> result;
// zoneId
ryw->getTransaction().setOption(FDBTransactionOptions::LOCK_AWARE);
Optional<Value> val = wait(ryw->getTransaction().get(healthyZoneKey));
if (val.present()) {
auto healthyZone = decodeHealthyZoneValue(val.get());
if ((healthyZone.first == ignoreSSFailuresZoneString) ||
(healthyZone.second > ryw->getTransaction().getReadVersion().get())) {
Key zone_key = healthyZone.first.withPrefix(prefix);
double seconds = healthyZone.first == ignoreSSFailuresZoneString
? 0
: (healthyZone.second - ryw->getTransaction().getReadVersion().get()) /
CLIENT_KNOBS->CORE_VERSIONSPERSECOND;
if (kr.contains(zone_key)) {
result.push_back_deep(result.arena(),
KeyValueRef(zone_key, Value(boost::lexical_cast<std::string>(seconds))));
}
}
}
return rywGetRange(ryw, kr, result);
}
Future<Standalone<RangeResultRef>> MaintenanceImpl::getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const {
return MaintenanceGetRangeActor(ryw, getKeyRange().begin, kr);
}
// Commit the change to healthZoneKey
// We do not allow more than one zone to be set in maintenance in one transaction
// In addition, if the zoneId now is 'ignoreSSFailuresZoneString',
// which means the data distribution is disabled for storage failures.
// Only clear this specific key is allowed, any other operations will throw error
ACTOR static Future<Optional<std::string>> maintenanceCommitActor(ReadYourWritesTransaction* ryw, KeyRangeRef kr) {
// read
ryw->getTransaction().setOption(FDBTransactionOptions::LOCK_AWARE);
ryw->getTransaction().setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
Optional<Value> val = wait(ryw->getTransaction().get(healthyZoneKey));
Optional<std::pair<Key, Version>> healthyZone =
val.present() ? decodeHealthyZoneValue(val.get()) : Optional<std::pair<Key, Version>>();
state RangeMap<Key, std::pair<bool, Optional<Value>>, KeyRangeRef>::Ranges ranges =
ryw->getSpecialKeySpaceWriteMap().containedRanges(kr);
Key zoneId;
double seconds;
bool isSet = false;
// Since maintenance only allows one zone at the same time,
// if a transaction has more than one set operation on different zone keys,
// the commit will throw an error
for (auto iter = ranges.begin(); iter != ranges.end(); ++iter) {
if (!iter->value().first)
continue;
if (iter->value().second.present()) {
if (isSet)
return Optional<std::string>(ManagementAPIError::toJsonString(
false, "maintenance", "Multiple zones given for maintenance, only one allowed at the same time"));
isSet = true;
zoneId = iter->begin().removePrefix(kr.begin);
seconds = boost::lexical_cast<double>(iter->value().second.get().toString());
} else {
// if we already have set operation, then all clear operations will be meaningless, thus skip
if (!isSet && healthyZone.present() && iter.range().contains(healthyZone.get().first.withPrefix(kr.begin)))
ryw->getTransaction().clear(healthyZoneKey);
}
}
if (isSet) {
if (healthyZone.present() && healthyZone.get().first == ignoreSSFailuresZoneString) {
std::string msg = "Maintenance mode cannot be used while data distribution is disabled for storage "
"server failures.";
return Optional<std::string>(ManagementAPIError::toJsonString(false, "maintenance", msg));
} else if (seconds < 0) {
std::string msg =
"The specified maintenance time " + boost::lexical_cast<std::string>(seconds) + " is a negative value";
return Optional<std::string>(ManagementAPIError::toJsonString(false, "maintenance", msg));
} else {
TraceEvent(SevDebug, "SKSMaintenanceSet").detail("ZoneId", zoneId.toString());
ryw->getTransaction().set(healthyZoneKey,
healthyZoneValue(zoneId,
ryw->getTransaction().getReadVersion().get() +
(seconds * CLIENT_KNOBS->CORE_VERSIONSPERSECOND)));
}
}
return Optional<std::string>();
}
Future<Optional<std::string>> MaintenanceImpl::commit(ReadYourWritesTransaction* ryw) {
return maintenanceCommitActor(ryw, getKeyRange());
}
DataDistributionImpl::DataDistributionImpl(KeyRangeRef kr) : SpecialKeyRangeRWImpl(kr) {}
// Read the system keys dataDistributionModeKey and rebalanceDDIgnoreKey
ACTOR static Future<Standalone<RangeResultRef>> DataDistributionGetRangeActor(ReadYourWritesTransaction* ryw,
KeyRef prefix,
KeyRangeRef kr) {
state Standalone<RangeResultRef> result;
// dataDistributionModeKey
state Key modeKey = LiteralStringRef("mode").withPrefix(prefix);
if (kr.contains(modeKey)) {
auto entry = ryw->getSpecialKeySpaceWriteMap()[modeKey];
if (ryw->readYourWritesDisabled() || !entry.first) {
Optional<Value> f = wait(ryw->getTransaction().get(dataDistributionModeKey));
int mode = -1;
if (f.present()) {
mode = BinaryReader::fromStringRef<int>(f.get(), Unversioned());
}
result.push_back_deep(result.arena(), KeyValueRef(modeKey, Value(boost::lexical_cast<std::string>(mode))));
}
}
// rebalanceDDIgnoreKey
state Key rebalanceIgnoredKey = LiteralStringRef("rebalance_ignored").withPrefix(prefix);
if (kr.contains(rebalanceIgnoredKey)) {
auto entry = ryw->getSpecialKeySpaceWriteMap()[rebalanceIgnoredKey];
if (ryw->readYourWritesDisabled() || !entry.first) {
Optional<Value> f = wait(ryw->getTransaction().get(rebalanceDDIgnoreKey));
if (f.present()) {
result.push_back_deep(result.arena(), KeyValueRef(rebalanceIgnoredKey, Value()));
}
}
}
return rywGetRange(ryw, kr, result);
}
Future<Standalone<RangeResultRef>> DataDistributionImpl::getRange(ReadYourWritesTransaction* ryw,
KeyRangeRef kr) const {
return DataDistributionGetRangeActor(ryw, getKeyRange().begin, kr);
}
Future<Optional<std::string>> DataDistributionImpl::commit(ReadYourWritesTransaction* ryw) {
// there are two valid keys in the range
// <prefix>/mode -> dataDistributionModeKey, the value is only allowed to be set as "0"(disable) or "1"(enable)
// <prefix>/rebalance_ignored -> rebalanceDDIgnoreKey, value is unused thus empty
Optional<std::string> msg;
KeyRangeRef kr = getKeyRange();
Key modeKey = LiteralStringRef("mode").withPrefix(kr.begin);
Key rebalanceIgnoredKey = LiteralStringRef("rebalance_ignored").withPrefix(kr.begin);
auto ranges = ryw->getSpecialKeySpaceWriteMap().containedRanges(kr);
for (auto iter = ranges.begin(); iter != ranges.end(); ++iter) {
if (!iter->value().first)
continue;
if (iter->value().second.present()) {
if (iter->range() == singleKeyRange(modeKey)) {
try {
int mode = boost::lexical_cast<int>(iter->value().second.get().toString());
Value modeVal = BinaryWriter::toValue(mode, Unversioned());
if (mode == 0 || mode == 1)
ryw->getTransaction().set(dataDistributionModeKey, modeVal);
else
msg = ManagementAPIError::toJsonString(false,
"datadistribution",
"Please set the value of the data_distribution/mode to "
"0(disable) or 1(enable), other values are not allowed");
} catch (boost::bad_lexical_cast& e) {
msg = ManagementAPIError::toJsonString(false,
"datadistribution",
"Invalid datadistribution mode(int): " +
iter->value().second.get().toString());
}
} else if (iter->range() == singleKeyRange(rebalanceIgnoredKey)) {
if (iter->value().second.get().size())
msg =
ManagementAPIError::toJsonString(false,
"datadistribution",
"Value is unused for the data_distribution/rebalance_ignored "
"key, please set it to an empty value");
else
ryw->getTransaction().set(rebalanceDDIgnoreKey, LiteralStringRef("on"));
} else {
msg = ManagementAPIError::toJsonString(
false,
"datadistribution",
"Changing invalid keys, please read the documentation to check valid keys in the range");
}
} else {
// clear
if (iter->range().contains(modeKey))
ryw->getTransaction().clear(dataDistributionModeKey);
else if (iter->range().contains(rebalanceIgnoredKey))
ryw->getTransaction().clear(rebalanceDDIgnoreKey);
}
}
return msg;
}

View File

@ -142,10 +142,13 @@ public:
class SpecialKeySpace {
public:
enum class MODULE {
ACTORLINEAGE, // Sampling data
ACTOR_PROFILER_CONF, // profiler configuration
CLUSTERFILEPATH,
CONFIGURATION, // Configuration of the cluster
CONNECTIONSTRING,
ERRORMSG, // A single key space contains a json string which describes the last error in special-key-space
GLOBALCONFIG, // Global configuration options synchronized to all nodes
MANAGEMENT, // Management-API
METRICS, // data-distribution metrics
TESTONLY, // only used by correctness tests
@ -198,6 +201,12 @@ public:
static KeyRef getManagementApiCommandPrefix(const std::string& command) {
return managementApiCommandToRange.at(command).begin;
}
static KeyRangeRef getActorLineageApiCommandRange(const std::string& command) {
return actorLineageApiCommandToRange.at(command);
}
static KeyRef getActorLineageApiCommandPrefix(const std::string& command) {
return actorLineageApiCommandToRange.at(command).begin;
}
static Key getManagementApiCommandOptionSpecialKey(const std::string& command, const std::string& option);
static const std::set<std::string>& getManagementApiOptionsSet() { return options; }
static const std::set<std::string>& getTracingOptions() { return tracingOptions; }
@ -226,6 +235,7 @@ private:
static std::unordered_map<SpecialKeySpace::MODULE, KeyRange> moduleToBoundary;
static std::unordered_map<std::string, KeyRange>
managementApiCommandToRange; // management command to its special keys' range
static std::unordered_map<std::string, KeyRange> actorLineageApiCommandToRange;
static std::set<std::string> options; // "<command>/<option>"
static std::set<std::string> tracingOptions;
@ -336,6 +346,16 @@ public:
Future<Optional<std::string>> commit(ReadYourWritesTransaction* ryw) override;
};
class GlobalConfigImpl : public SpecialKeyRangeRWImpl {
public:
explicit GlobalConfigImpl(KeyRangeRef kr);
Future<Standalone<RangeResultRef>> getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const override;
void set(ReadYourWritesTransaction* ryw, const KeyRef& key, const ValueRef& value) override;
Future<Optional<std::string>> commit(ReadYourWritesTransaction* ryw) override;
void clear(ReadYourWritesTransaction* ryw, const KeyRangeRef& range) override;
void clear(ReadYourWritesTransaction* ryw, const KeyRef& key) override;
};
class TracingOptionsImpl : public SpecialKeyRangeRWImpl {
public:
explicit TracingOptionsImpl(KeyRangeRef kr);
@ -377,5 +397,38 @@ public:
void clear(ReadYourWritesTransaction* ryw, const KeyRef& key) override;
};
class ActorLineageImpl : public SpecialKeyRangeReadImpl {
public:
explicit ActorLineageImpl(KeyRangeRef kr);
Future<Standalone<RangeResultRef>> getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const override;
};
class ActorProfilerConf : public SpecialKeyRangeRWImpl {
bool didWrite = false;
std::map<std::string, std::string> config;
public:
explicit ActorProfilerConf(KeyRangeRef kr);
Future<Standalone<RangeResultRef>> getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const override;
void set(ReadYourWritesTransaction* ryw, const KeyRef& key, const ValueRef& value) override;
void clear(ReadYourWritesTransaction* ryw, const KeyRangeRef& range) override;
void clear(ReadYourWritesTransaction* ryw, const KeyRef& key) override;
Future<Optional<std::string>> commit(ReadYourWritesTransaction* ryw) override;
};
class MaintenanceImpl : public SpecialKeyRangeRWImpl {
public:
explicit MaintenanceImpl(KeyRangeRef kr);
Future<Standalone<RangeResultRef>> getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const override;
Future<Optional<std::string>> commit(ReadYourWritesTransaction* ryw) override;
};
class DataDistributionImpl : public SpecialKeyRangeRWImpl {
public:
explicit DataDistributionImpl(KeyRangeRef kr);
Future<Standalone<RangeResultRef>> getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const override;
Future<Optional<std::string>> commit(ReadYourWritesTransaction* ryw) override;
};
#include "flow/unactorcompiler.h"
#endif

View File

@ -632,7 +632,18 @@ std::string encodeFailedServersKey(AddressExclusion const& addr) {
return failedServersPrefix.toString() + addr.toString();
}
const KeyRangeRef workerListKeys(LiteralStringRef("\xff/worker/"), LiteralStringRef("\xff/worker0"));
// const KeyRangeRef globalConfigKeys( LiteralStringRef("\xff/globalConfig/"), LiteralStringRef("\xff/globalConfig0") );
// const KeyRef globalConfigPrefix = globalConfigKeys.begin;
const KeyRangeRef globalConfigDataKeys( LiteralStringRef("\xff/globalConfig/k/"), LiteralStringRef("\xff/globalConfig/k0") );
const KeyRef globalConfigKeysPrefix = globalConfigDataKeys.begin;
const KeyRangeRef globalConfigHistoryKeys( LiteralStringRef("\xff/globalConfig/h/"), LiteralStringRef("\xff/globalConfig/h0") );
const KeyRef globalConfigHistoryPrefix = globalConfigHistoryKeys.begin;
const KeyRef globalConfigVersionKey = LiteralStringRef("\xff/globalConfig/v");
const KeyRangeRef workerListKeys( LiteralStringRef("\xff/worker/"), LiteralStringRef("\xff/worker0") );
const KeyRef workerListPrefix = workerListKeys.begin;
const Key workerListKeyFor(StringRef processID) {
@ -748,8 +759,7 @@ const KeyRef tagThrottleCountKey = LiteralStringRef("\xff\x02/throttledTags/manu
// Client status info prefix
const KeyRangeRef fdbClientInfoPrefixRange(LiteralStringRef("\xff\x02/fdbClientInfo/"),
LiteralStringRef("\xff\x02/fdbClientInfo0"));
const KeyRef fdbClientInfoTxnSampleRate = LiteralStringRef("\xff\x02/fdbClientInfo/client_txn_sample_rate/");
const KeyRef fdbClientInfoTxnSizeLimit = LiteralStringRef("\xff\x02/fdbClientInfo/client_txn_size_limit/");
// See remaining fields in GlobalConfig.actor.h
// ConsistencyCheck settings
const KeyRef fdbShouldConsistencyCheckBeSuspended = LiteralStringRef("\xff\x02/ConsistencyCheck/Suspend");

View File

@ -230,6 +230,30 @@ extern const KeyRef failedServersVersionKey; // The value of this key shall be c
const AddressExclusion decodeFailedServersKey(KeyRef const& key); // where key.startsWith(failedServersPrefix)
std::string encodeFailedServersKey(AddressExclusion const&);
// "\xff/globalConfig/[[option]]" := "value"
// An umbrella prefix for global configuration data synchronized to all nodes.
// extern const KeyRangeRef globalConfigData;
// extern const KeyRef globalConfigDataPrefix;
// "\xff/globalConfig/k/[[key]]" := "value"
// Key-value pairs that have been set. The range this keyspace represents
// contains all globally configured options.
extern const KeyRangeRef globalConfigDataKeys;
extern const KeyRef globalConfigKeysPrefix;
// "\xff/globalConfig/h/[[version]]" := "value"
// Maps a commit version to a list of mutations made to the global
// configuration at that commit. Shipped to nodes periodically. In general,
// clients should not write to keys in this keyspace; it will be written
// automatically when updating global configuration keys.
extern const KeyRangeRef globalConfigHistoryKeys;
extern const KeyRef globalConfigHistoryPrefix;
// "\xff/globalConfig/v" := "version"
// Read-only key which returns the commit version of the most recent mutation
// made to the global configuration keyspace.
extern const KeyRef globalConfigVersionKey;
// "\xff/workers/[[processID]]" := ""
// Asynchronously updated by the cluster controller, this is a list of fdbserver processes that have joined the cluster
// and are currently (recently) available
@ -355,8 +379,6 @@ extern const KeyRangeRef applyMutationsKeyVersionCountRange;
// FdbClient Info prefix
extern const KeyRangeRef fdbClientInfoPrefixRange;
extern const KeyRef fdbClientInfoTxnSampleRate;
extern const KeyRef fdbClientInfoTxnSizeLimit;
// Consistency Check settings
extern const KeyRef fdbShouldConsistencyCheckBeSuspended;

View File

@ -97,6 +97,15 @@ double ThreadSafeDatabase::getMainThreadBusyness() {
return g_network->networkInfo.metrics.networkBusyness;
}
// Returns the protocol version reported by the coordinator this client is connected to
// If an expected version is given, the future won't return until the protocol version is different than expected
// Note: this will never return if the server is running a protocol from FDB 5.0 or older
ThreadFuture<ProtocolVersion> ThreadSafeDatabase::getServerProtocol(Optional<ProtocolVersion> expectedVersion) {
DatabaseContext* db = this->db;
return onMainThread(
[db, expectedVersion]() -> Future<ProtocolVersion> { return db->getClusterProtocol(expectedVersion); });
}
ThreadSafeDatabase::ThreadSafeDatabase(std::string connFilename, int apiVersion) {
ClusterConnectionFile* connFile =
new ClusterConnectionFile(ClusterConnectionFile::lookupClusterFileName(connFilename).first);
@ -407,16 +416,6 @@ const char* ThreadSafeApi::getClientVersion() {
return clientVersion.c_str();
}
// Wait until a quorum of coordinators with the same protocol version are available, and then return that protocol
// version.
ThreadFuture<uint64_t> ThreadSafeApi::getServerProtocol(const char* clusterFilePath) {
return onMainThread([clusterFilePath = std::string(clusterFilePath)]() -> Future<uint64_t> {
auto [clusterFile, isDefault] = ClusterConnectionFile::lookupClusterFileName(clusterFilePath);
Reference<ClusterConnectionFile> f = Reference<ClusterConnectionFile>(new ClusterConnectionFile(clusterFile));
return getCoordinatorProtocols(f);
});
}
void ThreadSafeApi::setNetworkOption(FDBNetworkOptions::Option option, Optional<StringRef> value) {
if (option == FDBNetworkOptions::EXTERNAL_CLIENT_TRANSPORT_ID) {
if (value.present()) {

View File

@ -27,6 +27,8 @@
#include "fdbclient/ClusterInterface.h"
#include "fdbclient/IClientApi.h"
// An implementation of IDatabase that serializes operations onto the network thread and interacts with the lower-level
// client APIs exposed by NativeAPI and ReadYourWrites.
class ThreadSafeDatabase : public IDatabase, public ThreadSafeReferenceCounted<ThreadSafeDatabase> {
public:
~ThreadSafeDatabase() override;
@ -37,9 +39,15 @@ public:
void setOption(FDBDatabaseOptions::Option option, Optional<StringRef> value = Optional<StringRef>()) override;
double getMainThreadBusyness() override;
ThreadFuture<Void>
onConnected(); // Returns after a majority of coordination servers are available and have reported a leader. The
// cluster file therefore is valid, but the database might be unavailable.
// Returns the protocol version reported by the coordinator this client is connected to
// If an expected version is given, the future won't return until the protocol version is different than expected
// Note: this will never return if the server is running a protocol from FDB 5.0 or older
ThreadFuture<ProtocolVersion> getServerProtocol(
Optional<ProtocolVersion> expectedVersion = Optional<ProtocolVersion>()) override;
// Returns after a majority of coordination servers are available and have reported a leader. The
// cluster file therefore is valid, but the database might be unavailable.
ThreadFuture<Void> onConnected();
void addref() override { ThreadSafeReferenceCounted<ThreadSafeDatabase>::addref(); }
void delref() override { ThreadSafeReferenceCounted<ThreadSafeDatabase>::delref(); }
@ -58,6 +66,8 @@ public: // Internal use only
DatabaseContext* unsafeGetPtr() const { return db; }
};
// An implementation of ITransaction that serializes operations onto the network thread and interacts with the
// lower-level client APIs exposed by NativeAPI and ReadYourWrites.
class ThreadSafeTransaction : public ITransaction, ThreadSafeReferenceCounted<ThreadSafeTransaction>, NonCopyable {
public:
explicit ThreadSafeTransaction(DatabaseContext* cx);
@ -135,11 +145,12 @@ private:
ReadYourWritesTransaction* tr;
};
// An implementation of IClientApi that serializes operations onto the network thread and interacts with the lower-level
// client APIs exposed by NativeAPI and ReadYourWrites.
class ThreadSafeApi : public IClientApi, ThreadSafeReferenceCounted<ThreadSafeApi> {
public:
void selectApiVersion(int apiVersion) override;
const char* getClientVersion() override;
ThreadFuture<uint64_t> getServerProtocol(const char* clusterFilePath) override;
void setNetworkOption(FDBNetworkOptions::Option option, Optional<StringRef> value = Optional<StringRef>()) override;
void setupNetwork() override;

View File

@ -0,0 +1,25 @@
/*
* TransactionLineage.cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2021 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "fdbclient/TransactionLineage.h"
namespace {
TransactionLineageCollector transactionLineageCollector;
}

View File

@ -0,0 +1,128 @@
/*
* TransactionLineage.h
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2021 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "fdbclient/ActorLineageProfiler.h"
struct TransactionLineage : LineageProperties<TransactionLineage> {
enum class Operation {
Unset,
GetValue,
GetKey,
GetKeyValues,
WatchValue,
GetConsistentReadVersion,
Commit,
GetKeyServersLocations
};
static constexpr std::string_view name = "Transaction"sv;
uint64_t txID;
Operation operation = Operation::Unset;
bool isSet(uint64_t TransactionLineage::*member) const { return this->*member > 0; }
bool isSet(Operation TransactionLineage::*member) const { return this->*member != Operation::Unset; }
};
struct TransactionLineageCollector : IALPCollector<TransactionLineage> {
using Operation = TransactionLineage::Operation;
std::optional<std::any> collect(ActorLineage* lineage) {
std::map<std::string_view, std::any> res;
auto txID = lineage->get(&TransactionLineage::txID);
if (txID.has_value()) {
res["ID"sv] = txID.value();
}
auto operation = lineage->get(&TransactionLineage::operation);
if (operation.has_value()) {
switch (operation.value()) {
case Operation::Unset:
res["operation"sv] = "Unset"sv;
break;
case Operation::GetValue:
res["operation"sv] = "GetValue"sv;
break;
case Operation::GetKey:
res["operation"sv] = "GetKey"sv;
break;
case Operation::GetKeyValues:
res["operation"sv] = "GetKeyValues"sv;
break;
case Operation::WatchValue:
res["operation"sv] = "WatchValue"sv;
break;
case Operation::GetConsistentReadVersion:
res["operation"sv] = "GetConsistentReadVersion"sv;
break;
case Operation::Commit:
res["operation"sv] = "Commit"sv;
break;
case Operation::GetKeyServersLocations:
res["operation"sv] = "GetKeyServersLocations"sv;
break;
}
}
if (res.empty()) {
return std::optional<std::any>{};
} else {
return res;
}
}
};
template <class T, class V>
class ScopedLineage {
V before;
V T::*member;
bool valid = true;
public:
ScopedLineage(V T::*member, V const& value) : member(member) {
auto& val = currentLineage->modify(member);
before = val;
val = value;
}
~ScopedLineage() {
if (!valid) {
return;
}
currentLineage->modify(member) = before;
}
ScopedLineage(ScopedLineage<T, V>&& o) : before(std::move(o.before)), member(o.member), valid(o.valid) {
o.release();
}
ScopedLineage& operator=(ScopedLineage<T, V>&& o) {
if (valid) {
currentLineage->modify(member) = before;
}
before = std::move(o.before);
member = o.member;
valid = o.valid;
o.release();
return *this;
}
ScopedLineage(const ScopedLineage<T, V>&) = delete;
ScopedLineage& operator=(const ScopedLineage<T, V>&) = delete;
void release() { valid = false; }
};
template <class T, class V>
ScopedLineage<T, V> make_scoped_lineage(V T::*member, V const& value) {
return ScopedLineage<T, V>(member, value);
}

View File

@ -20,7 +20,20 @@
#include "fdbclient/Tuple.h"
static size_t find_string_terminator(const StringRef data, size_t offset) {
// TODO: Many functions copied from bindings/flow/Tuple.cpp. Merge at some point.
static float bigEndianFloat(float orig) {
int32_t big = *(int32_t*)&orig;
big = bigEndian32(big);
return *(float*)&big;
}
static double bigEndianDouble(double orig) {
int64_t big = *(int64_t*)&orig;
big = bigEndian64(big);
return *(double*)&big;
}
static size_t findStringTerminator(const StringRef data, size_t offset) {
size_t i = offset;
while (i < data.size() - 1 && !(data[i] == '\x00' && data[i + 1] != (uint8_t)'\xff')) {
i += (data[i] == '\x00' ? 2 : 1);
@ -29,6 +42,20 @@ static size_t find_string_terminator(const StringRef data, size_t offset) {
return i;
}
// If encoding and the sign bit is 1 (the number is negative), flip all the bits.
// If decoding and the sign bit is 0 (the number is negative), flip all the bits.
// Otherwise, the number is positive, so flip the sign bit.
static void adjustFloatingPoint(uint8_t* bytes, size_t size, bool encode) {
if ((encode && ((uint8_t)(bytes[0] & 0x80) != (uint8_t)0x00)) ||
(!encode && ((uint8_t)(bytes[0] & 0x80) != (uint8_t)0x80))) {
for (size_t i = 0; i < size; i++) {
bytes[i] ^= (uint8_t)0xff;
}
} else {
bytes[0] ^= (uint8_t)0x80;
}
}
Tuple::Tuple(StringRef const& str, bool exclude_incomplete) {
data.append(data.arena(), str.begin(), str.size());
@ -37,9 +64,15 @@ Tuple::Tuple(StringRef const& str, bool exclude_incomplete) {
offsets.push_back(i);
if (data[i] == '\x01' || data[i] == '\x02') {
i = find_string_terminator(str, i + 1) + 1;
i = findStringTerminator(str, i + 1) + 1;
} else if (data[i] >= '\x0c' && data[i] <= '\x1c') {
i += abs(data[i] - '\x14') + 1;
} else if (data[i] == 0x20) {
i += sizeof(float) + 1;
} else if (data[i] == 0x21) {
i += sizeof(double) + 1;
} else if (data[i] == 0x26 || data[i] == 0x27) {
i += 1;
} else if (data[i] == '\x00') {
i += 1;
} else {
@ -113,6 +146,39 @@ Tuple& Tuple::append(int64_t value) {
return *this;
}
Tuple& Tuple::appendBool(bool value) {
offsets.push_back(data.size());
if (value) {
data.push_back(data.arena(), 0x27);
} else {
data.push_back(data.arena(), 0x26);
}
return *this;
}
Tuple& Tuple::appendFloat(float value) {
offsets.push_back(data.size());
float swap = bigEndianFloat(value);
uint8_t* bytes = (uint8_t*)&swap;
adjustFloatingPoint(bytes, sizeof(float), true);
data.push_back(data.arena(), 0x20);
data.append(data.arena(), bytes, sizeof(float));
return *this;
}
Tuple& Tuple::appendDouble(double value) {
offsets.push_back(data.size());
double swap = value;
swap = bigEndianDouble(swap);
uint8_t* bytes = (uint8_t*)&swap;
adjustFloatingPoint(bytes, sizeof(double), true);
data.push_back(data.arena(), 0x21);
data.append(data.arena(), bytes, sizeof(double));
return *this;
}
Tuple& Tuple::appendNull() {
offsets.push_back(data.size());
data.push_back(data.arena(), (uint8_t)'\x00');
@ -134,6 +200,12 @@ Tuple::ElementType Tuple::getType(size_t index) const {
return ElementType::UTF8;
} else if (code >= '\x0c' && code <= '\x1c') {
return ElementType::INT;
} else if (code == 0x20) {
return ElementType::FLOAT;
} else if (code == 0x21) {
return ElementType::DOUBLE;
} else if (code == 0x26 || code == 0x27) {
return ElementType::BOOL;
} else {
throw invalid_tuple_data_type();
}
@ -228,6 +300,60 @@ int64_t Tuple::getInt(size_t index, bool allow_incomplete) const {
return swap;
}
// TODO: Combine with bindings/flow/Tuple.*. This code is copied from there.
bool Tuple::getBool(size_t index) const {
if (index >= offsets.size()) {
throw invalid_tuple_index();
}
ASSERT_LT(offsets[index], data.size());
uint8_t code = data[offsets[index]];
if (code == 0x26) {
return false;
} else if (code == 0x27) {
return true;
} else {
throw invalid_tuple_data_type();
}
}
float Tuple::getFloat(size_t index) const {
if (index >= offsets.size()) {
throw invalid_tuple_index();
}
ASSERT_LT(offsets[index], data.size());
uint8_t code = data[offsets[index]];
if (code != 0x20) {
throw invalid_tuple_data_type();
}
float swap;
uint8_t* bytes = (uint8_t*)&swap;
ASSERT_LE(offsets[index] + 1 + sizeof(float), data.size());
swap = *(float*)(data.begin() + offsets[index] + 1);
adjustFloatingPoint(bytes, sizeof(float), false);
return bigEndianFloat(swap);
}
double Tuple::getDouble(size_t index) const {
if (index >= offsets.size()) {
throw invalid_tuple_index();
}
ASSERT_LT(offsets[index], data.size());
uint8_t code = data[offsets[index]];
if (code != 0x21) {
throw invalid_tuple_data_type();
}
double swap;
uint8_t* bytes = (uint8_t*)&swap;
ASSERT_LE(offsets[index] + 1 + sizeof(double), data.size());
swap = *(double*)(data.begin() + offsets[index] + 1);
adjustFloatingPoint(bytes, sizeof(double), false);
return bigEndianDouble(swap);
}
KeyRange Tuple::range(Tuple const& tuple) const {
VectorRef<uint8_t> begin;
VectorRef<uint8_t> end;

View File

@ -38,6 +38,11 @@ struct Tuple {
Tuple& append(Tuple const& tuple);
Tuple& append(StringRef const& str, bool utf8 = false);
Tuple& append(int64_t);
// There are some ambiguous append calls in fdbclient, so to make it easier
// to add append for floats and doubles, name them differently for now.
Tuple& appendBool(bool);
Tuple& appendFloat(float);
Tuple& appendDouble(double);
Tuple& appendNull();
StringRef pack() const { return StringRef(data.begin(), data.size()); }
@ -47,7 +52,7 @@ struct Tuple {
return append(t);
}
enum ElementType { NULL_TYPE, INT, BYTES, UTF8 };
enum ElementType { NULL_TYPE, INT, BYTES, UTF8, BOOL, FLOAT, DOUBLE };
// this is number of elements, not length of data
size_t size() const { return offsets.size(); }
@ -55,6 +60,9 @@ struct Tuple {
ElementType getType(size_t index) const;
Standalone<StringRef> getString(size_t index) const;
int64_t getInt(size_t index, bool allow_incomplete = false) const;
bool getBool(size_t index) const;
float getFloat(size_t index) const;
double getDouble(size_t index) const;
KeyRange range(Tuple const& tuple = Tuple()) const;

View File

@ -210,7 +210,7 @@ description is not currently required but encouraged.
<Option name="check_writes_enable" code="50"
hidden="true" />
<Option name="read_your_writes_disable" code="51"
description="Reads performed by a transaction will not see any prior mutations that occured in that transaction, instead seeing the value which was in the database at the transaction's read version. This option may provide a small performance benefit for the client, but also disables a number of client-side optimizations which are beneficial for transactions which tend to read and write the same keys within a single transaction."/>
description="Reads performed by a transaction will not see any prior mutations that occured in that transaction, instead seeing the value which was in the database at the transaction's read version. This option may provide a small performance benefit for the client, but also disables a number of client-side optimizations which are beneficial for transactions which tend to read and write the same keys within a single transaction. It is an error to set this option after performing any reads or writes on the transaction."/>
<Option name="read_ahead_disable" code="52"
description="Deprecated" />
<Option name="durability_datacenter" code="110" />

View File

@ -242,7 +242,12 @@ public:
// result = map(result, [=](int r) mutable { KAIOLogBlockEvent(io, OpLogEntry::READY, r); return r; });
#endif
return success(result);
auto& actorLineageSet = IAsyncFileSystem::filesystem()->getActorLineageSet();
auto index = actorLineageSet.insert(currentLineage);
ASSERT(index != ActorLineageSet::npos);
Future<Void> res = success(result);
actorLineageSet.erase(index);
return res;
}
// TODO(alexmiller): Remove when we upgrade the dev docker image to >14.10
#ifndef FALLOC_FL_ZERO_RANGE

View File

@ -197,7 +197,7 @@ private:
this->file = file;
this->filename = filename;
this->diskParameters = diskParameters;
maxWriteDelay = 5.0;
maxWriteDelay = FLOW_KNOBS->NON_DURABLE_MAX_WRITE_DELAY;
hasBeenSynced = false;
killMode = (KillMode)deterministicRandom()->randomInt(1, 3);
@ -276,6 +276,20 @@ public:
Future<Void> deleteFuture = deleteFile(this);
if (!deleteFuture.isReady())
filesBeingDeleted[filename] = deleteFuture;
} else if (isSoleOwner()) {
// isSoleOwner is a bit confusing here. What we mean is that the openFiles map is the sole owner. If we
// remove the file from the map to make sure it gets closed.
auto& openFiles = g_simulator.getCurrentProcess()->machine->openFiles;
auto iter = openFiles.find(filename);
// the file could've been renamed (DiskQueue does that for example). In that case the file won't be in the
// map anymore.
if (iter != openFiles.end()) {
// even if the filename exists, it doesn't mean that it references the same file. It could be that the
// file was renamed and later a file with the same name was opened.
if (iter->second.canGet() && iter->second.get().getPtr() == this) {
openFiles.erase(filename);
}
}
}
}
@ -434,7 +448,8 @@ private:
state TaskPriority currentTaskID = g_network->getCurrentTask();
wait(g_simulator.onMachine(currentProcess));
state double delayDuration = deterministicRandom()->random01() * self->maxWriteDelay;
state double delayDuration =
g_simulator.speedUpSimulation ? 0.0001 : (deterministicRandom()->random01() * self->maxWriteDelay);
state Standalone<StringRef> dataCopy(StringRef((uint8_t*)data, length));
state Future<bool> startSyncFuture = self->startSyncPromise.getFuture();
@ -448,7 +463,7 @@ private:
self->getModificationsAndInsert(offset, length, true, writeEnded);
self->minSizeAfterPendingModifications = std::max(self->minSizeAfterPendingModifications, offset + length);
if (BUGGIFY_WITH_PROB(0.001))
if (BUGGIFY_WITH_PROB(0.001) && !g_simulator.speedUpSimulation)
priorModifications.push_back(
delay(deterministicRandom()->random01() * FLOW_KNOBS->MAX_PRIOR_MODIFICATION_DELAY) ||
self->killed.getFuture());
@ -606,7 +621,8 @@ private:
state TaskPriority currentTaskID = g_network->getCurrentTask();
wait(g_simulator.onMachine(currentProcess));
state double delayDuration = deterministicRandom()->random01() * self->maxWriteDelay;
state double delayDuration =
g_simulator.speedUpSimulation ? 0.0001 : (deterministicRandom()->random01() * self->maxWriteDelay);
state Future<bool> startSyncFuture = self->startSyncPromise.getFuture();
try {

View File

@ -24,6 +24,7 @@
#include "flow/UnitTest.h"
#include "flow/DeterministicRandom.h"
#include "flow/IThreadPool.h"
#include "flow/WriteOnlySet.h"
#include "fdbrpc/fdbrpc.h"
#include "fdbrpc/IAsyncFile.h"
#include "flow/TLSConfig.actor.h"
@ -283,6 +284,9 @@ struct YieldMockNetwork final : INetwork, ReferenceCounted<YieldMockNetwork> {
static TLSConfig emptyConfig;
return emptyConfig;
}
ActorLineageSet& getActorLineageSet() override {
throw std::exception();
}
ProtocolVersion protocolVersion() override { return baseNetwork->protocolVersion(); }
};

View File

@ -334,7 +334,7 @@ ACTOR Future<Void> pingLatencyLogger(TransportData* self) {
}
TransportData::TransportData(uint64_t transportId)
: endpoints(/*wellKnownTokenCount*/ 11), endpointNotFoundReceiver(endpoints), pingReceiver(endpoints),
: endpoints(/*wellKnownTokenCount*/ 12), endpointNotFoundReceiver(endpoints), pingReceiver(endpoints),
warnAlwaysForLargePacket(true), lastIncompatibleMessage(0), transportId(transportId),
numIncompatibleConnections(0) {
degraded = makeReference<AsyncVar<bool>>(false);
@ -760,6 +760,13 @@ ACTOR Future<Void> connectionKeeper(Reference<Peer> self,
conn->close();
conn = Reference<IConnection>();
// Old versions will throw this error, and we don't want to forget their protocol versions.
// This means we can't tell the difference between an old protocol version and one we
// can no longer connect to.
if (e.code() != error_code_incompatible_protocol_version) {
self->protocolVersion->set(Optional<ProtocolVersion>());
}
}
// Clients might send more packets in response, which needs to go out on the next connection
@ -787,7 +794,8 @@ Peer::Peer(TransportData* transport, NetworkAddress const& destination)
incompatibleProtocolVersionNewer(false), peerReferences(-1), bytesReceived(0), lastDataPacketSentTime(now()),
pingLatencies(destination.isPublic() ? FLOW_KNOBS->PING_SAMPLE_AMOUNT : 1), lastLoggedBytesReceived(0),
bytesSent(0), lastLoggedBytesSent(0), lastLoggedTime(0.0), connectOutgoingCount(0), connectIncomingCount(0),
connectFailedCount(0), connectLatencies(destination.isPublic() ? FLOW_KNOBS->NETWORK_CONNECT_SAMPLE_AMOUNT : 1) {
connectFailedCount(0), connectLatencies(destination.isPublic() ? FLOW_KNOBS->NETWORK_CONNECT_SAMPLE_AMOUNT : 1),
protocolVersion(Reference<AsyncVar<Optional<ProtocolVersion>>>(new AsyncVar<Optional<ProtocolVersion>>())) {
IFailureMonitor::failureMonitor().setStatus(destination, FailureStatus(false));
}
@ -1103,12 +1111,12 @@ static int getNewBufferSize(const uint8_t* begin,
packetLen + sizeof(uint32_t) * (peerAddress.isTLS() ? 2 : 3));
}
// This actor exists whenever there is an open or opening connection, whether incoming or outgoing
// For incoming connections conn is set and peer is initially nullptr; for outgoing connections it is the reverse
ACTOR static Future<Void> connectionReader(TransportData* transport,
Reference<IConnection> conn,
Reference<Peer> peer,
Promise<Reference<Peer>> onConnected) {
// This actor exists whenever there is an open or opening connection, whether incoming or outgoing
// For incoming connections conn is set and peer is initially nullptr; for outgoing connections it is the reverse
state Arena arena;
state uint8_t* unprocessed_begin = nullptr;
@ -1206,7 +1214,11 @@ ACTOR static Future<Void> connectionReader(TransportData* transport,
now() + FLOW_KNOBS->CONNECTION_ID_TIMEOUT;
}
compatible = false;
if (!protocolVersion.hasMultiVersionClient()) {
if (!protocolVersion.hasInexpensiveMultiVersionClient()) {
if(peer) {
peer->protocolVersion->set(protocolVersion);
}
// Older versions expected us to hang up. It may work even if we don't hang up here, but
// it's safer to keep the old behavior.
throw incompatible_protocol_version();
@ -1256,6 +1268,7 @@ ACTOR static Future<Void> connectionReader(TransportData* transport,
onConnected.send(peer);
wait(delay(0)); // Check for cancellation
}
peer->protocolVersion->set(peerProtocolVersion);
}
}
@ -1669,6 +1682,16 @@ Reference<AsyncVar<bool>> FlowTransport::getDegraded() {
return self->degraded;
}
// Returns the protocol version of the peer at the specified address. The result is returned as an AsyncVar that
// can be used to monitor for changes of a peer's protocol. The protocol version will be unset in the event that
// there is no connection established to the peer.
//
// Note that this function does not establish a connection to the peer. In order to obtain a peer's protocol
// version, some other mechanism should be used to connect to that peer.
Reference<AsyncVar<Optional<ProtocolVersion>>> FlowTransport::getPeerProtocolAsyncVar(NetworkAddress addr) {
return self->peers.at(addr)->protocolVersion;
}
void FlowTransport::resetConnection(NetworkAddress address) {
auto peer = self->getPeer(address);
if (peer) {

View File

@ -152,6 +152,9 @@ struct Peer : public ReferenceCounted<Peer> {
double lastLoggedTime;
int64_t lastLoggedBytesReceived;
int64_t lastLoggedBytesSent;
Reference<AsyncVar<Optional<ProtocolVersion>>> protocolVersion;
// Cleared every time stats are logged for this peer.
int connectOutgoingCount;
int connectIncomingCount;
@ -174,64 +177,64 @@ public:
FlowTransport(uint64_t transportId);
~FlowTransport();
static void createInstance(bool isClient, uint64_t transportId);
// Creates a new FlowTransport and makes FlowTransport::transport() return it. This uses g_network->global()
// variables, so it will be private to a simulation.
static void createInstance(bool isClient, uint64_t transportId);
static bool isClient() { return g_network->global(INetwork::enClientFailureMonitor) != nullptr; }
void initMetrics();
// Metrics must be initialized after FlowTransport::createInstance has been called
void initMetrics();
Future<Void> bind(NetworkAddress publicAddress, NetworkAddress listenAddress);
// Starts a server listening on the given listenAddress, and sets publicAddress to be the public
// address of this server. Returns only errors.
Future<Void> bind(NetworkAddress publicAddress, NetworkAddress listenAddress);
NetworkAddress getLocalAddress() const;
// Returns first local NetworkAddress.
NetworkAddress getLocalAddress() const;
NetworkAddressList getLocalAddresses() const;
// Returns all local NetworkAddress.
NetworkAddressList getLocalAddresses() const;
std::map<NetworkAddress, std::pair<uint64_t, double>>* getIncompatiblePeers();
// Returns the same of all peers that have attempted to connect, but have incompatible protocol versions
std::map<NetworkAddress, std::pair<uint64_t, double>>* getIncompatiblePeers();
Future<Void> onIncompatibleChanged();
// Returns when getIncompatiblePeers has at least one peer which is incompatible.
Future<Void> onIncompatibleChanged();
void addPeerReference(const Endpoint&, bool isStream);
// Signal that a peer connection is being used, even if no messages are currently being sent to the peer
void addPeerReference(const Endpoint&, bool isStream);
void removePeerReference(const Endpoint&, bool isStream);
// Signal that a peer connection is no longer being used
void removePeerReference(const Endpoint&, bool isStream);
void addEndpoint(Endpoint& endpoint, NetworkMessageReceiver*, TaskPriority taskID);
// Sets endpoint to be a new local endpoint which delivers messages to the given receiver
void addEndpoint(Endpoint& endpoint, NetworkMessageReceiver*, TaskPriority taskID);
void addEndpoints(std::vector<std::pair<struct FlowReceiver*, TaskPriority>> const& streams);
void removeEndpoint(const Endpoint&, NetworkMessageReceiver*);
// The given local endpoint no longer delivers messages to the given receiver or uses resources
void removeEndpoint(const Endpoint&, NetworkMessageReceiver*);
void addWellKnownEndpoint(Endpoint& endpoint, NetworkMessageReceiver*, TaskPriority taskID);
// Sets endpoint to a new local endpoint (without changing its token) which delivers messages to the given receiver
// Implementations may have limitations on when this function is called and what endpoint.token may be!
void addWellKnownEndpoint(Endpoint& endpoint, NetworkMessageReceiver*, TaskPriority taskID);
// sendReliable will keep trying to deliver the data to the destination until cancelReliable is called. It will
// retry sending if the connection is closed or the failure manager reports the destination become available (edge
// triggered).
ReliablePacket* sendReliable(ISerializeSource const& what, const Endpoint& destination);
// sendReliable will keep trying to deliver the data to the destination until cancelReliable is
// called. It will retry sending if the connection is closed or the failure manager reports
// the destination become available (edge triggered).
// Makes Packet "unreliable" (either the data or a connection close event will be delivered eventually). It can
// still be used safely to send a reply to a "reliable" request.
void cancelReliable(ReliablePacket*);
// Makes Packet "unreliable" (either the data or a connection close event will be delivered
// eventually). It can still be used safely to send a reply to a "reliable" request.
Reference<AsyncVar<bool>> getDegraded();
// This async var will be set to true when the process cannot connect to a public network address that the failure
// monitor thinks is healthy.
Reference<AsyncVar<bool>> getDegraded();
void resetConnection(NetworkAddress address);
// Forces the connection with this address to be reset
void resetConnection(NetworkAddress address);
Reference<Peer> sendUnreliable(ISerializeSource const& what,
const Endpoint& destination,
@ -239,6 +242,14 @@ public:
bool incompatibleOutgoingConnectionsPresent();
// Returns the protocol version of the peer at the specified address. The result is returned as an AsyncVar that
// can be used to monitor for changes of a peer's protocol. The protocol version will be unset in the event that
// there is no connection established to the peer.
//
// Note that this function does not establish a connection to the peer. In order to obtain a peer's protocol
// version, some other mechanism should be used to connect to that peer.
Reference<AsyncVar<Optional<ProtocolVersion>>> getPeerProtocolAsyncVar(NetworkAddress addr);
static FlowTransport& transport() {
return *static_cast<FlowTransport*>((void*)g_network->global(INetwork::enFlowTransport));
}

Some files were not shown because too many files have changed in this diff Show More