From 0bd6ac2a8ed5641dbd5d5300bd0d59b88b8af71d Mon Sep 17 00:00:00 2001 From: Lukas Joswiak Date: Wed, 30 Jun 2021 19:50:08 -0700 Subject: [PATCH 01/10] Disable recruitment determinism check for configurations with remote satellites --- fdbserver/ClusterController.actor.cpp | 155 +++++++++++++++----------- 1 file changed, 88 insertions(+), 67 deletions(-) diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp index 142409deca..c54a50e7a8 100644 --- a/fdbserver/ClusterController.actor.cpp +++ b/fdbserver/ClusterController.actor.cpp @@ -1543,6 +1543,20 @@ public: return result; } + // Given datacenter ID, returns the primary and remote regions. + std::pair getPrimaryAndRemoteRegion(std::vector regions, Key dcId) { + RegionInfo region; + RegionInfo remoteRegion; + for (const auto& r : regions) { + if (r.dcId == dcId) { + region = r; + } else { + remoteRegion = r; + } + } + return std::make_pair(region, remoteRegion); + } + ErrorOr findWorkersForConfigurationFromDC(RecruitFromConfigurationRequest const& req, Optional dcId) { RecruitFromConfigurationReply result; @@ -1555,15 +1569,7 @@ public: primaryDC.insert(dcId); result.dcId = dcId; - RegionInfo region; - RegionInfo remoteRegion; - for (auto& r : req.configuration.regions) { - if (r.dcId == dcId.get()) { - region = r; - } else { - remoteRegion = r; - } - } + auto [region, remoteRegion] = getPrimaryAndRemoteRegion(req.configuration.regions, dcId.get()); if (req.recruitSeedServers) { auto primaryStorageServers = @@ -2008,67 +2014,82 @@ public: RecruitFromConfigurationReply findWorkersForConfiguration(RecruitFromConfigurationRequest const& req) { RecruitFromConfigurationReply rep = findWorkersForConfigurationDispatch(req); if (g_network->isSimulated()) { - RecruitFromConfigurationReply compare = findWorkersForConfigurationDispatch(req); + // FIXME: The logic to pick a satellite in a remote region is not + // deterministic and can therefore break this nondeterminism check. + // Since satellites will generally be in the primary region, + // disable the determinism check for remote region satellites. + bool remoteDCUsedAsSatellite = false; + if (req.configuration.regions.size() > 1) { + auto [region, remoteRegion] = getPrimaryAndRemoteRegion(req.configuration.regions, req.configuration.regions[0].dcId); + for (const auto& satellite : region.satellites) { + if (satellite.dcId == remoteRegion.dcId) { + remoteDCUsedAsSatellite = true; + } + } + } + if (!remoteDCUsedAsSatellite) { + RecruitFromConfigurationReply compare = findWorkersForConfigurationDispatch(req); - std::map>, int> firstUsed; - std::map>, int> secondUsed; - updateKnownIds(&firstUsed); - updateKnownIds(&secondUsed); + std::map>, int> firstUsed; + std::map>, int> secondUsed; + updateKnownIds(&firstUsed); + updateKnownIds(&secondUsed); - // auto mworker = id_worker.find(masterProcessId); - //TraceEvent("CompareAddressesMaster") - // .detail("Master", - // mworker != id_worker.end() ? mworker->second.details.interf.address() : NetworkAddress()); + // auto mworker = id_worker.find(masterProcessId); + //TraceEvent("CompareAddressesMaster") + // .detail("Master", + // mworker != id_worker.end() ? mworker->second.details.interf.address() : NetworkAddress()); - updateIdUsed(rep.tLogs, firstUsed); - updateIdUsed(compare.tLogs, secondUsed); - compareWorkers( - req.configuration, rep.tLogs, firstUsed, compare.tLogs, secondUsed, ProcessClass::TLog, "TLog"); - updateIdUsed(rep.satelliteTLogs, firstUsed); - updateIdUsed(compare.satelliteTLogs, secondUsed); - compareWorkers(req.configuration, - rep.satelliteTLogs, - firstUsed, - compare.satelliteTLogs, - secondUsed, - ProcessClass::TLog, - "Satellite"); - updateIdUsed(rep.commitProxies, firstUsed); - updateIdUsed(compare.commitProxies, secondUsed); - updateIdUsed(rep.grvProxies, firstUsed); - updateIdUsed(compare.grvProxies, secondUsed); - updateIdUsed(rep.resolvers, firstUsed); - updateIdUsed(compare.resolvers, secondUsed); - compareWorkers(req.configuration, - rep.commitProxies, - firstUsed, - compare.commitProxies, - secondUsed, - ProcessClass::CommitProxy, - "CommitProxy"); - compareWorkers(req.configuration, - rep.grvProxies, - firstUsed, - compare.grvProxies, - secondUsed, - ProcessClass::GrvProxy, - "GrvProxy"); - compareWorkers(req.configuration, - rep.resolvers, - firstUsed, - compare.resolvers, - secondUsed, - ProcessClass::Resolver, - "Resolver"); - updateIdUsed(rep.backupWorkers, firstUsed); - updateIdUsed(compare.backupWorkers, secondUsed); - compareWorkers(req.configuration, - rep.backupWorkers, - firstUsed, - compare.backupWorkers, - secondUsed, - ProcessClass::Backup, - "Backup"); + updateIdUsed(rep.tLogs, firstUsed); + updateIdUsed(compare.tLogs, secondUsed); + compareWorkers( + req.configuration, rep.tLogs, firstUsed, compare.tLogs, secondUsed, ProcessClass::TLog, "TLog"); + updateIdUsed(rep.satelliteTLogs, firstUsed); + updateIdUsed(compare.satelliteTLogs, secondUsed); + compareWorkers(req.configuration, + rep.satelliteTLogs, + firstUsed, + compare.satelliteTLogs, + secondUsed, + ProcessClass::TLog, + "Satellite"); + updateIdUsed(rep.commitProxies, firstUsed); + updateIdUsed(compare.commitProxies, secondUsed); + updateIdUsed(rep.grvProxies, firstUsed); + updateIdUsed(compare.grvProxies, secondUsed); + updateIdUsed(rep.resolvers, firstUsed); + updateIdUsed(compare.resolvers, secondUsed); + compareWorkers(req.configuration, + rep.commitProxies, + firstUsed, + compare.commitProxies, + secondUsed, + ProcessClass::CommitProxy, + "CommitProxy"); + compareWorkers(req.configuration, + rep.grvProxies, + firstUsed, + compare.grvProxies, + secondUsed, + ProcessClass::GrvProxy, + "GrvProxy"); + compareWorkers(req.configuration, + rep.resolvers, + firstUsed, + compare.resolvers, + secondUsed, + ProcessClass::Resolver, + "Resolver"); + updateIdUsed(rep.backupWorkers, firstUsed); + updateIdUsed(compare.backupWorkers, secondUsed); + compareWorkers(req.configuration, + rep.backupWorkers, + firstUsed, + compare.backupWorkers, + secondUsed, + ProcessClass::Backup, + "Backup"); + } } return rep; } From e828a498cd9d18560cda241ea700f63e0a0d50df Mon Sep 17 00:00:00 2001 From: Lukas Joswiak Date: Thu, 1 Jul 2021 11:01:52 -0700 Subject: [PATCH 02/10] Pass vector by const reference --- fdbserver/ClusterController.actor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp index c54a50e7a8..cc563faff0 100644 --- a/fdbserver/ClusterController.actor.cpp +++ b/fdbserver/ClusterController.actor.cpp @@ -1544,7 +1544,7 @@ public: } // Given datacenter ID, returns the primary and remote regions. - std::pair getPrimaryAndRemoteRegion(std::vector regions, Key dcId) { + std::pair getPrimaryAndRemoteRegion(const std::vector& regions, Key dcId) { RegionInfo region; RegionInfo remoteRegion; for (const auto& r : regions) { From d1db0d40135ca4859183824778335c6230628ac6 Mon Sep 17 00:00:00 2001 From: Scott Fines Date: Tue, 9 Mar 2021 14:14:54 -0600 Subject: [PATCH 03/10] Making it possible to run a multiple-cluster Java integration test --- bindings/java/CMakeLists.txt | 31 ++++--- bindings/java/src/README.md | 17 +++- .../BasicMultiClientIntegrationTest.java | 69 ++++++++++++++++ .../com/apple/foundationdb/DirectoryTest.java | 2 - .../apple/foundationdb/MultiClientHelper.java | 82 +++++++++++++++++++ bindings/java/src/tests.cmake | 2 + cmake/AddFdbTest.cmake | 33 +++++++- tests/TestRunner/tmp_cluster.py | 9 +- tests/TestRunner/tmp_multi_cluster.py | 75 +++++++++++++++++ 9 files changed, 302 insertions(+), 18 deletions(-) create mode 100644 bindings/java/src/integration/com/apple/foundationdb/BasicMultiClientIntegrationTest.java create mode 100644 bindings/java/src/integration/com/apple/foundationdb/MultiClientHelper.java create mode 100755 tests/TestRunner/tmp_multi_cluster.py diff --git a/bindings/java/CMakeLists.txt b/bindings/java/CMakeLists.txt index 2da8639b8d..bb9bc6359e 100644 --- a/bindings/java/CMakeLists.txt +++ b/bindings/java/CMakeLists.txt @@ -301,7 +301,7 @@ if(NOT OPEN_FOR_IDE) if(RUN_JUNIT_TESTS) # Sets up the JUnit testing structure to run through ctest # - # To add a new junit test, add the class to the JAVA_JUNIT_TESTS variable in `src/tests.cmake`. Note that if you run a Suite, + # To add a new junit test, add the class to the JAVA_JUNIT_TESTS variable in `src/tests.cmake`. Note that if you run a Suite, # ctest will NOT display underlying details of the suite itself, so it's best to avoid junit suites in general. Also, # if you need a different runner other than JUnitCore, you'll have to modify this so be aware. # @@ -309,8 +309,8 @@ if(NOT OPEN_FOR_IDE) # # ctest . # - # from the ${BUILD_DIR}/bindings/java subdirectory. - # + # from the ${BUILD_DIR}/bindings/java subdirectory. + # # Note: if you are running from ${BUILD_DIR}, additional tests of the native logic will be run. To avoid these, use # # ctest . -R java-unit @@ -318,15 +318,15 @@ if(NOT OPEN_FOR_IDE) # ctest has lots of flexible command options, so be sure to refer to its documentation if you want to do something specific(documentation # can be found at https://cmake.org/cmake/help/v3.19/manual/ctest.1.html) - add_jar(fdb-junit SOURCES ${JAVA_JUNIT_TESTS} ${JUNIT_RESOURCES} INCLUDE_JARS fdb-java - ${CMAKE_BINARY_DIR}/packages/junit-jupiter-api-5.7.1.jar + add_jar(fdb-junit SOURCES ${JAVA_JUNIT_TESTS} ${JUNIT_RESOURCES} INCLUDE_JARS fdb-java + ${CMAKE_BINARY_DIR}/packages/junit-jupiter-api-5.7.1.jar ${CMAKE_BINARY_DIR}/packages/junit-jupiter-engine-5.7.1.jar ${CMAKE_BINARY_DIR}/packages/junit-jupiter-params-5.7.1.jar ${CMAKE_BINARY_DIR}/packages/opentest4j-1.2.0.jar ${CMAKE_BINARY_DIR}/packages/apiguardian-api-1.1.1.jar ) get_property(junit_jar_path TARGET fdb-junit PROPERTY JAR_FILE) - + add_test(NAME java-unit COMMAND ${Java_JAVA_EXECUTABLE} -classpath "${target_jar}:${junit_jar_path}:${JUNIT_CLASSPATH}" @@ -339,12 +339,12 @@ if(NOT OPEN_FOR_IDE) if(RUN_JAVA_INTEGRATION_TESTS) # Set up the integration tests. These tests generally require a running database server to function properly. Most tests # should be written such that they can be run in parallel with other integration tests (e.g. try to use a unique key range for each test - # whenever possible), because it's a reasonable assumption that a single server will be shared among multiple tests, and might do so + # whenever possible), because it's a reasonable assumption that a single server will be shared among multiple tests, and might do so # concurrently. # # Integration tests are run through ctest the same way as unit tests, but their label is prefixed with the entry 'integration-'. - # Note that most java integration tests will fail if they can't quickly connect to a running FDB instance(depending on how the test is written, anyway). - # However, if you want to explicitly skip them, you can run + # Note that most java integration tests will fail if they can't quickly connect to a running FDB instance(depending on how the test is written, anyway). + # However, if you want to explicitly skip them, you can run # # `ctest -E integration` # @@ -361,8 +361,8 @@ if(NOT OPEN_FOR_IDE) # empty, consider generating a random prefix for the keys you write, use # the directory layer with a unique path, etc.) # - add_jar(fdb-integration SOURCES ${JAVA_INTEGRATION_TESTS} ${JAVA_INTEGRATION_RESOURCES} INCLUDE_JARS fdb-java - ${CMAKE_BINARY_DIR}/packages/junit-jupiter-api-5.7.1.jar + add_jar(fdb-integration SOURCES ${JAVA_INTEGRATION_TESTS} ${JAVA_INTEGRATION_RESOURCES} INCLUDE_JARS fdb-java + ${CMAKE_BINARY_DIR}/packages/junit-jupiter-api-5.7.1.jar ${CMAKE_BINARY_DIR}/packages/junit-jupiter-engine-5.7.1.jar ${CMAKE_BINARY_DIR}/packages/junit-jupiter-params-5.7.1.jar ${CMAKE_BINARY_DIR}/packages/opentest4j-1.2.0.jar @@ -375,7 +375,14 @@ if(NOT OPEN_FOR_IDE) COMMAND ${Java_JAVA_EXECUTABLE} -classpath "${target_jar}:${integration_jar_path}:${JUNIT_CLASSPATH}" -Djava.library.path=${CMAKE_BINARY_DIR}/lib - org.junit.platform.console.ConsoleLauncher "--details=summary" "--class-path=${integration_jar_path}" "--scan-classpath" "--disable-banner" + org.junit.platform.console.ConsoleLauncher "--details=summary" "--class-path=${integration_jar_path}" "--scan-classpath" "--disable-banner" "-T MultiClient" + ) + + add_multi_fdbclient_test(NAME java-multi-integration + COMMAND ${Java_JAVA_EXECUTABLE} + -classpath "${target_jar}:${integration_jar_path}:${JUNIT_CLASSPATH}" + -Djava.library.path=${CMAKE_BINARY_DIR}/lib + org.junit.platform.console.ConsoleLauncher "--details=summary" "--class-path=${integration_jar_path}" "--scan-classpath" "--disable-banner" "-t MultiClient" ) endif() diff --git a/bindings/java/src/README.md b/bindings/java/src/README.md index 6fedba6368..6bd377b85a 100644 --- a/bindings/java/src/README.md +++ b/bindings/java/src/README.md @@ -22,4 +22,19 @@ To skip integration tests, execute `ctest -E integration` from `${BUILD_DIR}/bin To run _only_ integration tests, run `ctest -R integration` from `${BUILD_DIR}/bindings/java`. There are lots of other useful `ctest` commands, which we don't need to get into here. For more information, -see the [https://cmake.org/cmake/help/v3.19/manual/ctest.1.html](ctest documentation). \ No newline at end of file +see the [https://cmake.org/cmake/help/v3.19/manual/ctest.1.html](ctest documentation). + +### Multi-Client tests +Multi-Client tests are integration tests that can only be executed when multiple clusters are running. To write a multi-client +test, do the following: + +1. Tag all tests that require multiple clients with `@Tag("MultiClient")` +2. Ensure that your tests have the `MultiClientHelper` extension present, and Registered as an extension +3. Ensure that your test class is in the the JAVA_INTEGRATION_TESTS list in `test.cmake` + +( see `BasicMultiClientIntegrationTest` for a good reference example) + +It is important to note that it requires significant time to start and stop 3 separate clusters; if the underying test takes a long time to run, +ctest will time out and kill the test. When that happens, there is no guarantee that the FDB clusters will be properly stopped! It is thus +in your best interest to ensure that all tests run in a relatively small amount of time, or have a longer timeout attached. + diff --git a/bindings/java/src/integration/com/apple/foundationdb/BasicMultiClientIntegrationTest.java b/bindings/java/src/integration/com/apple/foundationdb/BasicMultiClientIntegrationTest.java new file mode 100644 index 0000000000..2b02a3656f --- /dev/null +++ b/bindings/java/src/integration/com/apple/foundationdb/BasicMultiClientIntegrationTest.java @@ -0,0 +1,69 @@ +/* + * BasicMultiClientIntegrationTest + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.apple.foundationdb; + +import java.util.Collection; +import java.util.Random; + +import com.apple.foundationdb.tuple.Tuple; + +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Tag; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.RegisterExtension; + +/** + * Simple class to test multi-client logic. + * + * Note that all Multi-client-only tests _must_ be tagged with "MultiClient", which will ensure that they are excluded + * from non-multi-threaded tests. + */ +public class BasicMultiClientIntegrationTest { + @RegisterExtension public static final MultiClientHelper clientHelper = new MultiClientHelper(); + + @Test + @Tag("MultiClient") + void testMultiClientWritesAndReadsData() throws Exception { + FDB fdb = FDB.selectAPIVersion(630); + fdb.options().setKnob("min_trace_severity=5"); + + Collection dbs = clientHelper.openDatabases(fdb); // the clientHelper will close the databases for us + System.out.print("Starting tests."); + Random rand = new Random(); + for (int counter = 0; counter < 25; ++counter) { + for (Database db : dbs) { + String key = Integer.toString(rand.nextInt(100000000)); + String val = Integer.toString(rand.nextInt(100000000)); + + db.run(tr -> { + tr.set(Tuple.from(key).pack(), Tuple.from(val).pack()); + return null; + }); + + String fetchedVal = db.run(tr -> { + byte[] result = tr.get(Tuple.from(key).pack()).join(); + return Tuple.fromBytes(result).getString(0); + }); + Assertions.assertEquals(val, fetchedVal, "Wrong result!"); + } + Thread.sleep(200); + } + } +} diff --git a/bindings/java/src/integration/com/apple/foundationdb/DirectoryTest.java b/bindings/java/src/integration/com/apple/foundationdb/DirectoryTest.java index ddddd20ad1..519405e0f4 100644 --- a/bindings/java/src/integration/com/apple/foundationdb/DirectoryTest.java +++ b/bindings/java/src/integration/com/apple/foundationdb/DirectoryTest.java @@ -19,8 +19,6 @@ */ package com.apple.foundationdb; -import java.nio.file.Path; -import java.nio.file.Paths; import java.util.ArrayList; import java.util.Arrays; import java.util.List; diff --git a/bindings/java/src/integration/com/apple/foundationdb/MultiClientHelper.java b/bindings/java/src/integration/com/apple/foundationdb/MultiClientHelper.java new file mode 100644 index 0000000000..671163955f --- /dev/null +++ b/bindings/java/src/integration/com/apple/foundationdb/MultiClientHelper.java @@ -0,0 +1,82 @@ +/* + * MultiClientHelper.java + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.apple.foundationdb; + +import java.util.ArrayList; +import java.util.Collection; +import org.junit.jupiter.api.extension.AfterEachCallback; +import org.junit.jupiter.api.extension.BeforeAllCallback; +import org.junit.jupiter.api.extension.ExtensionContext; + +/** + * Callback to help define a multi-client scenario and ensure that + * the clients can be configured properly. + */ +public class MultiClientHelper implements BeforeAllCallback,AfterEachCallback{ + private String[] clusterFiles; + private Collection openDatabases; + + public static String[] readClusterFromEnv() { + /* + * Reads the cluster file lists from the ENV variable + * FDB_CLUSTERS. + */ + String clusterFilesProp = System.getenv("FDB_CLUSTERS"); + if (clusterFilesProp == null) { + throw new IllegalStateException("Missing FDB cluster connection file names"); + } + + return clusterFilesProp.split(";"); + } + + Collection openDatabases(FDB fdb){ + if(openDatabases!=null){ + return openDatabases; + } + if(clusterFiles==null){ + clusterFiles = readClusterFromEnv(); + } + Collection dbs = new ArrayList(); + for (String arg : clusterFiles) { + System.out.printf("Opening Cluster: %s\n", arg); + dbs.add(fdb.open(arg)); + } + + this.openDatabases = dbs; + return dbs; + } + + @Override + public void beforeAll(ExtensionContext arg0) throws Exception { + clusterFiles = readClusterFromEnv(); + } + + @Override + public void afterEach(ExtensionContext arg0) throws Exception { + //close any databases that have been opened + if(openDatabases!=null){ + for(Database db : openDatabases){ + db.close(); + } + } + openDatabases = null; + } + +} diff --git a/bindings/java/src/tests.cmake b/bindings/java/src/tests.cmake index 8bed62ecb8..bf3131eeb3 100644 --- a/bindings/java/src/tests.cmake +++ b/bindings/java/src/tests.cmake @@ -48,12 +48,14 @@ set(JUNIT_RESOURCES set(JAVA_INTEGRATION_TESTS src/integration/com/apple/foundationdb/DirectoryTest.java src/integration/com/apple/foundationdb/RangeQueryIntegrationTest.java + src/integration/com/apple/foundationdb/BasicMultiClientIntegrationTest.java ) # Resources that are used in integration testing, but are not explicitly test files (JUnit rules, # utility classes, and so forth) set(JAVA_INTEGRATION_RESOURCES src/integration/com/apple/foundationdb/RequiresDatabase.java + src/integration/com/apple/foundationdb/MultiClientHelper.java ) diff --git a/cmake/AddFdbTest.cmake b/cmake/AddFdbTest.cmake index 7e0502c52e..3824edc9b2 100644 --- a/cmake/AddFdbTest.cmake +++ b/cmake/AddFdbTest.cmake @@ -378,6 +378,7 @@ function(package_bindingtester) add_dependencies(bindingtester copy_bindingtester_binaries) endfunction() +# Creates a single cluster before running the specified command (usually a ctest test) function(add_fdbclient_test) set(options DISABLED ENABLED) set(oneValueArgs NAME) @@ -401,7 +402,37 @@ function(add_fdbclient_test) --build-dir ${CMAKE_BINARY_DIR} -- ${T_COMMAND}) - set_tests_properties("${T_NAME}" PROPERTIES TIMEOUT 60) + set_tests_properties("${T_NAME}" PROPERTIES TIMEOUT 60) +endfunction() + +# Creates 3 distinct clusters before running the specified command. +# This is useful for testing features that require multiple clusters (like the +# multi-cluster FDB client) +function(add_multi_fdbclient_test) + set(options DISABLED ENABLED) + set(oneValueArgs NAME) + set(multiValueArgs COMMAND) + cmake_parse_arguments(T "${options}" "${oneValueArgs}" "${multiValueArgs}" "${ARGN}") + if(OPEN_FOR_IDE) + return() + endif() + if(NOT T_ENABLED AND T_DISABLED) + return() + endif() + if(NOT T_NAME) + message(FATAL_ERROR "NAME is a required argument for add_multi_fdbclient_test") + endif() + if(NOT T_COMMAND) + message(FATAL_ERROR "COMMAND is a required argument for add_multi_fdbclient_test") + endif() + message(STATUS "Adding Client test ${T_NAME}") + add_test(NAME "${T_NAME}" + COMMAND ${CMAKE_SOURCE_DIR}/tests/TestRunner/tmp_multi_cluster.py + --build-dir ${CMAKE_BINARY_DIR} + --clusters 3 + -- + ${T_COMMAND}) + set_tests_properties("${T_NAME}" PROPERTIES TIMEOUT 60) endfunction() function(add_java_test) diff --git a/tests/TestRunner/tmp_cluster.py b/tests/TestRunner/tmp_cluster.py index c34fc6a85e..f8ae4ef813 100755 --- a/tests/TestRunner/tmp_cluster.py +++ b/tests/TestRunner/tmp_cluster.py @@ -11,7 +11,7 @@ from random import choice from pathlib import Path class TempCluster: - def __init__(self, build_dir: str): + def __init__(self, build_dir: str,port: str = None): self.build_dir = Path(build_dir).resolve() assert self.build_dir.exists(), "{} does not exist".format(build_dir) assert self.build_dir.is_dir(), "{} is not a directory".format(build_dir) @@ -22,7 +22,8 @@ class TempCluster: self.cluster = LocalCluster(tmp_dir, self.build_dir.joinpath('bin', 'fdbserver'), self.build_dir.joinpath('bin', 'fdbmonitor'), - self.build_dir.joinpath('bin', 'fdbcli')) + self.build_dir.joinpath('bin', 'fdbcli'), + port = port) self.log = self.cluster.log self.etc = self.cluster.etc self.data = self.cluster.data @@ -37,6 +38,10 @@ class TempCluster: self.cluster.__exit__(xc_type, exc_value, traceback) shutil.rmtree(self.tmp_dir) + def close(self): + self.cluster.__exit__(None,None,None) + shutil.rmtree(self.tmp_dir) + if __name__ == '__main__': parser = ArgumentParser(formatter_class=RawDescriptionHelpFormatter, diff --git a/tests/TestRunner/tmp_multi_cluster.py b/tests/TestRunner/tmp_multi_cluster.py new file mode 100755 index 0000000000..2edcef29de --- /dev/null +++ b/tests/TestRunner/tmp_multi_cluster.py @@ -0,0 +1,75 @@ +#!/usr/bin/env python3 + +# +# tmp_multi_cluster.py +# +# This source file is part of the FoundationDB open source project +# +# Copyright 2013-2018 Apple Inc. and the FoundationDB project authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import os +import subprocess +import sys +import shutil +from pathlib import Path +from argparse import ArgumentParser, RawDescriptionHelpFormatter +from tmp_cluster import TempCluster + + +if __name__ == '__main__': + parser = ArgumentParser(formatter_class=RawDescriptionHelpFormatter,description=""" + This script automatically configures N temporary local clusters on the machine and then + calls a command while these clusters are running. As soon as the command returns, all + configured clusters are killed and all generated data is deleted. + + The purpose of this is to support testing a set of integration tests using multiple clusters + (i.e. using the Multi-threaded client). + """) + parser.add_argument('--build-dir','-b',metavar='BUILD_DIRECTORY',help='FDB build director',required=True) + parser.add_argument('--clusters','-c',metavar='NUM_CLUSTERS',type=int,help='The number of clusters to run',required=True) + parser.add_argument('cmd', metavar='COMMAND',nargs='+',help='The command to run') + args = parser.parse_args() + errcode = 1 + + #spawn all the clusters + base_dir = args.build_dir + num_clusters = args.clusters + + build_dir=Path(base_dir) + bin_dir=build_dir.joinpath('bin') + + clusters = [] + for c in range(1,num_clusters+1): + # now start the cluster up + local_c = TempCluster(args.build_dir, port="{}501".format(c)) + + local_c.__enter__() + clusters.append(local_c) + + # all clusters should be running now, so run the subcommand + # TODO (bfines): pass through the proper ENV commands so that the client can find everything + cluster_paths = ';'.join([str(cluster.etc.joinpath('fdb.cluster')) for cluster in clusters]) + print(cluster_paths) + env = dict(**os.environ) + env['FDB_CLUSTERS'] = env.get('FDB_CLUSTERS',cluster_paths) + errcode = subprocess.run(args.cmd,stdout=sys.stdout,stderr=sys.stderr,env=env).returncode + + # shutdown all the running clusters + for tc in clusters: + tc.close() + + sys.exit(errcode) + \ No newline at end of file From fbc4f47882cf2662a09222d65f8140905b4205f3 Mon Sep 17 00:00:00 2001 From: sfc-gh-tclinkenbeard Date: Sat, 10 Jul 2021 16:55:30 -0700 Subject: [PATCH 04/10] Add LowLatencySingleClog.toml test --- fdbrpc/sim2.actor.cpp | 1 + fdbserver/CMakeLists.txt | 3 +- fdbserver/tester.actor.cpp | 9 +++ .../workloads/ClogSingleConnection.actor.cpp | 71 +++++++++++++++++++ fdbserver/workloads/workloads.actor.h | 5 +- tests/CMakeLists.txt | 1 + tests/fast/LowLatencySingleClog.toml | 20 ++++++ 7 files changed, 106 insertions(+), 4 deletions(-) create mode 100644 fdbserver/workloads/ClogSingleConnection.actor.cpp create mode 100644 tests/fast/LowLatencySingleClog.toml diff --git a/fdbrpc/sim2.actor.cpp b/fdbrpc/sim2.actor.cpp index 9243bda4a1..73231bf8a9 100644 --- a/fdbrpc/sim2.actor.cpp +++ b/fdbrpc/sim2.actor.cpp @@ -1950,6 +1950,7 @@ public: g_clogging.clogRecvFor(ip, seconds); } void clogPair(const IPAddress& from, const IPAddress& to, double seconds) override { + TraceEvent("CloggingPair").detail("From", from).detail("To", to).detail("Seconds", seconds); g_clogging.clogPairFor(from, to, seconds); } std::vector getAllProcesses() const override { diff --git a/fdbserver/CMakeLists.txt b/fdbserver/CMakeLists.txt index 0f7d5dc860..485761ebd7 100644 --- a/fdbserver/CMakeLists.txt +++ b/fdbserver/CMakeLists.txt @@ -146,7 +146,7 @@ set(FDBSERVER_SRCS workloads/BackgroundSelectors.actor.cpp workloads/BackupCorrectness.actor.cpp workloads/BackupAndParallelRestoreCorrectness.actor.cpp - workloads/ParallelRestore.actor.cpp + workloads/ClogSingleConnection.actor.cpp workloads/BackupToBlob.actor.cpp workloads/BackupToDBAbort.actor.cpp workloads/BackupToDBCorrectness.actor.cpp @@ -197,6 +197,7 @@ set(FDBSERVER_SRCS workloads/MemoryKeyValueStore.h workloads/MemoryLifetime.actor.cpp workloads/MetricLogging.actor.cpp + workloads/ParallelRestore.actor.cpp workloads/Performance.actor.cpp workloads/Ping.actor.cpp workloads/PopulateTPCC.actor.cpp diff --git a/fdbserver/tester.actor.cpp b/fdbserver/tester.actor.cpp index a65d02cbc2..30a0bf5e81 100644 --- a/fdbserver/tester.actor.cpp +++ b/fdbserver/tester.actor.cpp @@ -234,6 +234,15 @@ vector getOption(VectorRef options, Key key, vector options, Key key) { + for (const auto& option : options) { + if (option.key == key) { + return true; + } + } + return false; +} + // returns unconsumed options Standalone> checkAllOptionsConsumed(VectorRef options) { static StringRef nothing = LiteralStringRef(""); diff --git a/fdbserver/workloads/ClogSingleConnection.actor.cpp b/fdbserver/workloads/ClogSingleConnection.actor.cpp new file mode 100644 index 0000000000..24e0b6e32f --- /dev/null +++ b/fdbserver/workloads/ClogSingleConnection.actor.cpp @@ -0,0 +1,71 @@ +/* + * ClogSingleConnection.actor.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "fdbclient/NativeAPI.actor.h" +#include "fdbserver/TesterInterface.actor.h" +#include "fdbserver/workloads/workloads.actor.h" +#include "fdbrpc/simulator.h" +#include "flow/actorcompiler.h" // This must be the last #include. + +class ClogSingleConnectionWorkload : public TestWorkload { + double delaySeconds; + Optional clogDuration; // If empty, clog forever + +public: + ClogSingleConnectionWorkload(WorkloadContext const& wcx) : TestWorkload(wcx) { + auto minDelay = getOption(options, "minDelay"_sr, 0.0); + auto maxDelay = getOption(options, "maxDelay"_sr, 10.0); + ASSERT_LE(minDelay, maxDelay); + delaySeconds = minDelay + deterministicRandom()->random01() * (maxDelay - minDelay); + if (hasOption(options, "clogDuration"_sr)) { + clogDuration = getOption(options, "clogDuration"_sr, ""); + } + } + + std::string description() const override { + return g_network->isSimulated() ? "ClogSingleConnection" : "NoClogging"; + } + Future setup(Database const& cx) override { return Void(); } + + Future start(Database const& cx) override { + if (g_network->isSimulated() && clientId == 0) { + return map(delay(delaySeconds), [this](Void _) { + clogRandomPair(); + return Void(); + }); + } else { + return Void(); + } + } + + Future check(Database const& cx) override { return true; } + + void getMetrics(std::vector& m) override {} + + void clogRandomPair() { + auto m1 = deterministicRandom()->randomChoice(g_simulator.getAllProcesses()); + auto m2 = deterministicRandom()->randomChoice(g_simulator.getAllProcesses()); + if (m1->address.ip != m2->address.ip) { + g_simulator.clogPair(m1->address.ip, m2->address.ip, clogDuration.orDefault(10000)); + } + } +}; + +WorkloadFactory ClogSingleConnectionWorkloadFactory("ClogSingleConnection"); diff --git a/fdbserver/workloads/workloads.actor.h b/fdbserver/workloads/workloads.actor.h index fa3fb62571..d47b981409 100644 --- a/fdbserver/workloads/workloads.actor.h +++ b/fdbserver/workloads/workloads.actor.h @@ -43,6 +43,7 @@ bool getOption(VectorRef options, Key key, bool defaultValue); vector getOption(VectorRef options, Key key, vector defaultValue); // comma-separated strings +bool hasOption(VectorRef options, Key key); struct WorkloadContext { Standalone> options; @@ -53,9 +54,7 @@ struct WorkloadContext { WorkloadContext(); WorkloadContext(const WorkloadContext&); ~WorkloadContext(); - -private: - void operator=(const WorkloadContext&); + WorkloadContext& operator=(const WorkloadContext&) = delete; }; struct TestWorkload : NonCopyable, WorkloadContext { diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 913b39413b..ba58b3bf27 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -134,6 +134,7 @@ if(WITH_PYTHON) add_fdb_test(TEST_FILES fast/LocalRatekeeper.toml) add_fdb_test(TEST_FILES fast/LongStackWriteDuringRead.toml) add_fdb_test(TEST_FILES fast/LowLatency.toml) + add_fdb_test(TEST_FILES fast/LowLatencySingleClog.toml) add_fdb_test(TEST_FILES fast/MemoryLifetime.toml) add_fdb_test(TEST_FILES fast/MoveKeysCycle.toml) add_fdb_test(TEST_FILES fast/ProtocolVersion.toml) diff --git a/tests/fast/LowLatencySingleClog.toml b/tests/fast/LowLatencySingleClog.toml new file mode 100644 index 0000000000..cbfe6682e3 --- /dev/null +++ b/tests/fast/LowLatencySingleClog.toml @@ -0,0 +1,20 @@ +[configuration] +buggify = false +minimumReplication = 2 + +[[test]] +testTitle = 'Clogged' +connectionFailuresDisableDuration = 100000 + + [[test.workload]] + testName = 'Cycle' + transactionsPerSecond = 1000.0 + testDuration = 30.0 + expectedRate = 0 + + [[test.workload]] + testName = 'LowLatency' + testDuration = 30.0 + + [[test.workload]] + testName = 'ClogSingleConnection' From 2abdbff11f28bf2071706cca2310dfc3b3c382c9 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Mon, 12 Jul 2021 03:09:11 +0000 Subject: [PATCH 05/10] add Knobs --- fdbclient/ManagementAPI.actor.cpp | 2 +- fdbclient/ServerKnobs.cpp | 1 + fdbserver/tester.actor.cpp | 8 ++++++-- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/fdbclient/ManagementAPI.actor.cpp b/fdbclient/ManagementAPI.actor.cpp index 447aad68bf..38f79e8ac1 100644 --- a/fdbclient/ManagementAPI.actor.cpp +++ b/fdbclient/ManagementAPI.actor.cpp @@ -143,7 +143,7 @@ std::map configForToken(std::string const& mode) { } if (key == "perpetual_storage_wiggle" && isInteger(value)) { - int ppWiggle = atoi(value.c_str()); + int ppWiggle = std::stoi(value); if (ppWiggle >= 2 || ppWiggle < 0) { printf("Error: Only 0 and 1 are valid values of perpetual_storage_wiggle at present.\n"); return out; diff --git a/fdbclient/ServerKnobs.cpp b/fdbclient/ServerKnobs.cpp index 83dbe0af37..3496d34572 100644 --- a/fdbclient/ServerKnobs.cpp +++ b/fdbclient/ServerKnobs.cpp @@ -255,6 +255,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( DD_TEAMS_INFO_PRINT_YIELD_COUNT, 100 ); if( randomize && BUGGIFY ) DD_TEAMS_INFO_PRINT_YIELD_COUNT = deterministicRandom()->random01() * 1000 + 1; init( DD_TEAM_ZERO_SERVER_LEFT_LOG_DELAY, 120 ); if( randomize && BUGGIFY ) DD_TEAM_ZERO_SERVER_LEFT_LOG_DELAY = 5; init( DD_STORAGE_WIGGLE_PAUSE_THRESHOLD, 1 ); if( randomize && BUGGIFY ) DD_STORAGE_WIGGLE_PAUSE_THRESHOLD = 10; + init( DD_STORAGE_WIGGLE_STUCK_THRESHOLD, 50 ); // TeamRemover init( TR_FLAG_DISABLE_MACHINE_TEAM_REMOVER, false ); if( randomize && BUGGIFY ) TR_FLAG_DISABLE_MACHINE_TEAM_REMOVER = deterministicRandom()->random01() < 0.1 ? true : false; // false by default. disable the consistency check when it's true diff --git a/fdbserver/tester.actor.cpp b/fdbserver/tester.actor.cpp index a65d02cbc2..7318ece807 100644 --- a/fdbserver/tester.actor.cpp +++ b/fdbserver/tester.actor.cpp @@ -1049,8 +1049,7 @@ std::map> testSpecGlobalKey [](const std::string& value) { TraceEvent("TestParserTest").detail("ParsedStorageEngineExcludeTypes", ""); } }, { "maxTLogVersion", [](const std::string& value) { TraceEvent("TestParserTest").detail("ParsedMaxTLogVersion", ""); } }, - { "disableTss", - [](const std::string& value) { TraceEvent("TestParserTest").detail("ParsedDisableTSS", ""); } } + { "disableTss", [](const std::string& value) { TraceEvent("TestParserTest").detail("ParsedDisableTSS", ""); } } }; std::map> testSpecTestKeys = { @@ -1178,6 +1177,11 @@ std::mapphases = TestWorkload::CHECK; } }, + { "restorePerpetualWiggleSetting", + [](const std::string& value, TestSpec* spec) { + if (value == "false") + spec->restorePerpetualWiggleSetting = false; + } }, }; vector readTests(ifstream& ifs) { From 501dc339a948afe38d9052190d10cba418c2545b Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Mon, 12 Jul 2021 03:36:10 +0000 Subject: [PATCH 06/10] relax perpetual wiggle pause condition; add trace log; correct perpetual wiggle priority setting --- fdbclient/ServerKnobs.cpp | 2 +- fdbclient/ServerKnobs.h | 1 + fdbserver/DataDistribution.actor.cpp | 36 ++++++++++++++++++++-------- 3 files changed, 28 insertions(+), 11 deletions(-) diff --git a/fdbclient/ServerKnobs.cpp b/fdbclient/ServerKnobs.cpp index 3496d34572..994f367292 100644 --- a/fdbclient/ServerKnobs.cpp +++ b/fdbclient/ServerKnobs.cpp @@ -255,7 +255,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( DD_TEAMS_INFO_PRINT_YIELD_COUNT, 100 ); if( randomize && BUGGIFY ) DD_TEAMS_INFO_PRINT_YIELD_COUNT = deterministicRandom()->random01() * 1000 + 1; init( DD_TEAM_ZERO_SERVER_LEFT_LOG_DELAY, 120 ); if( randomize && BUGGIFY ) DD_TEAM_ZERO_SERVER_LEFT_LOG_DELAY = 5; init( DD_STORAGE_WIGGLE_PAUSE_THRESHOLD, 1 ); if( randomize && BUGGIFY ) DD_STORAGE_WIGGLE_PAUSE_THRESHOLD = 10; - init( DD_STORAGE_WIGGLE_STUCK_THRESHOLD, 50 ); + init( DD_STORAGE_WIGGLE_STUCK_THRESHOLD, 50 ); // TeamRemover init( TR_FLAG_DISABLE_MACHINE_TEAM_REMOVER, false ); if( randomize && BUGGIFY ) TR_FLAG_DISABLE_MACHINE_TEAM_REMOVER = deterministicRandom()->random01() < 0.1 ? true : false; // false by default. disable the consistency check when it's true diff --git a/fdbclient/ServerKnobs.h b/fdbclient/ServerKnobs.h index 9a80f7a151..79600d49bb 100644 --- a/fdbclient/ServerKnobs.h +++ b/fdbclient/ServerKnobs.h @@ -208,6 +208,7 @@ public: int DD_TEAMS_INFO_PRINT_YIELD_COUNT; int DD_TEAM_ZERO_SERVER_LEFT_LOG_DELAY; int DD_STORAGE_WIGGLE_PAUSE_THRESHOLD; // How many unhealthy relocations are ongoing will pause storage wiggle + int DD_STORAGE_WIGGLE_STUCK_THRESHOLD; // How many times bestTeamStuck accumulate will pause storage wiggle // TeamRemover to remove redundant teams bool TR_FLAG_DISABLE_MACHINE_TEAM_REMOVER; // disable the machineTeamRemover actor diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp index 3829d11d2f..6b882bb1e1 100644 --- a/fdbserver/DataDistribution.actor.cpp +++ b/fdbserver/DataDistribution.actor.cpp @@ -656,7 +656,7 @@ struct DDTeamCollection : ReferenceCounted { int optimalTeamCount; AsyncVar zeroOptimalTeams; - bool bestTeamStuck = false; + int bestTeamKeepStuckCount = 0; bool isTssRecruiting; // If tss recruiting is waiting on a pair, don't consider DD recruiting for the purposes of QuietDB @@ -1011,12 +1011,12 @@ struct DDTeamCollection : ReferenceCounted { // Log BestTeamStuck reason when we have healthy teams but they do not have healthy free space if (randomTeams.empty() && !self->zeroHealthyTeams->get()) { - self->bestTeamStuck = true; + self->bestTeamKeepStuckCount++; if (g_network->isSimulated()) { TraceEvent(SevWarn, "GetTeamReturnEmpty").detail("HealthyTeams", self->healthyTeamCount); } } else { - self->bestTeamStuck = false; + self->bestTeamKeepStuckCount = 0; } for (int i = 0; i < randomTeams.size(); i++) { @@ -2833,7 +2833,7 @@ struct DDTeamCollection : ReferenceCounted { std::vector> moveFutures; if (this->pid2server_info.count(pid) != 0) { for (auto& info : this->pid2server_info[pid]) { - AddressExclusion addr(info->lastKnownInterface.address().ip); + AddressExclusion addr(info->lastKnownInterface.address().ip, info->lastKnownInterface.address().port); if (this->excludedServers.count(addr) && this->excludedServers.get(addr) != DDTeamCollection::Status::NONE) { continue; // don't overwrite the value set by actor trackExcludedServer @@ -3509,7 +3509,7 @@ ACTOR Future teamTracker(DDTeamCollection* self, Reference tea bool anyUndesired = false; bool anyWrongConfiguration = false; bool anyWigglingServer = false; - int serversLeft = 0; + int serversLeft = 0, serverUndesired = 0, serverWrongConf = 0, serverWiggling = 0; for (const UID& uid : team->getServerIDs()) { change.push_back(self->server_status.onChange(uid)); @@ -3519,12 +3519,15 @@ ACTOR Future teamTracker(DDTeamCollection* self, Reference tea } if (status.isUndesired) { anyUndesired = true; + serverUndesired++; } if (status.isWrongConfiguration) { anyWrongConfiguration = true; + serverWrongConf++; } if (status.isWiggling) { anyWigglingServer = true; + serverWiggling++; } } @@ -3646,6 +3649,10 @@ ACTOR Future teamTracker(DDTeamCollection* self, Reference tea team->setPriority(SERVER_KNOBS->PRIORITY_TEAM_2_LEFT); else team->setPriority(SERVER_KNOBS->PRIORITY_TEAM_UNHEALTHY); + } else if (!badTeam && anyWigglingServer && serverWiggling == serverWrongConf && + serverWiggling == serverUndesired) { + // the wrong configured and undesired server is the wiggling server + team->setPriority(SERVER_KNOBS->PRIORITY_PERPETUAL_STORAGE_WIGGLE); } else if (badTeam || anyWrongConfiguration) { if (redundantTeam) { team->setPriority(SERVER_KNOBS->PRIORITY_TEAM_REDUNDANT); @@ -3654,8 +3661,6 @@ ACTOR Future teamTracker(DDTeamCollection* self, Reference tea } } else if (anyUndesired) { team->setPriority(SERVER_KNOBS->PRIORITY_TEAM_CONTAINS_UNDESIRED_SERVER); - } else if (anyWigglingServer) { - team->setPriority(SERVER_KNOBS->PRIORITY_PERPETUAL_STORAGE_WIGGLE); } else { team->setPriority(SERVER_KNOBS->PRIORITY_TEAM_HEALTHY); } @@ -3972,7 +3977,7 @@ ACTOR Future perpetualStorageWiggleIterator(AsyncVar* stopSignal, wait(delayJittered(SERVER_KNOBS->PERPETUAL_WIGGLE_DELAY)); // there must not have other teams to place wiggled data takeRest = teamCollection->server_info.size() <= teamCollection->configuration.storageTeamSize || - teamCollection->machine_info.size() < teamCollection->configuration.storageTeamSize; + teamCollection->machine_info.size() < teamCollection->configuration.storageTeamSize; } wait(updateNextWigglingStoragePID(teamCollection)); } @@ -4020,10 +4025,12 @@ ACTOR Future clusterHealthCheckForPerpetualWiggle(DDTeamCollection* self, // b. healthy teams are not enough // c. the overall disk space is not enough if (count >= SERVER_KNOBS->DD_STORAGE_WIGGLE_PAUSE_THRESHOLD || self->healthyTeamCount <= *extraTeamCount || - self->bestTeamStuck) { + self->bestTeamKeepStuckCount > SERVER_KNOBS->DD_STORAGE_WIGGLE_STUCK_THRESHOLD) { // if we pause wiggle not because the reason a, increase extraTeamCount. This helps avoid oscillation // between pause and non-pause status. - if ((self->healthyTeamCount <= *extraTeamCount || self->bestTeamStuck) && !self->pauseWiggle->get()) { + if ((self->healthyTeamCount <= *extraTeamCount || + self->bestTeamKeepStuckCount > SERVER_KNOBS->DD_STORAGE_WIGGLE_PAUSE_THRESHOLD) && + !self->pauseWiggle->get()) { *extraTeamCount = std::min(*extraTeamCount + pausePenalty, (int)self->teams.size()); pausePenalty = std::min(pausePenalty * 2, (int)self->teams.size()); } @@ -4060,6 +4067,7 @@ ACTOR Future perpetualStorageWiggler(AsyncVar* stopSignal, self->includeStorageServersForWiggle(); TraceEvent("PerpetualStorageWigglePause", self->distributorId) .detail("ProcessId", pid) + .detail("BestTeamKeepStuckCount", self->bestTeamKeepStuckCount) .detail("ExtraHealthyTeamCount", extraTeamCount) .detail("HealthyTeamCount", self->healthyTeamCount) .detail("StorageCount", movingCount); @@ -4566,6 +4574,10 @@ ACTOR Future storageServerTracker( DDTeamCollection::Status worstStatus = self->excludedServers.get(worstAddr); if (worstStatus == DDTeamCollection::Status::WIGGLING && invalidWiggleServer(worstAddr, self, server)) { + TraceEvent(SevInfo, "InvalidWiggleServer", self->distributorId) + .detail("Address", worstAddr.toString()) + .detail("ProcessId", server->lastKnownInterface.locality.processId()) + .detail("ValidWigglingId", self->wigglingPid.present()); self->excludedServers.set(worstAddr, DDTeamCollection::Status::NONE); worstStatus = DDTeamCollection::Status::NONE; } @@ -4586,6 +4598,10 @@ ACTOR Future storageServerTracker( DDTeamCollection::Status testStatus = self->excludedServers.get(testAddr); if (testStatus == DDTeamCollection::Status::WIGGLING && invalidWiggleServer(testAddr, self, server)) { + TraceEvent(SevInfo, "InvalidWiggleServer", self->distributorId) + .detail("Address", testAddr.toString()) + .detail("ProcessId", server->lastKnownInterface.locality.processId()) + .detail("ValidWigglingId", self->wigglingPid.present()); self->excludedServers.set(testAddr, DDTeamCollection::Status::NONE); testStatus = DDTeamCollection::Status::NONE; } From 4059215180e0d73a7ff8e18d378ece7a98ea4bdc Mon Sep 17 00:00:00 2001 From: Andrew Noyes Date: Mon, 12 Jul 2021 16:07:56 -0700 Subject: [PATCH 07/10] Value should not be present for ENABLE_RUN_LOOP_PROFILING This fixes an unintentional change from 79ff07a0717fed9ec653641c3eb1e1c1488a2f8. I reviewed other options changed in that commit and this was the only one of its kind. --- fdbclient/NativeAPI.actor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index 8843f7cbda..f588f31418 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -1956,7 +1956,7 @@ void setNetworkOption(FDBNetworkOptions::Option option, Optional valu break; } case FDBNetworkOptions::ENABLE_RUN_LOOP_PROFILING: // Same as ENABLE_SLOW_TASK_PROFILING - validateOptionValuePresent(value); + validateOptionValueNotPresent(value); networkOptions.runLoopProfilingEnabled = true; break; case FDBNetworkOptions::DISTRIBUTED_CLIENT_TRACER: { From 4ce6b8eea1c2ab65721275a39fda7b72ac7551f8 Mon Sep 17 00:00:00 2001 From: sfc-gh-tclinkenbeard Date: Sun, 11 Jul 2021 11:56:06 -0700 Subject: [PATCH 08/10] Disable LowLatencySingleClog test --- tests/CMakeLists.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index ba58b3bf27..1feb231389 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -134,7 +134,8 @@ if(WITH_PYTHON) add_fdb_test(TEST_FILES fast/LocalRatekeeper.toml) add_fdb_test(TEST_FILES fast/LongStackWriteDuringRead.toml) add_fdb_test(TEST_FILES fast/LowLatency.toml) - add_fdb_test(TEST_FILES fast/LowLatencySingleClog.toml) + # TODO: Fix failures and reenable this test: + add_fdb_test(TEST_FILES fast/LowLatencySingleClog.toml IGNORE) add_fdb_test(TEST_FILES fast/MemoryLifetime.toml) add_fdb_test(TEST_FILES fast/MoveKeysCycle.toml) add_fdb_test(TEST_FILES fast/ProtocolVersion.toml) From 4b58dbf68e452c1720e92c7635882f570511e2b9 Mon Sep 17 00:00:00 2001 From: Pierre Zemb Date: Tue, 13 Jul 2021 14:34:24 +0200 Subject: [PATCH 09/10] fix compile commands for IDE --- CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 96181a1111..8ff0c2c704 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -201,9 +201,9 @@ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/fdbclient/BuildFlags.h.in ${CMAKE_CUR if (CMAKE_EXPORT_COMPILE_COMMANDS AND WITH_PYTHON) add_custom_command( OUTPUT ${CMAKE_CURRENT_SOURCE_DIR}/compile_commands.json - COMMAND $ contrib/gen_compile_db.py + COMMAND $ ${CMAKE_CURRENT_SOURCE_DIR}/contrib/gen_compile_db.py ARGS -b ${CMAKE_CURRENT_BINARY_DIR} -s ${CMAKE_CURRENT_SOURCE_DIR} -o ${CMAKE_CURRENT_SOURCE_DIR}/compile_commands.json ${CMAKE_CURRENT_BINARY_DIR}/compile_commands.json - DEPENDS contrib/gen_compile_db.py ${CMAKE_CURRENT_BINARY_DIR}/compile_commands.json + DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/contrib/gen_compile_db.py ${CMAKE_CURRENT_BINARY_DIR}/compile_commands.json COMMENT "Build compile commands for IDE" ) add_custom_target(processed_compile_commands ALL DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/compile_commands.json ${CMAKE_CURRENT_BINARY_DIR}/compile_commands.json) From c49a09567701d56ba888dcf179d4d727172ebceb Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Tue, 13 Jul 2021 09:15:54 -0400 Subject: [PATCH 10/10] Remove some misleading wording --- fdbmonitor/fdbmonitor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdbmonitor/fdbmonitor.cpp b/fdbmonitor/fdbmonitor.cpp index 3f413cd47d..b0853cf5e8 100644 --- a/fdbmonitor/fdbmonitor.cpp +++ b/fdbmonitor/fdbmonitor.cpp @@ -836,7 +836,7 @@ void load_conf(const char* confpath, uid_t& uid, gid_t& gid, sigset_t* mask, fdb for (auto i : id_pid) { if (!loadedConf || ini.GetSectionSize(id_command[i.first]->ssection.c_str()) == -1) { - /* Server on this port no longer configured; deconfigure it and kill it if required */ + /* Process no longer configured; deconfigure it and kill it if required */ log_msg(SevInfo, "Deconfigured %s\n", id_command[i.first]->ssection.c_str()); id_command[i.first]->deconfigured = true;