foundationdb/fdbrpc/sim2.actor.cpp

2645 lines
101 KiB
C++
Raw Normal View History

2017-05-26 04:48:44 +08:00
/*
* sim2.actor.cpp
*
* This source file is part of the FoundationDB open source project
*
2022-03-22 04:36:23 +08:00
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
*
2017-05-26 04:48:44 +08:00
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
2017-05-26 04:48:44 +08:00
* http://www.apache.org/licenses/LICENSE-2.0
*
2017-05-26 04:48:44 +08:00
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <cinttypes>
2020-08-07 04:06:50 +08:00
#include <memory>
Remote ikvs debugging (#6465) * initial structure for remote IKVS server * moved struct to .h file, added new files to CMakeList * happy path implementation, connection error when testing * saved minor local change * changed tracing to debug * fixed onClosed and getError being called before init is finished * fix spawn process bug, now use absolute path * added server knob to set ikvs process port number * added server knob for remote/local kv store * implement simulator remote process spawning * fixed bug for simulator timeout * commit all changes * removed print lines in trace * added FlowProcess implementation by Markus * initial debug of FlowProcess, stuck at parent sending OpenKVStoreRequest to child * temporary fix for process factory throwing segfault on create * specify public address in command * change remote kv store knob to false for jenkins build * made port 0 open random unused port * change remote store knob to true for benchmark * set listening port to randomly opened port * added print lines for jenkins run open kv store timeout debug * removed most tracing and print lines * removed tutorial changes * update handleIOErrors error handling to handle remote-ikvs cases * Push all debugging changes * A version where worker bug exists * A version where restarting tests fail * Use both the name and the port to determine the child process * Remove unnecessary update on local address * Disable remote-kvs for DiskFailureCycle test * A version where restarting stuck * A version where most restarting tests green * Reset connection with child process explicitly * Remove change on unnecessary files * Unify flags from _ to - * fix merging unexpected changes * fix trac.error to .errorUnsuppressed * Add license header * Remove unnecessary header in FlowProcess.actor.cpp * Fix Windows build * Fix Windows build, add missing ; * Fix a stupid bug caused by code dropped by code merging * Disable remote kvs by default * Pass the conn_file path to the flow process, though not needed, but the buildNetwork is difficult to tune * serialization change on readrange * Update traces * Refactor the RemoteIKVS interface * Format files * Update sim2 interface to not clog connections between parent and child processes in simulation * Update comments; remove debugging symbols; Add error handling for remote_kvs_cancelled * Add comments, format files * Change method name from isBuggifyDisabled to isStableConnection; Decrease(0.1x) latency for stable connections * Commit the IConnection interface change, forgot in previous commit * Fix the issue that onClosed request is cancelled by ActorCollection * Enable the remote kv store knob * Remove FlowProcess.actor.cpp and move functions to RemoteIKeyValueStore.actor.cpp; Add remote kv store delay to avoid race; Bind the child process to die with parent process * Fix the bug where one process starts storage server more than once * Add a please_reboot_remote_kv_store error to restart the storage server worker if remote kvs died abnormally * Remove unreachable code path and add comments * Clang format the code * Fix a simple wait error * Clang format after merging the main branch * Testing mixed mode in simulation if remote_kvs knob is enabled, setting the default to false * Disable remote kvs for PhysicalShardMove which is for RocksDB * Cleanup #include orders, remove debugging traces * Revert the reorder in fdbserver.actor.cpp, which fails the gcc build Co-authored-by: “Lincoln <“lincoln.xiao@snowflake.com”>
2022-04-01 08:08:59 +08:00
#include <string>
2022-03-05 02:42:45 +08:00
#include "contrib/fmt-8.1.1/include/fmt/format.h"
#include "fdbrpc/simulator.h"
#include "flow/Arena.h"
2021-01-27 05:41:37 +08:00
#define BOOST_SYSTEM_NO_LIB
#define BOOST_DATE_TIME_NO_LIB
#define BOOST_REGEX_NO_LIB
#include "fdbrpc/SimExternalConnection.h"
2020-08-07 04:06:50 +08:00
#include "flow/ActorCollection.h"
#include "flow/IRandom.h"
2017-05-26 04:48:44 +08:00
#include "flow/IThreadPool.h"
#include "flow/ProtocolVersion.h"
#include "flow/Util.h"
#include "flow/WriteOnlySet.h"
#include "fdbrpc/IAsyncFile.h"
#include "fdbrpc/AsyncFileCached.actor.h"
2021-01-23 13:47:36 +08:00
#include "fdbrpc/AsyncFileEncrypted.h"
#include "fdbrpc/AsyncFileNonDurable.actor.h"
2021-07-20 02:13:57 +08:00
#include "fdbrpc/AsyncFileChaos.actor.h"
2020-01-14 10:28:40 +08:00
#include "flow/crc32c.h"
#include "fdbrpc/TraceFileIO.h"
2017-05-26 04:48:44 +08:00
#include "flow/FaultInjection.h"
2020-08-07 04:06:50 +08:00
#include "flow/flow.h"
#include "flow/genericactors.actor.h"
2017-05-26 04:48:44 +08:00
#include "flow/network.h"
#include "flow/TLSConfig.actor.h"
#include "fdbrpc/Net2FileSystem.h"
2017-05-26 04:48:44 +08:00
#include "fdbrpc/Replication.h"
#include "fdbrpc/ReplicationUtils.h"
#include "fdbrpc/AsyncFileWriteChecker.h"
2021-07-24 07:28:20 +08:00
#include "flow/FaultInjection.h"
#include "flow/actorcompiler.h" // This must be the last #include.
2017-05-26 04:48:44 +08:00
bool simulator_should_inject_fault(const char* context, const char* file, int line, int error_code) {
2021-07-24 07:28:20 +08:00
if (!g_network->isSimulated() || !faultInjectionActivated)
return false;
2017-05-26 04:48:44 +08:00
auto p = g_simulator.getCurrentProcess();
if (p->fault_injection_p2 && deterministicRandom()->random01() < p->fault_injection_p2 &&
!g_simulator.speedUpSimulation) {
2019-04-10 02:16:45 +08:00
uint32_t h1 = line + (p->fault_injection_r >> 32);
2017-05-26 04:48:44 +08:00
if (h1 < p->fault_injection_p1 * std::numeric_limits<uint32_t>::max()) {
TEST(true); // A fault was injected
TEST(error_code == error_code_io_timeout); // An io timeout was injected
TEST(error_code == error_code_io_error); // An io error was injected
TEST(error_code == error_code_platform_error); // A platform error was injected.
TraceEvent(SevWarn, "FaultInjected")
.detail("Context", context)
.detail("File", file)
.detail("Line", line)
.detail("ErrorCode", error_code);
if (error_code == error_code_io_timeout) {
g_network->setGlobal(INetwork::enASIOTimedOut, (flowGlobalType) true);
}
2017-05-26 04:48:44 +08:00
return true;
}
}
return false;
}
void ISimulator::displayWorkers() const {
std::map<std::string, std::vector<ISimulator::ProcessInfo*>> machineMap;
// Create a map of machine Id
for (auto processInfo : getAllProcesses()) {
std::string dataHall = processInfo->locality.dataHallId().present()
? processInfo->locality.dataHallId().get().printable()
: "[unset]";
std::string machineId = processInfo->locality.machineId().present()
? processInfo->locality.machineId().get().printable()
: "[unset]";
machineMap[format("%-8s %s", dataHall.c_str(), machineId.c_str())].push_back(processInfo);
}
printf("DataHall MachineId\n");
printf(" Address Name Class Excluded Failed Rebooting Cleared Role "
" DataFolder\n");
for (auto& machineRecord : machineMap) {
printf("\n%s\n", machineRecord.first.c_str());
for (auto& processInfo : machineRecord.second) {
printf(" %9s %-10s%-13s%-8s %-6s %-9s %-8s %-48s %-40s\n",
processInfo->address.toString().c_str(),
processInfo->name,
processInfo->startingClass.toString().c_str(),
(processInfo->isExcluded() ? "True" : "False"),
(processInfo->failed ? "True" : "False"),
(processInfo->rebooting ? "True" : "False"),
(processInfo->isCleared() ? "True" : "False"),
getRoles(processInfo->address).c_str(),
processInfo->dataFolder);
}
}
return;
}
2017-05-26 04:48:44 +08:00
int openCount = 0;
struct SimClogging {
Remote ikvs debugging (#6465) * initial structure for remote IKVS server * moved struct to .h file, added new files to CMakeList * happy path implementation, connection error when testing * saved minor local change * changed tracing to debug * fixed onClosed and getError being called before init is finished * fix spawn process bug, now use absolute path * added server knob to set ikvs process port number * added server knob for remote/local kv store * implement simulator remote process spawning * fixed bug for simulator timeout * commit all changes * removed print lines in trace * added FlowProcess implementation by Markus * initial debug of FlowProcess, stuck at parent sending OpenKVStoreRequest to child * temporary fix for process factory throwing segfault on create * specify public address in command * change remote kv store knob to false for jenkins build * made port 0 open random unused port * change remote store knob to true for benchmark * set listening port to randomly opened port * added print lines for jenkins run open kv store timeout debug * removed most tracing and print lines * removed tutorial changes * update handleIOErrors error handling to handle remote-ikvs cases * Push all debugging changes * A version where worker bug exists * A version where restarting tests fail * Use both the name and the port to determine the child process * Remove unnecessary update on local address * Disable remote-kvs for DiskFailureCycle test * A version where restarting stuck * A version where most restarting tests green * Reset connection with child process explicitly * Remove change on unnecessary files * Unify flags from _ to - * fix merging unexpected changes * fix trac.error to .errorUnsuppressed * Add license header * Remove unnecessary header in FlowProcess.actor.cpp * Fix Windows build * Fix Windows build, add missing ; * Fix a stupid bug caused by code dropped by code merging * Disable remote kvs by default * Pass the conn_file path to the flow process, though not needed, but the buildNetwork is difficult to tune * serialization change on readrange * Update traces * Refactor the RemoteIKVS interface * Format files * Update sim2 interface to not clog connections between parent and child processes in simulation * Update comments; remove debugging symbols; Add error handling for remote_kvs_cancelled * Add comments, format files * Change method name from isBuggifyDisabled to isStableConnection; Decrease(0.1x) latency for stable connections * Commit the IConnection interface change, forgot in previous commit * Fix the issue that onClosed request is cancelled by ActorCollection * Enable the remote kv store knob * Remove FlowProcess.actor.cpp and move functions to RemoteIKeyValueStore.actor.cpp; Add remote kv store delay to avoid race; Bind the child process to die with parent process * Fix the bug where one process starts storage server more than once * Add a please_reboot_remote_kv_store error to restart the storage server worker if remote kvs died abnormally * Remove unreachable code path and add comments * Clang format the code * Fix a simple wait error * Clang format after merging the main branch * Testing mixed mode in simulation if remote_kvs knob is enabled, setting the default to false * Disable remote kvs for PhysicalShardMove which is for RocksDB * Cleanup #include orders, remove debugging traces * Revert the reorder in fdbserver.actor.cpp, which fails the gcc build Co-authored-by: “Lincoln <“lincoln.xiao@snowflake.com”>
2022-04-01 08:08:59 +08:00
double getSendDelay(NetworkAddress from, NetworkAddress to, bool stableConnection = false) const {
// stable connection here means it's a local connection between processes on the same machine
// we expect it to have much lower latency
return (stableConnection ? 0.1 : 1.0) * halfLatency();
}
2017-05-26 04:48:44 +08:00
Remote ikvs debugging (#6465) * initial structure for remote IKVS server * moved struct to .h file, added new files to CMakeList * happy path implementation, connection error when testing * saved minor local change * changed tracing to debug * fixed onClosed and getError being called before init is finished * fix spawn process bug, now use absolute path * added server knob to set ikvs process port number * added server knob for remote/local kv store * implement simulator remote process spawning * fixed bug for simulator timeout * commit all changes * removed print lines in trace * added FlowProcess implementation by Markus * initial debug of FlowProcess, stuck at parent sending OpenKVStoreRequest to child * temporary fix for process factory throwing segfault on create * specify public address in command * change remote kv store knob to false for jenkins build * made port 0 open random unused port * change remote store knob to true for benchmark * set listening port to randomly opened port * added print lines for jenkins run open kv store timeout debug * removed most tracing and print lines * removed tutorial changes * update handleIOErrors error handling to handle remote-ikvs cases * Push all debugging changes * A version where worker bug exists * A version where restarting tests fail * Use both the name and the port to determine the child process * Remove unnecessary update on local address * Disable remote-kvs for DiskFailureCycle test * A version where restarting stuck * A version where most restarting tests green * Reset connection with child process explicitly * Remove change on unnecessary files * Unify flags from _ to - * fix merging unexpected changes * fix trac.error to .errorUnsuppressed * Add license header * Remove unnecessary header in FlowProcess.actor.cpp * Fix Windows build * Fix Windows build, add missing ; * Fix a stupid bug caused by code dropped by code merging * Disable remote kvs by default * Pass the conn_file path to the flow process, though not needed, but the buildNetwork is difficult to tune * serialization change on readrange * Update traces * Refactor the RemoteIKVS interface * Format files * Update sim2 interface to not clog connections between parent and child processes in simulation * Update comments; remove debugging symbols; Add error handling for remote_kvs_cancelled * Add comments, format files * Change method name from isBuggifyDisabled to isStableConnection; Decrease(0.1x) latency for stable connections * Commit the IConnection interface change, forgot in previous commit * Fix the issue that onClosed request is cancelled by ActorCollection * Enable the remote kv store knob * Remove FlowProcess.actor.cpp and move functions to RemoteIKeyValueStore.actor.cpp; Add remote kv store delay to avoid race; Bind the child process to die with parent process * Fix the bug where one process starts storage server more than once * Add a please_reboot_remote_kv_store error to restart the storage server worker if remote kvs died abnormally * Remove unreachable code path and add comments * Clang format the code * Fix a simple wait error * Clang format after merging the main branch * Testing mixed mode in simulation if remote_kvs knob is enabled, setting the default to false * Disable remote kvs for PhysicalShardMove which is for RocksDB * Cleanup #include orders, remove debugging traces * Revert the reorder in fdbserver.actor.cpp, which fails the gcc build Co-authored-by: “Lincoln <“lincoln.xiao@snowflake.com”>
2022-04-01 08:08:59 +08:00
double getRecvDelay(NetworkAddress from, NetworkAddress to, bool stableConnection = false) {
auto pair = std::make_pair(from.ip, to.ip);
2017-05-26 04:48:44 +08:00
double tnow = now();
Remote ikvs debugging (#6465) * initial structure for remote IKVS server * moved struct to .h file, added new files to CMakeList * happy path implementation, connection error when testing * saved minor local change * changed tracing to debug * fixed onClosed and getError being called before init is finished * fix spawn process bug, now use absolute path * added server knob to set ikvs process port number * added server knob for remote/local kv store * implement simulator remote process spawning * fixed bug for simulator timeout * commit all changes * removed print lines in trace * added FlowProcess implementation by Markus * initial debug of FlowProcess, stuck at parent sending OpenKVStoreRequest to child * temporary fix for process factory throwing segfault on create * specify public address in command * change remote kv store knob to false for jenkins build * made port 0 open random unused port * change remote store knob to true for benchmark * set listening port to randomly opened port * added print lines for jenkins run open kv store timeout debug * removed most tracing and print lines * removed tutorial changes * update handleIOErrors error handling to handle remote-ikvs cases * Push all debugging changes * A version where worker bug exists * A version where restarting tests fail * Use both the name and the port to determine the child process * Remove unnecessary update on local address * Disable remote-kvs for DiskFailureCycle test * A version where restarting stuck * A version where most restarting tests green * Reset connection with child process explicitly * Remove change on unnecessary files * Unify flags from _ to - * fix merging unexpected changes * fix trac.error to .errorUnsuppressed * Add license header * Remove unnecessary header in FlowProcess.actor.cpp * Fix Windows build * Fix Windows build, add missing ; * Fix a stupid bug caused by code dropped by code merging * Disable remote kvs by default * Pass the conn_file path to the flow process, though not needed, but the buildNetwork is difficult to tune * serialization change on readrange * Update traces * Refactor the RemoteIKVS interface * Format files * Update sim2 interface to not clog connections between parent and child processes in simulation * Update comments; remove debugging symbols; Add error handling for remote_kvs_cancelled * Add comments, format files * Change method name from isBuggifyDisabled to isStableConnection; Decrease(0.1x) latency for stable connections * Commit the IConnection interface change, forgot in previous commit * Fix the issue that onClosed request is cancelled by ActorCollection * Enable the remote kv store knob * Remove FlowProcess.actor.cpp and move functions to RemoteIKeyValueStore.actor.cpp; Add remote kv store delay to avoid race; Bind the child process to die with parent process * Fix the bug where one process starts storage server more than once * Add a please_reboot_remote_kv_store error to restart the storage server worker if remote kvs died abnormally * Remove unreachable code path and add comments * Clang format the code * Fix a simple wait error * Clang format after merging the main branch * Testing mixed mode in simulation if remote_kvs knob is enabled, setting the default to false * Disable remote kvs for PhysicalShardMove which is for RocksDB * Cleanup #include orders, remove debugging traces * Revert the reorder in fdbserver.actor.cpp, which fails the gcc build Co-authored-by: “Lincoln <“lincoln.xiao@snowflake.com”>
2022-04-01 08:08:59 +08:00
double t = tnow + (stableConnection ? 0.1 : 1.0) * halfLatency();
if (!g_simulator.speedUpSimulation && !stableConnection)
t += clogPairLatency[pair];
2017-05-26 04:48:44 +08:00
Remote ikvs debugging (#6465) * initial structure for remote IKVS server * moved struct to .h file, added new files to CMakeList * happy path implementation, connection error when testing * saved minor local change * changed tracing to debug * fixed onClosed and getError being called before init is finished * fix spawn process bug, now use absolute path * added server knob to set ikvs process port number * added server knob for remote/local kv store * implement simulator remote process spawning * fixed bug for simulator timeout * commit all changes * removed print lines in trace * added FlowProcess implementation by Markus * initial debug of FlowProcess, stuck at parent sending OpenKVStoreRequest to child * temporary fix for process factory throwing segfault on create * specify public address in command * change remote kv store knob to false for jenkins build * made port 0 open random unused port * change remote store knob to true for benchmark * set listening port to randomly opened port * added print lines for jenkins run open kv store timeout debug * removed most tracing and print lines * removed tutorial changes * update handleIOErrors error handling to handle remote-ikvs cases * Push all debugging changes * A version where worker bug exists * A version where restarting tests fail * Use both the name and the port to determine the child process * Remove unnecessary update on local address * Disable remote-kvs for DiskFailureCycle test * A version where restarting stuck * A version where most restarting tests green * Reset connection with child process explicitly * Remove change on unnecessary files * Unify flags from _ to - * fix merging unexpected changes * fix trac.error to .errorUnsuppressed * Add license header * Remove unnecessary header in FlowProcess.actor.cpp * Fix Windows build * Fix Windows build, add missing ; * Fix a stupid bug caused by code dropped by code merging * Disable remote kvs by default * Pass the conn_file path to the flow process, though not needed, but the buildNetwork is difficult to tune * serialization change on readrange * Update traces * Refactor the RemoteIKVS interface * Format files * Update sim2 interface to not clog connections between parent and child processes in simulation * Update comments; remove debugging symbols; Add error handling for remote_kvs_cancelled * Add comments, format files * Change method name from isBuggifyDisabled to isStableConnection; Decrease(0.1x) latency for stable connections * Commit the IConnection interface change, forgot in previous commit * Fix the issue that onClosed request is cancelled by ActorCollection * Enable the remote kv store knob * Remove FlowProcess.actor.cpp and move functions to RemoteIKeyValueStore.actor.cpp; Add remote kv store delay to avoid race; Bind the child process to die with parent process * Fix the bug where one process starts storage server more than once * Add a please_reboot_remote_kv_store error to restart the storage server worker if remote kvs died abnormally * Remove unreachable code path and add comments * Clang format the code * Fix a simple wait error * Clang format after merging the main branch * Testing mixed mode in simulation if remote_kvs knob is enabled, setting the default to false * Disable remote kvs for PhysicalShardMove which is for RocksDB * Cleanup #include orders, remove debugging traces * Revert the reorder in fdbserver.actor.cpp, which fails the gcc build Co-authored-by: “Lincoln <“lincoln.xiao@snowflake.com”>
2022-04-01 08:08:59 +08:00
if (!g_simulator.speedUpSimulation && !stableConnection && clogPairUntil.count(pair))
t = std::max(t, clogPairUntil[pair]);
2017-05-26 04:48:44 +08:00
Remote ikvs debugging (#6465) * initial structure for remote IKVS server * moved struct to .h file, added new files to CMakeList * happy path implementation, connection error when testing * saved minor local change * changed tracing to debug * fixed onClosed and getError being called before init is finished * fix spawn process bug, now use absolute path * added server knob to set ikvs process port number * added server knob for remote/local kv store * implement simulator remote process spawning * fixed bug for simulator timeout * commit all changes * removed print lines in trace * added FlowProcess implementation by Markus * initial debug of FlowProcess, stuck at parent sending OpenKVStoreRequest to child * temporary fix for process factory throwing segfault on create * specify public address in command * change remote kv store knob to false for jenkins build * made port 0 open random unused port * change remote store knob to true for benchmark * set listening port to randomly opened port * added print lines for jenkins run open kv store timeout debug * removed most tracing and print lines * removed tutorial changes * update handleIOErrors error handling to handle remote-ikvs cases * Push all debugging changes * A version where worker bug exists * A version where restarting tests fail * Use both the name and the port to determine the child process * Remove unnecessary update on local address * Disable remote-kvs for DiskFailureCycle test * A version where restarting stuck * A version where most restarting tests green * Reset connection with child process explicitly * Remove change on unnecessary files * Unify flags from _ to - * fix merging unexpected changes * fix trac.error to .errorUnsuppressed * Add license header * Remove unnecessary header in FlowProcess.actor.cpp * Fix Windows build * Fix Windows build, add missing ; * Fix a stupid bug caused by code dropped by code merging * Disable remote kvs by default * Pass the conn_file path to the flow process, though not needed, but the buildNetwork is difficult to tune * serialization change on readrange * Update traces * Refactor the RemoteIKVS interface * Format files * Update sim2 interface to not clog connections between parent and child processes in simulation * Update comments; remove debugging symbols; Add error handling for remote_kvs_cancelled * Add comments, format files * Change method name from isBuggifyDisabled to isStableConnection; Decrease(0.1x) latency for stable connections * Commit the IConnection interface change, forgot in previous commit * Fix the issue that onClosed request is cancelled by ActorCollection * Enable the remote kv store knob * Remove FlowProcess.actor.cpp and move functions to RemoteIKeyValueStore.actor.cpp; Add remote kv store delay to avoid race; Bind the child process to die with parent process * Fix the bug where one process starts storage server more than once * Add a please_reboot_remote_kv_store error to restart the storage server worker if remote kvs died abnormally * Remove unreachable code path and add comments * Clang format the code * Fix a simple wait error * Clang format after merging the main branch * Testing mixed mode in simulation if remote_kvs knob is enabled, setting the default to false * Disable remote kvs for PhysicalShardMove which is for RocksDB * Cleanup #include orders, remove debugging traces * Revert the reorder in fdbserver.actor.cpp, which fails the gcc build Co-authored-by: “Lincoln <“lincoln.xiao@snowflake.com”>
2022-04-01 08:08:59 +08:00
if (!g_simulator.speedUpSimulation && !stableConnection && clogRecvUntil.count(to.ip))
t = std::max(t, clogRecvUntil[to.ip]);
2017-05-26 04:48:44 +08:00
return t - tnow;
}
void clogPairFor(const IPAddress& from, const IPAddress& to, double t) {
auto& u = clogPairUntil[std::make_pair(from, to)];
2017-05-26 04:48:44 +08:00
u = std::max(u, now() + t);
}
void clogSendFor(const IPAddress& from, double t) {
2017-05-26 04:48:44 +08:00
auto& u = clogSendUntil[from];
u = std::max(u, now() + t);
}
void clogRecvFor(const IPAddress& from, double t) {
2017-05-26 04:48:44 +08:00
auto& u = clogRecvUntil[from];
u = std::max(u, now() + t);
}
double setPairLatencyIfNotSet(const IPAddress& from, const IPAddress& to, double t) {
auto i = clogPairLatency.find(std::make_pair(from, to));
2017-05-26 04:48:44 +08:00
if (i == clogPairLatency.end())
i = clogPairLatency.insert(std::make_pair(std::make_pair(from, to), t)).first;
2017-05-26 04:48:44 +08:00
return i->second;
}
2017-05-26 04:48:44 +08:00
private:
std::map<IPAddress, double> clogSendUntil, clogRecvUntil;
std::map<std::pair<IPAddress, IPAddress>, double> clogPairUntil;
std::map<std::pair<IPAddress, IPAddress>, double> clogPairLatency;
double halfLatency() const {
double a = deterministicRandom()->random01();
2017-05-26 04:48:44 +08:00
const double pFast = 0.999;
if (a <= pFast || g_simulator.speedUpSimulation) {
2017-05-26 04:48:44 +08:00
a = a / pFast;
return 0.5 * (FLOW_KNOBS->MIN_NETWORK_LATENCY * (1 - a) +
FLOW_KNOBS->FAST_NETWORK_LATENCY / pFast * a); // 0.5ms average
2017-05-26 04:48:44 +08:00
} else {
a = (a - pFast) / (1 - pFast); // uniform 0-1 again
return 0.5 * (FLOW_KNOBS->MIN_NETWORK_LATENCY * (1 - a) +
FLOW_KNOBS->SLOW_NETWORK_LATENCY * a); // long tail up to X ms
2017-05-26 04:48:44 +08:00
}
}
};
SimClogging g_clogging;
struct Sim2Conn final : IConnection, ReferenceCounted<Sim2Conn> {
Sim2Conn(ISimulator::ProcessInfo* process)
Remote ikvs debugging (#6465) * initial structure for remote IKVS server * moved struct to .h file, added new files to CMakeList * happy path implementation, connection error when testing * saved minor local change * changed tracing to debug * fixed onClosed and getError being called before init is finished * fix spawn process bug, now use absolute path * added server knob to set ikvs process port number * added server knob for remote/local kv store * implement simulator remote process spawning * fixed bug for simulator timeout * commit all changes * removed print lines in trace * added FlowProcess implementation by Markus * initial debug of FlowProcess, stuck at parent sending OpenKVStoreRequest to child * temporary fix for process factory throwing segfault on create * specify public address in command * change remote kv store knob to false for jenkins build * made port 0 open random unused port * change remote store knob to true for benchmark * set listening port to randomly opened port * added print lines for jenkins run open kv store timeout debug * removed most tracing and print lines * removed tutorial changes * update handleIOErrors error handling to handle remote-ikvs cases * Push all debugging changes * A version where worker bug exists * A version where restarting tests fail * Use both the name and the port to determine the child process * Remove unnecessary update on local address * Disable remote-kvs for DiskFailureCycle test * A version where restarting stuck * A version where most restarting tests green * Reset connection with child process explicitly * Remove change on unnecessary files * Unify flags from _ to - * fix merging unexpected changes * fix trac.error to .errorUnsuppressed * Add license header * Remove unnecessary header in FlowProcess.actor.cpp * Fix Windows build * Fix Windows build, add missing ; * Fix a stupid bug caused by code dropped by code merging * Disable remote kvs by default * Pass the conn_file path to the flow process, though not needed, but the buildNetwork is difficult to tune * serialization change on readrange * Update traces * Refactor the RemoteIKVS interface * Format files * Update sim2 interface to not clog connections between parent and child processes in simulation * Update comments; remove debugging symbols; Add error handling for remote_kvs_cancelled * Add comments, format files * Change method name from isBuggifyDisabled to isStableConnection; Decrease(0.1x) latency for stable connections * Commit the IConnection interface change, forgot in previous commit * Fix the issue that onClosed request is cancelled by ActorCollection * Enable the remote kv store knob * Remove FlowProcess.actor.cpp and move functions to RemoteIKeyValueStore.actor.cpp; Add remote kv store delay to avoid race; Bind the child process to die with parent process * Fix the bug where one process starts storage server more than once * Add a please_reboot_remote_kv_store error to restart the storage server worker if remote kvs died abnormally * Remove unreachable code path and add comments * Clang format the code * Fix a simple wait error * Clang format after merging the main branch * Testing mixed mode in simulation if remote_kvs knob is enabled, setting the default to false * Disable remote kvs for PhysicalShardMove which is for RocksDB * Cleanup #include orders, remove debugging traces * Revert the reorder in fdbserver.actor.cpp, which fails the gcc build Co-authored-by: “Lincoln <“lincoln.xiao@snowflake.com”>
2022-04-01 08:08:59 +08:00
: opened(false), closedByCaller(false), stableConnection(false), process(process),
dbgid(deterministicRandom()->randomUniqueID()), stopReceive(Never()) {
2017-05-26 04:48:44 +08:00
pipes = sender(this) && receiver(this);
}
// connect() is called on a pair of connections immediately after creation; logically it is part of the constructor
// and no other method may be called previously!
void connect(Reference<Sim2Conn> peer, NetworkAddress peerEndpoint) {
2017-05-26 04:48:44 +08:00
this->peer = peer;
this->peerProcess = peer->process;
this->peerId = peer->dbgid;
this->peerEndpoint = peerEndpoint;
// Every one-way connection gets a random permanent latency and a random send buffer for the duration of the
// connection
auto latency =
g_clogging.setPairLatencyIfNotSet(peerProcess->address.ip,
process->address.ip,
FLOW_KNOBS->MAX_CLOGGING_LATENCY * deterministicRandom()->random01());
sendBufSize = std::max<double>(deterministicRandom()->randomInt(0, 5000000), 25e6 * (latency + .002));
Remote ikvs debugging (#6465) * initial structure for remote IKVS server * moved struct to .h file, added new files to CMakeList * happy path implementation, connection error when testing * saved minor local change * changed tracing to debug * fixed onClosed and getError being called before init is finished * fix spawn process bug, now use absolute path * added server knob to set ikvs process port number * added server knob for remote/local kv store * implement simulator remote process spawning * fixed bug for simulator timeout * commit all changes * removed print lines in trace * added FlowProcess implementation by Markus * initial debug of FlowProcess, stuck at parent sending OpenKVStoreRequest to child * temporary fix for process factory throwing segfault on create * specify public address in command * change remote kv store knob to false for jenkins build * made port 0 open random unused port * change remote store knob to true for benchmark * set listening port to randomly opened port * added print lines for jenkins run open kv store timeout debug * removed most tracing and print lines * removed tutorial changes * update handleIOErrors error handling to handle remote-ikvs cases * Push all debugging changes * A version where worker bug exists * A version where restarting tests fail * Use both the name and the port to determine the child process * Remove unnecessary update on local address * Disable remote-kvs for DiskFailureCycle test * A version where restarting stuck * A version where most restarting tests green * Reset connection with child process explicitly * Remove change on unnecessary files * Unify flags from _ to - * fix merging unexpected changes * fix trac.error to .errorUnsuppressed * Add license header * Remove unnecessary header in FlowProcess.actor.cpp * Fix Windows build * Fix Windows build, add missing ; * Fix a stupid bug caused by code dropped by code merging * Disable remote kvs by default * Pass the conn_file path to the flow process, though not needed, but the buildNetwork is difficult to tune * serialization change on readrange * Update traces * Refactor the RemoteIKVS interface * Format files * Update sim2 interface to not clog connections between parent and child processes in simulation * Update comments; remove debugging symbols; Add error handling for remote_kvs_cancelled * Add comments, format files * Change method name from isBuggifyDisabled to isStableConnection; Decrease(0.1x) latency for stable connections * Commit the IConnection interface change, forgot in previous commit * Fix the issue that onClosed request is cancelled by ActorCollection * Enable the remote kv store knob * Remove FlowProcess.actor.cpp and move functions to RemoteIKeyValueStore.actor.cpp; Add remote kv store delay to avoid race; Bind the child process to die with parent process * Fix the bug where one process starts storage server more than once * Add a please_reboot_remote_kv_store error to restart the storage server worker if remote kvs died abnormally * Remove unreachable code path and add comments * Clang format the code * Fix a simple wait error * Clang format after merging the main branch * Testing mixed mode in simulation if remote_kvs knob is enabled, setting the default to false * Disable remote kvs for PhysicalShardMove which is for RocksDB * Cleanup #include orders, remove debugging traces * Revert the reorder in fdbserver.actor.cpp, which fails the gcc build Co-authored-by: “Lincoln <“lincoln.xiao@snowflake.com”>
2022-04-01 08:08:59 +08:00
// options like clogging or bitsflip are disabled for stable connections
stableConnection = std::any_of(process->childs.begin(),
process->childs.end(),
[&](ISimulator::ProcessInfo* child) { return child && child == peerProcess; }) ||
std::any_of(peerProcess->childs.begin(),
peerProcess->childs.end(),
[&](ISimulator::ProcessInfo* child) { return child && child == process; });
TraceEvent("Sim2Connection")
.detail("SendBufSize", sendBufSize)
.detail("Latency", latency)
.detail("StableConnection", stableConnection);
2017-05-26 04:48:44 +08:00
}
~Sim2Conn() { ASSERT_ABORT(!opened || closedByCaller); }
2017-05-26 04:48:44 +08:00
void addref() override { ReferenceCounted<Sim2Conn>::addref(); }
void delref() override { ReferenceCounted<Sim2Conn>::delref(); }
void close() override {
closedByCaller = true;
closeInternal();
}
2017-05-26 04:48:44 +08:00
Future<Void> acceptHandshake() override { return delay(0.01 * deterministicRandom()->random01()); }
Future<Void> connectHandshake() override { return delay(0.01 * deterministicRandom()->random01()); }
Future<Void> onWritable() override { return whenWritable(this); }
Future<Void> onReadable() override { return whenReadable(this); }
2017-05-26 04:48:44 +08:00
bool isPeerGone() const { return !peer || peerProcess->failed; }
2017-05-26 04:48:44 +08:00
Remote ikvs debugging (#6465) * initial structure for remote IKVS server * moved struct to .h file, added new files to CMakeList * happy path implementation, connection error when testing * saved minor local change * changed tracing to debug * fixed onClosed and getError being called before init is finished * fix spawn process bug, now use absolute path * added server knob to set ikvs process port number * added server knob for remote/local kv store * implement simulator remote process spawning * fixed bug for simulator timeout * commit all changes * removed print lines in trace * added FlowProcess implementation by Markus * initial debug of FlowProcess, stuck at parent sending OpenKVStoreRequest to child * temporary fix for process factory throwing segfault on create * specify public address in command * change remote kv store knob to false for jenkins build * made port 0 open random unused port * change remote store knob to true for benchmark * set listening port to randomly opened port * added print lines for jenkins run open kv store timeout debug * removed most tracing and print lines * removed tutorial changes * update handleIOErrors error handling to handle remote-ikvs cases * Push all debugging changes * A version where worker bug exists * A version where restarting tests fail * Use both the name and the port to determine the child process * Remove unnecessary update on local address * Disable remote-kvs for DiskFailureCycle test * A version where restarting stuck * A version where most restarting tests green * Reset connection with child process explicitly * Remove change on unnecessary files * Unify flags from _ to - * fix merging unexpected changes * fix trac.error to .errorUnsuppressed * Add license header * Remove unnecessary header in FlowProcess.actor.cpp * Fix Windows build * Fix Windows build, add missing ; * Fix a stupid bug caused by code dropped by code merging * Disable remote kvs by default * Pass the conn_file path to the flow process, though not needed, but the buildNetwork is difficult to tune * serialization change on readrange * Update traces * Refactor the RemoteIKVS interface * Format files * Update sim2 interface to not clog connections between parent and child processes in simulation * Update comments; remove debugging symbols; Add error handling for remote_kvs_cancelled * Add comments, format files * Change method name from isBuggifyDisabled to isStableConnection; Decrease(0.1x) latency for stable connections * Commit the IConnection interface change, forgot in previous commit * Fix the issue that onClosed request is cancelled by ActorCollection * Enable the remote kv store knob * Remove FlowProcess.actor.cpp and move functions to RemoteIKeyValueStore.actor.cpp; Add remote kv store delay to avoid race; Bind the child process to die with parent process * Fix the bug where one process starts storage server more than once * Add a please_reboot_remote_kv_store error to restart the storage server worker if remote kvs died abnormally * Remove unreachable code path and add comments * Clang format the code * Fix a simple wait error * Clang format after merging the main branch * Testing mixed mode in simulation if remote_kvs knob is enabled, setting the default to false * Disable remote kvs for PhysicalShardMove which is for RocksDB * Cleanup #include orders, remove debugging traces * Revert the reorder in fdbserver.actor.cpp, which fails the gcc build Co-authored-by: “Lincoln <“lincoln.xiao@snowflake.com”>
2022-04-01 08:08:59 +08:00
bool isStableConnection() const override { return stableConnection; }
2017-05-26 04:48:44 +08:00
void peerClosed() {
leakedConnectionTracker = trackLeakedConnection(this);
stopReceive = delay(1.0);
2017-05-26 04:48:44 +08:00
}
// Reads as many bytes as possible from the read buffer into [begin,end) and returns the number of bytes read (might
// be 0) (or may throw an error if the connection dies)
int read(uint8_t* begin, uint8_t* end) override {
2017-05-26 04:48:44 +08:00
rollRandomClose();
int64_t avail = receivedBytes.get() - readBytes.get(); // SOMEDAY: random?
int toRead = std::min<int64_t>(end - begin, avail);
ASSERT(toRead >= 0 && toRead <= recvBuf.size() && toRead <= end - begin);
for (int i = 0; i < toRead; i++)
2017-05-26 04:48:44 +08:00
begin[i] = recvBuf[i];
recvBuf.erase(recvBuf.begin(), recvBuf.begin() + toRead);
readBytes.set(readBytes.get() + toRead);
2017-05-26 04:48:44 +08:00
return toRead;
}
// Writes as many bytes as possible from the given SendBuffer chain into the write buffer and returns the number of
// bytes written (might be 0) (or may throw an error if the connection dies)
int write(SendBuffer const* buffer, int limit) override {
2017-05-26 04:48:44 +08:00
rollRandomClose();
ASSERT(limit > 0);
int toSend = 0;
Remote ikvs debugging (#6465) * initial structure for remote IKVS server * moved struct to .h file, added new files to CMakeList * happy path implementation, connection error when testing * saved minor local change * changed tracing to debug * fixed onClosed and getError being called before init is finished * fix spawn process bug, now use absolute path * added server knob to set ikvs process port number * added server knob for remote/local kv store * implement simulator remote process spawning * fixed bug for simulator timeout * commit all changes * removed print lines in trace * added FlowProcess implementation by Markus * initial debug of FlowProcess, stuck at parent sending OpenKVStoreRequest to child * temporary fix for process factory throwing segfault on create * specify public address in command * change remote kv store knob to false for jenkins build * made port 0 open random unused port * change remote store knob to true for benchmark * set listening port to randomly opened port * added print lines for jenkins run open kv store timeout debug * removed most tracing and print lines * removed tutorial changes * update handleIOErrors error handling to handle remote-ikvs cases * Push all debugging changes * A version where worker bug exists * A version where restarting tests fail * Use both the name and the port to determine the child process * Remove unnecessary update on local address * Disable remote-kvs for DiskFailureCycle test * A version where restarting stuck * A version where most restarting tests green * Reset connection with child process explicitly * Remove change on unnecessary files * Unify flags from _ to - * fix merging unexpected changes * fix trac.error to .errorUnsuppressed * Add license header * Remove unnecessary header in FlowProcess.actor.cpp * Fix Windows build * Fix Windows build, add missing ; * Fix a stupid bug caused by code dropped by code merging * Disable remote kvs by default * Pass the conn_file path to the flow process, though not needed, but the buildNetwork is difficult to tune * serialization change on readrange * Update traces * Refactor the RemoteIKVS interface * Format files * Update sim2 interface to not clog connections between parent and child processes in simulation * Update comments; remove debugging symbols; Add error handling for remote_kvs_cancelled * Add comments, format files * Change method name from isBuggifyDisabled to isStableConnection; Decrease(0.1x) latency for stable connections * Commit the IConnection interface change, forgot in previous commit * Fix the issue that onClosed request is cancelled by ActorCollection * Enable the remote kv store knob * Remove FlowProcess.actor.cpp and move functions to RemoteIKeyValueStore.actor.cpp; Add remote kv store delay to avoid race; Bind the child process to die with parent process * Fix the bug where one process starts storage server more than once * Add a please_reboot_remote_kv_store error to restart the storage server worker if remote kvs died abnormally * Remove unreachable code path and add comments * Clang format the code * Fix a simple wait error * Clang format after merging the main branch * Testing mixed mode in simulation if remote_kvs knob is enabled, setting the default to false * Disable remote kvs for PhysicalShardMove which is for RocksDB * Cleanup #include orders, remove debugging traces * Revert the reorder in fdbserver.actor.cpp, which fails the gcc build Co-authored-by: “Lincoln <“lincoln.xiao@snowflake.com”>
2022-04-01 08:08:59 +08:00
if (BUGGIFY && !stableConnection) {
2017-05-26 04:48:44 +08:00
toSend = std::min(limit, buffer->bytes_written - buffer->bytes_sent);
} else {
for (auto p = buffer; p; p = p->next) {
2017-05-26 04:48:44 +08:00
toSend += p->bytes_written - p->bytes_sent;
if (toSend >= limit) {
if (toSend > limit)
2017-05-26 04:48:44 +08:00
toSend = limit;
break;
}
}
}
ASSERT(toSend);
Remote ikvs debugging (#6465) * initial structure for remote IKVS server * moved struct to .h file, added new files to CMakeList * happy path implementation, connection error when testing * saved minor local change * changed tracing to debug * fixed onClosed and getError being called before init is finished * fix spawn process bug, now use absolute path * added server knob to set ikvs process port number * added server knob for remote/local kv store * implement simulator remote process spawning * fixed bug for simulator timeout * commit all changes * removed print lines in trace * added FlowProcess implementation by Markus * initial debug of FlowProcess, stuck at parent sending OpenKVStoreRequest to child * temporary fix for process factory throwing segfault on create * specify public address in command * change remote kv store knob to false for jenkins build * made port 0 open random unused port * change remote store knob to true for benchmark * set listening port to randomly opened port * added print lines for jenkins run open kv store timeout debug * removed most tracing and print lines * removed tutorial changes * update handleIOErrors error handling to handle remote-ikvs cases * Push all debugging changes * A version where worker bug exists * A version where restarting tests fail * Use both the name and the port to determine the child process * Remove unnecessary update on local address * Disable remote-kvs for DiskFailureCycle test * A version where restarting stuck * A version where most restarting tests green * Reset connection with child process explicitly * Remove change on unnecessary files * Unify flags from _ to - * fix merging unexpected changes * fix trac.error to .errorUnsuppressed * Add license header * Remove unnecessary header in FlowProcess.actor.cpp * Fix Windows build * Fix Windows build, add missing ; * Fix a stupid bug caused by code dropped by code merging * Disable remote kvs by default * Pass the conn_file path to the flow process, though not needed, but the buildNetwork is difficult to tune * serialization change on readrange * Update traces * Refactor the RemoteIKVS interface * Format files * Update sim2 interface to not clog connections between parent and child processes in simulation * Update comments; remove debugging symbols; Add error handling for remote_kvs_cancelled * Add comments, format files * Change method name from isBuggifyDisabled to isStableConnection; Decrease(0.1x) latency for stable connections * Commit the IConnection interface change, forgot in previous commit * Fix the issue that onClosed request is cancelled by ActorCollection * Enable the remote kv store knob * Remove FlowProcess.actor.cpp and move functions to RemoteIKeyValueStore.actor.cpp; Add remote kv store delay to avoid race; Bind the child process to die with parent process * Fix the bug where one process starts storage server more than once * Add a please_reboot_remote_kv_store error to restart the storage server worker if remote kvs died abnormally * Remove unreachable code path and add comments * Clang format the code * Fix a simple wait error * Clang format after merging the main branch * Testing mixed mode in simulation if remote_kvs knob is enabled, setting the default to false * Disable remote kvs for PhysicalShardMove which is for RocksDB * Cleanup #include orders, remove debugging traces * Revert the reorder in fdbserver.actor.cpp, which fails the gcc build Co-authored-by: “Lincoln <“lincoln.xiao@snowflake.com”>
2022-04-01 08:08:59 +08:00
if (BUGGIFY && !stableConnection)
toSend = std::min(toSend, deterministicRandom()->randomInt(0, 1000));
2017-05-26 04:48:44 +08:00
if (!peer)
return toSend;
toSend = std::min(toSend, peer->availableSendBufferForPeer());
ASSERT(toSend >= 0);
2017-05-26 04:48:44 +08:00
int leftToSend = toSend;
for (auto p = buffer; p && leftToSend > 0; p = p->next) {
2017-05-26 04:48:44 +08:00
int ts = std::min(leftToSend, p->bytes_written - p->bytes_sent);
peer->recvBuf.insert(peer->recvBuf.end(), p->data() + p->bytes_sent, p->data() + p->bytes_sent + ts);
2017-05-26 04:48:44 +08:00
leftToSend -= ts;
}
ASSERT(leftToSend == 0);
peer->writtenBytes.set(peer->writtenBytes.get() + toSend);
2017-05-26 04:48:44 +08:00
return toSend;
}
// Returns the network address and port of the other end of the connection. In the case of an incoming connection,
// this may not be an address we can connect to!
NetworkAddress getPeerAddress() const override { return peerEndpoint; }
UID getDebugID() const override { return dbgid; }
2017-05-26 04:48:44 +08:00
Remote ikvs debugging (#6465) * initial structure for remote IKVS server * moved struct to .h file, added new files to CMakeList * happy path implementation, connection error when testing * saved minor local change * changed tracing to debug * fixed onClosed and getError being called before init is finished * fix spawn process bug, now use absolute path * added server knob to set ikvs process port number * added server knob for remote/local kv store * implement simulator remote process spawning * fixed bug for simulator timeout * commit all changes * removed print lines in trace * added FlowProcess implementation by Markus * initial debug of FlowProcess, stuck at parent sending OpenKVStoreRequest to child * temporary fix for process factory throwing segfault on create * specify public address in command * change remote kv store knob to false for jenkins build * made port 0 open random unused port * change remote store knob to true for benchmark * set listening port to randomly opened port * added print lines for jenkins run open kv store timeout debug * removed most tracing and print lines * removed tutorial changes * update handleIOErrors error handling to handle remote-ikvs cases * Push all debugging changes * A version where worker bug exists * A version where restarting tests fail * Use both the name and the port to determine the child process * Remove unnecessary update on local address * Disable remote-kvs for DiskFailureCycle test * A version where restarting stuck * A version where most restarting tests green * Reset connection with child process explicitly * Remove change on unnecessary files * Unify flags from _ to - * fix merging unexpected changes * fix trac.error to .errorUnsuppressed * Add license header * Remove unnecessary header in FlowProcess.actor.cpp * Fix Windows build * Fix Windows build, add missing ; * Fix a stupid bug caused by code dropped by code merging * Disable remote kvs by default * Pass the conn_file path to the flow process, though not needed, but the buildNetwork is difficult to tune * serialization change on readrange * Update traces * Refactor the RemoteIKVS interface * Format files * Update sim2 interface to not clog connections between parent and child processes in simulation * Update comments; remove debugging symbols; Add error handling for remote_kvs_cancelled * Add comments, format files * Change method name from isBuggifyDisabled to isStableConnection; Decrease(0.1x) latency for stable connections * Commit the IConnection interface change, forgot in previous commit * Fix the issue that onClosed request is cancelled by ActorCollection * Enable the remote kv store knob * Remove FlowProcess.actor.cpp and move functions to RemoteIKeyValueStore.actor.cpp; Add remote kv store delay to avoid race; Bind the child process to die with parent process * Fix the bug where one process starts storage server more than once * Add a please_reboot_remote_kv_store error to restart the storage server worker if remote kvs died abnormally * Remove unreachable code path and add comments * Clang format the code * Fix a simple wait error * Clang format after merging the main branch * Testing mixed mode in simulation if remote_kvs knob is enabled, setting the default to false * Disable remote kvs for PhysicalShardMove which is for RocksDB * Cleanup #include orders, remove debugging traces * Revert the reorder in fdbserver.actor.cpp, which fails the gcc build Co-authored-by: “Lincoln <“lincoln.xiao@snowflake.com”>
2022-04-01 08:08:59 +08:00
bool opened, closedByCaller, stableConnection;
2017-05-26 04:48:44 +08:00
private:
ISimulator::ProcessInfo *process, *peerProcess;
2017-05-26 04:48:44 +08:00
UID dbgid, peerId;
NetworkAddress peerEndpoint;
std::deque<uint8_t> recvBuf; // Includes bytes written but not yet received!
2017-05-26 04:48:44 +08:00
AsyncVar<int64_t> readBytes, // bytes already pulled from recvBuf (location of the beginning of recvBuf)
receivedBytes, sentBytes,
writtenBytes; // location of the end of recvBuf ( == recvBuf.size() + readBytes.get() )
2017-05-26 04:48:44 +08:00
Reference<Sim2Conn> peer;
int sendBufSize;
Future<Void> leakedConnectionTracker;
Future<Void> pipes;
Future<Void> stopReceive;
2017-05-26 04:48:44 +08:00
int availableSendBufferForPeer() const {
return sendBufSize - (writtenBytes.get() - receivedBytes.get());
} // SOMEDAY: acknowledgedBytes instead of receivedBytes
2017-05-26 04:48:44 +08:00
void closeInternal() {
if (peer) {
2017-05-26 04:48:44 +08:00
peer->peerClosed();
stopReceive = delay(1.0);
2017-05-26 04:48:44 +08:00
}
leakedConnectionTracker.cancel();
peer.clear();
}
ACTOR static Future<Void> sender(Sim2Conn* self) {
2017-05-26 04:48:44 +08:00
loop {
wait(self->writtenBytes.onChange()); // takes place on peer!
ASSERT(g_simulator.getCurrentProcess() == self->peerProcess);
wait(delay(.002 * deterministicRandom()->random01()));
self->sentBytes.set(self->writtenBytes.get()); // or possibly just some sometimes...
2017-05-26 04:48:44 +08:00
}
}
ACTOR static Future<Void> receiver(Sim2Conn* self) {
2017-05-26 04:48:44 +08:00
loop {
if (self->sentBytes.get() != self->receivedBytes.get())
wait(g_simulator.onProcess(self->peerProcess));
while (self->sentBytes.get() == self->receivedBytes.get())
wait(self->sentBytes.onChange());
ASSERT(g_simulator.getCurrentProcess() == self->peerProcess);
state int64_t pos =
deterministicRandom()->random01() < .5
? self->sentBytes.get()
: deterministicRandom()->randomInt64(self->receivedBytes.get(), self->sentBytes.get() + 1);
Remote ikvs debugging (#6465) * initial structure for remote IKVS server * moved struct to .h file, added new files to CMakeList * happy path implementation, connection error when testing * saved minor local change * changed tracing to debug * fixed onClosed and getError being called before init is finished * fix spawn process bug, now use absolute path * added server knob to set ikvs process port number * added server knob for remote/local kv store * implement simulator remote process spawning * fixed bug for simulator timeout * commit all changes * removed print lines in trace * added FlowProcess implementation by Markus * initial debug of FlowProcess, stuck at parent sending OpenKVStoreRequest to child * temporary fix for process factory throwing segfault on create * specify public address in command * change remote kv store knob to false for jenkins build * made port 0 open random unused port * change remote store knob to true for benchmark * set listening port to randomly opened port * added print lines for jenkins run open kv store timeout debug * removed most tracing and print lines * removed tutorial changes * update handleIOErrors error handling to handle remote-ikvs cases * Push all debugging changes * A version where worker bug exists * A version where restarting tests fail * Use both the name and the port to determine the child process * Remove unnecessary update on local address * Disable remote-kvs for DiskFailureCycle test * A version where restarting stuck * A version where most restarting tests green * Reset connection with child process explicitly * Remove change on unnecessary files * Unify flags from _ to - * fix merging unexpected changes * fix trac.error to .errorUnsuppressed * Add license header * Remove unnecessary header in FlowProcess.actor.cpp * Fix Windows build * Fix Windows build, add missing ; * Fix a stupid bug caused by code dropped by code merging * Disable remote kvs by default * Pass the conn_file path to the flow process, though not needed, but the buildNetwork is difficult to tune * serialization change on readrange * Update traces * Refactor the RemoteIKVS interface * Format files * Update sim2 interface to not clog connections between parent and child processes in simulation * Update comments; remove debugging symbols; Add error handling for remote_kvs_cancelled * Add comments, format files * Change method name from isBuggifyDisabled to isStableConnection; Decrease(0.1x) latency for stable connections * Commit the IConnection interface change, forgot in previous commit * Fix the issue that onClosed request is cancelled by ActorCollection * Enable the remote kv store knob * Remove FlowProcess.actor.cpp and move functions to RemoteIKeyValueStore.actor.cpp; Add remote kv store delay to avoid race; Bind the child process to die with parent process * Fix the bug where one process starts storage server more than once * Add a please_reboot_remote_kv_store error to restart the storage server worker if remote kvs died abnormally * Remove unreachable code path and add comments * Clang format the code * Fix a simple wait error * Clang format after merging the main branch * Testing mixed mode in simulation if remote_kvs knob is enabled, setting the default to false * Disable remote kvs for PhysicalShardMove which is for RocksDB * Cleanup #include orders, remove debugging traces * Revert the reorder in fdbserver.actor.cpp, which fails the gcc build Co-authored-by: “Lincoln <“lincoln.xiao@snowflake.com”>
2022-04-01 08:08:59 +08:00
wait(delay(g_clogging.getSendDelay(
self->process->address, self->peerProcess->address, self->isStableConnection())));
wait(g_simulator.onProcess(self->process));
ASSERT(g_simulator.getCurrentProcess() == self->process);
Remote ikvs debugging (#6465) * initial structure for remote IKVS server * moved struct to .h file, added new files to CMakeList * happy path implementation, connection error when testing * saved minor local change * changed tracing to debug * fixed onClosed and getError being called before init is finished * fix spawn process bug, now use absolute path * added server knob to set ikvs process port number * added server knob for remote/local kv store * implement simulator remote process spawning * fixed bug for simulator timeout * commit all changes * removed print lines in trace * added FlowProcess implementation by Markus * initial debug of FlowProcess, stuck at parent sending OpenKVStoreRequest to child * temporary fix for process factory throwing segfault on create * specify public address in command * change remote kv store knob to false for jenkins build * made port 0 open random unused port * change remote store knob to true for benchmark * set listening port to randomly opened port * added print lines for jenkins run open kv store timeout debug * removed most tracing and print lines * removed tutorial changes * update handleIOErrors error handling to handle remote-ikvs cases * Push all debugging changes * A version where worker bug exists * A version where restarting tests fail * Use both the name and the port to determine the child process * Remove unnecessary update on local address * Disable remote-kvs for DiskFailureCycle test * A version where restarting stuck * A version where most restarting tests green * Reset connection with child process explicitly * Remove change on unnecessary files * Unify flags from _ to - * fix merging unexpected changes * fix trac.error to .errorUnsuppressed * Add license header * Remove unnecessary header in FlowProcess.actor.cpp * Fix Windows build * Fix Windows build, add missing ; * Fix a stupid bug caused by code dropped by code merging * Disable remote kvs by default * Pass the conn_file path to the flow process, though not needed, but the buildNetwork is difficult to tune * serialization change on readrange * Update traces * Refactor the RemoteIKVS interface * Format files * Update sim2 interface to not clog connections between parent and child processes in simulation * Update comments; remove debugging symbols; Add error handling for remote_kvs_cancelled * Add comments, format files * Change method name from isBuggifyDisabled to isStableConnection; Decrease(0.1x) latency for stable connections * Commit the IConnection interface change, forgot in previous commit * Fix the issue that onClosed request is cancelled by ActorCollection * Enable the remote kv store knob * Remove FlowProcess.actor.cpp and move functions to RemoteIKeyValueStore.actor.cpp; Add remote kv store delay to avoid race; Bind the child process to die with parent process * Fix the bug where one process starts storage server more than once * Add a please_reboot_remote_kv_store error to restart the storage server worker if remote kvs died abnormally * Remove unreachable code path and add comments * Clang format the code * Fix a simple wait error * Clang format after merging the main branch * Testing mixed mode in simulation if remote_kvs knob is enabled, setting the default to false * Disable remote kvs for PhysicalShardMove which is for RocksDB * Cleanup #include orders, remove debugging traces * Revert the reorder in fdbserver.actor.cpp, which fails the gcc build Co-authored-by: “Lincoln <“lincoln.xiao@snowflake.com”>
2022-04-01 08:08:59 +08:00
wait(delay(g_clogging.getRecvDelay(
self->process->address, self->peerProcess->address, self->isStableConnection())));
ASSERT(g_simulator.getCurrentProcess() == self->process);
if (self->stopReceive.isReady()) {
wait(Future<Void>(Never()));
}
self->receivedBytes.set(pos);
wait(Future<Void>(Void())); // Prior notification can delete self and cancel this actor
ASSERT(g_simulator.getCurrentProcess() == self->process);
2017-05-26 04:48:44 +08:00
}
}
ACTOR static Future<Void> whenReadable(Sim2Conn* self) {
2017-05-26 04:48:44 +08:00
try {
loop {
if (self->readBytes.get() != self->receivedBytes.get()) {
ASSERT(g_simulator.getCurrentProcess() == self->process);
2017-05-26 04:48:44 +08:00
return Void();
}
wait(self->receivedBytes.onChange());
2017-05-26 04:48:44 +08:00
self->rollRandomClose();
}
} catch (Error& e) {
ASSERT(g_simulator.getCurrentProcess() == self->process);
2017-05-26 04:48:44 +08:00
throw;
}
}
ACTOR static Future<Void> whenWritable(Sim2Conn* self) {
2017-05-26 04:48:44 +08:00
try {
loop {
if (!self->peer)
return Void();
2017-05-26 04:48:44 +08:00
if (self->peer->availableSendBufferForPeer() > 0) {
ASSERT(g_simulator.getCurrentProcess() == self->process);
2017-05-26 04:48:44 +08:00
return Void();
}
try {
wait(self->peer->receivedBytes.onChange());
ASSERT(g_simulator.getCurrentProcess() == self->peerProcess);
2017-05-26 04:48:44 +08:00
} catch (Error& e) {
if (e.code() != error_code_broken_promise)
throw;
2017-05-26 04:48:44 +08:00
}
wait(g_simulator.onProcess(self->process));
2017-05-26 04:48:44 +08:00
}
} catch (Error& e) {
ASSERT(g_simulator.getCurrentProcess() == self->process);
2017-05-26 04:48:44 +08:00
throw;
}
}
void rollRandomClose() {
Remote ikvs debugging (#6465) * initial structure for remote IKVS server * moved struct to .h file, added new files to CMakeList * happy path implementation, connection error when testing * saved minor local change * changed tracing to debug * fixed onClosed and getError being called before init is finished * fix spawn process bug, now use absolute path * added server knob to set ikvs process port number * added server knob for remote/local kv store * implement simulator remote process spawning * fixed bug for simulator timeout * commit all changes * removed print lines in trace * added FlowProcess implementation by Markus * initial debug of FlowProcess, stuck at parent sending OpenKVStoreRequest to child * temporary fix for process factory throwing segfault on create * specify public address in command * change remote kv store knob to false for jenkins build * made port 0 open random unused port * change remote store knob to true for benchmark * set listening port to randomly opened port * added print lines for jenkins run open kv store timeout debug * removed most tracing and print lines * removed tutorial changes * update handleIOErrors error handling to handle remote-ikvs cases * Push all debugging changes * A version where worker bug exists * A version where restarting tests fail * Use both the name and the port to determine the child process * Remove unnecessary update on local address * Disable remote-kvs for DiskFailureCycle test * A version where restarting stuck * A version where most restarting tests green * Reset connection with child process explicitly * Remove change on unnecessary files * Unify flags from _ to - * fix merging unexpected changes * fix trac.error to .errorUnsuppressed * Add license header * Remove unnecessary header in FlowProcess.actor.cpp * Fix Windows build * Fix Windows build, add missing ; * Fix a stupid bug caused by code dropped by code merging * Disable remote kvs by default * Pass the conn_file path to the flow process, though not needed, but the buildNetwork is difficult to tune * serialization change on readrange * Update traces * Refactor the RemoteIKVS interface * Format files * Update sim2 interface to not clog connections between parent and child processes in simulation * Update comments; remove debugging symbols; Add error handling for remote_kvs_cancelled * Add comments, format files * Change method name from isBuggifyDisabled to isStableConnection; Decrease(0.1x) latency for stable connections * Commit the IConnection interface change, forgot in previous commit * Fix the issue that onClosed request is cancelled by ActorCollection * Enable the remote kv store knob * Remove FlowProcess.actor.cpp and move functions to RemoteIKeyValueStore.actor.cpp; Add remote kv store delay to avoid race; Bind the child process to die with parent process * Fix the bug where one process starts storage server more than once * Add a please_reboot_remote_kv_store error to restart the storage server worker if remote kvs died abnormally * Remove unreachable code path and add comments * Clang format the code * Fix a simple wait error * Clang format after merging the main branch * Testing mixed mode in simulation if remote_kvs knob is enabled, setting the default to false * Disable remote kvs for PhysicalShardMove which is for RocksDB * Cleanup #include orders, remove debugging traces * Revert the reorder in fdbserver.actor.cpp, which fails the gcc build Co-authored-by: “Lincoln <“lincoln.xiao@snowflake.com”>
2022-04-01 08:08:59 +08:00
// make sure connections between parenta and their childs are not closed
if (!stableConnection &&
now() - g_simulator.lastConnectionFailure > g_simulator.connectionFailuresDisableDuration &&
deterministicRandom()->random01() < .00001) {
g_simulator.lastConnectionFailure = now();
double a = deterministicRandom()->random01(), b = deterministicRandom()->random01();
TEST(true); // Simulated connection failure
TraceEvent("ConnectionFailure", dbgid)
.detail("MyAddr", process->address)
.detail("PeerAddr", peerProcess->address)
.detail("PeerIsValid", peer.isValid())
.detail("SendClosed", a > .33)
.detail("RecvClosed", a < .66)
.detail("Explicit", b < .3);
if (a < .66 && peer)
peer->closeInternal();
if (a > .33)
closeInternal();
// At the moment, we occasionally notice the connection failed immediately. In principle, this could happen
// but only after a delay.
2017-05-26 04:48:44 +08:00
if (b < .3)
throw connection_failed();
}
}
ACTOR static Future<Void> trackLeakedConnection(Sim2Conn* self) {
wait(g_simulator.onProcess(self->process));
if (self->process->address.isPublic()) {
wait(delay(FLOW_KNOBS->CONNECTION_MONITOR_IDLE_TIMEOUT * FLOW_KNOBS->CONNECTION_MONITOR_IDLE_TIMEOUT * 1.5 +
FLOW_KNOBS->CONNECTION_MONITOR_LOOP_TIME * 2.1 + FLOW_KNOBS->CONNECTION_MONITOR_TIMEOUT));
} else {
wait(delay(FLOW_KNOBS->CONNECTION_MONITOR_IDLE_TIMEOUT * 1.5));
}
TraceEvent(SevError, "LeakedConnection", self->dbgid)
.error(connection_leaked())
.detail("MyAddr", self->process->address)
2022-04-11 04:16:21 +08:00
.detail("IsPublic", self->process->address.isPublic())
.detail("PeerAddr", self->peerEndpoint)
.detail("PeerId", self->peerId)
.detail("Opened", self->opened);
2017-05-26 04:48:44 +08:00
return Void();
}
};
#include <fcntl.h>
#include <sys/stat.h>
int sf_open(const char* filename, int flags, int convFlags, int mode);
2017-05-26 04:48:44 +08:00
#if defined(_WIN32)
#include <io.h>
#define O_CLOEXEC 0
2017-05-26 04:48:44 +08:00
#elif defined(__unixish__)
#define _open ::open
#define _read ::read
#define _write ::write
#define _close ::close
#define _lseeki64 ::lseek
#define _commit ::fsync
#define _chsize ::ftruncate
#define O_BINARY 0
int sf_open(const char* filename, int flags, int convFlags, int mode) {
return _open(filename, convFlags, mode);
2017-05-26 04:48:44 +08:00
}
#else
#error How do i open a file on a new platform?
#endif
class SimpleFile : public IAsyncFile, public ReferenceCounted<SimpleFile> {
public:
static void init() {}
static bool should_poll() { return false; }
2020-11-07 15:50:55 +08:00
ACTOR static Future<Reference<IAsyncFile>> open(
std::string filename,
int flags,
int mode,
2020-11-07 15:50:55 +08:00
Reference<DiskParameters> diskParameters = makeReference<DiskParameters>(25000, 150000000),
bool delayOnWrite = true) {
2017-05-26 04:48:44 +08:00
state ISimulator::ProcessInfo* currentProcess = g_simulator.getCurrentProcess();
state TaskPriority currentTaskID = g_network->getCurrentTask();
2017-05-26 04:48:44 +08:00
if (++openCount >= 3000) {
TraceEvent(SevError, "TooManyFiles").log();
2017-05-26 04:48:44 +08:00
ASSERT(false);
}
if (openCount == 2000) {
TraceEvent(SevWarnAlways, "DisableConnectionFailures_TooManyFiles").log();
2017-05-26 04:48:44 +08:00
g_simulator.speedUpSimulation = true;
g_simulator.connectionFailuresDisableDuration = 1e6;
2017-05-26 04:48:44 +08:00
}
// Filesystems on average these days seem to start to have limits of around 255 characters for a
// filename. We add ".part" below, so we need to stay under 250.
ASSERT(basename(filename).size() < 250);
wait(g_simulator.onMachine(currentProcess));
2017-05-26 04:48:44 +08:00
try {
wait(delay(FLOW_KNOBS->MIN_OPEN_TIME +
deterministicRandom()->random01() * (FLOW_KNOBS->MAX_OPEN_TIME - FLOW_KNOBS->MIN_OPEN_TIME)));
2017-05-26 04:48:44 +08:00
std::string open_filename = filename;
if (flags & OPEN_ATOMIC_WRITE_AND_CREATE) {
ASSERT((flags & OPEN_CREATE) && (flags & OPEN_READWRITE) && !(flags & OPEN_EXCLUSIVE));
2017-05-26 04:48:44 +08:00
open_filename = filename + ".part";
}
int h = sf_open(open_filename.c_str(), flags, flagConversion(flags), mode);
if (h == -1) {
2017-05-26 04:48:44 +08:00
bool notFound = errno == ENOENT;
Error e = notFound ? file_not_found() : io_error();
TraceEvent(notFound ? SevWarn : SevWarnAlways, "FileOpenError")
.error(e)
.GetLastError()
.detail("File", filename)
.detail("Flags", flags);
2017-05-26 04:48:44 +08:00
throw e;
}
platform::makeTemporary(open_filename.c_str());
SimpleFile* simpleFile = new SimpleFile(h, diskParameters, delayOnWrite, filename, open_filename, flags);
state Reference<IAsyncFile> file = Reference<IAsyncFile>(simpleFile);
wait(g_simulator.onProcess(currentProcess, currentTaskID));
2017-05-26 04:48:44 +08:00
return file;
} catch (Error& e) {
2017-05-26 04:48:44 +08:00
state Error err = e;
wait(g_simulator.onProcess(currentProcess, currentTaskID));
2017-05-26 04:48:44 +08:00
throw err;
}
}
void addref() override { ReferenceCounted<SimpleFile>::addref(); }
void delref() override { ReferenceCounted<SimpleFile>::delref(); }
2017-05-26 04:48:44 +08:00
2020-08-20 08:32:11 +08:00
int64_t debugFD() const override { return (int64_t)h; }
2017-05-26 04:48:44 +08:00
Future<int> read(void* data, int length, int64_t offset) override { return read_impl(this, data, length, offset); }
2017-05-26 04:48:44 +08:00
Future<Void> write(void const* data, int length, int64_t offset) override {
return write_impl(this, StringRef((const uint8_t*)data, length), offset);
2017-05-26 04:48:44 +08:00
}
Future<Void> truncate(int64_t size) override { return truncate_impl(this, size); }
2017-05-26 04:48:44 +08:00
Future<Void> sync() override { return sync_impl(this); }
2017-05-26 04:48:44 +08:00
2020-08-20 08:32:11 +08:00
Future<int64_t> size() const override { return size_impl(this); }
2017-05-26 04:48:44 +08:00
std::string getFilename() const override { return actualFilename; }
2017-05-26 04:48:44 +08:00
2021-03-25 06:27:35 +08:00
~SimpleFile() override {
_close(h);
--openCount;
}
2017-05-26 04:48:44 +08:00
private:
int h;
// Performance parameters of simulated disk
2017-05-26 04:48:44 +08:00
Reference<DiskParameters> diskParameters;
std::string filename, actualFilename;
int flags;
UID dbgId;
// If true, then writes/truncates will be preceded by a delay (like other operations). If false, then they will not
// This is to support AsyncFileNonDurable, which issues its own delays for writes and truncates
2017-05-26 04:48:44 +08:00
bool delayOnWrite;
SimpleFile(int h,
Reference<DiskParameters> diskParameters,
bool delayOnWrite,
const std::string& filename,
const std::string& actualFilename,
int flags)
: h(h), diskParameters(diskParameters), filename(filename), actualFilename(actualFilename), flags(flags),
dbgId(deterministicRandom()->randomUniqueID()), delayOnWrite(delayOnWrite) {}
2017-05-26 04:48:44 +08:00
static int flagConversion(int flags) {
int outFlags = O_BINARY | O_CLOEXEC;
if (flags & OPEN_READWRITE)
outFlags |= O_RDWR;
if (flags & OPEN_CREATE)
outFlags |= O_CREAT;
if (flags & OPEN_READONLY)
outFlags |= O_RDONLY;
if (flags & OPEN_EXCLUSIVE)
outFlags |= O_EXCL;
if (flags & OPEN_ATOMIC_WRITE_AND_CREATE)
outFlags |= O_TRUNC;
2017-05-26 04:48:44 +08:00
return outFlags;
}
ACTOR static Future<int> read_impl(SimpleFile* self, void* data, int length, int64_t offset) {
ASSERT((self->flags & IAsyncFile::OPEN_NO_AIO) != 0 ||
((uintptr_t)data % 4096 == 0 && length % 4096 == 0 && offset % 4096 == 0)); // Required by KAIO.
state UID opId = deterministicRandom()->randomUniqueID();
2017-05-26 04:48:44 +08:00
if (randLog)
fmt::print(randLog,
"SFR1 {0} {1} {2} {3} {4}\n",
self->dbgId.shortString(),
self->filename,
opId.shortString(),
length,
offset);
2017-05-26 04:48:44 +08:00
wait(waitUntilDiskReady(self->diskParameters, length));
2017-05-26 04:48:44 +08:00
if (_lseeki64(self->h, offset, SEEK_SET) == -1) {
2017-05-26 04:48:44 +08:00
TraceEvent(SevWarn, "SimpleFileIOError").detail("Location", 1);
throw io_error();
}
unsigned int read_bytes = 0;
if ((read_bytes = _read(self->h, data, (unsigned int)length)) == -1) {
2017-05-26 04:48:44 +08:00
TraceEvent(SevWarn, "SimpleFileIOError").detail("Location", 2);
throw io_error();
}
if (randLog) {
uint32_t a = crc32c_append(0, (const uint8_t*)data, read_bytes);
fprintf(randLog,
"SFR2 %s %s %s %d %d\n",
self->dbgId.shortString().c_str(),
self->filename.c_str(),
opId.shortString().c_str(),
read_bytes,
a);
2017-05-26 04:48:44 +08:00
}
debugFileCheck("SimpleFileRead", self->filename, data, offset, length);
2020-11-11 05:01:07 +08:00
INJECT_FAULT(io_timeout, "SimpleFile::read"); // SimpleFile::read io_timeout injected
INJECT_FAULT(io_error, "SimpleFile::read"); // SimpleFile::read io_error injected
2017-05-26 04:48:44 +08:00
return read_bytes;
}
ACTOR static Future<Void> write_impl(SimpleFile* self, StringRef data, int64_t offset) {
state UID opId = deterministicRandom()->randomUniqueID();
2017-05-26 04:48:44 +08:00
if (randLog) {
uint32_t a = crc32c_append(0, data.begin(), data.size());
fmt::print(randLog,
"SFW1 {0} {1} {2} {3} {4} {5}\n",
self->dbgId.shortString(),
self->filename,
opId.shortString(),
a,
data.size(),
offset);
2017-05-26 04:48:44 +08:00
}
if (self->delayOnWrite)
wait(waitUntilDiskReady(self->diskParameters, data.size()));
2017-05-26 04:48:44 +08:00
if (_lseeki64(self->h, offset, SEEK_SET) == -1) {
2017-05-26 04:48:44 +08:00
TraceEvent(SevWarn, "SimpleFileIOError").detail("Location", 3);
throw io_error();
}
unsigned int write_bytes = 0;
if ((write_bytes = _write(self->h, (void*)data.begin(), data.size())) == -1) {
2017-05-26 04:48:44 +08:00
TraceEvent(SevWarn, "SimpleFileIOError").detail("Location", 4);
throw io_error();
}
if (write_bytes != data.size()) {
2017-05-26 04:48:44 +08:00
TraceEvent(SevWarn, "SimpleFileIOError").detail("Location", 5);
throw io_error();
}
if (randLog) {
fprintf(randLog,
"SFW2 %s %s %s\n",
self->dbgId.shortString().c_str(),
self->filename.c_str(),
opId.shortString().c_str());
2017-05-26 04:48:44 +08:00
}
debugFileCheck("SimpleFileWrite", self->filename, (void*)data.begin(), offset, data.size());
2020-11-11 05:01:07 +08:00
INJECT_FAULT(io_timeout, "SimpleFile::write"); // SimpleFile::write inject io_timeout
INJECT_FAULT(io_error, "SimpleFile::write"); // SimpleFile::write inject io_error
2017-05-26 04:48:44 +08:00
return Void();
}
ACTOR static Future<Void> truncate_impl(SimpleFile* self, int64_t size) {
state UID opId = deterministicRandom()->randomUniqueID();
2017-05-26 04:48:44 +08:00
if (randLog)
fmt::print(
randLog, "SFT1 {0} {1} {2} {3}\n", self->dbgId.shortString(), self->filename, opId.shortString(), size);
2017-05-26 04:48:44 +08:00
// KAIO will return EINVAL, as len==0 is an error.
2021-03-08 17:49:29 +08:00
if ((self->flags & IAsyncFile::OPEN_NO_AIO) == 0 && size == 0) {
throw io_error();
}
if (self->delayOnWrite)
wait(waitUntilDiskReady(self->diskParameters, 0));
2017-05-26 04:48:44 +08:00
if (_chsize(self->h, (long)size) == -1) {
TraceEvent(SevWarn, "SimpleFileIOError")
.detail("Location", 6)
.detail("Filename", self->filename)
.detail("Size", size)
.detail("Fd", self->h)
.GetLastError();
2017-05-26 04:48:44 +08:00
throw io_error();
}
if (randLog)
fprintf(randLog,
"SFT2 %s %s %s\n",
self->dbgId.shortString().c_str(),
self->filename.c_str(),
opId.shortString().c_str());
INJECT_FAULT(io_timeout, "SimpleFile::truncate"); // SimpleFile::truncate inject io_timeout
INJECT_FAULT(io_error, "SimpleFile::truncate"); // SimpleFile::truncate inject io_error
2017-05-26 04:48:44 +08:00
return Void();
}
// Simulated sync does not actually do anything besides wait a random amount of time
ACTOR static Future<Void> sync_impl(SimpleFile* self) {
state UID opId = deterministicRandom()->randomUniqueID();
2017-05-26 04:48:44 +08:00
if (randLog)
fprintf(randLog,
"SFC1 %s %s %s\n",
self->dbgId.shortString().c_str(),
self->filename.c_str(),
opId.shortString().c_str());
2017-05-26 04:48:44 +08:00
if (self->delayOnWrite)
wait(waitUntilDiskReady(self->diskParameters, 0, true));
2017-05-26 04:48:44 +08:00
if (self->flags & OPEN_ATOMIC_WRITE_AND_CREATE) {
self->flags &= ~OPEN_ATOMIC_WRITE_AND_CREATE;
auto& machineCache = g_simulator.getCurrentProcess()->machine->openFiles;
std::string sourceFilename = self->filename + ".part";
if (machineCache.count(sourceFilename)) {
TraceEvent("SimpleFileRename")
.detail("From", sourceFilename)
.detail("To", self->filename)
.detail("SourceCount", machineCache.count(sourceFilename))
.detail("FileCount", machineCache.count(self->filename));
renameFile(sourceFilename.c_str(), self->filename.c_str());
2017-05-26 04:48:44 +08:00
machineCache[self->filename] = machineCache[sourceFilename];
machineCache.erase(sourceFilename);
self->actualFilename = self->filename;
}
}
if (randLog)
fprintf(randLog,
"SFC2 %s %s %s\n",
self->dbgId.shortString().c_str(),
self->filename.c_str(),
opId.shortString().c_str());
INJECT_FAULT(io_timeout, "SimpleFile::sync"); // SimpleFile::sync inject io_timeout
INJECT_FAULT(io_error, "SimpleFile::sync"); // SimpleFile::sync inject io_errot
2017-05-26 04:48:44 +08:00
return Void();
}
2020-08-20 08:32:11 +08:00
ACTOR static Future<int64_t> size_impl(SimpleFile const* self) {
state UID opId = deterministicRandom()->randomUniqueID();
2017-05-26 04:48:44 +08:00
if (randLog)
fprintf(randLog,
"SFS1 %s %s %s\n",
self->dbgId.shortString().c_str(),
self->filename.c_str(),
opId.shortString().c_str());
2017-05-26 04:48:44 +08:00
wait(waitUntilDiskReady(self->diskParameters, 0));
2017-05-26 04:48:44 +08:00
int64_t pos = _lseeki64(self->h, 0L, SEEK_END);
if (pos == -1) {
2017-05-26 04:48:44 +08:00
TraceEvent(SevWarn, "SimpleFileIOError").detail("Location", 8);
throw io_error();
}
if (randLog)
fmt::print(
randLog, "SFS2 {0} {1} {2} {3}\n", self->dbgId.shortString(), self->filename, opId.shortString(), pos);
INJECT_FAULT(io_error, "SimpleFile::size"); // SimpleFile::size inject io_error
2017-05-26 04:48:44 +08:00
return pos;
}
};
struct SimDiskSpace {
int64_t totalSpace;
int64_t baseFreeSpace; // The original free space of the disk + deltas from simulated external modifications
2017-05-26 04:48:44 +08:00
double lastUpdate;
};
void doReboot(ISimulator::ProcessInfo* const& p, ISimulator::KillType const& kt);
2017-05-26 04:48:44 +08:00
struct Sim2Listener final : IListener, ReferenceCounted<Sim2Listener> {
explicit Sim2Listener(ISimulator::ProcessInfo* process, const NetworkAddress& listenAddr)
: process(process), address(listenAddr) {}
void incomingConnection(double seconds, Reference<IConnection> conn) { // Called by another process!
incoming(Reference<Sim2Listener>::addRef(this), seconds, conn);
2017-05-26 04:48:44 +08:00
}
void addref() override { ReferenceCounted<Sim2Listener>::addref(); }
void delref() override { ReferenceCounted<Sim2Listener>::delref(); }
2017-05-26 04:48:44 +08:00
Future<Reference<IConnection>> accept() override { return popOne(nextConnection.getFuture()); }
2017-05-26 04:48:44 +08:00
NetworkAddress getListenAddress() const override { return address; }
2017-05-26 04:48:44 +08:00
private:
ISimulator::ProcessInfo* process;
PromiseStream<Reference<IConnection>> nextConnection;
2017-05-26 04:48:44 +08:00
ACTOR static void incoming(Reference<Sim2Listener> self, double seconds, Reference<IConnection> conn) {
wait(g_simulator.onProcess(self->process));
wait(delay(seconds));
if (((Sim2Conn*)conn.getPtr())->isPeerGone() && deterministicRandom()->random01() < 0.5)
2017-05-26 04:48:44 +08:00
return;
2019-01-09 23:41:02 +08:00
TraceEvent("Sim2IncomingConn", conn->getDebugID())
.detail("ListenAddress", self->getListenAddress())
.detail("PeerAddress", conn->getPeerAddress());
self->nextConnection.send(conn);
2017-05-26 04:48:44 +08:00
}
ACTOR static Future<Reference<IConnection>> popOne(FutureStream<Reference<IConnection>> conns) {
Reference<IConnection> c = waitNext(conns);
2017-05-26 04:48:44 +08:00
((Sim2Conn*)c.getPtr())->opened = true;
return c;
}
NetworkAddress address;
2017-05-26 04:48:44 +08:00
};
#define g_sim2 ((Sim2&)g_simulator)
class Sim2 final : public ISimulator, public INetworkConnections {
2017-05-26 04:48:44 +08:00
public:
// Implement INetwork interface
// Everything actually network related is delegated to the Sim2Net class; Sim2 is only concerned with simulating
// machines and time
double now() const override { return time; }
2017-05-26 04:48:44 +08:00
// timer() can be up to 0.1 seconds ahead of now()
double timer() override {
timerTime += deterministicRandom()->random01() * (time + 0.1 - timerTime) / 2.0;
return timerTime;
}
2020-11-06 01:03:26 +08:00
double timer_monotonic() override { return timer(); }
Future<class Void> delay(double seconds, TaskPriority taskID) override {
ASSERT(taskID >= TaskPriority::Min && taskID <= TaskPriority::Max);
return delay(seconds, taskID, currentProcess);
2017-05-26 04:48:44 +08:00
}
Future<class Void> orderedDelay(double seconds, TaskPriority taskID) override {
ASSERT(taskID >= TaskPriority::Min && taskID <= TaskPriority::Max);
return delay(seconds, taskID, currentProcess, true);
}
Future<class Void> delay(double seconds, TaskPriority taskID, ProcessInfo* machine, bool ordered = false) {
ASSERT(seconds >= -0.0001);
2017-05-26 04:48:44 +08:00
seconds = std::max(0.0, seconds);
Future<Void> f;
if (!ordered && !currentProcess->rebooting && machine == currentProcess &&
!currentProcess->shutdownSignal.isSet() && FLOW_KNOBS->MAX_BUGGIFIED_DELAY > 0 &&
2022-03-13 21:02:11 +08:00
deterministicRandom()->random01() < 0.25) { // FIXME: why doesn't this work when we are changing machines?
seconds += FLOW_KNOBS->MAX_BUGGIFIED_DELAY * pow(deterministicRandom()->random01(), 1000.0);
2017-05-26 04:48:44 +08:00
}
mutex.enter();
tasks.push(Task(time + seconds, taskID, taskCount++, machine, f));
2017-05-26 04:48:44 +08:00
mutex.leave();
return f;
}
ACTOR static Future<Void> checkShutdown(Sim2* self, TaskPriority taskID) {
wait(success(self->getCurrentProcess()->shutdownSignal.getFuture()));
2017-05-26 04:48:44 +08:00
self->setCurrentTask(taskID);
return Void();
}
Future<class Void> yield(TaskPriority taskID) override {
if (taskID == TaskPriority::DefaultYield)
taskID = currentTaskID;
2017-05-26 04:48:44 +08:00
if (check_yield(taskID)) {
// We want to check that yielders can handle actual time elapsing (it sometimes will outside simulation),
// but don't want to prevent instantaneous shutdown of "rebooted" machines.
return delay(getCurrentProcess()->rebooting ? 0 : .001, taskID) || checkShutdown(this, taskID);
2017-05-26 04:48:44 +08:00
}
setCurrentTask(taskID);
return Void();
}
bool check_yield(TaskPriority taskID) override {
if (yielded)
return true;
2017-05-26 04:48:44 +08:00
if (--yield_limit <= 0) {
yield_limit = deterministicRandom()->randomInt(
1, 150); // If yield returns false *too* many times in a row, there could be a stack overflow, since we
// can't deterministically check stack size as the real network does
2017-05-26 04:48:44 +08:00
return yielded = true;
}
return yielded = BUGGIFY_WITH_PROB(0.01);
}
TaskPriority getCurrentTask() const override { return currentTaskID; }
void setCurrentTask(TaskPriority taskID) override { currentTaskID = taskID; }
2017-05-26 04:48:44 +08:00
// Sets the taskID/priority of the current task, without yielding
Future<Reference<IConnection>> connect(NetworkAddress toAddr, const std::string& host) override {
ASSERT(host.empty());
if (!addressMap.count(toAddr)) {
return waitForProcessAndConnect(toAddr, this);
2017-05-26 04:48:44 +08:00
}
auto peerp = getProcessByAddress(toAddr);
2020-11-07 15:50:55 +08:00
auto myc = makeReference<Sim2Conn>(getCurrentProcess());
auto peerc = makeReference<Sim2Conn>(peerp);
2017-05-26 04:48:44 +08:00
myc->connect(peerc, toAddr);
IPAddress localIp;
if (getCurrentProcess()->address.ip.isV6()) {
IPAddress::IPAddressStore store = getCurrentProcess()->address.ip.toV6();
uint16_t* ipParts = (uint16_t*)store.data();
ipParts[7] += deterministicRandom()->randomInt(0, 256);
localIp = IPAddress(store);
} else {
localIp = IPAddress(getCurrentProcess()->address.ip.toV4() + deterministicRandom()->randomInt(0, 256));
}
peerc->connect(myc,
NetworkAddress(localIp, deterministicRandom()->randomInt(40000, 60000), false, toAddr.isTLS()));
2017-05-26 04:48:44 +08:00
((Sim2Listener*)peerp->getListener(toAddr).getPtr())
->incomingConnection(0.5 * deterministicRandom()->random01(), Reference<IConnection>(peerc));
return onConnect(::delay(0.5 * deterministicRandom()->random01()), myc);
2017-05-26 04:48:44 +08:00
}
2020-08-07 04:06:50 +08:00
Future<Reference<IConnection>> connectExternal(NetworkAddress toAddr, const std::string& host) override {
return SimExternalConnection::connect(toAddr);
}
2020-08-07 04:06:50 +08:00
Future<Reference<IUDPSocket>> createUDPSocket(NetworkAddress toAddr) override;
Future<Reference<IUDPSocket>> createUDPSocket(bool isV6 = false) override;
// Add a <hostname, vector<NetworkAddress>> pair to mock DNS in simulation.
void addMockTCPEndpoint(const std::string& host,
const std::string& service,
const std::vector<NetworkAddress>& addresses) override {
2022-04-02 15:22:54 +08:00
mockDNS.add(host, service, addresses);
}
void removeMockTCPEndpoint(const std::string& host, const std::string& service) override {
2022-04-02 15:22:54 +08:00
mockDNS.remove(host, service);
}
2022-01-15 11:02:35 +08:00
// Convert hostnameToAddresses from/to string. The format is:
// hostname1,host1Address1,host1Address2;hostname2,host2Address1,host2Address2...
2022-04-02 15:22:54 +08:00
void parseMockDNSFromString(const std::string& s) override { mockDNS = DNSCache::parseFromString(s); }
2022-01-15 11:02:35 +08:00
std::string convertMockDNSToString() override { return mockDNS.toString(); }
Future<std::vector<NetworkAddress>> resolveTCPEndpoint(const std::string& host,
const std::string& service) override {
// If a <hostname, vector<NetworkAddress>> pair was injected to mock DNS, use it.
2022-04-02 15:22:54 +08:00
Optional<std::vector<NetworkAddress>> mock = mockDNS.find(host, service);
if (mock.present()) {
return mock.get();
}
2022-04-04 08:33:18 +08:00
return SimExternalConnection::resolveTCPEndpoint(host, service, &dnsCache);
}
Future<std::vector<NetworkAddress>> resolveTCPEndpointWithDNSCache(const std::string& host,
const std::string& service) override {
// If a <hostname, vector<NetworkAddress>> pair was injected to mock DNS, use it.
Optional<std::vector<NetworkAddress>> mock = mockDNS.find(host, service);
if (mock.present()) {
return mock.get();
}
Optional<std::vector<NetworkAddress>> cache = dnsCache.find(host, service);
if (cache.present()) {
return cache.get();
}
return SimExternalConnection::resolveTCPEndpoint(host, service, &dnsCache);
}
std::vector<NetworkAddress> resolveTCPEndpointBlocking(const std::string& host,
const std::string& service) override {
// If a <hostname, vector<NetworkAddress>> pair was injected to mock DNS, use it.
2022-04-02 15:22:54 +08:00
Optional<std::vector<NetworkAddress>> mock = mockDNS.find(host, service);
if (mock.present()) {
return mock.get();
}
2022-04-04 08:33:18 +08:00
return SimExternalConnection::resolveTCPEndpointBlocking(host, service, &dnsCache);
}
std::vector<NetworkAddress> resolveTCPEndpointBlockingWithDNSCache(const std::string& host,
const std::string& service) override {
// If a <hostname, vector<NetworkAddress>> pair was injected to mock DNS, use it.
Optional<std::vector<NetworkAddress>> mock = mockDNS.find(host, service);
if (mock.present()) {
return mock.get();
}
Optional<std::vector<NetworkAddress>> cache = dnsCache.find(host, service);
if (cache.present()) {
return cache.get();
}
return SimExternalConnection::resolveTCPEndpointBlocking(host, service, &dnsCache);
}
ACTOR static Future<Reference<IConnection>> onConnect(Future<Void> ready, Reference<Sim2Conn> conn) {
wait(ready);
if (conn->isPeerGone()) {
2017-05-26 04:48:44 +08:00
conn.clear();
if (FLOW_KNOBS->SIM_CONNECT_ERROR_MODE == 1 ||
(FLOW_KNOBS->SIM_CONNECT_ERROR_MODE == 2 && deterministicRandom()->random01() > 0.5)) {
throw connection_failed();
}
wait(Never());
2017-05-26 04:48:44 +08:00
}
conn->opened = true;
return conn;
}
Reference<IListener> listen(NetworkAddress localAddr) override {
Reference<IListener> listener(getCurrentProcess()->getListener(localAddr));
ASSERT(listener);
return listener;
2017-05-26 04:48:44 +08:00
}
ACTOR static Future<Reference<IConnection>> waitForProcessAndConnect(NetworkAddress toAddr,
INetworkConnections* self) {
2017-05-26 04:48:44 +08:00
// We have to be able to connect to processes that don't yet exist, so we do some silly polling
loop {
wait(::delay(0.1 * deterministicRandom()->random01()));
2017-05-26 04:48:44 +08:00
if (g_sim2.addressMap.count(toAddr)) {
Reference<IConnection> c = wait(self->connect(toAddr));
2017-05-26 04:48:44 +08:00
return c;
}
}
}
const TLSConfig& getTLSConfig() const override {
static TLSConfig emptyConfig;
return emptyConfig;
}
2017-05-26 04:48:44 +08:00
bool checkRunnable() override { return net2->checkRunnable(); }
#ifdef ENABLE_SAMPLING
ActorLineageSet& getActorLineageSet() override { return actorLineageSet; }
#endif
void stop() override { isStopped = true; }
void addStopCallback(std::function<void()> fn) override { stopCallbacks.emplace_back(std::move(fn)); }
bool isSimulated() const override { return true; }
2017-05-26 04:48:44 +08:00
struct SimThreadArgs {
THREAD_FUNC_RETURN (*func)(void*);
void* arg;
2017-05-26 04:48:44 +08:00
ISimulator::ProcessInfo* currentProcess;
2017-05-26 04:48:44 +08:00
SimThreadArgs(THREAD_FUNC_RETURN (*func)(void*), void* arg) : func(func), arg(arg) {
2017-05-26 04:48:44 +08:00
ASSERT(g_network->isSimulated());
currentProcess = g_simulator.getCurrentProcess();
}
};
// Starts a new thread, making sure to set any thread local state
THREAD_FUNC simStartThread(void* arg) {
SimThreadArgs* simArgs = (SimThreadArgs*)arg;
2017-05-26 04:48:44 +08:00
ISimulator::currentProcess = simArgs->currentProcess;
simArgs->func(simArgs->arg);
delete simArgs;
THREAD_RETURN;
}
2021-07-09 06:51:01 +08:00
THREAD_HANDLE startThread(THREAD_FUNC_RETURN (*func)(void*), void* arg, int stackSize, const char* name) override {
SimThreadArgs* simArgs = new SimThreadArgs(func, arg);
2021-07-09 06:51:01 +08:00
return ::startThread(simStartThread, simArgs, stackSize, name);
2017-05-26 04:48:44 +08:00
}
void getDiskBytes(std::string const& directory, int64_t& free, int64_t& total) override {
ProcessInfo* proc = getCurrentProcess();
SimDiskSpace& diskSpace = diskSpaceMap[proc->address.ip];
2017-05-26 04:48:44 +08:00
int64_t totalFileSize = 0;
int numFiles = 0;
// Get the size of all files we've created on the server and subtract them from the free space
for (auto file = proc->machine->openFiles.begin(); file != proc->machine->openFiles.end(); ++file) {
if (file->second.get().isReady()) {
totalFileSize += ((AsyncFileNonDurable*)file->second.get().get().getPtr())->approximateSize;
2017-05-26 04:48:44 +08:00
}
numFiles++;
}
if (diskSpace.totalSpace == 0) {
diskSpace.totalSpace = 5e9 + deterministicRandom()->random01() * 100e9; // Total space between 5GB and 105GB
diskSpace.baseFreeSpace = std::min<int64_t>(
diskSpace.totalSpace,
std::max(5e9, (deterministicRandom()->random01() * (1 - .075) + .075) * diskSpace.totalSpace) +
totalFileSize); // Minimum 5GB or 7.5% total disk space, whichever is higher
2017-05-26 04:48:44 +08:00
TraceEvent("Sim2DiskSpaceInitialization")
.detail("TotalSpace", diskSpace.totalSpace)
.detail("BaseFreeSpace", diskSpace.baseFreeSpace)
.detail("TotalFileSize", totalFileSize)
.detail("NumFiles", numFiles);
} else {
int64_t maxDelta = std::min(5.0, (now() - diskSpace.lastUpdate)) *
(BUGGIFY ? 10e6 : 1e6); // External processes modifying the disk
int64_t delta = -maxDelta + deterministicRandom()->random01() * maxDelta * 2;
diskSpace.baseFreeSpace = std::min<int64_t>(
diskSpace.totalSpace, std::max<int64_t>(diskSpace.baseFreeSpace + delta, totalFileSize));
2017-05-26 04:48:44 +08:00
}
diskSpace.lastUpdate = now();
total = diskSpace.totalSpace;
free = std::max<int64_t>(0, diskSpace.baseFreeSpace - totalFileSize);
if (free == 0)
TraceEvent(SevWarnAlways, "Sim2NoFreeSpace")
.detail("TotalSpace", diskSpace.totalSpace)
.detail("BaseFreeSpace", diskSpace.baseFreeSpace)
.detail("TotalFileSize", totalFileSize)
.detail("NumFiles", numFiles);
2017-05-26 04:48:44 +08:00
}
bool isAddressOnThisHost(NetworkAddress const& addr) const override {
2017-05-26 04:48:44 +08:00
return addr.ip == getCurrentProcess()->address.ip;
}
ACTOR static Future<Void> deleteFileImpl(Sim2* self, std::string filename, bool mustBeDurable) {
2017-05-26 04:48:44 +08:00
// This is a _rudimentary_ simulation of the untrustworthiness of non-durable deletes and the possibility of
// rebooting during a durable one. It isn't perfect: for example, on real filesystems testing
// for the existence of a non-durably deleted file BEFORE a reboot will show that it apparently doesn't exist.
if (g_simulator.getCurrentProcess()->machine->openFiles.count(filename)) {
g_simulator.getCurrentProcess()->machine->openFiles.erase(filename);
g_simulator.getCurrentProcess()->machine->deletingOrClosingFiles.insert(filename);
}
if (mustBeDurable || deterministicRandom()->random01() < 0.5) {
state ISimulator::ProcessInfo* currentProcess = g_simulator.getCurrentProcess();
state TaskPriority currentTaskID = g_network->getCurrentTask();
Remote ikvs debugging (#6465) * initial structure for remote IKVS server * moved struct to .h file, added new files to CMakeList * happy path implementation, connection error when testing * saved minor local change * changed tracing to debug * fixed onClosed and getError being called before init is finished * fix spawn process bug, now use absolute path * added server knob to set ikvs process port number * added server knob for remote/local kv store * implement simulator remote process spawning * fixed bug for simulator timeout * commit all changes * removed print lines in trace * added FlowProcess implementation by Markus * initial debug of FlowProcess, stuck at parent sending OpenKVStoreRequest to child * temporary fix for process factory throwing segfault on create * specify public address in command * change remote kv store knob to false for jenkins build * made port 0 open random unused port * change remote store knob to true for benchmark * set listening port to randomly opened port * added print lines for jenkins run open kv store timeout debug * removed most tracing and print lines * removed tutorial changes * update handleIOErrors error handling to handle remote-ikvs cases * Push all debugging changes * A version where worker bug exists * A version where restarting tests fail * Use both the name and the port to determine the child process * Remove unnecessary update on local address * Disable remote-kvs for DiskFailureCycle test * A version where restarting stuck * A version where most restarting tests green * Reset connection with child process explicitly * Remove change on unnecessary files * Unify flags from _ to - * fix merging unexpected changes * fix trac.error to .errorUnsuppressed * Add license header * Remove unnecessary header in FlowProcess.actor.cpp * Fix Windows build * Fix Windows build, add missing ; * Fix a stupid bug caused by code dropped by code merging * Disable remote kvs by default * Pass the conn_file path to the flow process, though not needed, but the buildNetwork is difficult to tune * serialization change on readrange * Update traces * Refactor the RemoteIKVS interface * Format files * Update sim2 interface to not clog connections between parent and child processes in simulation * Update comments; remove debugging symbols; Add error handling for remote_kvs_cancelled * Add comments, format files * Change method name from isBuggifyDisabled to isStableConnection; Decrease(0.1x) latency for stable connections * Commit the IConnection interface change, forgot in previous commit * Fix the issue that onClosed request is cancelled by ActorCollection * Enable the remote kv store knob * Remove FlowProcess.actor.cpp and move functions to RemoteIKeyValueStore.actor.cpp; Add remote kv store delay to avoid race; Bind the child process to die with parent process * Fix the bug where one process starts storage server more than once * Add a please_reboot_remote_kv_store error to restart the storage server worker if remote kvs died abnormally * Remove unreachable code path and add comments * Clang format the code * Fix a simple wait error * Clang format after merging the main branch * Testing mixed mode in simulation if remote_kvs knob is enabled, setting the default to false * Disable remote kvs for PhysicalShardMove which is for RocksDB * Cleanup #include orders, remove debugging traces * Revert the reorder in fdbserver.actor.cpp, which fails the gcc build Co-authored-by: “Lincoln <“lincoln.xiao@snowflake.com”>
2022-04-01 08:08:59 +08:00
TraceEvent(SevDebug, "Sim2DeleteFileImpl")
.detail("CurrentProcess", currentProcess->toString())
.detail("Filename", filename)
.detail("Durable", mustBeDurable);
wait(g_simulator.onMachine(currentProcess));
try {
wait(::delay(0.05 * deterministicRandom()->random01()));
if (!currentProcess->rebooting) {
auto f = IAsyncFileSystem::filesystem(self->net2)->deleteFile(filename, false);
ASSERT(f.isReady());
wait(::delay(0.05 * deterministicRandom()->random01()));
TEST(true); // Simulated durable delete
}
wait(g_simulator.onProcess(currentProcess, currentTaskID));
return Void();
} catch (Error& e) {
state Error err = e;
wait(g_simulator.onProcess(currentProcess, currentTaskID));
throw err;
2017-05-26 04:48:44 +08:00
}
} else {
Remote ikvs debugging (#6465) * initial structure for remote IKVS server * moved struct to .h file, added new files to CMakeList * happy path implementation, connection error when testing * saved minor local change * changed tracing to debug * fixed onClosed and getError being called before init is finished * fix spawn process bug, now use absolute path * added server knob to set ikvs process port number * added server knob for remote/local kv store * implement simulator remote process spawning * fixed bug for simulator timeout * commit all changes * removed print lines in trace * added FlowProcess implementation by Markus * initial debug of FlowProcess, stuck at parent sending OpenKVStoreRequest to child * temporary fix for process factory throwing segfault on create * specify public address in command * change remote kv store knob to false for jenkins build * made port 0 open random unused port * change remote store knob to true for benchmark * set listening port to randomly opened port * added print lines for jenkins run open kv store timeout debug * removed most tracing and print lines * removed tutorial changes * update handleIOErrors error handling to handle remote-ikvs cases * Push all debugging changes * A version where worker bug exists * A version where restarting tests fail * Use both the name and the port to determine the child process * Remove unnecessary update on local address * Disable remote-kvs for DiskFailureCycle test * A version where restarting stuck * A version where most restarting tests green * Reset connection with child process explicitly * Remove change on unnecessary files * Unify flags from _ to - * fix merging unexpected changes * fix trac.error to .errorUnsuppressed * Add license header * Remove unnecessary header in FlowProcess.actor.cpp * Fix Windows build * Fix Windows build, add missing ; * Fix a stupid bug caused by code dropped by code merging * Disable remote kvs by default * Pass the conn_file path to the flow process, though not needed, but the buildNetwork is difficult to tune * serialization change on readrange * Update traces * Refactor the RemoteIKVS interface * Format files * Update sim2 interface to not clog connections between parent and child processes in simulation * Update comments; remove debugging symbols; Add error handling for remote_kvs_cancelled * Add comments, format files * Change method name from isBuggifyDisabled to isStableConnection; Decrease(0.1x) latency for stable connections * Commit the IConnection interface change, forgot in previous commit * Fix the issue that onClosed request is cancelled by ActorCollection * Enable the remote kv store knob * Remove FlowProcess.actor.cpp and move functions to RemoteIKeyValueStore.actor.cpp; Add remote kv store delay to avoid race; Bind the child process to die with parent process * Fix the bug where one process starts storage server more than once * Add a please_reboot_remote_kv_store error to restart the storage server worker if remote kvs died abnormally * Remove unreachable code path and add comments * Clang format the code * Fix a simple wait error * Clang format after merging the main branch * Testing mixed mode in simulation if remote_kvs knob is enabled, setting the default to false * Disable remote kvs for PhysicalShardMove which is for RocksDB * Cleanup #include orders, remove debugging traces * Revert the reorder in fdbserver.actor.cpp, which fails the gcc build Co-authored-by: “Lincoln <“lincoln.xiao@snowflake.com”>
2022-04-01 08:08:59 +08:00
TraceEvent(SevDebug, "Sim2DeleteFileImplNonDurable")
.detail("Filename", filename)
.detail("Durable", mustBeDurable);
TEST(true); // Simulated non-durable delete
2017-05-26 04:48:44 +08:00
return Void();
}
}
static void runLoop(Sim2* self) {
ISimulator::ProcessInfo* callingMachine = self->currentProcess;
while (!self->isStopped) {
2017-05-26 04:48:44 +08:00
self->mutex.enter();
if (self->tasks.size() == 0) {
2017-05-26 04:48:44 +08:00
self->mutex.leave();
ASSERT(false);
}
// if (!randLog/* && now() >= 32.0*/)
2021-03-18 04:19:45 +08:00
// randLog = fopen("randLog.txt", "wt");
Task t = std::move(self->tasks.top()); // Unfortunately still a copy under gcc where .top() returns const&
self->currentTaskID = t.taskID;
2017-05-26 04:48:44 +08:00
self->tasks.pop();
self->mutex.leave();
self->execTask(t);
self->yielded = false;
2017-05-26 04:48:44 +08:00
}
self->currentProcess = callingMachine;
for (auto& fn : self->stopCallbacks) {
fn();
}
2017-05-26 04:48:44 +08:00
}
// Implement ISimulator interface
void run() override { runLoop(this); }
ProcessInfo* newProcess(const char* name,
IPAddress ip,
uint16_t port,
bool sslEnabled,
uint16_t listenPerProcess,
LocalityData locality,
ProcessClass startingClass,
const char* dataFolder,
const char* coordinationFolder,
ProtocolVersion protocol) override {
ASSERT(locality.machineId().present());
MachineInfo& machine = machines[locality.machineId().get()];
if (!machine.machineId.present())
machine.machineId = locality.machineId();
Remote ikvs debugging (#6465) * initial structure for remote IKVS server * moved struct to .h file, added new files to CMakeList * happy path implementation, connection error when testing * saved minor local change * changed tracing to debug * fixed onClosed and getError being called before init is finished * fix spawn process bug, now use absolute path * added server knob to set ikvs process port number * added server knob for remote/local kv store * implement simulator remote process spawning * fixed bug for simulator timeout * commit all changes * removed print lines in trace * added FlowProcess implementation by Markus * initial debug of FlowProcess, stuck at parent sending OpenKVStoreRequest to child * temporary fix for process factory throwing segfault on create * specify public address in command * change remote kv store knob to false for jenkins build * made port 0 open random unused port * change remote store knob to true for benchmark * set listening port to randomly opened port * added print lines for jenkins run open kv store timeout debug * removed most tracing and print lines * removed tutorial changes * update handleIOErrors error handling to handle remote-ikvs cases * Push all debugging changes * A version where worker bug exists * A version where restarting tests fail * Use both the name and the port to determine the child process * Remove unnecessary update on local address * Disable remote-kvs for DiskFailureCycle test * A version where restarting stuck * A version where most restarting tests green * Reset connection with child process explicitly * Remove change on unnecessary files * Unify flags from _ to - * fix merging unexpected changes * fix trac.error to .errorUnsuppressed * Add license header * Remove unnecessary header in FlowProcess.actor.cpp * Fix Windows build * Fix Windows build, add missing ; * Fix a stupid bug caused by code dropped by code merging * Disable remote kvs by default * Pass the conn_file path to the flow process, though not needed, but the buildNetwork is difficult to tune * serialization change on readrange * Update traces * Refactor the RemoteIKVS interface * Format files * Update sim2 interface to not clog connections between parent and child processes in simulation * Update comments; remove debugging symbols; Add error handling for remote_kvs_cancelled * Add comments, format files * Change method name from isBuggifyDisabled to isStableConnection; Decrease(0.1x) latency for stable connections * Commit the IConnection interface change, forgot in previous commit * Fix the issue that onClosed request is cancelled by ActorCollection * Enable the remote kv store knob * Remove FlowProcess.actor.cpp and move functions to RemoteIKeyValueStore.actor.cpp; Add remote kv store delay to avoid race; Bind the child process to die with parent process * Fix the bug where one process starts storage server more than once * Add a please_reboot_remote_kv_store error to restart the storage server worker if remote kvs died abnormally * Remove unreachable code path and add comments * Clang format the code * Fix a simple wait error * Clang format after merging the main branch * Testing mixed mode in simulation if remote_kvs knob is enabled, setting the default to false * Disable remote kvs for PhysicalShardMove which is for RocksDB * Cleanup #include orders, remove debugging traces * Revert the reorder in fdbserver.actor.cpp, which fails the gcc build Co-authored-by: “Lincoln <“lincoln.xiao@snowflake.com”>
2022-04-01 08:08:59 +08:00
if (port == 0 && std::string(name) == "remote flow process") {
port = machine.getRandomPort();
}
for (int i = 0; i < machine.processes.size(); i++) {
if (machine.processes[i]->locality.machineId() !=
locality.machineId()) { // SOMEDAY: compute ip from locality to avoid this check
TraceEvent("Sim2Mismatch")
.detail("IP", format("%s", ip.toString().c_str()))
.detail("MachineId", locality.machineId())
.detail("NewName", name)
.detail("ExistingMachineId", machine.processes[i]->locality.machineId())
.detail("ExistingName", machine.processes[i]->name);
ASSERT(false);
2017-05-26 04:48:44 +08:00
}
ASSERT(machine.processes[i]->address.port != port);
2017-05-26 04:48:44 +08:00
}
// This is for async operations on non-durable files.
// These files must live on after process kills for sim purposes.
if (machine.machineProcess == 0) {
2017-05-26 04:48:44 +08:00
NetworkAddress machineAddress(ip, 0, false, false);
machine.machineProcess =
new ProcessInfo("Machine", locality, startingClass, { machineAddress }, this, "", "");
2017-05-26 04:48:44 +08:00
machine.machineProcess->machine = &machine;
}
NetworkAddressList addresses;
2020-02-11 06:00:21 +08:00
addresses.address = NetworkAddress(ip, port, true, sslEnabled);
2020-03-13 01:18:53 +08:00
if (listenPerProcess == 2) { // listenPerProcess is only 1 or 2
addresses.secondaryAddress = NetworkAddress(ip, port + 1, true, false);
}
ProcessInfo* m =
new ProcessInfo(name, locality, startingClass, addresses, this, dataFolder, coordinationFolder);
for (int processPort = port; processPort < port + listenPerProcess; ++processPort) {
2020-02-11 06:00:21 +08:00
NetworkAddress address(ip, processPort, true, sslEnabled && processPort == port);
m->listenerMap[address] = Reference<IListener>(new Sim2Listener(m, address));
addressMap[address] = m;
}
2017-05-26 04:48:44 +08:00
m->machine = &machine;
machine.processes.push_back(m);
currentlyRebootingProcesses.erase(addresses.address);
m->excluded = g_simulator.isExcluded(NetworkAddress(ip, port, true, false));
m->cleared = g_simulator.isCleared(addresses.address);
m->protocolVersion = protocol;
2017-05-26 04:48:44 +08:00
m->setGlobal(enTDMetrics, (flowGlobalType)&m->tdmetrics);
2021-07-29 07:03:37 +08:00
if (FLOW_KNOBS->ENABLE_CHAOS_FEATURES) {
m->setGlobal(enChaosMetrics, (flowGlobalType)&m->chaosMetrics);
}
m->setGlobal(enNetworkConnections, (flowGlobalType)m->network);
m->setGlobal(enASIOTimedOut, (flowGlobalType) false);
2017-05-26 04:48:44 +08:00
TraceEvent("NewMachine")
.detail("Name", name)
.detail("Address", m->address)
.detail("MachineId", m->locality.machineId())
.detail("Excluded", m->excluded)
.detail("Cleared", m->cleared);
2017-05-26 04:48:44 +08:00
Remote ikvs debugging (#6465) * initial structure for remote IKVS server * moved struct to .h file, added new files to CMakeList * happy path implementation, connection error when testing * saved minor local change * changed tracing to debug * fixed onClosed and getError being called before init is finished * fix spawn process bug, now use absolute path * added server knob to set ikvs process port number * added server knob for remote/local kv store * implement simulator remote process spawning * fixed bug for simulator timeout * commit all changes * removed print lines in trace * added FlowProcess implementation by Markus * initial debug of FlowProcess, stuck at parent sending OpenKVStoreRequest to child * temporary fix for process factory throwing segfault on create * specify public address in command * change remote kv store knob to false for jenkins build * made port 0 open random unused port * change remote store knob to true for benchmark * set listening port to randomly opened port * added print lines for jenkins run open kv store timeout debug * removed most tracing and print lines * removed tutorial changes * update handleIOErrors error handling to handle remote-ikvs cases * Push all debugging changes * A version where worker bug exists * A version where restarting tests fail * Use both the name and the port to determine the child process * Remove unnecessary update on local address * Disable remote-kvs for DiskFailureCycle test * A version where restarting stuck * A version where most restarting tests green * Reset connection with child process explicitly * Remove change on unnecessary files * Unify flags from _ to - * fix merging unexpected changes * fix trac.error to .errorUnsuppressed * Add license header * Remove unnecessary header in FlowProcess.actor.cpp * Fix Windows build * Fix Windows build, add missing ; * Fix a stupid bug caused by code dropped by code merging * Disable remote kvs by default * Pass the conn_file path to the flow process, though not needed, but the buildNetwork is difficult to tune * serialization change on readrange * Update traces * Refactor the RemoteIKVS interface * Format files * Update sim2 interface to not clog connections between parent and child processes in simulation * Update comments; remove debugging symbols; Add error handling for remote_kvs_cancelled * Add comments, format files * Change method name from isBuggifyDisabled to isStableConnection; Decrease(0.1x) latency for stable connections * Commit the IConnection interface change, forgot in previous commit * Fix the issue that onClosed request is cancelled by ActorCollection * Enable the remote kv store knob * Remove FlowProcess.actor.cpp and move functions to RemoteIKeyValueStore.actor.cpp; Add remote kv store delay to avoid race; Bind the child process to die with parent process * Fix the bug where one process starts storage server more than once * Add a please_reboot_remote_kv_store error to restart the storage server worker if remote kvs died abnormally * Remove unreachable code path and add comments * Clang format the code * Fix a simple wait error * Clang format after merging the main branch * Testing mixed mode in simulation if remote_kvs knob is enabled, setting the default to false * Disable remote kvs for PhysicalShardMove which is for RocksDB * Cleanup #include orders, remove debugging traces * Revert the reorder in fdbserver.actor.cpp, which fails the gcc build Co-authored-by: “Lincoln <“lincoln.xiao@snowflake.com”>
2022-04-01 08:08:59 +08:00
if (std::string(name) == "remote flow process") {
protectedAddresses.insert(m->address);
TraceEvent(SevDebug, "NewFlowProcessProtected").detail("Address", m->address);
}
2017-05-26 04:48:44 +08:00
// FIXME: Sometimes, connections to/from this process will explicitly close
return m;
}
bool isAvailable() const override {
2018-07-17 06:56:43 +08:00
std::vector<ProcessInfo*> processesLeft, processesDead;
for (auto processInfo : getAllProcesses()) {
if (processInfo->isAvailableClass()) {
2018-07-17 06:56:43 +08:00
if (processInfo->isExcluded() || processInfo->isCleared() || !processInfo->isAvailable()) {
processesDead.push_back(processInfo);
2018-07-17 06:56:43 +08:00
} else {
processesLeft.push_back(processInfo);
2018-07-17 06:56:43 +08:00
}
}
}
2020-08-19 05:43:17 +08:00
return canKillProcesses(processesLeft, processesDead, KillInstantly, nullptr);
}
2017-05-26 04:48:44 +08:00
bool datacenterDead(Optional<Standalone<StringRef>> dcId) const override {
if (!dcId.present()) {
return false;
}
LocalityGroup primaryProcessesLeft, primaryProcessesDead;
std::vector<LocalityData> primaryLocalitiesDead, primaryLocalitiesLeft;
for (auto processInfo : getAllProcesses()) {
if (processInfo->isAvailableClass() && processInfo->locality.dcId() == dcId) {
2018-07-17 06:56:43 +08:00
if (processInfo->isExcluded() || processInfo->isCleared() || !processInfo->isAvailable()) {
primaryProcessesDead.add(processInfo->locality);
primaryLocalitiesDead.push_back(processInfo->locality);
} else {
primaryProcessesLeft.add(processInfo->locality);
primaryLocalitiesLeft.push_back(processInfo->locality);
}
}
}
std::vector<LocalityData> badCombo;
bool primaryTLogsDead =
tLogWriteAntiQuorum
? !validateAllCombinations(
badCombo, primaryProcessesDead, tLogPolicy, primaryLocalitiesLeft, tLogWriteAntiQuorum, false)
: primaryProcessesDead.validate(tLogPolicy);
if (usableRegions > 1 && remoteTLogPolicy && !primaryTLogsDead) {
primaryTLogsDead = primaryProcessesDead.validate(remoteTLogPolicy);
}
return primaryTLogsDead || primaryProcessesDead.validate(storagePolicy);
}
// The following function will determine if the specified configuration of available and dead processes can allow
// the cluster to survive
bool canKillProcesses(std::vector<ProcessInfo*> const& availableProcesses,
std::vector<ProcessInfo*> const& deadProcesses,
KillType kt,
KillType* newKillType) const override {
bool canSurvive = true;
int nQuorum = ((desiredCoordinators + 1) / 2) * 2 - 1;
KillType newKt = kt;
if ((kt == KillInstantly) || (kt == InjectFaults) || (kt == FailDisk) || (kt == RebootAndDelete) ||
(kt == RebootProcessAndDelete)) {
LocalityGroup primaryProcessesLeft, primaryProcessesDead;
LocalityGroup primarySatelliteProcessesLeft, primarySatelliteProcessesDead;
LocalityGroup remoteProcessesLeft, remoteProcessesDead;
LocalityGroup remoteSatelliteProcessesLeft, remoteSatelliteProcessesDead;
std::vector<LocalityData> primaryLocalitiesDead, primaryLocalitiesLeft;
std::vector<LocalityData> primarySatelliteLocalitiesDead, primarySatelliteLocalitiesLeft;
std::vector<LocalityData> remoteLocalitiesDead, remoteLocalitiesLeft;
std::vector<LocalityData> remoteSatelliteLocalitiesDead, remoteSatelliteLocalitiesLeft;
std::vector<LocalityData> badCombo;
std::set<Optional<Standalone<StringRef>>> uniqueMachines;
if (!primaryDcId.present()) {
for (auto processInfo : availableProcesses) {
primaryProcessesLeft.add(processInfo->locality);
primaryLocalitiesLeft.push_back(processInfo->locality);
uniqueMachines.insert(processInfo->locality.zoneId());
}
for (auto processInfo : deadProcesses) {
primaryProcessesDead.add(processInfo->locality);
primaryLocalitiesDead.push_back(processInfo->locality);
}
} else {
for (auto processInfo : availableProcesses) {
uniqueMachines.insert(processInfo->locality.zoneId());
if (processInfo->locality.dcId() == primaryDcId) {
primaryProcessesLeft.add(processInfo->locality);
primaryLocalitiesLeft.push_back(processInfo->locality);
} else if (processInfo->locality.dcId() == remoteDcId) {
remoteProcessesLeft.add(processInfo->locality);
remoteLocalitiesLeft.push_back(processInfo->locality);
} else if (std::find(primarySatelliteDcIds.begin(),
primarySatelliteDcIds.end(),
processInfo->locality.dcId()) != primarySatelliteDcIds.end()) {
primarySatelliteProcessesLeft.add(processInfo->locality);
primarySatelliteLocalitiesLeft.push_back(processInfo->locality);
} else if (std::find(remoteSatelliteDcIds.begin(),
remoteSatelliteDcIds.end(),
processInfo->locality.dcId()) != remoteSatelliteDcIds.end()) {
remoteSatelliteProcessesLeft.add(processInfo->locality);
remoteSatelliteLocalitiesLeft.push_back(processInfo->locality);
}
}
for (auto processInfo : deadProcesses) {
if (processInfo->locality.dcId() == primaryDcId) {
primaryProcessesDead.add(processInfo->locality);
primaryLocalitiesDead.push_back(processInfo->locality);
} else if (processInfo->locality.dcId() == remoteDcId) {
remoteProcessesDead.add(processInfo->locality);
remoteLocalitiesDead.push_back(processInfo->locality);
} else if (std::find(primarySatelliteDcIds.begin(),
primarySatelliteDcIds.end(),
processInfo->locality.dcId()) != primarySatelliteDcIds.end()) {
primarySatelliteProcessesDead.add(processInfo->locality);
primarySatelliteLocalitiesDead.push_back(processInfo->locality);
} else if (std::find(remoteSatelliteDcIds.begin(),
remoteSatelliteDcIds.end(),
processInfo->locality.dcId()) != remoteSatelliteDcIds.end()) {
remoteSatelliteProcessesDead.add(processInfo->locality);
remoteSatelliteLocalitiesDead.push_back(processInfo->locality);
}
}
2017-05-26 04:48:44 +08:00
}
bool tooManyDead = false;
bool notEnoughLeft = false;
bool primaryTLogsDead =
tLogWriteAntiQuorum
? !validateAllCombinations(
badCombo, primaryProcessesDead, tLogPolicy, primaryLocalitiesLeft, tLogWriteAntiQuorum, false)
: primaryProcessesDead.validate(tLogPolicy);
if (usableRegions > 1 && remoteTLogPolicy && !primaryTLogsDead) {
primaryTLogsDead = primaryProcessesDead.validate(remoteTLogPolicy);
}
if (!primaryDcId.present()) {
tooManyDead = primaryTLogsDead || primaryProcessesDead.validate(storagePolicy);
notEnoughLeft =
!primaryProcessesLeft.validate(tLogPolicy) || !primaryProcessesLeft.validate(storagePolicy);
} else {
bool remoteTLogsDead = tLogWriteAntiQuorum ? !validateAllCombinations(badCombo,
remoteProcessesDead,
tLogPolicy,
remoteLocalitiesLeft,
tLogWriteAntiQuorum,
false)
: remoteProcessesDead.validate(tLogPolicy);
if (usableRegions > 1 && remoteTLogPolicy && !remoteTLogsDead) {
remoteTLogsDead = remoteProcessesDead.validate(remoteTLogPolicy);
}
if (!hasSatelliteReplication) {
if (usableRegions > 1) {
tooManyDead = primaryTLogsDead || remoteTLogsDead ||
(primaryProcessesDead.validate(storagePolicy) &&
remoteProcessesDead.validate(storagePolicy));
notEnoughLeft = !primaryProcessesLeft.validate(tLogPolicy) ||
!primaryProcessesLeft.validate(remoteTLogPolicy) ||
!primaryProcessesLeft.validate(storagePolicy) ||
!remoteProcessesLeft.validate(tLogPolicy) ||
!remoteProcessesLeft.validate(remoteTLogPolicy) ||
!remoteProcessesLeft.validate(storagePolicy);
} else {
tooManyDead = primaryTLogsDead || remoteTLogsDead ||
primaryProcessesDead.validate(storagePolicy) ||
remoteProcessesDead.validate(storagePolicy);
notEnoughLeft = !primaryProcessesLeft.validate(tLogPolicy) ||
!primaryProcessesLeft.validate(storagePolicy) ||
!remoteProcessesLeft.validate(tLogPolicy) ||
!remoteProcessesLeft.validate(storagePolicy);
}
} else {
bool primarySatelliteTLogsDead =
satelliteTLogWriteAntiQuorumFallback
? !validateAllCombinations(badCombo,
primarySatelliteProcessesDead,
satelliteTLogPolicyFallback,
primarySatelliteLocalitiesLeft,
satelliteTLogWriteAntiQuorumFallback,
false)
: primarySatelliteProcessesDead.validate(satelliteTLogPolicyFallback);
bool remoteSatelliteTLogsDead =
satelliteTLogWriteAntiQuorumFallback
? !validateAllCombinations(badCombo,
remoteSatelliteProcessesDead,
satelliteTLogPolicyFallback,
remoteSatelliteLocalitiesLeft,
satelliteTLogWriteAntiQuorumFallback,
false)
: remoteSatelliteProcessesDead.validate(satelliteTLogPolicyFallback);
if (usableRegions > 1) {
notEnoughLeft = !primaryProcessesLeft.validate(tLogPolicy) ||
!primaryProcessesLeft.validate(remoteTLogPolicy) ||
!primaryProcessesLeft.validate(storagePolicy) ||
!primarySatelliteProcessesLeft.validate(satelliteTLogPolicy) ||
!remoteProcessesLeft.validate(tLogPolicy) ||
!remoteProcessesLeft.validate(remoteTLogPolicy) ||
!remoteProcessesLeft.validate(storagePolicy) ||
!remoteSatelliteProcessesLeft.validate(satelliteTLogPolicy);
} else {
notEnoughLeft = !primaryProcessesLeft.validate(tLogPolicy) ||
!primaryProcessesLeft.validate(storagePolicy) ||
!primarySatelliteProcessesLeft.validate(satelliteTLogPolicy) ||
!remoteProcessesLeft.validate(tLogPolicy) ||
!remoteProcessesLeft.validate(storagePolicy) ||
!remoteSatelliteProcessesLeft.validate(satelliteTLogPolicy);
}
if (usableRegions > 1 && allowLogSetKills) {
tooManyDead = (primaryTLogsDead && primarySatelliteTLogsDead) ||
(remoteTLogsDead && remoteSatelliteTLogsDead) ||
(primaryTLogsDead && remoteTLogsDead) ||
(primaryProcessesDead.validate(storagePolicy) &&
remoteProcessesDead.validate(storagePolicy));
} else {
tooManyDead = primaryTLogsDead || remoteTLogsDead ||
primaryProcessesDead.validate(storagePolicy) ||
remoteProcessesDead.validate(storagePolicy);
}
}
2017-05-26 04:48:44 +08:00
}
2017-05-26 04:48:44 +08:00
// Reboot if dead machines do fulfill policies
if (tooManyDead) {
2017-05-26 04:48:44 +08:00
newKt = Reboot;
canSurvive = false;
TraceEvent("KillChanged")
.detail("KillType", kt)
.detail("NewKillType", newKt)
.detail("TLogPolicy", tLogPolicy->info())
.detail("Reason", "Too many dead processes that cannot satisfy tLogPolicy.");
2017-05-26 04:48:44 +08:00
}
// Reboot and Delete if remaining machines do NOT fulfill policies
else if ((kt < RebootAndDelete) && notEnoughLeft) {
newKt = RebootAndDelete;
2017-05-26 04:48:44 +08:00
canSurvive = false;
TraceEvent("KillChanged")
.detail("KillType", kt)
.detail("NewKillType", newKt)
.detail("TLogPolicy", tLogPolicy->info())
.detail("Reason", "Not enough tLog left to satisfy tLogPolicy.");
} else if ((kt < RebootAndDelete) && (nQuorum > uniqueMachines.size())) {
newKt = RebootAndDelete;
canSurvive = false;
TraceEvent("KillChanged")
.detail("KillType", kt)
.detail("NewKillType", newKt)
.detail("StoragePolicy", storagePolicy->info())
.detail("Quorum", nQuorum)
.detail("Machines", uniqueMachines.size())
.detail("Reason", "Not enough unique machines to perform auto configuration of coordinators.");
} else {
TraceEvent("CanSurviveKills")
.detail("KillType", kt)
.detail("TLogPolicy", tLogPolicy->info())
.detail("StoragePolicy", storagePolicy->info())
.detail("Quorum", nQuorum)
.detail("Machines", uniqueMachines.size());
2017-05-26 04:48:44 +08:00
}
}
if (newKillType)
*newKillType = newKt;
2017-05-26 04:48:44 +08:00
return canSurvive;
}
void destroyProcess(ISimulator::ProcessInfo* p) override {
TraceEvent("ProcessDestroyed")
.detail("Name", p->name)
.detail("Address", p->address)
.detail("MachineId", p->locality.machineId());
2017-05-26 04:48:44 +08:00
currentlyRebootingProcesses.insert(std::pair<NetworkAddress, ProcessInfo*>(p->address, p));
std::vector<ProcessInfo*>& processes = machines[p->locality.machineId().get()].processes;
Remote ikvs debugging (#6465) * initial structure for remote IKVS server * moved struct to .h file, added new files to CMakeList * happy path implementation, connection error when testing * saved minor local change * changed tracing to debug * fixed onClosed and getError being called before init is finished * fix spawn process bug, now use absolute path * added server knob to set ikvs process port number * added server knob for remote/local kv store * implement simulator remote process spawning * fixed bug for simulator timeout * commit all changes * removed print lines in trace * added FlowProcess implementation by Markus * initial debug of FlowProcess, stuck at parent sending OpenKVStoreRequest to child * temporary fix for process factory throwing segfault on create * specify public address in command * change remote kv store knob to false for jenkins build * made port 0 open random unused port * change remote store knob to true for benchmark * set listening port to randomly opened port * added print lines for jenkins run open kv store timeout debug * removed most tracing and print lines * removed tutorial changes * update handleIOErrors error handling to handle remote-ikvs cases * Push all debugging changes * A version where worker bug exists * A version where restarting tests fail * Use both the name and the port to determine the child process * Remove unnecessary update on local address * Disable remote-kvs for DiskFailureCycle test * A version where restarting stuck * A version where most restarting tests green * Reset connection with child process explicitly * Remove change on unnecessary files * Unify flags from _ to - * fix merging unexpected changes * fix trac.error to .errorUnsuppressed * Add license header * Remove unnecessary header in FlowProcess.actor.cpp * Fix Windows build * Fix Windows build, add missing ; * Fix a stupid bug caused by code dropped by code merging * Disable remote kvs by default * Pass the conn_file path to the flow process, though not needed, but the buildNetwork is difficult to tune * serialization change on readrange * Update traces * Refactor the RemoteIKVS interface * Format files * Update sim2 interface to not clog connections between parent and child processes in simulation * Update comments; remove debugging symbols; Add error handling for remote_kvs_cancelled * Add comments, format files * Change method name from isBuggifyDisabled to isStableConnection; Decrease(0.1x) latency for stable connections * Commit the IConnection interface change, forgot in previous commit * Fix the issue that onClosed request is cancelled by ActorCollection * Enable the remote kv store knob * Remove FlowProcess.actor.cpp and move functions to RemoteIKeyValueStore.actor.cpp; Add remote kv store delay to avoid race; Bind the child process to die with parent process * Fix the bug where one process starts storage server more than once * Add a please_reboot_remote_kv_store error to restart the storage server worker if remote kvs died abnormally * Remove unreachable code path and add comments * Clang format the code * Fix a simple wait error * Clang format after merging the main branch * Testing mixed mode in simulation if remote_kvs knob is enabled, setting the default to false * Disable remote kvs for PhysicalShardMove which is for RocksDB * Cleanup #include orders, remove debugging traces * Revert the reorder in fdbserver.actor.cpp, which fails the gcc build Co-authored-by: “Lincoln <“lincoln.xiao@snowflake.com”>
2022-04-01 08:08:59 +08:00
machines[p->locality.machineId().get()].removeRemotePort(p->address.port);
if (p != processes.back()) {
auto it = std::find(processes.begin(), processes.end(), p);
std::swap(*it, processes.back());
2017-05-26 04:48:44 +08:00
}
processes.pop_back();
killProcess_internal(p, KillInstantly);
2017-05-26 04:48:44 +08:00
}
void killProcess_internal(ProcessInfo* machine, KillType kt) {
TEST(true); // Simulated machine was killed with any kill type
TEST(kt == KillInstantly); // Simulated machine was killed instantly
TEST(kt == InjectFaults); // Simulated machine was killed with faults
TEST(kt == FailDisk); // Simulated machine was killed with a failed disk
2017-05-26 04:48:44 +08:00
if (kt == KillInstantly) {
TraceEvent(SevWarn, "FailMachine")
.detail("Name", machine->name)
.detail("Address", machine->address)
.detail("ZoneId", machine->locality.zoneId())
.detail("Process", machine->toString())
.detail("Rebooting", machine->rebooting)
.detail("Protected", protectedAddresses.count(machine->address))
.backtrace();
2017-05-26 04:48:44 +08:00
// This will remove all the "tracked" messages that came from the machine being killed
Remote ikvs debugging (#6465) * initial structure for remote IKVS server * moved struct to .h file, added new files to CMakeList * happy path implementation, connection error when testing * saved minor local change * changed tracing to debug * fixed onClosed and getError being called before init is finished * fix spawn process bug, now use absolute path * added server knob to set ikvs process port number * added server knob for remote/local kv store * implement simulator remote process spawning * fixed bug for simulator timeout * commit all changes * removed print lines in trace * added FlowProcess implementation by Markus * initial debug of FlowProcess, stuck at parent sending OpenKVStoreRequest to child * temporary fix for process factory throwing segfault on create * specify public address in command * change remote kv store knob to false for jenkins build * made port 0 open random unused port * change remote store knob to true for benchmark * set listening port to randomly opened port * added print lines for jenkins run open kv store timeout debug * removed most tracing and print lines * removed tutorial changes * update handleIOErrors error handling to handle remote-ikvs cases * Push all debugging changes * A version where worker bug exists * A version where restarting tests fail * Use both the name and the port to determine the child process * Remove unnecessary update on local address * Disable remote-kvs for DiskFailureCycle test * A version where restarting stuck * A version where most restarting tests green * Reset connection with child process explicitly * Remove change on unnecessary files * Unify flags from _ to - * fix merging unexpected changes * fix trac.error to .errorUnsuppressed * Add license header * Remove unnecessary header in FlowProcess.actor.cpp * Fix Windows build * Fix Windows build, add missing ; * Fix a stupid bug caused by code dropped by code merging * Disable remote kvs by default * Pass the conn_file path to the flow process, though not needed, but the buildNetwork is difficult to tune * serialization change on readrange * Update traces * Refactor the RemoteIKVS interface * Format files * Update sim2 interface to not clog connections between parent and child processes in simulation * Update comments; remove debugging symbols; Add error handling for remote_kvs_cancelled * Add comments, format files * Change method name from isBuggifyDisabled to isStableConnection; Decrease(0.1x) latency for stable connections * Commit the IConnection interface change, forgot in previous commit * Fix the issue that onClosed request is cancelled by ActorCollection * Enable the remote kv store knob * Remove FlowProcess.actor.cpp and move functions to RemoteIKeyValueStore.actor.cpp; Add remote kv store delay to avoid race; Bind the child process to die with parent process * Fix the bug where one process starts storage server more than once * Add a please_reboot_remote_kv_store error to restart the storage server worker if remote kvs died abnormally * Remove unreachable code path and add comments * Clang format the code * Fix a simple wait error * Clang format after merging the main branch * Testing mixed mode in simulation if remote_kvs knob is enabled, setting the default to false * Disable remote kvs for PhysicalShardMove which is for RocksDB * Cleanup #include orders, remove debugging traces * Revert the reorder in fdbserver.actor.cpp, which fails the gcc build Co-authored-by: “Lincoln <“lincoln.xiao@snowflake.com”>
2022-04-01 08:08:59 +08:00
if (std::string(machine->name) != "remote flow process")
latestEventCache.clear();
2017-05-26 04:48:44 +08:00
machine->failed = true;
} else if (kt == InjectFaults) {
TraceEvent(SevWarn, "FaultMachine")
.detail("Name", machine->name)
.detail("Address", machine->address)
.detail("ZoneId", machine->locality.zoneId())
.detail("Process", machine->toString())
.detail("Rebooting", machine->rebooting)
.detail("Protected", protectedAddresses.count(machine->address))
.backtrace();
2017-05-26 04:48:44 +08:00
should_inject_fault = simulator_should_inject_fault;
machine->fault_injection_r = deterministicRandom()->randomUniqueID().first();
2017-05-26 04:48:44 +08:00
machine->fault_injection_p1 = 0.1;
machine->fault_injection_p2 = deterministicRandom()->random01();
} else if (kt == FailDisk) {
TraceEvent(SevWarn, "FailDiskMachine")
.detail("Name", machine->name)
.detail("Address", machine->address)
.detail("ZoneId", machine->locality.zoneId())
.detail("Process", machine->toString())
.detail("Rebooting", machine->rebooting)
.detail("Protected", protectedAddresses.count(machine->address))
.backtrace();
machine->failedDisk = true;
2017-05-26 04:48:44 +08:00
} else {
ASSERT(false);
2017-05-26 04:48:44 +08:00
}
Remote ikvs debugging (#6465) * initial structure for remote IKVS server * moved struct to .h file, added new files to CMakeList * happy path implementation, connection error when testing * saved minor local change * changed tracing to debug * fixed onClosed and getError being called before init is finished * fix spawn process bug, now use absolute path * added server knob to set ikvs process port number * added server knob for remote/local kv store * implement simulator remote process spawning * fixed bug for simulator timeout * commit all changes * removed print lines in trace * added FlowProcess implementation by Markus * initial debug of FlowProcess, stuck at parent sending OpenKVStoreRequest to child * temporary fix for process factory throwing segfault on create * specify public address in command * change remote kv store knob to false for jenkins build * made port 0 open random unused port * change remote store knob to true for benchmark * set listening port to randomly opened port * added print lines for jenkins run open kv store timeout debug * removed most tracing and print lines * removed tutorial changes * update handleIOErrors error handling to handle remote-ikvs cases * Push all debugging changes * A version where worker bug exists * A version where restarting tests fail * Use both the name and the port to determine the child process * Remove unnecessary update on local address * Disable remote-kvs for DiskFailureCycle test * A version where restarting stuck * A version where most restarting tests green * Reset connection with child process explicitly * Remove change on unnecessary files * Unify flags from _ to - * fix merging unexpected changes * fix trac.error to .errorUnsuppressed * Add license header * Remove unnecessary header in FlowProcess.actor.cpp * Fix Windows build * Fix Windows build, add missing ; * Fix a stupid bug caused by code dropped by code merging * Disable remote kvs by default * Pass the conn_file path to the flow process, though not needed, but the buildNetwork is difficult to tune * serialization change on readrange * Update traces * Refactor the RemoteIKVS interface * Format files * Update sim2 interface to not clog connections between parent and child processes in simulation * Update comments; remove debugging symbols; Add error handling for remote_kvs_cancelled * Add comments, format files * Change method name from isBuggifyDisabled to isStableConnection; Decrease(0.1x) latency for stable connections * Commit the IConnection interface change, forgot in previous commit * Fix the issue that onClosed request is cancelled by ActorCollection * Enable the remote kv store knob * Remove FlowProcess.actor.cpp and move functions to RemoteIKeyValueStore.actor.cpp; Add remote kv store delay to avoid race; Bind the child process to die with parent process * Fix the bug where one process starts storage server more than once * Add a please_reboot_remote_kv_store error to restart the storage server worker if remote kvs died abnormally * Remove unreachable code path and add comments * Clang format the code * Fix a simple wait error * Clang format after merging the main branch * Testing mixed mode in simulation if remote_kvs knob is enabled, setting the default to false * Disable remote kvs for PhysicalShardMove which is for RocksDB * Cleanup #include orders, remove debugging traces * Revert the reorder in fdbserver.actor.cpp, which fails the gcc build Co-authored-by: “Lincoln <“lincoln.xiao@snowflake.com”>
2022-04-01 08:08:59 +08:00
ASSERT(!protectedAddresses.count(machine->address) || machine->rebooting ||
std::string(machine->name) == "remote flow process");
2017-05-26 04:48:44 +08:00
}
void rebootProcess(ProcessInfo* process, KillType kt) override {
if (kt == RebootProcessAndDelete && protectedAddresses.count(process->address)) {
TraceEvent("RebootChanged")
.detail("ZoneId", process->locality.describeZone())
.detail("KillType", RebootProcess)
.detail("OrigKillType", kt)
.detail("Reason", "Protected process");
2017-05-26 04:48:44 +08:00
kt = RebootProcess;
}
doReboot(process, kt);
2017-05-26 04:48:44 +08:00
}
void rebootProcess(Optional<Standalone<StringRef>> zoneId, bool allProcesses) override {
if (allProcesses) {
2017-05-26 04:48:44 +08:00
auto processes = getAllProcesses();
for (int i = 0; i < processes.size(); i++)
if (processes[i]->locality.zoneId() == zoneId && !processes[i]->rebooting)
doReboot(processes[i], RebootProcess);
2017-05-26 04:48:44 +08:00
} else {
auto processes = getAllProcesses();
for (int i = 0; i < processes.size(); i++) {
if (processes[i]->locality.zoneId() != zoneId || processes[i]->rebooting) {
swapAndPop(&processes, i--);
2017-05-26 04:48:44 +08:00
}
}
if (processes.size())
doReboot(deterministicRandom()->randomChoice(processes), RebootProcess);
2017-05-26 04:48:44 +08:00
}
}
void killProcess(ProcessInfo* machine, KillType kt) override {
TraceEvent("AttemptingKillProcess").detail("ProcessInfo", machine->toString());
if (kt < RebootAndDelete) {
killProcess_internal(machine, kt);
2017-05-26 04:48:44 +08:00
}
}
void killInterface(NetworkAddress address, KillType kt) override {
if (kt < RebootAndDelete) {
std::vector<ProcessInfo*>& processes = machines[addressMap[address]->locality.machineId()].processes;
for (int i = 0; i < processes.size(); i++)
killProcess_internal(processes[i], kt);
2017-05-26 04:48:44 +08:00
}
}
bool killZone(Optional<Standalone<StringRef>> zoneId, KillType kt, bool forceKill, KillType* ktFinal) override {
auto processes = getAllProcesses();
std::set<Optional<Standalone<StringRef>>> zoneMachines;
for (auto& process : processes) {
if (process->locality.zoneId() == zoneId) {
zoneMachines.insert(process->locality.machineId());
}
}
bool result = false;
for (auto& machineId : zoneMachines) {
if (killMachine(machineId, kt, forceKill, ktFinal)) {
result = true;
}
}
return result;
}
bool killMachine(Optional<Standalone<StringRef>> machineId,
KillType kt,
bool forceKill,
KillType* ktFinal) override {
2017-05-26 04:48:44 +08:00
auto ktOrig = kt;
TEST(true); // Trying to killing a machine
TEST(kt == KillInstantly); // Trying to kill instantly
TEST(kt == InjectFaults); // Trying to kill by injecting faults
if (speedUpSimulation && !forceKill) {
TraceEvent(SevWarn, "AbortedKill")
.detail("MachineId", machineId)
.detail("Reason", "Unforced kill within speedy simulation.")
.backtrace();
if (ktFinal)
*ktFinal = None;
2017-05-26 04:48:44 +08:00
return false;
}
int processesOnMachine = 0;
KillType originalKt = kt;
2017-05-26 04:48:44 +08:00
// Reboot if any of the processes are protected and count the number of processes not rebooting
for (auto& process : machines[machineId].processes) {
2017-05-26 04:48:44 +08:00
if (protectedAddresses.count(process->address))
kt = Reboot;
if (!process->rebooting)
processesOnMachine++;
}
// Do nothing, if no processes to kill
if (processesOnMachine == 0) {
TraceEvent(SevWarn, "AbortedKill")
.detail("MachineId", machineId)
.detail("Reason", "The target had no processes running.")
.detail("Processes", processesOnMachine)
.detail("ProcessesPerMachine", processesPerMachine)
.backtrace();
if (ktFinal)
*ktFinal = None;
return false;
}
2017-05-26 04:48:44 +08:00
// Check if machine can be removed, if requested
if (!forceKill && ((kt == KillInstantly) || (kt == InjectFaults) || (kt == FailDisk) ||
(kt == RebootAndDelete) || (kt == RebootProcessAndDelete))) {
std::vector<ProcessInfo*> processesLeft, processesDead;
int protectedWorker = 0, unavailable = 0, excluded = 0, cleared = 0;
2017-05-26 04:48:44 +08:00
for (auto processInfo : getAllProcesses()) {
if (processInfo->isAvailableClass()) {
if (processInfo->isExcluded()) {
processesDead.push_back(processInfo);
excluded++;
} else if (processInfo->isCleared()) {
processesDead.push_back(processInfo);
cleared++;
} else if (!processInfo->isAvailable()) {
processesDead.push_back(processInfo);
unavailable++;
} else if (protectedAddresses.count(processInfo->address)) {
processesLeft.push_back(processInfo);
protectedWorker++;
} else if (processInfo->locality.machineId() != machineId) {
processesLeft.push_back(processInfo);
2018-07-17 06:56:43 +08:00
} else {
processesDead.push_back(processInfo);
2018-07-17 06:56:43 +08:00
}
2017-05-26 04:48:44 +08:00
}
}
if (!canKillProcesses(processesLeft, processesDead, kt, &kt)) {
TraceEvent("ChangedKillMachine")
.detail("MachineId", machineId)
.detail("KillType", kt)
.detail("OrigKillType", ktOrig)
.detail("ProcessesLeft", processesLeft.size())
.detail("ProcessesDead", processesDead.size())
.detail("TotalProcesses", machines.size())
.detail("ProcessesPerMachine", processesPerMachine)
.detail("Protected", protectedWorker)
.detail("Unavailable", unavailable)
.detail("Excluded", excluded)
.detail("Cleared", cleared)
.detail("ProtectedTotal", protectedAddresses.size())
.detail("TLogPolicy", tLogPolicy->info())
.detail("StoragePolicy", storagePolicy->info());
} else if ((kt == KillInstantly) || (kt == InjectFaults) || (kt == FailDisk)) {
TraceEvent("DeadMachine")
.detail("MachineId", machineId)
.detail("KillType", kt)
.detail("ProcessesLeft", processesLeft.size())
.detail("ProcessesDead", processesDead.size())
.detail("TotalProcesses", machines.size())
.detail("ProcessesPerMachine", processesPerMachine)
.detail("TLogPolicy", tLogPolicy->info())
.detail("StoragePolicy", storagePolicy->info());
for (auto process : processesLeft) {
TraceEvent("DeadMachineSurvivors")
.detail("MachineId", machineId)
.detail("KillType", kt)
.detail("ProcessesLeft", processesLeft.size())
.detail("ProcessesDead", processesDead.size())
.detail("SurvivingProcess", process->toString());
}
for (auto process : processesDead) {
TraceEvent("DeadMachineVictims")
.detail("MachineId", machineId)
.detail("KillType", kt)
.detail("ProcessesLeft", processesLeft.size())
.detail("ProcessesDead", processesDead.size())
.detail("VictimProcess", process->toString());
}
} else {
TraceEvent("ClearMachine")
.detail("MachineId", machineId)
.detail("KillType", kt)
.detail("ProcessesLeft", processesLeft.size())
.detail("ProcessesDead", processesDead.size())
.detail("TotalProcesses", machines.size())
.detail("ProcessesPerMachine", processesPerMachine)
.detail("TLogPolicy", tLogPolicy->info())
.detail("StoragePolicy", storagePolicy->info());
for (auto process : processesLeft) {
TraceEvent("ClearMachineSurvivors")
.detail("MachineId", machineId)
.detail("KillType", kt)
.detail("ProcessesLeft", processesLeft.size())
.detail("ProcessesDead", processesDead.size())
.detail("SurvivingProcess", process->toString());
}
for (auto process : processesDead) {
TraceEvent("ClearMachineVictims")
.detail("MachineId", machineId)
.detail("KillType", kt)
.detail("ProcessesLeft", processesLeft.size())
.detail("ProcessesDead", processesDead.size())
.detail("VictimProcess", process->toString());
}
2017-05-26 04:48:44 +08:00
}
}
TEST(originalKt != kt); // Kill type was changed from requested to reboot.
2017-05-26 04:48:44 +08:00
// Check if any processes on machine are rebooting
if (processesOnMachine != processesPerMachine && kt >= RebootAndDelete) {
TEST(true); // Attempted reboot, but the target did not have all of its processes running
TraceEvent(SevWarn, "AbortedKill")
.detail("KillType", kt)
.detail("MachineId", machineId)
.detail("Reason", "Machine processes does not match number of processes per machine")
.detail("Processes", processesOnMachine)
.detail("ProcessesPerMachine", processesPerMachine)
.backtrace();
if (ktFinal)
*ktFinal = None;
2017-05-26 04:48:44 +08:00
return false;
}
// Check if any processes on machine are rebooting
if (processesOnMachine != processesPerMachine) {
TEST(true); // Attempted reboot and kill, but the target did not have all of its processes running
TraceEvent(SevWarn, "AbortedKill")
.detail("KillType", kt)
.detail("MachineId", machineId)
.detail("Reason", "Machine processes does not match number of processes per machine")
.detail("Processes", processesOnMachine)
.detail("ProcessesPerMachine", processesPerMachine)
.backtrace();
if (ktFinal)
*ktFinal = None;
return false;
}
TraceEvent("KillMachine")
.detail("MachineId", machineId)
.detail("Kt", kt)
.detail("KtOrig", ktOrig)
.detail("KillableMachines", processesOnMachine)
.detail("ProcessPerMachine", processesPerMachine)
.detail("KillChanged", kt != ktOrig);
if (kt < RebootAndDelete) {
if ((kt == InjectFaults || kt == FailDisk) && machines[machineId].machineProcess != nullptr)
killProcess_internal(machines[machineId].machineProcess, kt);
for (auto& process : machines[machineId].processes) {
TraceEvent("KillMachineProcess")
.detail("KillType", kt)
.detail("Process", process->toString())
.detail("StartingClass", process->startingClass.toString())
.detail("Failed", process->failed)
.detail("Excluded", process->excluded)
.detail("Cleared", process->cleared)
.detail("Rebooting", process->rebooting);
2017-05-26 04:48:44 +08:00
if (process->startingClass != ProcessClass::TesterClass)
killProcess_internal(process, kt);
2017-05-26 04:48:44 +08:00
}
} else if (kt == Reboot || kt == RebootAndDelete) {
for (auto& process : machines[machineId].processes) {
TraceEvent("KillMachineProcess")
.detail("KillType", kt)
.detail("Process", process->toString())
.detail("StartingClass", process->startingClass.toString())
.detail("Failed", process->failed)
.detail("Excluded", process->excluded)
.detail("Cleared", process->cleared)
.detail("Rebooting", process->rebooting);
2017-05-26 04:48:44 +08:00
if (process->startingClass != ProcessClass::TesterClass)
doReboot(process, kt);
2017-05-26 04:48:44 +08:00
}
}
TEST(kt == RebootAndDelete); // Resulted in a reboot and delete
TEST(kt == Reboot); // Resulted in a reboot
TEST(kt == KillInstantly); // Resulted in an instant kill
TEST(kt == InjectFaults); // Resulted in a kill by injecting faults
if (ktFinal)
*ktFinal = kt;
2017-05-26 04:48:44 +08:00
return true;
}
bool killDataCenter(Optional<Standalone<StringRef>> dcId, KillType kt, bool forceKill, KillType* ktFinal) override {
2017-05-26 04:48:44 +08:00
auto ktOrig = kt;
auto processes = getAllProcesses();
std::map<Optional<Standalone<StringRef>>, int> datacenterMachines;
int dcProcesses = 0;
2017-05-26 04:48:44 +08:00
// Switch to a reboot, if anything protected on machine
for (auto& procRecord : processes) {
auto processDcId = procRecord->locality.dcId();
auto processMachineId = procRecord->locality.machineId();
ASSERT(processMachineId.present());
2017-05-26 04:48:44 +08:00
if (processDcId.present() && (processDcId == dcId)) {
if ((kt != Reboot) && (protectedAddresses.count(procRecord->address))) {
2017-05-26 04:48:44 +08:00
kt = Reboot;
TraceEvent(SevWarn, "DcKillChanged")
.detail("DataCenter", dcId)
.detail("KillType", kt)
.detail("OrigKillType", ktOrig)
.detail("Reason", "Datacenter has protected process")
.detail("ProcessAddress", procRecord->address)
.detail("Failed", procRecord->failed)
.detail("Rebooting", procRecord->rebooting)
.detail("Excluded", procRecord->excluded)
.detail("Cleared", procRecord->cleared)
.detail("Process", procRecord->toString());
}
datacenterMachines[processMachineId.get()]++;
dcProcesses++;
2017-05-26 04:48:44 +08:00
}
}
// Check if machine can be removed, if requested
if (!forceKill && ((kt == KillInstantly) || (kt == InjectFaults) || (kt == FailDisk) ||
(kt == RebootAndDelete) || (kt == RebootProcessAndDelete))) {
std::vector<ProcessInfo*> processesLeft, processesDead;
for (auto processInfo : getAllProcesses()) {
if (processInfo->isAvailableClass()) {
2018-07-17 06:56:43 +08:00
if (processInfo->isExcluded() || processInfo->isCleared() || !processInfo->isAvailable()) {
processesDead.push_back(processInfo);
} else if (protectedAddresses.count(processInfo->address) ||
datacenterMachines.find(processInfo->locality.machineId()) == datacenterMachines.end()) {
processesLeft.push_back(processInfo);
2018-07-17 06:56:43 +08:00
} else {
processesDead.push_back(processInfo);
2018-07-17 06:56:43 +08:00
}
2017-05-26 04:48:44 +08:00
}
}
if (!canKillProcesses(processesLeft, processesDead, kt, &kt)) {
TraceEvent(SevWarn, "DcKillChanged")
.detail("DataCenter", dcId)
.detail("KillType", kt)
.detail("OrigKillType", ktOrig);
} else {
TraceEvent("DeadDataCenter")
.detail("DataCenter", dcId)
.detail("KillType", kt)
.detail("DcZones", datacenterMachines.size())
.detail("DcProcesses", dcProcesses)
.detail("ProcessesDead", processesDead.size())
.detail("ProcessesLeft", processesLeft.size())
.detail("TLogPolicy", tLogPolicy->info())
.detail("StoragePolicy", storagePolicy->info());
for (auto process : processesLeft) {
TraceEvent("DeadDcSurvivors")
.detail("MachineId", process->locality.machineId())
.detail("KillType", kt)
.detail("ProcessesLeft", processesLeft.size())
.detail("ProcessesDead", processesDead.size())
.detail("SurvivingProcess", process->toString());
}
for (auto process : processesDead) {
TraceEvent("DeadDcVictims")
.detail("MachineId", process->locality.machineId())
.detail("KillType", kt)
.detail("ProcessesLeft", processesLeft.size())
.detail("ProcessesDead", processesDead.size())
.detail("VictimProcess", process->toString());
}
2017-05-26 04:48:44 +08:00
}
}
KillType ktResult, ktMin = kt;
for (auto& datacenterMachine : datacenterMachines) {
if (deterministicRandom()->random01() < 0.99 || forceKill) {
killMachine(datacenterMachine.first, kt, true, &ktResult);
if (ktResult != kt) {
TraceEvent(SevWarn, "KillDCFail")
.detail("Zone", datacenterMachine.first)
.detail("KillType", kt)
.detail("KillTypeResult", ktResult)
.detail("KillTypeOrig", ktOrig);
ASSERT(ktResult == None);
}
ktMin = std::min<KillType>(ktResult, ktMin);
}
}
TraceEvent("KillDataCenter")
.detail("DcZones", datacenterMachines.size())
.detail("DcProcesses", dcProcesses)
.detail("DCID", dcId)
.detail("KillType", kt)
.detail("KillTypeOrig", ktOrig)
.detail("KillTypeMin", ktMin)
.detail("KilledDC", kt == ktMin);
2017-05-26 04:48:44 +08:00
TEST(kt != ktMin); // DataCenter kill was rejected by killMachine
TEST((kt == ktMin) && (kt == RebootAndDelete)); // Datacenter kill Resulted in a reboot and delete
TEST((kt == ktMin) && (kt == Reboot)); // Datacenter kill Resulted in a reboot
TEST((kt == ktMin) && (kt == KillInstantly)); // Datacenter kill Resulted in an instant kill
TEST((kt == ktMin) && (kt == InjectFaults)); // Datacenter kill Resulted in a kill by injecting faults
TEST((kt == ktMin) && (kt != ktOrig)); // Datacenter Kill request was downgraded
TEST((kt == ktMin) && (kt == ktOrig)); // Datacenter kill - Requested kill was done
if (ktFinal)
*ktFinal = ktMin;
2017-10-03 03:04:28 +08:00
return (kt == ktMin);
2017-05-26 04:48:44 +08:00
}
void clogInterface(const IPAddress& ip, double seconds, ClogMode mode = ClogDefault) override {
2017-05-26 04:48:44 +08:00
if (mode == ClogDefault) {
double a = deterministicRandom()->random01();
if (a < 0.3)
mode = ClogSend;
else if (a < 0.6)
mode = ClogReceive;
else
mode = ClogAll;
2017-05-26 04:48:44 +08:00
}
TraceEvent("ClogInterface")
.detail("IP", ip.toString())
.detail("Delay", seconds)
.detail("Queue",
mode == ClogSend ? "Send"
: mode == ClogReceive ? "Receive"
: "All");
2017-05-26 04:48:44 +08:00
if (mode == ClogSend || mode == ClogAll)
g_clogging.clogSendFor(ip, seconds);
if (mode == ClogReceive || mode == ClogAll)
g_clogging.clogRecvFor(ip, seconds);
2017-05-26 04:48:44 +08:00
}
void clogPair(const IPAddress& from, const IPAddress& to, double seconds) override {
2021-07-11 07:55:30 +08:00
TraceEvent("CloggingPair").detail("From", from).detail("To", to).detail("Seconds", seconds);
g_clogging.clogPairFor(from, to, seconds);
2017-05-26 04:48:44 +08:00
}
std::vector<ProcessInfo*> getAllProcesses() const override {
2017-05-26 04:48:44 +08:00
std::vector<ProcessInfo*> processes;
for (auto& c : machines) {
processes.insert(processes.end(), c.second.processes.begin(), c.second.processes.end());
}
for (auto& c : currentlyRebootingProcesses) {
processes.push_back(c.second);
}
2017-05-26 04:48:44 +08:00
return processes;
}
ProcessInfo* getProcessByAddress(NetworkAddress const& address) override {
2020-02-11 06:00:21 +08:00
NetworkAddress normalizedAddress(address.ip, address.port, true, address.isTLS());
ASSERT(addressMap.count(normalizedAddress));
// NOTE: addressMap[normalizedAddress]->address may not equal to normalizedAddress
return addressMap[normalizedAddress];
2017-05-26 04:48:44 +08:00
}
MachineInfo* getMachineByNetworkAddress(NetworkAddress const& address) override {
return &machines[addressMap[address]->locality.machineId()];
2017-05-26 04:48:44 +08:00
}
MachineInfo* getMachineById(Optional<Standalone<StringRef>> const& machineId) override {
return &machines[machineId];
2017-05-26 04:48:44 +08:00
}
void destroyMachine(Optional<Standalone<StringRef>> const& machineId) override {
auto& machine = machines[machineId];
for (auto process : machine.processes) {
ASSERT(process->failed);
2017-05-26 04:48:44 +08:00
}
if (machine.machineProcess) {
killProcess_internal(machine.machineProcess, KillInstantly);
2017-05-26 04:48:44 +08:00
}
machines.erase(machineId);
2017-05-26 04:48:44 +08:00
}
Sim2(bool printSimTime)
: time(0.0), timerTime(0.0), currentTaskID(TaskPriority::Zero), taskCount(0), yielded(false), yield_limit(0),
printSimTime(printSimTime) {
2020-08-19 05:43:17 +08:00
// Not letting currentProcess be nullptr eliminates some annoying special cases
currentProcess =
new ProcessInfo("NoMachine",
LocalityData(Optional<Standalone<StringRef>>(), StringRef(), StringRef(), StringRef()),
ProcessClass(),
{ NetworkAddress() },
this,
"",
"");
g_network = net2 = newNet2(TLSConfig(), false, true);
g_network->addStopCallback(Net2FileSystem::stop);
2017-05-26 04:48:44 +08:00
Net2FileSystem::newFileSystem();
check_yield(TaskPriority::Zero);
2017-05-26 04:48:44 +08:00
}
// Implementation
struct Task {
TaskPriority taskID;
2017-05-26 04:48:44 +08:00
double time;
uint64_t stable;
ProcessInfo* machine;
Promise<Void> action;
Task(double time, TaskPriority taskID, uint64_t stable, ProcessInfo* machine, Promise<Void>&& action)
: taskID(taskID), time(time), stable(stable), machine(machine), action(std::move(action)) {}
Task(double time, TaskPriority taskID, uint64_t stable, ProcessInfo* machine, Future<Void>& future)
: taskID(taskID), time(time), stable(stable), machine(machine) {
future = action.getFuture();
}
2020-06-10 08:33:41 +08:00
Task(Task&& rhs) noexcept
: taskID(rhs.taskID), time(rhs.time), stable(rhs.stable), machine(rhs.machine),
2020-06-10 08:33:41 +08:00
action(std::move(rhs.action)) {}
void operator=(Task const& rhs) {
taskID = rhs.taskID;
time = rhs.time;
stable = rhs.stable;
machine = rhs.machine;
action = rhs.action;
}
Task(Task const& rhs)
: taskID(rhs.taskID), time(rhs.time), stable(rhs.stable), machine(rhs.machine), action(rhs.action) {}
2020-06-10 08:33:41 +08:00
void operator=(Task&& rhs) noexcept {
time = rhs.time;
taskID = rhs.taskID;
stable = rhs.stable;
machine = rhs.machine;
action = std::move(rhs.action);
}
2017-05-26 04:48:44 +08:00
bool operator<(Task const& rhs) const {
2017-05-26 04:48:44 +08:00
// Ordering is reversed for priority_queue
if (time != rhs.time)
return time > rhs.time;
2017-05-26 04:48:44 +08:00
return stable > rhs.stable;
}
};
void execTask(struct Task& t) {
if (t.machine->failed) {
t.action.send(Never());
} else {
2017-05-26 04:48:44 +08:00
mutex.enter();
if (printSimTime && (int)this->time < (int)t.time) {
printf("Time: %d\n", (int)t.time);
}
this->time = t.time;
this->timerTime = std::max(this->timerTime, this->time);
2017-05-26 04:48:44 +08:00
mutex.leave();
this->currentProcess = t.machine;
try {
t.action.send(Void());
ASSERT(this->currentProcess == t.machine);
2017-05-26 04:48:44 +08:00
} catch (Error& e) {
TraceEvent(SevError, "UnhandledSimulationEventError").errorUnsuppressed(e);
2017-05-26 04:48:44 +08:00
killProcess(t.machine, KillInstantly);
}
if (randLog)
fmt::print(randLog,
"T {0} {1} {2} {3}\n",
this->time,
int(deterministicRandom()->peek() % 10000),
t.machine ? t.machine->name : "none",
t.stable);
2017-05-26 04:48:44 +08:00
}
}
void onMainThread(Promise<Void>&& signal, TaskPriority taskID) override {
2017-05-26 04:48:44 +08:00
// This is presumably coming from either a "fake" thread pool thread, i.e. it is actually on this thread
// or a thread created with g_network->startThread
ASSERT(getCurrentProcess());
mutex.enter();
ASSERT(taskID >= TaskPriority::Min && taskID <= TaskPriority::Max);
tasks.push(Task(time, taskID, taskCount++, getCurrentProcess(), std::move(signal)));
2017-05-26 04:48:44 +08:00
mutex.leave();
}
bool isOnMainThread() const override { return net2->isOnMainThread(); }
Future<Void> onProcess(ISimulator::ProcessInfo* process, TaskPriority taskID) override {
return delay(0, taskID, process);
2017-05-26 04:48:44 +08:00
}
Future<Void> onMachine(ISimulator::ProcessInfo* process, TaskPriority taskID) override {
if (process->machine == 0)
2017-05-26 04:48:44 +08:00
return Void();
return delay(0, taskID, process->machine->machineProcess);
}
2017-05-26 04:48:44 +08:00
2021-10-20 08:37:03 +08:00
ProtocolVersion protocolVersion() const override { return getCurrentProcess()->protocolVersion; }
// time is guarded by ISimulator::mutex. It is not necessary to guard reads on the main thread because
// time should only be modified from the main thread.
double time;
double timerTime;
TaskPriority currentTaskID;
2017-05-26 04:48:44 +08:00
// taskCount is guarded by ISimulator::mutex
2017-05-26 04:48:44 +08:00
uint64_t taskCount;
std::map<Optional<Standalone<StringRef>>, MachineInfo> machines;
2017-05-26 04:48:44 +08:00
std::map<NetworkAddress, ProcessInfo*> addressMap;
std::map<ProcessInfo*, Promise<Void>> filesDeadMap;
// tasks is guarded by ISimulator::mutex
2017-05-26 04:48:44 +08:00
std::priority_queue<Task, std::vector<Task>> tasks;
std::vector<std::function<void()>> stopCallbacks;
// Sim2Net network;
INetwork* net2;
2017-05-26 04:48:44 +08:00
// Map from machine IP -> machine disk space info
std::map<IPAddress, SimDiskSpace> diskSpaceMap;
2017-05-26 04:48:44 +08:00
// Whether or not yield has returned true during the current iteration of the run loop
2017-05-26 04:48:44 +08:00
bool yielded;
int yield_limit; // how many more times yield may return false before next returning true
bool printSimTime;
private:
2022-04-02 15:22:54 +08:00
DNSCache mockDNS;
#ifdef ENABLE_SAMPLING
ActorLineageSet actorLineageSet;
#endif
2017-05-26 04:48:44 +08:00
};
2020-08-07 04:06:50 +08:00
class UDPSimSocket : public IUDPSocket, ReferenceCounted<UDPSimSocket> {
using Packet = std::shared_ptr<std::vector<uint8_t>>;
UID id;
ISimulator::ProcessInfo* process;
Optional<NetworkAddress> peerAddress;
Optional<ISimulator::ProcessInfo*> peerProcess;
Optional<Reference<UDPSimSocket>> peerSocket;
ActorCollection actors;
Promise<Void> closed;
std::deque<std::pair<NetworkAddress, Packet>> recvBuffer;
AsyncVar<int64_t> writtenPackets;
NetworkAddress _localAddress;
bool randomDropPacket() {
auto res = deterministicRandom()->random01() < .000001;
TEST(res); // UDP packet drop
return res;
}
bool isClosed() const { return closed.getFuture().isReady(); }
Future<Void> onClosed() const { return closed.getFuture(); }
ACTOR static Future<Void> cleanupPeerSocket(UDPSimSocket* self) {
wait(self->peerSocket.get()->onClosed());
self->peerSocket.reset();
return Void();
}
ACTOR static Future<Void> send(UDPSimSocket* self,
Reference<UDPSimSocket> peerSocket,
uint8_t const* begin,
2020-11-14 06:37:03 +08:00
uint8_t const* end) {
2020-08-07 04:06:50 +08:00
state Packet packet(std::make_shared<std::vector<uint8_t>>());
2020-08-12 05:34:59 +08:00
packet->resize(end - begin);
2020-08-07 04:06:50 +08:00
std::copy(begin, end, packet->begin());
wait(delay(.002 * deterministicRandom()->random01()));
2020-08-07 04:06:50 +08:00
peerSocket->recvBuffer.emplace_back(self->_localAddress, std::move(packet));
peerSocket->writtenPackets.set(peerSocket->writtenPackets.get() + 1);
2020-08-07 04:06:50 +08:00
return Void();
}
ACTOR static Future<int> receiveFrom(UDPSimSocket* self, uint8_t* begin, uint8_t* end, NetworkAddress* sender) {
state TaskPriority currentTaskID = g_sim2.getCurrentTask();
wait(self->writtenPackets.onChange());
wait(g_sim2.onProcess(self->process, currentTaskID));
auto packet = self->recvBuffer.front().second;
int sz = packet->size();
ASSERT(sz <= end - begin);
if (sender) {
*sender = self->recvBuffer.front().first;
}
std::copy(packet->begin(), packet->end(), begin);
self->recvBuffer.pop_front();
return sz;
}
public:
UDPSimSocket(NetworkAddress const& localAddress, Optional<NetworkAddress> const& peerAddress)
: id(deterministicRandom()->randomUniqueID()), process(g_simulator.getCurrentProcess()), peerAddress(peerAddress),
actors(false), _localAddress(localAddress) {
2020-08-12 05:34:59 +08:00
g_sim2.addressMap.emplace(_localAddress, process);
ASSERT(process->boundUDPSockets.find(localAddress) == process->boundUDPSockets.end());
2020-08-07 04:06:50 +08:00
process->boundUDPSockets.emplace(localAddress, this);
}
~UDPSimSocket() override {
2020-08-07 04:06:50 +08:00
if (!closed.getFuture().isReady()) {
close();
closed.send(Void());
}
actors.clear(true);
}
2020-08-12 05:34:59 +08:00
void close() override {
process->boundUDPSockets.erase(_localAddress);
g_sim2.addressMap.erase(_localAddress);
}
2020-08-07 04:06:50 +08:00
UID getDebugID() const override { return id; }
void addref() override { ReferenceCounted<UDPSimSocket>::addref(); }
void delref() override { ReferenceCounted<UDPSimSocket>::delref(); }
Future<int> send(uint8_t const* begin, uint8_t const* end) override {
int sz = int(end - begin);
auto res = fmap([sz](Void) { return sz; }, delay(0.0));
2020-08-07 04:06:50 +08:00
ASSERT(sz <= IUDPSocket::MAX_PACKET_SIZE);
ASSERT(peerAddress.present());
if (!peerProcess.present()) {
auto iter = g_sim2.addressMap.find(peerAddress.get());
if (iter == g_sim2.addressMap.end()) {
return res;
}
peerProcess = iter->second;
}
if (!peerSocket.present() || peerSocket.get()->isClosed()) {
peerSocket.reset();
auto iter = peerProcess.get()->boundUDPSockets.find(peerAddress.get());
if (iter == peerProcess.get()->boundUDPSockets.end()) {
return fmap([sz](Void) { return sz; }, delay(0.0));
2020-08-07 04:06:50 +08:00
}
peerSocket = iter->second.castTo<UDPSimSocket>();
// the notation of leaking connections doesn't make much sense in the context of UDP
// so we simply handle those in the simulator
actors.add(cleanupPeerSocket(this));
}
if (randomDropPacket()) {
return res;
}
actors.add(send(this, peerSocket.get(), begin, end));
return res;
}
Future<int> sendTo(uint8_t const* begin, uint8_t const* end, NetworkAddress const& peer) override {
int sz = int(end - begin);
auto res = fmap([sz](Void) { return sz; }, delay(0.0));
2020-08-07 04:06:50 +08:00
ASSERT(sz <= MAX_PACKET_SIZE);
ISimulator::ProcessInfo* peerProcess = nullptr;
Reference<UDPSimSocket> peerSocket;
{
auto iter = g_sim2.addressMap.find(peer);
if (iter == g_sim2.addressMap.end()) {
return res;
}
peerProcess = iter->second;
}
{
auto iter = peerProcess->boundUDPSockets.find(peer);
if (iter == peerProcess->boundUDPSockets.end()) {
return res;
}
peerSocket = iter->second.castTo<UDPSimSocket>();
}
actors.add(send(this, peerSocket, begin, end));
return res;
}
Future<int> receive(uint8_t* begin, uint8_t* end) override { return receiveFrom(begin, end, nullptr); }
2020-08-07 04:06:50 +08:00
Future<int> receiveFrom(uint8_t* begin, uint8_t* end, NetworkAddress* sender) override {
if (!recvBuffer.empty()) {
auto buf = recvBuffer.front().second;
if (sender) {
*sender = recvBuffer.front().first;
}
int sz = buf->size();
ASSERT(sz <= end - begin);
std::copy(buf->begin(), buf->end(), begin);
auto res = fmap([sz](Void) { return sz; }, delay(0.0));
2020-08-07 04:06:50 +08:00
recvBuffer.pop_front();
return res;
}
return receiveFrom(this, begin, end, sender);
}
void bind(NetworkAddress const& addr) override {
2020-08-12 05:34:59 +08:00
g_sim2.addressMap.erase(_localAddress);
2020-08-07 04:06:50 +08:00
process->boundUDPSockets.erase(_localAddress);
process->boundUDPSockets.emplace(addr, Reference<UDPSimSocket>::addRef(this));
_localAddress = addr;
2020-08-12 05:34:59 +08:00
g_sim2.addressMap.emplace(_localAddress, process);
2020-08-07 04:06:50 +08:00
}
NetworkAddress localAddress() const override { return _localAddress; }
2020-11-17 03:40:06 +08:00
boost::asio::ip::udp::socket::native_handle_type native_handle() override { return 0; }
2020-08-07 04:06:50 +08:00
};
Future<Reference<IUDPSocket>> Sim2::createUDPSocket(NetworkAddress toAddr) {
2020-11-14 06:37:03 +08:00
NetworkAddress localAddress;
auto process = g_simulator.getCurrentProcess();
if (process->address.ip.isV6()) {
IPAddress::IPAddressStore store = process->address.ip.toV6();
uint16_t* ipParts = (uint16_t*)store.data();
ipParts[7] += deterministicRandom()->randomInt(0, 256);
localAddress.ip = IPAddress(store);
} else {
localAddress.ip = IPAddress(process->address.ip.toV4() + deterministicRandom()->randomInt(0, 256));
}
localAddress.port = deterministicRandom()->randomInt(40000, 60000);
while (process->boundUDPSockets.find(localAddress) != process->boundUDPSockets.end()) {
localAddress.port = deterministicRandom()->randomInt(40000, 60000);
}
2020-11-14 06:37:03 +08:00
return Reference<IUDPSocket>(new UDPSimSocket(localAddress, toAddr));
2020-08-07 04:06:50 +08:00
}
Future<Reference<IUDPSocket>> Sim2::createUDPSocket(bool isV6) {
2020-11-14 06:37:03 +08:00
NetworkAddress localAddress;
auto process = g_simulator.getCurrentProcess();
if (process->address.ip.isV6() == isV6) {
localAddress = process->address;
} else {
ASSERT(process->addresses.secondaryAddress.present() &&
process->addresses.secondaryAddress.get().isV6() == isV6);
localAddress = process->addresses.secondaryAddress.get();
}
if (localAddress.ip.isV6()) {
IPAddress::IPAddressStore store = localAddress.ip.toV6();
uint16_t* ipParts = (uint16_t*)store.data();
ipParts[7] += deterministicRandom()->randomInt(0, 256);
localAddress.ip = IPAddress(store);
} else {
localAddress.ip = IPAddress(localAddress.ip.toV4() + deterministicRandom()->randomInt(0, 256));
}
localAddress.port = deterministicRandom()->randomInt(40000, 60000);
return Reference<IUDPSocket>(new UDPSimSocket(localAddress, Optional<NetworkAddress>{}));
2020-08-07 04:06:50 +08:00
}
void startNewSimulator(bool printSimTime) {
ASSERT(!g_network);
g_network = g_pSimulator = new Sim2(printSimTime);
g_simulator.connectionFailuresDisableDuration = deterministicRandom()->random01() < 0.5 ? 0 : 1e6;
2017-05-26 04:48:44 +08:00
}
ACTOR void doReboot(ISimulator::ProcessInfo* p, ISimulator::KillType kt) {
2020-06-24 01:07:39 +08:00
TraceEvent("RebootingProcessAttempt")
.detail("ZoneId", p->locality.zoneId())
.detail("KillType", kt)
.detail("Process", p->toString())
.detail("StartingClass", p->startingClass.toString())
.detail("Failed", p->failed)
.detail("Excluded", p->excluded)
.detail("Cleared", p->cleared)
.detail("Rebooting", p->rebooting)
.detail("TaskPriorityDefaultDelay", TaskPriority::DefaultDelay);
wait(g_sim2.delay(0, TaskPriority::DefaultDelay, p)); // Switch to the machine in question
2017-05-26 04:48:44 +08:00
try {
ASSERT(kt == ISimulator::RebootProcess || kt == ISimulator::Reboot || kt == ISimulator::RebootAndDelete ||
kt == ISimulator::RebootProcessAndDelete);
2017-05-26 04:48:44 +08:00
TEST(kt == ISimulator::RebootProcess); // Simulated process rebooted
TEST(kt == ISimulator::Reboot); // Simulated machine rebooted
TEST(kt == ISimulator::RebootAndDelete); // Simulated machine rebooted with data and coordination state deletion
TEST(
kt ==
ISimulator::RebootProcessAndDelete); // Simulated process rebooted with data and coordination state deletion
2017-05-26 04:48:44 +08:00
Remote ikvs debugging (#6465) * initial structure for remote IKVS server * moved struct to .h file, added new files to CMakeList * happy path implementation, connection error when testing * saved minor local change * changed tracing to debug * fixed onClosed and getError being called before init is finished * fix spawn process bug, now use absolute path * added server knob to set ikvs process port number * added server knob for remote/local kv store * implement simulator remote process spawning * fixed bug for simulator timeout * commit all changes * removed print lines in trace * added FlowProcess implementation by Markus * initial debug of FlowProcess, stuck at parent sending OpenKVStoreRequest to child * temporary fix for process factory throwing segfault on create * specify public address in command * change remote kv store knob to false for jenkins build * made port 0 open random unused port * change remote store knob to true for benchmark * set listening port to randomly opened port * added print lines for jenkins run open kv store timeout debug * removed most tracing and print lines * removed tutorial changes * update handleIOErrors error handling to handle remote-ikvs cases * Push all debugging changes * A version where worker bug exists * A version where restarting tests fail * Use both the name and the port to determine the child process * Remove unnecessary update on local address * Disable remote-kvs for DiskFailureCycle test * A version where restarting stuck * A version where most restarting tests green * Reset connection with child process explicitly * Remove change on unnecessary files * Unify flags from _ to - * fix merging unexpected changes * fix trac.error to .errorUnsuppressed * Add license header * Remove unnecessary header in FlowProcess.actor.cpp * Fix Windows build * Fix Windows build, add missing ; * Fix a stupid bug caused by code dropped by code merging * Disable remote kvs by default * Pass the conn_file path to the flow process, though not needed, but the buildNetwork is difficult to tune * serialization change on readrange * Update traces * Refactor the RemoteIKVS interface * Format files * Update sim2 interface to not clog connections between parent and child processes in simulation * Update comments; remove debugging symbols; Add error handling for remote_kvs_cancelled * Add comments, format files * Change method name from isBuggifyDisabled to isStableConnection; Decrease(0.1x) latency for stable connections * Commit the IConnection interface change, forgot in previous commit * Fix the issue that onClosed request is cancelled by ActorCollection * Enable the remote kv store knob * Remove FlowProcess.actor.cpp and move functions to RemoteIKeyValueStore.actor.cpp; Add remote kv store delay to avoid race; Bind the child process to die with parent process * Fix the bug where one process starts storage server more than once * Add a please_reboot_remote_kv_store error to restart the storage server worker if remote kvs died abnormally * Remove unreachable code path and add comments * Clang format the code * Fix a simple wait error * Clang format after merging the main branch * Testing mixed mode in simulation if remote_kvs knob is enabled, setting the default to false * Disable remote kvs for PhysicalShardMove which is for RocksDB * Cleanup #include orders, remove debugging traces * Revert the reorder in fdbserver.actor.cpp, which fails the gcc build Co-authored-by: “Lincoln <“lincoln.xiao@snowflake.com”>
2022-04-01 08:08:59 +08:00
if (p->rebooting || !p->isReliable()) {
TraceEvent(SevDebug, "DoRebootFailed")
.detail("Rebooting", p->rebooting)
.detail("Reliable", p->isReliable());
return;
} else if (std::string(p->name) == "remote flow process") {
TraceEvent(SevDebug, "DoRebootFailed").detail("Name", p->name).detail("Address", p->address);
return;
} else if (p->getChilds().size()) {
TraceEvent(SevDebug, "DoRebootFailedOnParentProcess").detail("Address", p->address);
2017-05-26 04:48:44 +08:00
return;
Remote ikvs debugging (#6465) * initial structure for remote IKVS server * moved struct to .h file, added new files to CMakeList * happy path implementation, connection error when testing * saved minor local change * changed tracing to debug * fixed onClosed and getError being called before init is finished * fix spawn process bug, now use absolute path * added server knob to set ikvs process port number * added server knob for remote/local kv store * implement simulator remote process spawning * fixed bug for simulator timeout * commit all changes * removed print lines in trace * added FlowProcess implementation by Markus * initial debug of FlowProcess, stuck at parent sending OpenKVStoreRequest to child * temporary fix for process factory throwing segfault on create * specify public address in command * change remote kv store knob to false for jenkins build * made port 0 open random unused port * change remote store knob to true for benchmark * set listening port to randomly opened port * added print lines for jenkins run open kv store timeout debug * removed most tracing and print lines * removed tutorial changes * update handleIOErrors error handling to handle remote-ikvs cases * Push all debugging changes * A version where worker bug exists * A version where restarting tests fail * Use both the name and the port to determine the child process * Remove unnecessary update on local address * Disable remote-kvs for DiskFailureCycle test * A version where restarting stuck * A version where most restarting tests green * Reset connection with child process explicitly * Remove change on unnecessary files * Unify flags from _ to - * fix merging unexpected changes * fix trac.error to .errorUnsuppressed * Add license header * Remove unnecessary header in FlowProcess.actor.cpp * Fix Windows build * Fix Windows build, add missing ; * Fix a stupid bug caused by code dropped by code merging * Disable remote kvs by default * Pass the conn_file path to the flow process, though not needed, but the buildNetwork is difficult to tune * serialization change on readrange * Update traces * Refactor the RemoteIKVS interface * Format files * Update sim2 interface to not clog connections between parent and child processes in simulation * Update comments; remove debugging symbols; Add error handling for remote_kvs_cancelled * Add comments, format files * Change method name from isBuggifyDisabled to isStableConnection; Decrease(0.1x) latency for stable connections * Commit the IConnection interface change, forgot in previous commit * Fix the issue that onClosed request is cancelled by ActorCollection * Enable the remote kv store knob * Remove FlowProcess.actor.cpp and move functions to RemoteIKeyValueStore.actor.cpp; Add remote kv store delay to avoid race; Bind the child process to die with parent process * Fix the bug where one process starts storage server more than once * Add a please_reboot_remote_kv_store error to restart the storage server worker if remote kvs died abnormally * Remove unreachable code path and add comments * Clang format the code * Fix a simple wait error * Clang format after merging the main branch * Testing mixed mode in simulation if remote_kvs knob is enabled, setting the default to false * Disable remote kvs for PhysicalShardMove which is for RocksDB * Cleanup #include orders, remove debugging traces * Revert the reorder in fdbserver.actor.cpp, which fails the gcc build Co-authored-by: “Lincoln <“lincoln.xiao@snowflake.com”>
2022-04-01 08:08:59 +08:00
}
2020-06-24 01:07:39 +08:00
TraceEvent("RebootingProcess")
.detail("KillType", kt)
.detail("Address", p->address)
.detail("ZoneId", p->locality.zoneId())
.detail("DataHall", p->locality.dataHallId())
.detail("Locality", p->locality.toString())
.detail("Failed", p->failed)
.detail("Excluded", p->excluded)
.detail("Cleared", p->cleared)
.backtrace();
2017-05-26 04:48:44 +08:00
p->rebooting = true;
if ((kt == ISimulator::RebootAndDelete) || (kt == ISimulator::RebootProcessAndDelete)) {
p->cleared = true;
g_simulator.clearAddress(p->address);
}
p->shutdownSignal.send(kt);
2017-05-26 04:48:44 +08:00
} catch (Error& e) {
TraceEvent(SevError, "RebootError").error(e);
p->shutdownSignal.sendError(e); // ?
2017-05-26 04:48:44 +08:00
throw; // goes nowhere!
}
}
// Simulates delays for performing operations on disk
Future<Void> waitUntilDiskReady(Reference<DiskParameters> diskParameters, int64_t size, bool sync) {
if (g_simulator.getCurrentProcess()->failedDisk) {
2021-01-28 06:36:02 +08:00
return Never();
}
if (g_simulator.connectionFailuresDisableDuration > 1e4)
2017-05-26 04:48:44 +08:00
return delay(0.0001);
if (diskParameters->nextOperation < now())
diskParameters->nextOperation = now();
diskParameters->nextOperation += (1.0 / diskParameters->iops) + (size / diskParameters->bandwidth);
2017-05-26 04:48:44 +08:00
double randomLatency;
if (sync) {
randomLatency = .005 + deterministicRandom()->random01() * (BUGGIFY ? 1.0 : .010);
2017-05-26 04:48:44 +08:00
} else
randomLatency = 10 * deterministicRandom()->random01() / diskParameters->iops;
2017-05-26 04:48:44 +08:00
return delayUntil(diskParameters->nextOperation + randomLatency);
2017-05-26 04:48:44 +08:00
}
#if defined(_WIN32)
/* Opening with FILE_SHARE_DELETE lets simulation actually work on windows - previously renames were always failing.
FIXME: Use an actual platform abstraction for this stuff! Is there any reason we can't use underlying net2 for
example? */
2017-05-26 04:48:44 +08:00
#include <Windows.h>
int sf_open(const char* filename, int flags, int convFlags, int mode) {
HANDLE wh = CreateFile(filename,
GENERIC_READ | ((flags & IAsyncFile::OPEN_READWRITE) ? GENERIC_WRITE : 0),
FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
nullptr,
(flags & IAsyncFile::OPEN_EXCLUSIVE) ? CREATE_NEW
: (flags & IAsyncFile::OPEN_CREATE) ? OPEN_ALWAYS
: OPEN_EXISTING,
FILE_ATTRIBUTE_NORMAL,
nullptr);
2017-05-26 04:48:44 +08:00
int h = -1;
if (wh != INVALID_HANDLE_VALUE)
h = _open_osfhandle((intptr_t)wh, convFlags);
else
errno = GetLastError() == ERROR_FILE_NOT_FOUND ? ENOENT : EFAULT;
2017-05-26 04:48:44 +08:00
return h;
}
#endif
// Opens a file for asynchronous I/O
2020-12-28 12:43:47 +08:00
Future<Reference<class IAsyncFile>> Sim2FileSystem::open(const std::string& filename, int64_t flags, int64_t mode) {
ASSERT((flags & IAsyncFile::OPEN_ATOMIC_WRITE_AND_CREATE) || !(flags & IAsyncFile::OPEN_CREATE) ||
StringRef(filename).endsWith(
LiteralStringRef(".fdb-lock"))); // We don't use "ordinary" non-atomic file creation right now except for
// folder locking, and we don't have code to simulate its unsafeness.
2017-05-26 04:48:44 +08:00
if ((flags & IAsyncFile::OPEN_EXCLUSIVE))
ASSERT(flags & IAsyncFile::OPEN_CREATE);
2017-05-26 04:48:44 +08:00
if (flags & IAsyncFile::OPEN_UNCACHED) {
auto& machineCache = g_simulator.getCurrentProcess()->machine->openFiles;
std::string actualFilename = filename;
if (flags & IAsyncFile::OPEN_ATOMIC_WRITE_AND_CREATE) {
actualFilename = filename + ".part";
auto partFile = machineCache.find(actualFilename);
if (partFile != machineCache.end()) {
Future<Reference<IAsyncFile>> f = AsyncFileDetachable::open(partFile->second.get());
if (FLOW_KNOBS->PAGE_WRITE_CHECKSUM_HISTORY > 0)
f = map(f, [=](Reference<IAsyncFile> r) {
return Reference<IAsyncFile>(new AsyncFileWriteChecker(r));
});
return f;
2017-05-26 04:48:44 +08:00
}
}
Future<Reference<IAsyncFile>> f;
auto itr = machineCache.find(actualFilename);
if (itr == machineCache.end()) {
2020-07-13 09:30:02 +08:00
// Simulated disk parameters are shared by the AsyncFileNonDurable and the underlying SimpleFile.
// This way, they can both keep up with the time to start the next operation
2020-11-07 15:50:55 +08:00
auto diskParameters =
makeReference<DiskParameters>(FLOW_KNOBS->SIM_DISK_IOPS, FLOW_KNOBS->SIM_DISK_BANDWIDTH);
f = AsyncFileNonDurable::open(filename,
actualFilename,
SimpleFile::open(filename, flags, mode, diskParameters, false),
2021-03-08 17:49:29 +08:00
diskParameters,
(flags & IAsyncFile::OPEN_NO_AIO) == 0);
machineCache[actualFilename] = UnsafeWeakFutureReference<IAsyncFile>(f);
} else {
f = itr->second.get();
}
f = AsyncFileDetachable::open(f);
if (FLOW_KNOBS->PAGE_WRITE_CHECKSUM_HISTORY > 0)
f = map(f, [=](Reference<IAsyncFile> r) { return Reference<IAsyncFile>(new AsyncFileWriteChecker(r)); });
if (FLOW_KNOBS->ENABLE_CHAOS_FEATURES)
2021-07-20 02:13:57 +08:00
f = map(f, [=](Reference<IAsyncFile> r) { return Reference<IAsyncFile>(new AsyncFileChaos(r)); });
2021-07-10 12:42:42 +08:00
#if ENCRYPTION_ENABLED
2021-01-23 13:47:36 +08:00
if (flags & IAsyncFile::OPEN_ENCRYPTED)
f = map(f, [flags](Reference<IAsyncFile> r) {
2021-06-28 09:55:57 +08:00
auto mode = flags & IAsyncFile::OPEN_READWRITE ? AsyncFileEncrypted::Mode::APPEND_ONLY
: AsyncFileEncrypted::Mode::READ_ONLY;
return Reference<IAsyncFile>(new AsyncFileEncrypted(r, mode));
2021-01-23 13:47:36 +08:00
});
2021-07-10 12:42:42 +08:00
#endif // ENCRYPTION_ENABLED
return f;
} else
2017-05-26 04:48:44 +08:00
return AsyncFileCached::open(filename, flags, mode);
}
// Deletes the given file. If mustBeDurable, returns only when the file is guaranteed to be deleted even after a power
// failure.
2020-12-28 12:43:47 +08:00
Future<Void> Sim2FileSystem::deleteFile(const std::string& filename, bool mustBeDurable) {
return Sim2::deleteFileImpl(&g_sim2, filename, mustBeDurable);
2017-05-26 04:48:44 +08:00
}
2021-03-03 08:34:30 +08:00
ACTOR Future<Void> renameFileImpl(std::string from, std::string to) {
wait(delay(0.5 * deterministicRandom()->random01()));
::renameFile(from, to);
wait(delay(0.5 * deterministicRandom()->random01()));
2021-03-03 08:34:30 +08:00
return Void();
}
Future<Void> Sim2FileSystem::renameFile(std::string const& from, std::string const& to) {
return renameFileImpl(from, to);
}
2020-12-28 12:43:47 +08:00
Future<std::time_t> Sim2FileSystem::lastWriteTime(const std::string& filename) {
// TODO: update this map upon file writes.
static std::map<std::string, double> fileWrites;
if (BUGGIFY && deterministicRandom()->random01() < 0.01) {
fileWrites[filename] = now();
}
return fileWrites[filename];
}
#ifdef ENABLE_SAMPLING
ActorLineageSet& Sim2FileSystem::getActorLineageSet() {
return actorLineageSet;
}
#endif
void Sim2FileSystem::newFileSystem() {
2017-05-26 04:48:44 +08:00
g_network->setGlobal(INetwork::enFileSystem, (flowGlobalType) new Sim2FileSystem());
}