foundationdb/fdbrpc/simulator.h

543 lines
19 KiB
C
Raw Normal View History

2017-05-26 04:48:44 +08:00
/*
* simulator.h
*
* This source file is part of the FoundationDB open source project
*
2022-03-22 04:36:23 +08:00
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
*
2017-05-26 04:48:44 +08:00
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
2017-05-26 04:48:44 +08:00
* http://www.apache.org/licenses/LICENSE-2.0
*
2017-05-26 04:48:44 +08:00
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef FLOW_SIMULATOR_H
#define FLOW_SIMULATOR_H
#include "flow/ProtocolVersion.h"
Remote ikvs debugging (#6465) * initial structure for remote IKVS server * moved struct to .h file, added new files to CMakeList * happy path implementation, connection error when testing * saved minor local change * changed tracing to debug * fixed onClosed and getError being called before init is finished * fix spawn process bug, now use absolute path * added server knob to set ikvs process port number * added server knob for remote/local kv store * implement simulator remote process spawning * fixed bug for simulator timeout * commit all changes * removed print lines in trace * added FlowProcess implementation by Markus * initial debug of FlowProcess, stuck at parent sending OpenKVStoreRequest to child * temporary fix for process factory throwing segfault on create * specify public address in command * change remote kv store knob to false for jenkins build * made port 0 open random unused port * change remote store knob to true for benchmark * set listening port to randomly opened port * added print lines for jenkins run open kv store timeout debug * removed most tracing and print lines * removed tutorial changes * update handleIOErrors error handling to handle remote-ikvs cases * Push all debugging changes * A version where worker bug exists * A version where restarting tests fail * Use both the name and the port to determine the child process * Remove unnecessary update on local address * Disable remote-kvs for DiskFailureCycle test * A version where restarting stuck * A version where most restarting tests green * Reset connection with child process explicitly * Remove change on unnecessary files * Unify flags from _ to - * fix merging unexpected changes * fix trac.error to .errorUnsuppressed * Add license header * Remove unnecessary header in FlowProcess.actor.cpp * Fix Windows build * Fix Windows build, add missing ; * Fix a stupid bug caused by code dropped by code merging * Disable remote kvs by default * Pass the conn_file path to the flow process, though not needed, but the buildNetwork is difficult to tune * serialization change on readrange * Update traces * Refactor the RemoteIKVS interface * Format files * Update sim2 interface to not clog connections between parent and child processes in simulation * Update comments; remove debugging symbols; Add error handling for remote_kvs_cancelled * Add comments, format files * Change method name from isBuggifyDisabled to isStableConnection; Decrease(0.1x) latency for stable connections * Commit the IConnection interface change, forgot in previous commit * Fix the issue that onClosed request is cancelled by ActorCollection * Enable the remote kv store knob * Remove FlowProcess.actor.cpp and move functions to RemoteIKeyValueStore.actor.cpp; Add remote kv store delay to avoid race; Bind the child process to die with parent process * Fix the bug where one process starts storage server more than once * Add a please_reboot_remote_kv_store error to restart the storage server worker if remote kvs died abnormally * Remove unreachable code path and add comments * Clang format the code * Fix a simple wait error * Clang format after merging the main branch * Testing mixed mode in simulation if remote_kvs knob is enabled, setting the default to false * Disable remote kvs for PhysicalShardMove which is for RocksDB * Cleanup #include orders, remove debugging traces * Revert the reorder in fdbserver.actor.cpp, which fails the gcc build Co-authored-by: “Lincoln <“lincoln.xiao@snowflake.com”>
2022-04-01 08:08:59 +08:00
#include <algorithm>
#include <string>
2017-05-26 04:48:44 +08:00
#pragma once
#include "flow/flow.h"
#include "flow/Histogram.h"
#include "fdbrpc/FailureMonitor.h"
#include "fdbrpc/Locality.h"
#include "fdbrpc/IAsyncFile.h"
2017-05-26 04:48:44 +08:00
#include "flow/TDMetric.actor.h"
#include <random>
#include "fdbrpc/ReplicationPolicy.h"
enum ClogMode { ClogDefault, ClogAll, ClogSend, ClogReceive };
class ISimulator : public INetwork {
public:
// Order matters!
enum KillType {
KillInstantly,
InjectFaults,
FailDisk,
RebootAndDelete,
RebootProcessAndDelete,
Reboot,
RebootProcess,
None
};
2017-05-26 04:48:44 +08:00
2021-03-06 03:28:15 +08:00
// Order matters! all modes >= 2 are fault injection modes
enum TSSMode { Disabled, EnabledNormal, EnabledAddDelay, EnabledDropMutations };
enum class BackupAgentType { NoBackupAgents, WaitForType, BackupToFile, BackupToDB };
2017-05-26 04:48:44 +08:00
// Subclasses may subclass ProcessInfo as well
struct MachineInfo;
struct ProcessInfo : NonCopyable {
const char* name;
const char* coordinationFolder;
const char* dataFolder;
MachineInfo* machine;
NetworkAddressList addresses;
2017-05-26 04:48:44 +08:00
NetworkAddress address;
LocalityData locality;
2017-05-26 04:48:44 +08:00
ProcessClass startingClass;
TDMetricCollection tdmetrics;
2021-07-29 07:03:37 +08:00
ChaosMetrics chaosMetrics;
HistogramRegistry histograms;
std::map<NetworkAddress, Reference<IListener>> listenerMap;
2020-08-07 04:06:50 +08:00
std::map<NetworkAddress, Reference<IUDPSocket>> boundUDPSockets;
2017-05-26 04:48:44 +08:00
bool failed;
bool excluded;
bool cleared;
2017-05-26 04:48:44 +08:00
bool rebooting;
std::vector<flowGlobalType> globals;
INetworkConnections* network;
2017-05-26 04:48:44 +08:00
uint64_t fault_injection_r;
double fault_injection_p1, fault_injection_p2;
bool failedDisk;
2017-05-26 04:48:44 +08:00
UID uid;
ProtocolVersion protocolVersion;
2022-03-15 23:57:26 +08:00
bool excludeFromRestarts = false;
Remote ikvs debugging (#6465) * initial structure for remote IKVS server * moved struct to .h file, added new files to CMakeList * happy path implementation, connection error when testing * saved minor local change * changed tracing to debug * fixed onClosed and getError being called before init is finished * fix spawn process bug, now use absolute path * added server knob to set ikvs process port number * added server knob for remote/local kv store * implement simulator remote process spawning * fixed bug for simulator timeout * commit all changes * removed print lines in trace * added FlowProcess implementation by Markus * initial debug of FlowProcess, stuck at parent sending OpenKVStoreRequest to child * temporary fix for process factory throwing segfault on create * specify public address in command * change remote kv store knob to false for jenkins build * made port 0 open random unused port * change remote store knob to true for benchmark * set listening port to randomly opened port * added print lines for jenkins run open kv store timeout debug * removed most tracing and print lines * removed tutorial changes * update handleIOErrors error handling to handle remote-ikvs cases * Push all debugging changes * A version where worker bug exists * A version where restarting tests fail * Use both the name and the port to determine the child process * Remove unnecessary update on local address * Disable remote-kvs for DiskFailureCycle test * A version where restarting stuck * A version where most restarting tests green * Reset connection with child process explicitly * Remove change on unnecessary files * Unify flags from _ to - * fix merging unexpected changes * fix trac.error to .errorUnsuppressed * Add license header * Remove unnecessary header in FlowProcess.actor.cpp * Fix Windows build * Fix Windows build, add missing ; * Fix a stupid bug caused by code dropped by code merging * Disable remote kvs by default * Pass the conn_file path to the flow process, though not needed, but the buildNetwork is difficult to tune * serialization change on readrange * Update traces * Refactor the RemoteIKVS interface * Format files * Update sim2 interface to not clog connections between parent and child processes in simulation * Update comments; remove debugging symbols; Add error handling for remote_kvs_cancelled * Add comments, format files * Change method name from isBuggifyDisabled to isStableConnection; Decrease(0.1x) latency for stable connections * Commit the IConnection interface change, forgot in previous commit * Fix the issue that onClosed request is cancelled by ActorCollection * Enable the remote kv store knob * Remove FlowProcess.actor.cpp and move functions to RemoteIKeyValueStore.actor.cpp; Add remote kv store delay to avoid race; Bind the child process to die with parent process * Fix the bug where one process starts storage server more than once * Add a please_reboot_remote_kv_store error to restart the storage server worker if remote kvs died abnormally * Remove unreachable code path and add comments * Clang format the code * Fix a simple wait error * Clang format after merging the main branch * Testing mixed mode in simulation if remote_kvs knob is enabled, setting the default to false * Disable remote kvs for PhysicalShardMove which is for RocksDB * Cleanup #include orders, remove debugging traces * Revert the reorder in fdbserver.actor.cpp, which fails the gcc build Co-authored-by: “Lincoln <“lincoln.xiao@snowflake.com”>
2022-04-01 08:08:59 +08:00
std::vector<ProcessInfo*> childs;
ProcessInfo(const char* name,
LocalityData locality,
ProcessClass startingClass,
NetworkAddressList addresses,
INetworkConnections* net,
const char* dataFolder,
const char* coordinationFolder)
2021-07-23 13:48:27 +08:00
: name(name), coordinationFolder(coordinationFolder), dataFolder(dataFolder), machine(nullptr),
addresses(addresses), address(addresses.address), locality(locality), startingClass(startingClass),
failed(false), excluded(false), cleared(false), rebooting(false), network(net), fault_injection_r(0),
fault_injection_p1(0), fault_injection_p2(0), failedDisk(false) {
uid = deterministicRandom()->randomUniqueID();
}
2017-05-26 04:48:44 +08:00
Future<KillType> onShutdown() { return shutdownSignal.getFuture(); }
bool isReliable() const {
return !failed && fault_injection_p1 == 0 && fault_injection_p2 == 0 && !failedDisk &&
(!machine || (machine->machineProcess->fault_injection_p1 == 0 &&
machine->machineProcess->fault_injection_p2 == 0));
}
bool isAvailable() const { return !isExcluded() && isReliable(); }
bool isExcluded() const { return excluded; }
bool isCleared() const { return cleared; }
2020-11-23 11:29:27 +08:00
std::string getReliableInfo() const {
std::stringstream ss;
ss << "failed:" << failed << " fault_injection_p1:" << fault_injection_p1
<< " fault_injection_p2:" << fault_injection_p2;
return ss.str();
}
Remote ikvs debugging (#6465) * initial structure for remote IKVS server * moved struct to .h file, added new files to CMakeList * happy path implementation, connection error when testing * saved minor local change * changed tracing to debug * fixed onClosed and getError being called before init is finished * fix spawn process bug, now use absolute path * added server knob to set ikvs process port number * added server knob for remote/local kv store * implement simulator remote process spawning * fixed bug for simulator timeout * commit all changes * removed print lines in trace * added FlowProcess implementation by Markus * initial debug of FlowProcess, stuck at parent sending OpenKVStoreRequest to child * temporary fix for process factory throwing segfault on create * specify public address in command * change remote kv store knob to false for jenkins build * made port 0 open random unused port * change remote store knob to true for benchmark * set listening port to randomly opened port * added print lines for jenkins run open kv store timeout debug * removed most tracing and print lines * removed tutorial changes * update handleIOErrors error handling to handle remote-ikvs cases * Push all debugging changes * A version where worker bug exists * A version where restarting tests fail * Use both the name and the port to determine the child process * Remove unnecessary update on local address * Disable remote-kvs for DiskFailureCycle test * A version where restarting stuck * A version where most restarting tests green * Reset connection with child process explicitly * Remove change on unnecessary files * Unify flags from _ to - * fix merging unexpected changes * fix trac.error to .errorUnsuppressed * Add license header * Remove unnecessary header in FlowProcess.actor.cpp * Fix Windows build * Fix Windows build, add missing ; * Fix a stupid bug caused by code dropped by code merging * Disable remote kvs by default * Pass the conn_file path to the flow process, though not needed, but the buildNetwork is difficult to tune * serialization change on readrange * Update traces * Refactor the RemoteIKVS interface * Format files * Update sim2 interface to not clog connections between parent and child processes in simulation * Update comments; remove debugging symbols; Add error handling for remote_kvs_cancelled * Add comments, format files * Change method name from isBuggifyDisabled to isStableConnection; Decrease(0.1x) latency for stable connections * Commit the IConnection interface change, forgot in previous commit * Fix the issue that onClosed request is cancelled by ActorCollection * Enable the remote kv store knob * Remove FlowProcess.actor.cpp and move functions to RemoteIKeyValueStore.actor.cpp; Add remote kv store delay to avoid race; Bind the child process to die with parent process * Fix the bug where one process starts storage server more than once * Add a please_reboot_remote_kv_store error to restart the storage server worker if remote kvs died abnormally * Remove unreachable code path and add comments * Clang format the code * Fix a simple wait error * Clang format after merging the main branch * Testing mixed mode in simulation if remote_kvs knob is enabled, setting the default to false * Disable remote kvs for PhysicalShardMove which is for RocksDB * Cleanup #include orders, remove debugging traces * Revert the reorder in fdbserver.actor.cpp, which fails the gcc build Co-authored-by: “Lincoln <“lincoln.xiao@snowflake.com”>
2022-04-01 08:08:59 +08:00
std::vector<ProcessInfo*> const& getChilds() const { return childs; }
2017-05-26 04:48:44 +08:00
// Return true if the class type is suitable for stateful roles, such as tLog and StorageServer.
bool isAvailableClass() const {
switch (startingClass._class) {
case ProcessClass::UnsetClass:
return true;
case ProcessClass::StorageClass:
return true;
case ProcessClass::TransactionClass:
return true;
case ProcessClass::ResolutionClass:
return false;
case ProcessClass::CommitProxyClass:
return false;
case ProcessClass::GrvProxyClass:
return false;
case ProcessClass::MasterClass:
return false;
case ProcessClass::TesterClass:
return false;
case ProcessClass::StatelessClass:
return false;
case ProcessClass::LogClass:
return true;
case ProcessClass::LogRouterClass:
return false;
case ProcessClass::ClusterControllerClass:
return false;
case ProcessClass::DataDistributorClass:
return false;
case ProcessClass::RatekeeperClass:
return false;
2021-09-15 23:35:58 +08:00
case ProcessClass::BlobManagerClass:
return false;
case ProcessClass::StorageCacheClass:
return false;
case ProcessClass::BackupClass:
return false;
case ProcessClass::EncryptKeyProxyClass:
return false;
default:
return false;
}
}
Reference<IListener> getListener(const NetworkAddress& addr) const {
auto listener = listenerMap.find(addr);
ASSERT(listener != listenerMap.end());
return listener->second;
}
inline flowGlobalType global(int id) const { return (globals.size() > id) ? globals[id] : nullptr; };
inline void setGlobal(size_t id, flowGlobalType v) {
globals.resize(std::max(globals.size(), id + 1));
globals[id] = v;
};
2017-05-26 04:48:44 +08:00
std::string toString() const {
return format(
"name: %s address: %s zone: %s datahall: %s class: %s excluded: %d cleared: %d",
name,
formatIpPort(addresses.address.ip, addresses.address.port).c_str(),
(locality.zoneId().present() ? locality.zoneId().get().printable().c_str() : "[unset]"),
(locality.dataHallId().present() ? locality.dataHallId().get().printable().c_str() : "[unset]"),
startingClass.toString().c_str(),
excluded,
cleared);
2019-01-09 23:41:02 +08:00
}
2017-05-26 04:48:44 +08:00
// Members not for external use
Promise<KillType> shutdownSignal;
};
// A set of data associated with a simulated machine
2017-05-26 04:48:44 +08:00
struct MachineInfo {
ProcessInfo* machineProcess;
std::vector<ProcessInfo*> processes;
// A map from filename to file handle for all open files on a machine
std::map<std::string, UnsafeWeakFutureReference<IAsyncFile>> openFiles;
std::set<std::string> deletingFiles;
2017-05-26 04:48:44 +08:00
std::set<std::string> closingFiles;
Optional<Standalone<StringRef>> machineId;
2017-05-26 04:48:44 +08:00
Remote ikvs debugging (#6465) * initial structure for remote IKVS server * moved struct to .h file, added new files to CMakeList * happy path implementation, connection error when testing * saved minor local change * changed tracing to debug * fixed onClosed and getError being called before init is finished * fix spawn process bug, now use absolute path * added server knob to set ikvs process port number * added server knob for remote/local kv store * implement simulator remote process spawning * fixed bug for simulator timeout * commit all changes * removed print lines in trace * added FlowProcess implementation by Markus * initial debug of FlowProcess, stuck at parent sending OpenKVStoreRequest to child * temporary fix for process factory throwing segfault on create * specify public address in command * change remote kv store knob to false for jenkins build * made port 0 open random unused port * change remote store knob to true for benchmark * set listening port to randomly opened port * added print lines for jenkins run open kv store timeout debug * removed most tracing and print lines * removed tutorial changes * update handleIOErrors error handling to handle remote-ikvs cases * Push all debugging changes * A version where worker bug exists * A version where restarting tests fail * Use both the name and the port to determine the child process * Remove unnecessary update on local address * Disable remote-kvs for DiskFailureCycle test * A version where restarting stuck * A version where most restarting tests green * Reset connection with child process explicitly * Remove change on unnecessary files * Unify flags from _ to - * fix merging unexpected changes * fix trac.error to .errorUnsuppressed * Add license header * Remove unnecessary header in FlowProcess.actor.cpp * Fix Windows build * Fix Windows build, add missing ; * Fix a stupid bug caused by code dropped by code merging * Disable remote kvs by default * Pass the conn_file path to the flow process, though not needed, but the buildNetwork is difficult to tune * serialization change on readrange * Update traces * Refactor the RemoteIKVS interface * Format files * Update sim2 interface to not clog connections between parent and child processes in simulation * Update comments; remove debugging symbols; Add error handling for remote_kvs_cancelled * Add comments, format files * Change method name from isBuggifyDisabled to isStableConnection; Decrease(0.1x) latency for stable connections * Commit the IConnection interface change, forgot in previous commit * Fix the issue that onClosed request is cancelled by ActorCollection * Enable the remote kv store knob * Remove FlowProcess.actor.cpp and move functions to RemoteIKeyValueStore.actor.cpp; Add remote kv store delay to avoid race; Bind the child process to die with parent process * Fix the bug where one process starts storage server more than once * Add a please_reboot_remote_kv_store error to restart the storage server worker if remote kvs died abnormally * Remove unreachable code path and add comments * Clang format the code * Fix a simple wait error * Clang format after merging the main branch * Testing mixed mode in simulation if remote_kvs knob is enabled, setting the default to false * Disable remote kvs for PhysicalShardMove which is for RocksDB * Cleanup #include orders, remove debugging traces * Revert the reorder in fdbserver.actor.cpp, which fails the gcc build Co-authored-by: “Lincoln <“lincoln.xiao@snowflake.com”>
2022-04-01 08:08:59 +08:00
const uint16_t remotePortStart;
std::vector<uint16_t> usedRemotePorts;
MachineInfo() : machineProcess(nullptr), remotePortStart(1000) {}
short getRandomPort() {
for (uint16_t i = remotePortStart; i < 60000; i++) {
if (std::find(usedRemotePorts.begin(), usedRemotePorts.end(), i) == usedRemotePorts.end()) {
TraceEvent(SevDebug, "RandomPortOpened").detail("PortNum", i);
usedRemotePorts.push_back(i);
return i;
}
}
UNREACHABLE();
}
void removeRemotePort(uint16_t port) {
if (port < remotePortStart)
return;
auto pos = std::find(usedRemotePorts.begin(), usedRemotePorts.end(), port);
if (pos != usedRemotePorts.end()) {
usedRemotePorts.erase(pos);
}
}
2017-05-26 04:48:44 +08:00
};
ProcessInfo* getProcess(Endpoint const& endpoint) { return getProcessByAddress(endpoint.getPrimaryAddress()); }
2017-05-26 04:48:44 +08:00
ProcessInfo* getCurrentProcess() { return currentProcess; }
ProcessInfo const* getCurrentProcess() const { return currentProcess; }
2020-07-13 09:30:02 +08:00
// onProcess: wait for the process to be scheduled by the runloop; a task will be created for the process.
virtual Future<Void> onProcess(ISimulator::ProcessInfo* process, TaskPriority taskID = TaskPriority::Zero) = 0;
virtual Future<Void> onMachine(ISimulator::ProcessInfo* process, TaskPriority taskID = TaskPriority::Zero) = 0;
virtual ProcessInfo* newProcess(const char* name,
IPAddress ip,
uint16_t port,
bool sslEnabled,
uint16_t listenPerProcess,
LocalityData locality,
ProcessClass startingClass,
const char* dataFolder,
const char* coordinationFolder,
ProtocolVersion protocol) = 0;
virtual void killProcess(ProcessInfo* machine, KillType) = 0;
virtual void rebootProcess(Optional<Standalone<StringRef>> zoneId, bool allProcesses) = 0;
virtual void rebootProcess(ProcessInfo* process, KillType kt) = 0;
virtual void killInterface(NetworkAddress address, KillType) = 0;
virtual bool killMachine(Optional<Standalone<StringRef>> machineId,
KillType kt,
bool forceKill = false,
KillType* ktFinal = nullptr) = 0;
virtual bool killZone(Optional<Standalone<StringRef>> zoneId,
KillType kt,
bool forceKill = false,
KillType* ktFinal = nullptr) = 0;
virtual bool killDataCenter(Optional<Standalone<StringRef>> dcId,
KillType kt,
bool forceKill = false,
KillType* ktFinal = nullptr) = 0;
// virtual KillType getMachineKillState( UID zoneID ) = 0;
virtual bool canKillProcesses(std::vector<ProcessInfo*> const& availableProcesses,
std::vector<ProcessInfo*> const& deadProcesses,
KillType kt,
KillType* newKillType) const = 0;
virtual bool isAvailable() const = 0;
virtual bool datacenterDead(Optional<Standalone<StringRef>> dcId) const = 0;
virtual void displayWorkers() const;
2021-10-20 08:37:03 +08:00
ProtocolVersion protocolVersion() const override = 0;
2020-11-23 11:29:27 +08:00
void addRole(NetworkAddress const& address, std::string const& role) {
roleAddresses[address][role]++;
TraceEvent("RoleAdd")
.detail("Address", address)
.detail("Role", role)
.detail("NumRoles", roleAddresses[address].size())
.detail("Value", roleAddresses[address][role]);
}
2020-11-23 11:29:27 +08:00
void removeRole(NetworkAddress const& address, std::string const& role) {
auto addressIt = roleAddresses.find(address);
if (addressIt != roleAddresses.end()) {
auto rolesIt = addressIt->second.find(role);
if (rolesIt != addressIt->second.end()) {
if (rolesIt->second > 1) {
rolesIt->second--;
TraceEvent("RoleRemove")
.detail("Address", address)
.detail("Role", role)
.detail("NumRoles", addressIt->second.size())
.detail("Value", rolesIt->second)
.detail("Result", "Decremented Role");
} else {
addressIt->second.erase(rolesIt);
if (addressIt->second.size()) {
TraceEvent("RoleRemove")
.detail("Address", address)
.detail("Role", role)
.detail("NumRoles", addressIt->second.size())
.detail("Value", 0)
.detail("Result", "Removed Role");
} else {
roleAddresses.erase(addressIt);
TraceEvent("RoleRemove")
.detail("Address", address)
.detail("Role", role)
.detail("NumRoles", 0)
.detail("Value", 0)
.detail("Result", "Removed Address");
}
}
} else {
TraceEvent(SevWarn, "RoleRemove")
.detail("Address", address)
.detail("Role", role)
.detail("Result", "Role Missing");
}
} else {
TraceEvent(SevWarn, "RoleRemove")
.detail("Address", address)
.detail("Role", role)
.detail("Result", "Address Missing");
}
}
2020-11-23 11:29:27 +08:00
std::string getRoles(NetworkAddress const& address, bool skipWorkers = true) const {
auto addressIt = roleAddresses.find(address);
std::string roleText;
if (addressIt != roleAddresses.end()) {
for (auto& roleIt : addressIt->second) {
if ((!skipWorkers) || (roleIt.first != "Worker"))
roleText += roleIt.first + ((roleIt.second > 1) ? format("-%d ", roleIt.second) : " ");
}
}
if (roleText.empty())
roleText = "[unset]";
return roleText;
}
2017-05-26 04:48:44 +08:00
2020-11-23 11:29:27 +08:00
void clearAddress(NetworkAddress const& address) {
clearedAddresses[address]++;
TraceEvent("ClearAddress").detail("Address", address).detail("Value", clearedAddresses[address]);
}
2020-11-23 11:29:27 +08:00
bool isCleared(NetworkAddress const& address) const {
return clearedAddresses.find(address) != clearedAddresses.end();
}
2020-11-23 11:29:27 +08:00
void excludeAddress(NetworkAddress const& address) {
excludedAddresses[address]++;
TraceEvent("ExcludeAddress").detail("Address", address).detail("Value", excludedAddresses[address]);
}
2020-11-23 11:29:27 +08:00
void includeAddress(NetworkAddress const& address) {
auto addressIt = excludedAddresses.find(address);
if (addressIt != excludedAddresses.end()) {
if (addressIt->second > 1) {
addressIt->second--;
TraceEvent("IncludeAddress")
.detail("Address", address)
.detail("Value", addressIt->second)
.detail("Result", "Decremented");
} else {
excludedAddresses.erase(addressIt);
TraceEvent("IncludeAddress").detail("Address", address).detail("Value", 0).detail("Result", "Removed");
}
} else {
TraceEvent(SevWarn, "IncludeAddress").detail("Address", address).detail("Result", "Missing");
}
}
2020-11-23 11:29:27 +08:00
void includeAllAddresses() {
TraceEvent("IncludeAddressAll").detail("AddressTotal", excludedAddresses.size());
excludedAddresses.clear();
}
2020-11-23 11:29:27 +08:00
bool isExcluded(NetworkAddress const& address) const {
return excludedAddresses.find(address) != excludedAddresses.end();
}
2020-11-23 11:29:27 +08:00
void disableSwapToMachine(Optional<Standalone<StringRef>> zoneId) { swapsDisabled.insert(zoneId); }
void enableSwapToMachine(Optional<Standalone<StringRef>> zoneId) {
2017-05-26 04:48:44 +08:00
swapsDisabled.erase(zoneId);
allSwapsDisabled = false;
}
2020-11-23 11:29:27 +08:00
bool canSwapToMachine(Optional<Standalone<StringRef>> zoneId) const {
return swapsDisabled.count(zoneId) == 0 && !allSwapsDisabled && !extraDB;
2017-05-26 04:48:44 +08:00
}
2020-11-23 11:29:27 +08:00
void enableSwapsToAll() {
2017-05-26 04:48:44 +08:00
swapsDisabled.clear();
allSwapsDisabled = false;
}
2020-11-23 11:29:27 +08:00
void disableSwapsToAll() {
2017-05-26 04:48:44 +08:00
swapsDisabled.clear();
allSwapsDisabled = true;
}
virtual void clogInterface(const IPAddress& ip, double seconds, ClogMode mode = ClogDefault) = 0;
virtual void clogPair(const IPAddress& from, const IPAddress& to, double seconds) = 0;
virtual std::vector<ProcessInfo*> getAllProcesses() const = 0;
virtual ProcessInfo* getProcessByAddress(NetworkAddress const& address) = 0;
2017-05-26 04:48:44 +08:00
virtual MachineInfo* getMachineByNetworkAddress(NetworkAddress const& address) = 0;
virtual MachineInfo* getMachineById(Optional<Standalone<StringRef>> const& machineId) = 0;
void run() override {}
virtual void destroyProcess(ProcessInfo* p) = 0;
virtual void destroyMachine(Optional<Standalone<StringRef>> const& machineId) = 0;
2017-05-26 04:48:44 +08:00
int desiredCoordinators;
int physicalDatacenters;
int processesPerMachine;
int listenersPerProcess;
2017-05-26 04:48:44 +08:00
std::set<NetworkAddress> protectedAddresses;
std::map<NetworkAddress, ProcessInfo*> currentlyRebootingProcesses;
std::unique_ptr<class ClusterConnectionString> extraDB;
Reference<IReplicationPolicy> storagePolicy;
Reference<IReplicationPolicy> tLogPolicy;
int32_t tLogWriteAntiQuorum;
Optional<Standalone<StringRef>> primaryDcId;
Reference<IReplicationPolicy> remoteTLogPolicy;
int32_t usableRegions;
std::string disablePrimary;
std::string disableRemote;
std::string originalRegions;
std::string startingDisabledConfiguration;
bool allowLogSetKills;
Optional<Standalone<StringRef>> remoteDcId;
bool hasSatelliteReplication;
Reference<IReplicationPolicy> satelliteTLogPolicy;
Reference<IReplicationPolicy> satelliteTLogPolicyFallback;
int32_t satelliteTLogWriteAntiQuorum;
int32_t satelliteTLogWriteAntiQuorumFallback;
std::vector<Optional<Standalone<StringRef>>> primarySatelliteDcIds;
std::vector<Optional<Standalone<StringRef>>> remoteSatelliteDcIds;
2021-03-06 03:28:15 +08:00
TSSMode tssMode;
std::map<NetworkAddress, bool> corruptWorkerMap;
2021-08-07 14:18:10 +08:00
ConfigDBType configDBType;
2017-05-26 04:48:44 +08:00
// Used by workloads that perform reconfigurations
2017-05-26 04:48:44 +08:00
int testerCount;
std::string connectionString;
bool isStopped;
double lastConnectionFailure;
double connectionFailuresDisableDuration;
2017-05-26 04:48:44 +08:00
bool speedUpSimulation;
BackupAgentType backupAgents;
BackupAgentType drAgents;
bool restarted = false;
2017-05-26 04:48:44 +08:00
bool hasDiffProtocolProcess; // true if simulator is testing a process with a different version
bool setDiffProtocol; // true if a process with a different protocol version has been started
bool allowStorageMigrationTypeChange = false;
flowGlobalType global(int id) const final { return getCurrentProcess()->global(id); };
void setGlobal(size_t id, flowGlobalType v) final { getCurrentProcess()->setGlobal(id, v); };
2017-05-26 04:48:44 +08:00
void disableFor(const std::string& desc, double time) { disabledMap[desc] = time; }
double checkDisabled(const std::string& desc) const {
auto iter = disabledMap.find(desc);
if (iter != disabledMap.end()) {
return iter->second;
}
return 0;
}
2019-03-14 12:27:23 +08:00
2017-05-26 04:48:44 +08:00
static thread_local ProcessInfo* currentProcess;
bool checkInjectedCorruption() {
auto iter = corruptWorkerMap.find(currentProcess->address);
if (iter != corruptWorkerMap.end())
return iter->second;
return false;
}
ISimulator();
virtual ~ISimulator();
protected:
2017-05-26 04:48:44 +08:00
Mutex mutex;
private:
std::set<Optional<Standalone<StringRef>>> swapsDisabled;
std::map<NetworkAddress, int> excludedAddresses;
std::map<NetworkAddress, int> clearedAddresses;
std::map<NetworkAddress, std::map<std::string, int>> roleAddresses;
std::map<std::string, double> disabledMap;
2017-05-26 04:48:44 +08:00
bool allSwapsDisabled;
};
// Quickly make existing code work that expects g_simulator to be of class type (not a pointer)
extern ISimulator* g_pSimulator;
#define g_simulator (*g_pSimulator)
void startNewSimulator(bool printSimTime);
2017-05-26 04:48:44 +08:00
// Parameters used to simulate disk performance
2017-05-26 04:48:44 +08:00
struct DiskParameters : ReferenceCounted<DiskParameters> {
double nextOperation;
int64_t iops;
int64_t bandwidth;
DiskParameters(int64_t iops, int64_t bandwidth) : nextOperation(0), iops(iops), bandwidth(bandwidth) {}
2017-05-26 04:48:44 +08:00
};
// Simulates delays for performing operations on disk
2017-05-26 04:48:44 +08:00
extern Future<Void> waitUntilDiskReady(Reference<DiskParameters> parameters, int64_t size, bool sync = false);
class Sim2FileSystem : public IAsyncFileSystem {
public:
// Opens a file for asynchronous I/O
2020-12-28 12:43:47 +08:00
Future<Reference<class IAsyncFile>> open(const std::string& filename, int64_t flags, int64_t mode) override;
2017-05-26 04:48:44 +08:00
// Deletes the given file. If mustBeDurable, returns only when the file is guaranteed to be deleted even after a
// power failure.
2020-12-28 12:43:47 +08:00
Future<Void> deleteFile(const std::string& filename, bool mustBeDurable) override;
2020-12-28 12:43:47 +08:00
Future<std::time_t> lastWriteTime(const std::string& filename) override;
2017-05-26 04:48:44 +08:00
#ifdef ENABLE_SAMPLING
ActorLineageSet& getActorLineageSet() override;
#endif
Future<Void> renameFile(std::string const& from, std::string const& to) override;
2017-05-26 04:48:44 +08:00
Sim2FileSystem() {}
~Sim2FileSystem() override {}
2017-05-26 04:48:44 +08:00
static void newFileSystem();
#ifdef ENABLE_SAMPLING
ActorLineageSet actorLineageSet;
#endif
2017-05-26 04:48:44 +08:00
};
#endif