A version that pass 100K simulation
This commit is contained in:
parent
dcfbb55ad5
commit
3056702f2b
|
@ -686,14 +686,14 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
|
|||
init( REPLACE_INTERFACE_CHECK_DELAY, 5.0 );
|
||||
init( COORDINATOR_REGISTER_INTERVAL, 5.0 );
|
||||
init( CLIENT_REGISTER_INTERVAL, 600.0 );
|
||||
init( CC_ENABLE_WORKER_HEALTH_MONITOR, true );
|
||||
init( CC_WORKER_HEALTH_CHECKING_INTERVAL, 30.0 );
|
||||
init( CC_ENABLE_WORKER_HEALTH_MONITOR, false );
|
||||
init( CC_WORKER_HEALTH_CHECKING_INTERVAL, 60.0 );
|
||||
init( CC_DEGRADED_LINK_EXPIRATION_INTERVAL, 300.0 );
|
||||
init( CC_MIN_DEGRADATION_INTERVAL, 60.0 );
|
||||
init( CC_MIN_DEGRADATION_INTERVAL, 120.0 );
|
||||
init( ENCRYPT_KEY_PROXY_FAILURE_TIME, 0.1 ); if ( isSimulated ) ENCRYPT_KEY_PROXY_FAILURE_TIME = 1.0 + deterministicRandom()->random01();
|
||||
init( CC_DEGRADED_PEER_DEGREE_TO_EXCLUDE, 3 );
|
||||
init( CC_MAX_EXCLUSION_DUE_TO_HEALTH, 2 );
|
||||
init( CC_HEALTH_TRIGGER_RECOVERY, true );
|
||||
init( CC_HEALTH_TRIGGER_RECOVERY, false );
|
||||
init( CC_TRACKING_HEALTH_RECOVERY_INTERVAL, 3600.0 );
|
||||
init( CC_MAX_HEALTH_RECOVERY_COUNT, 5 );
|
||||
init( CC_HEALTH_TRIGGER_FAILOVER, false );
|
||||
|
@ -974,7 +974,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
|
|||
init( MIN_DELAY_CC_WORST_FIT_CANDIDACY_SECONDS, 10.0 );
|
||||
init( MAX_DELAY_CC_WORST_FIT_CANDIDACY_SECONDS, 30.0 );
|
||||
init( DBINFO_FAILED_DELAY, 1.0 );
|
||||
init( ENABLE_WORKER_HEALTH_MONITOR, true ); if ( randomize && BUGGIFY ) ENABLE_WORKER_HEALTH_MONITOR = true;
|
||||
init( ENABLE_WORKER_HEALTH_MONITOR, false ); if ( randomize && BUGGIFY ) ENABLE_WORKER_HEALTH_MONITOR = true;
|
||||
init( WORKER_HEALTH_MONITOR_INTERVAL, 60.0 );
|
||||
init( PEER_LATENCY_CHECK_MIN_POPULATION, 30 );
|
||||
init( PEER_LATENCY_DEGRADATION_PERCENTILE, 0.50 );
|
||||
|
|
|
@ -129,7 +129,6 @@ bool ClusterControllerData::transactionSystemContainsDegradedServers() {
|
|||
}
|
||||
}
|
||||
|
||||
/*
|
||||
if (recoveryData->recoveryState < RecoveryState::ACCEPTING_COMMITS) {
|
||||
for (const auto& tlog : recoveryData->recruitment.tLogs) {
|
||||
if (tlog.addresses().contains(server)) {
|
||||
|
@ -141,7 +140,7 @@ bool ClusterControllerData::transactionSystemContainsDegradedServers() {
|
|||
return true;
|
||||
}
|
||||
}
|
||||
}*/
|
||||
}
|
||||
|
||||
for (const auto& proxy : dbi.client.grvProxies) {
|
||||
if (proxy.addresses().contains(server)) {
|
||||
|
@ -758,6 +757,7 @@ void checkBetterSingletons(ClusterControllerData* self) {
|
|||
}
|
||||
|
||||
ACTOR Future<Void> doCheckOutstandingRequests(ClusterControllerData* self) {
|
||||
TraceEvent("ZZZZZdoCheckOutstandingRequestsCall").log();
|
||||
try {
|
||||
wait(delay(SERVER_KNOBS->CHECK_OUTSTANDING_INTERVAL));
|
||||
while (now() - self->lastRecruitTime < SERVER_KNOBS->SINGLETON_RECRUIT_BME_DELAY ||
|
||||
|
@ -788,6 +788,7 @@ ACTOR Future<Void> doCheckOutstandingRequests(ClusterControllerData* self) {
|
|||
TraceEvent(SevError, "CheckOutstandingError").error(e);
|
||||
}
|
||||
}
|
||||
TraceEvent("ZZZZZdoCheckOutstandingRequestsReturn").log();
|
||||
return Void();
|
||||
}
|
||||
|
||||
|
@ -808,6 +809,7 @@ ACTOR Future<Void> doCheckOutstandingRemoteRequests(ClusterControllerData* self)
|
|||
}
|
||||
|
||||
void checkOutstandingRequests(ClusterControllerData* self) {
|
||||
TraceEvent("ZZZZZCallCheckOutstandingRequests").log();
|
||||
if (self->outstandingRemoteRequestChecker.isReady()) {
|
||||
self->outstandingRemoteRequestChecker = doCheckOutstandingRemoteRequests(self);
|
||||
}
|
||||
|
@ -2848,11 +2850,13 @@ ACTOR Future<Void> workerHealthMonitor(ClusterControllerData* self) {
|
|||
|
||||
// Compare `self->degradationInfo` with `self->excludedDegradedServers` and remove those that have
|
||||
// recovered.
|
||||
bool hasRecoveredServer = false;
|
||||
for (auto it = self->excludedDegradedServers.begin(); it != self->excludedDegradedServers.end();) {
|
||||
if (self->degradationInfo.degradedServers.find(*it) == self->degradationInfo.degradedServers.end() &&
|
||||
self->degradationInfo.disconnectedServers.find(*it) ==
|
||||
self->degradationInfo.disconnectedServers.end()) {
|
||||
self->excludedDegradedServers.erase(it++);
|
||||
hasRecoveredServer = true;
|
||||
} else {
|
||||
++it;
|
||||
}
|
||||
|
@ -2911,6 +2915,10 @@ ACTOR Future<Void> workerHealthMonitor(ClusterControllerData* self) {
|
|||
}
|
||||
}
|
||||
|
||||
if (hasRecoveredServer) {
|
||||
checkOutstandingRequests(self);
|
||||
}
|
||||
|
||||
wait(delay(SERVER_KNOBS->CC_WORKER_HEALTH_CHECKING_INTERVAL));
|
||||
} catch (Error& e) {
|
||||
TraceEvent(SevWarnAlways, "ClusterControllerHealthMonitorError").error(e);
|
||||
|
@ -3097,7 +3105,7 @@ ACTOR Future<Void> clusterControllerCore(ClusterControllerFullInterface interf,
|
|||
|
||||
for (auto const& [id, worker] : self.id_worker) {
|
||||
if ((req.flags & GetWorkersRequest::NON_EXCLUDED_PROCESSES_ONLY) &&
|
||||
self.db.config.isExcludedServer(worker.details.interf.addresses())) {
|
||||
(self.db.config.isExcludedServer(worker.details.interf.addresses()) || self.isExcludedDegradedServer(worker.details.interf.addresses()))) {
|
||||
continue;
|
||||
}
|
||||
|
||||
|
|
|
@ -1496,6 +1496,9 @@ public:
|
|||
|
||||
for (auto& it : id_worker) {
|
||||
auto fitness = it.second.details.processClass.machineClassFitness(role);
|
||||
if (role == ProcessClass::ClusterRole::Master) {
|
||||
TraceEvent("ZZZZZZGetMasterRoleFitness").detail("Worker", it.second.details.interf.address()).detail("Fitness", fitness);
|
||||
}
|
||||
if (conf.isExcludedServer(it.second.details.interf.addresses()) ||
|
||||
isExcludedDegradedServer(it.second.details.interf.addresses())) {
|
||||
fitness = std::max(fitness, ProcessClass::ExcludeFit);
|
||||
|
@ -2388,6 +2391,7 @@ public:
|
|||
// This function returns true when the cluster controller determines it is worth forcing
|
||||
// a cluster recovery in order to change the recruited processes in the transaction subsystem.
|
||||
bool betterMasterExists() {
|
||||
TraceEvent("ZZZZZCallingBetterMasterExist").log();
|
||||
const ServerDBInfo dbi = db.serverInfo->get();
|
||||
|
||||
if (dbi.recoveryState < RecoveryState::ACCEPTING_COMMITS) {
|
||||
|
@ -2577,6 +2581,7 @@ public:
|
|||
;
|
||||
return false;
|
||||
}
|
||||
TraceEvent("ZZZZZZMasterFit").detail("Old", oldMasterFit).detail("New", newMasterFit);
|
||||
if (oldMasterFit > newMasterFit || (dbi.master.locality.processId() == clusterControllerProcessId &&
|
||||
mworker.worker.interf.locality.processId() != clusterControllerProcessId)) {
|
||||
TraceEvent("BetterMasterExists", id)
|
||||
|
|
|
@ -129,14 +129,14 @@ struct ClogTlogWorkload : TestWorkload {
|
|||
}
|
||||
|
||||
ACTOR static Future<Void> excludeFailedLog(ClogTlogWorkload* self, Database cx) {
|
||||
state Future<Void> timeout = delay(300);
|
||||
state Future<Void> timeout = delay(30);
|
||||
|
||||
loop choose {
|
||||
when(wait(self->dbInfo->onChange())) {
|
||||
if (self->dbInfo->get().recoveryState >= RecoveryState::ACCEPTING_COMMITS) {
|
||||
return Void();
|
||||
}
|
||||
timeout = delay(300);
|
||||
timeout = delay(30);
|
||||
}
|
||||
when(wait(timeout)) {
|
||||
// recovery state hasn't changed in 30s, exclude the failed tlog
|
||||
|
@ -157,7 +157,6 @@ struct ClogTlogWorkload : TestWorkload {
|
|||
self->useDisconnection = true;
|
||||
}
|
||||
|
||||
state double workloadEnd = now() + self->testDuration - 10;
|
||||
// Let cycle workload issue some transactions.
|
||||
wait(delay(20.0));
|
||||
|
||||
|
@ -166,6 +165,7 @@ struct ClogTlogWorkload : TestWorkload {
|
|||
}
|
||||
|
||||
double startTime = now();
|
||||
state double workloadEnd = now() + self->testDuration - 10;
|
||||
TraceEvent("ClogTlog").detail("StartTime", startTime).detail("EndTime", workloadEnd);
|
||||
|
||||
// Clog and wait for recovery to happen
|
||||
|
@ -174,8 +174,14 @@ struct ClogTlogWorkload : TestWorkload {
|
|||
wait(self->dbInfo->onChange());
|
||||
}
|
||||
|
||||
state bool useGrayFailureToRecover = false;
|
||||
if (deterministicRandom()->coinflip() && self->useDisconnection) {
|
||||
TraceEvent("ClogTlogUseGrayFailreToRecover").log();
|
||||
useGrayFailureToRecover = true;
|
||||
}
|
||||
|
||||
// start exclusion and wait for fully recovery
|
||||
// state Future<Void> excludeLog = excludeFailedLog(self, cx);
|
||||
state Future<Void> excludeLog = useGrayFailureToRecover ? Never() : excludeFailedLog(self, cx);
|
||||
state Future<Void> onChange = self->dbInfo->onChange();
|
||||
loop choose {
|
||||
when(wait(onChange)) {
|
||||
|
|
|
@ -1394,6 +1394,7 @@ struct ConsistencyCheckWorkload : TestWorkload {
|
|||
std::map<NetworkAddress, WorkerDetails> nonExcludedWorkerProcessMap;
|
||||
std::map<Optional<Key>, std::vector<ProcessClass::ClassType>> dcToNonExcludedClassTypes;
|
||||
for (const auto& worker : nonExcludedWorkers) {
|
||||
TraceEvent("ZZZZZZNonExcludedWorkerClass").detail("Worker", worker.interf.address()).detail("Class", worker.processClass.classType());
|
||||
nonExcludedWorkerProcessMap[worker.interf.address()] = worker;
|
||||
Optional<Key> dc = worker.interf.locality.dcId();
|
||||
if (!dcToNonExcludedClassTypes.count(dc))
|
||||
|
|
|
@ -5,10 +5,16 @@ machineCount = 20
|
|||
commitProxyCount = 4
|
||||
config = 'triple'
|
||||
desiredTLogCount = 6
|
||||
# FIXME: re-enable rocks at some point
|
||||
storageEngineExcludeTypes = [4, 5]
|
||||
|
||||
[[knobs]]
|
||||
enable_worker_health_monitor = true
|
||||
cc_enable_worker_health_monitor = true
|
||||
cc_worker_health_checking_interval = 15
|
||||
cc_min_degradation_interval = 30
|
||||
cc_health_trigger_recovery = true
|
||||
peer_latency_degradation_threshold = 1
|
||||
|
||||
[[test]]
|
||||
testTitle = 'ClogTlog'
|
||||
|
|
Loading…
Reference in New Issue