Add a knob to guard the gray failure detection during TLog recovery
This commit is contained in:
parent
8d7f2e84ed
commit
83992d61ec
|
@ -1002,6 +1002,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
|
|||
init( PEER_TIMEOUT_PERCENTAGE_DEGRADATION_THRESHOLD, 0.1 );
|
||||
init( PEER_DEGRADATION_CONNECTION_FAILURE_COUNT, 5 );
|
||||
init( WORKER_HEALTH_REPORT_RECENT_DESTROYED_PEER, true );
|
||||
init( GRAY_FAILURE_ENABLE_TLOG_RECOVERY_MONITORING, true );
|
||||
init( STORAGE_SERVER_REBOOT_ON_IO_TIMEOUT, false ); if ( randomize && BUGGIFY ) STORAGE_SERVER_REBOOT_ON_IO_TIMEOUT = true;
|
||||
init( STORAGE_DISK_CLEANUP_MAX_RETRIES, 10 );
|
||||
init( STORAGE_DISK_CLEANUP_RETRY_INTERVAL, isSimulated ? 2 : 30 );
|
||||
|
|
|
@ -996,6 +996,8 @@ public:
|
|||
bool WORKER_HEALTH_REPORT_RECENT_DESTROYED_PEER; // When enabled, the worker's health monitor also report any recent
|
||||
// destroyed peers who are part of the transaction system to
|
||||
// cluster controller.
|
||||
bool GRAY_FAILURE_ENABLE_TLOG_RECOVERY_MONITORING; // When enabled, health monitor will try to detect any gray
|
||||
// failure during tlog recovery during the recovery process.
|
||||
bool STORAGE_SERVER_REBOOT_ON_IO_TIMEOUT; // When enabled, storage server's worker will crash on io_timeout error;
|
||||
// this allows fdbmonitor to restart the worker and recreate the same SS.
|
||||
// When SS can be temporarily throttled by infrastructure, e.g, k8s,
|
||||
|
|
|
@ -135,7 +135,8 @@ bool ClusterControllerData::transactionSystemContainsDegradedServers() {
|
|||
}
|
||||
}
|
||||
|
||||
if (recoveryData.isValid() && recoveryData->recoveryState < RecoveryState::ACCEPTING_COMMITS) {
|
||||
if (SERVER_KNOBS->GRAY_FAILURE_ENABLE_TLOG_RECOVERY_MONITORING && recoveryData.isValid() &&
|
||||
recoveryData->recoveryState < RecoveryState::ACCEPTING_COMMITS) {
|
||||
// During recovery, TLogs may not be able to pull data from previous generation TLogs due to gray
|
||||
// failures. In this case, we rely on the latest recruitment information and see if any newly recruited
|
||||
// TLogs are degraded.
|
||||
|
|
|
@ -3629,16 +3629,20 @@ ACTOR Future<Void> tLogStart(TLogData* self, InitializeTLogRequest req, Locality
|
|||
std::vector<Tag> tags;
|
||||
tags.push_back(logData->remoteTag);
|
||||
|
||||
// Force gray failure monitoring during recovery.
|
||||
self->enablePrimaryTxnSystemHealthCheck->set(true);
|
||||
if (SERVER_KNOBS->GRAY_FAILURE_ENABLE_TLOG_RECOVERY_MONITORING) {
|
||||
// Force gray failure monitoring during recovery.
|
||||
self->enablePrimaryTxnSystemHealthCheck->set(true);
|
||||
}
|
||||
wait(pullAsyncData(self, logData, tags, logData->unrecoveredBefore, recoverAt, true) ||
|
||||
logData->removed || logData->stopCommit.onTrigger());
|
||||
self->enablePrimaryTxnSystemHealthCheck->set(false);
|
||||
} else if (!req.recoverTags.empty()) {
|
||||
ASSERT(logData->unrecoveredBefore > req.knownCommittedVersion);
|
||||
|
||||
// Force gray failure monitoring during recovery.
|
||||
self->enablePrimaryTxnSystemHealthCheck->set(true);
|
||||
if (SERVER_KNOBS->GRAY_FAILURE_ENABLE_TLOG_RECOVERY_MONITORING) {
|
||||
// Force gray failure monitoring during recovery.
|
||||
self->enablePrimaryTxnSystemHealthCheck->set(true);
|
||||
}
|
||||
wait(pullAsyncData(
|
||||
self, logData, req.recoverTags, req.knownCommittedVersion + 1, recoverAt, false) ||
|
||||
logData->removed || logData->stopCommit.onTrigger());
|
||||
|
|
Loading…
Reference in New Issue