Add a knob to guard the gray failure detection during TLog recovery

This commit is contained in:
Zhe Wu 2023-08-29 14:49:39 -07:00
parent 8d7f2e84ed
commit 83992d61ec
4 changed files with 13 additions and 5 deletions

View File

@ -1002,6 +1002,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( PEER_TIMEOUT_PERCENTAGE_DEGRADATION_THRESHOLD, 0.1 );
init( PEER_DEGRADATION_CONNECTION_FAILURE_COUNT, 5 );
init( WORKER_HEALTH_REPORT_RECENT_DESTROYED_PEER, true );
init( GRAY_FAILURE_ENABLE_TLOG_RECOVERY_MONITORING, true );
init( STORAGE_SERVER_REBOOT_ON_IO_TIMEOUT, false ); if ( randomize && BUGGIFY ) STORAGE_SERVER_REBOOT_ON_IO_TIMEOUT = true;
init( STORAGE_DISK_CLEANUP_MAX_RETRIES, 10 );
init( STORAGE_DISK_CLEANUP_RETRY_INTERVAL, isSimulated ? 2 : 30 );

View File

@ -996,6 +996,8 @@ public:
bool WORKER_HEALTH_REPORT_RECENT_DESTROYED_PEER; // When enabled, the worker's health monitor also report any recent
// destroyed peers who are part of the transaction system to
// cluster controller.
bool GRAY_FAILURE_ENABLE_TLOG_RECOVERY_MONITORING; // When enabled, health monitor will try to detect any gray
// failure during tlog recovery during the recovery process.
bool STORAGE_SERVER_REBOOT_ON_IO_TIMEOUT; // When enabled, storage server's worker will crash on io_timeout error;
// this allows fdbmonitor to restart the worker and recreate the same SS.
// When SS can be temporarily throttled by infrastructure, e.g, k8s,

View File

@ -135,7 +135,8 @@ bool ClusterControllerData::transactionSystemContainsDegradedServers() {
}
}
if (recoveryData.isValid() && recoveryData->recoveryState < RecoveryState::ACCEPTING_COMMITS) {
if (SERVER_KNOBS->GRAY_FAILURE_ENABLE_TLOG_RECOVERY_MONITORING && recoveryData.isValid() &&
recoveryData->recoveryState < RecoveryState::ACCEPTING_COMMITS) {
// During recovery, TLogs may not be able to pull data from previous generation TLogs due to gray
// failures. In this case, we rely on the latest recruitment information and see if any newly recruited
// TLogs are degraded.

View File

@ -3629,16 +3629,20 @@ ACTOR Future<Void> tLogStart(TLogData* self, InitializeTLogRequest req, Locality
std::vector<Tag> tags;
tags.push_back(logData->remoteTag);
// Force gray failure monitoring during recovery.
self->enablePrimaryTxnSystemHealthCheck->set(true);
if (SERVER_KNOBS->GRAY_FAILURE_ENABLE_TLOG_RECOVERY_MONITORING) {
// Force gray failure monitoring during recovery.
self->enablePrimaryTxnSystemHealthCheck->set(true);
}
wait(pullAsyncData(self, logData, tags, logData->unrecoveredBefore, recoverAt, true) ||
logData->removed || logData->stopCommit.onTrigger());
self->enablePrimaryTxnSystemHealthCheck->set(false);
} else if (!req.recoverTags.empty()) {
ASSERT(logData->unrecoveredBefore > req.knownCommittedVersion);
// Force gray failure monitoring during recovery.
self->enablePrimaryTxnSystemHealthCheck->set(true);
if (SERVER_KNOBS->GRAY_FAILURE_ENABLE_TLOG_RECOVERY_MONITORING) {
// Force gray failure monitoring during recovery.
self->enablePrimaryTxnSystemHealthCheck->set(true);
}
wait(pullAsyncData(
self, logData, req.recoverTags, req.knownCommittedVersion + 1, recoverAt, false) ||
logData->removed || logData->stopCommit.onTrigger());