From d6d347f665cfa1b77bd376217a7d8c9a8ad6ba63 Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Fri, 13 Mar 2020 10:31:59 -0700 Subject: [PATCH 1/2] treat a tlog which takes a long time to create its disk queue as failed --- fdbserver/Knobs.cpp | 1 + fdbserver/Knobs.h | 1 + fdbserver/OldTLogServer_6_0.actor.cpp | 12 +++++++++++- fdbserver/TLogServer.actor.cpp | 12 +++++++++++- 4 files changed, 24 insertions(+), 2 deletions(-) diff --git a/fdbserver/Knobs.cpp b/fdbserver/Knobs.cpp index d3b229f689..fd1184b519 100644 --- a/fdbserver/Knobs.cpp +++ b/fdbserver/Knobs.cpp @@ -82,6 +82,7 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs, bool isSimula init( TLOG_DEGRADED_DURATION, 5.0 ); init( TLOG_IGNORE_POP_AUTO_ENABLE_DELAY, 300.0 ); init( TXS_POPPED_MAX_DELAY, 1.0 ); if ( randomize && BUGGIFY ) TXS_POPPED_MAX_DELAY = deterministicRandom()->random01(); + init( TLOG_MAX_CREATE_DURATION, 10.0 ); // disk snapshot max timeout, to be put in TLog, storage and coordinator nodes init( SNAP_CREATE_MAX_TIMEOUT, 300.0 ); diff --git a/fdbserver/Knobs.h b/fdbserver/Knobs.h index 2473f78e0f..73401f1ae4 100644 --- a/fdbserver/Knobs.h +++ b/fdbserver/Knobs.h @@ -84,6 +84,7 @@ public: int DISK_QUEUE_MAX_TRUNCATE_BYTES; // A truncate larger than this will cause the file to be replaced instead. double TLOG_DEGRADED_DURATION; double TXS_POPPED_MAX_DELAY; + double TLOG_MAX_CREATE_DURATION; // Data distribution queue double HEALTH_POLL_TIME; diff --git a/fdbserver/OldTLogServer_6_0.actor.cpp b/fdbserver/OldTLogServer_6_0.actor.cpp index 07a047d5a8..23f81745fa 100644 --- a/fdbserver/OldTLogServer_6_0.actor.cpp +++ b/fdbserver/OldTLogServer_6_0.actor.cpp @@ -2332,7 +2332,17 @@ ACTOR Future tLog( IKeyValueStore* persistentData, IDiskQueue* persistentQ if(restoreFromDisk) { wait( restorePersistentState( &self, locality, oldLog, recovered, tlogRequests ) ); } else { - wait( checkEmptyQueue(&self) && checkRecovered(&self) ); + choose { + when( wait( checkEmptyQueue(&self) && checkRecovered(&self) ) ) {} + when( wait( lowPriorityDelay(SERVER_KNOBS->TLOG_MAX_CREATE_DURATION) ) ) { + Error err = io_timeout(); + if(g_network->isSimulated()) { + err = err.asInjectedFault(); + } + TraceEvent(SevError, "TLogInitializeFilesTimeout", tlogId).error(err); + throw err; + } + } } //Disk errors need a chance to kill this actor. diff --git a/fdbserver/TLogServer.actor.cpp b/fdbserver/TLogServer.actor.cpp index 85eb10461d..6d474d6e5b 100644 --- a/fdbserver/TLogServer.actor.cpp +++ b/fdbserver/TLogServer.actor.cpp @@ -2766,7 +2766,17 @@ ACTOR Future tLog( IKeyValueStore* persistentData, IDiskQueue* persistentQ if(restoreFromDisk) { wait( restorePersistentState( &self, locality, oldLog, recovered, tlogRequests ) ); } else { - wait( checkEmptyQueue(&self) && checkRecovered(&self) ); + choose { + when( wait( checkEmptyQueue(&self) && checkRecovered(&self) ) ) {} + when( wait( lowPriorityDelay(SERVER_KNOBS->TLOG_MAX_CREATE_DURATION) ) ) { + Error err = io_timeout(); + if(g_network->isSimulated()) { + err = err.asInjectedFault(); + } + TraceEvent(SevError, "TLogInitializeFilesTimeout", tlogId).error(err); + throw err; + } + } } //Disk errors need a chance to kill this actor. From ea98c7a40a9d6c82f09174f34d8a4ebaa7deb9cb Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Mon, 16 Mar 2020 11:38:14 -0700 Subject: [PATCH 2/2] added additional timeout on initPersistentState --- fdbserver/OldTLogServer_6_0.actor.cpp | 16 +++------------- fdbserver/TLogServer.actor.cpp | 16 +++------------- flow/genericactors.actor.h | 17 ++++++++++++++++- 3 files changed, 22 insertions(+), 27 deletions(-) diff --git a/fdbserver/OldTLogServer_6_0.actor.cpp b/fdbserver/OldTLogServer_6_0.actor.cpp index 23f81745fa..564bb96370 100644 --- a/fdbserver/OldTLogServer_6_0.actor.cpp +++ b/fdbserver/OldTLogServer_6_0.actor.cpp @@ -1416,7 +1416,7 @@ ACTOR Future initPersistentState( TLogData* self, Reference logDa // PERSIST: Initial setup of persistentData for a brand new tLog for a new database state IKeyValueStore *storage = self->persistentData; - wait(storage->init()); + wait( ioTimeoutError( storage->init(), SERVER_KNOBS->TLOG_MAX_CREATE_DURATION ) ); storage->set( persistFormat ); storage->set( KeyValueRef( BinaryWriter::toValue(logData->logId,Unversioned()).withPrefix(persistCurrentVersionKeys.begin), BinaryWriter::toValue(logData->version.get(), Unversioned()) ) ); storage->set( KeyValueRef( BinaryWriter::toValue(logData->logId,Unversioned()).withPrefix(persistKnownCommittedVersionKeys.begin), BinaryWriter::toValue(logData->knownCommittedVersion, Unversioned()) ) ); @@ -1432,7 +1432,7 @@ ACTOR Future initPersistentState( TLogData* self, Reference logDa } TraceEvent("TLogInitCommit", logData->logId); - wait( self->persistentData->commit() ); + wait( ioTimeoutError( self->persistentData->commit(), SERVER_KNOBS->TLOG_MAX_CREATE_DURATION ) ); return Void(); } @@ -2332,17 +2332,7 @@ ACTOR Future tLog( IKeyValueStore* persistentData, IDiskQueue* persistentQ if(restoreFromDisk) { wait( restorePersistentState( &self, locality, oldLog, recovered, tlogRequests ) ); } else { - choose { - when( wait( checkEmptyQueue(&self) && checkRecovered(&self) ) ) {} - when( wait( lowPriorityDelay(SERVER_KNOBS->TLOG_MAX_CREATE_DURATION) ) ) { - Error err = io_timeout(); - if(g_network->isSimulated()) { - err = err.asInjectedFault(); - } - TraceEvent(SevError, "TLogInitializeFilesTimeout", tlogId).error(err); - throw err; - } - } + wait( ioTimeoutError( checkEmptyQueue(&self) && checkRecovered(&self), SERVER_KNOBS->TLOG_MAX_CREATE_DURATION ) ); } //Disk errors need a chance to kill this actor. diff --git a/fdbserver/TLogServer.actor.cpp b/fdbserver/TLogServer.actor.cpp index 6d474d6e5b..c090a9dcae 100644 --- a/fdbserver/TLogServer.actor.cpp +++ b/fdbserver/TLogServer.actor.cpp @@ -1807,7 +1807,7 @@ ACTOR Future initPersistentState( TLogData* self, Reference logDa // PERSIST: Initial setup of persistentData for a brand new tLog for a new database state IKeyValueStore *storage = self->persistentData; - wait(storage->init()); + wait( ioTimeoutError( storage->init(), SERVER_KNOBS->TLOG_MAX_CREATE_DURATION ) ); storage->set( persistFormat ); storage->set( KeyValueRef( BinaryWriter::toValue(logData->logId,Unversioned()).withPrefix(persistCurrentVersionKeys.begin), BinaryWriter::toValue(logData->version.get(), Unversioned()) ) ); storage->set( KeyValueRef( BinaryWriter::toValue(logData->logId,Unversioned()).withPrefix(persistKnownCommittedVersionKeys.begin), BinaryWriter::toValue(logData->knownCommittedVersion, Unversioned()) ) ); @@ -1824,7 +1824,7 @@ ACTOR Future initPersistentState( TLogData* self, Reference logDa } TraceEvent("TLogInitCommit", logData->logId); - wait( self->persistentData->commit() ); + wait( ioTimeoutError( self->persistentData->commit(), SERVER_KNOBS->TLOG_MAX_CREATE_DURATION ) ); return Void(); } @@ -2766,17 +2766,7 @@ ACTOR Future tLog( IKeyValueStore* persistentData, IDiskQueue* persistentQ if(restoreFromDisk) { wait( restorePersistentState( &self, locality, oldLog, recovered, tlogRequests ) ); } else { - choose { - when( wait( checkEmptyQueue(&self) && checkRecovered(&self) ) ) {} - when( wait( lowPriorityDelay(SERVER_KNOBS->TLOG_MAX_CREATE_DURATION) ) ) { - Error err = io_timeout(); - if(g_network->isSimulated()) { - err = err.asInjectedFault(); - } - TraceEvent(SevError, "TLogInitializeFilesTimeout", tlogId).error(err); - throw err; - } - } + wait( ioTimeoutError( checkEmptyQueue(&self) && checkRecovered(&self), SERVER_KNOBS->TLOG_MAX_CREATE_DURATION ) ); } //Disk errors need a chance to kill this actor. diff --git a/flow/genericactors.actor.h b/flow/genericactors.actor.h index 14db6428ab..b1978bbbc4 100644 --- a/flow/genericactors.actor.h +++ b/flow/genericactors.actor.h @@ -199,7 +199,6 @@ Future timeoutError( Future what, double time, TaskPriority taskID = TaskP } } - ACTOR template Future delayed( Future what, double time = 0.0, TaskPriority taskID = TaskPriority::DefaultDelay ) { try { @@ -866,6 +865,22 @@ Future timeoutWarningCollector( FutureStream const& input, double co Future quorumEqualsTrue( std::vector> const& futures, int const& required ); Future lowPriorityDelay( double const& waitTime ); +ACTOR template +Future ioTimeoutError( Future what, double time ) { + Future end = lowPriorityDelay( time ); + choose { + when( T t = wait( what ) ) { return t; } + when( wait( end ) ) { + Error err = io_timeout(); + if(g_network->isSimulated()) { + err = err.asInjectedFault(); + } + TraceEvent(SevError, "IoTimeoutError").error(err); + throw err; + } + } +} + ACTOR template Future streamHelper( PromiseStream output, PromiseStream errors, Future input ) { try {