Merge pull request #2242 from alexmiller-apple/fix-10min-stall-again

Fix the 10min multi-region recovery stall again
This commit is contained in:
Evan Tschannen 2020-01-23 17:53:02 -08:00 committed by GitHub
commit 6c0b934dda
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 69 additions and 59 deletions

View File

@ -348,14 +348,14 @@ ACTOR Future<Void> logRouterPeekMessages( LogRouterData* self, TLogPeekRequest r
// The peek cursor and this comparison need to agree about the maximum number of in-flight requests.
while(trackerData.sequence_version.size() && seqBegin->first <= sequence - SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS) {
if(seqBegin->second.canBeSet()) {
seqBegin->second.sendError(timed_out());
seqBegin->second.sendError(operation_obsolete());
}
trackerData.sequence_version.erase(seqBegin);
seqBegin = trackerData.sequence_version.begin();
}
if(trackerData.sequence_version.size() && sequence < seqBegin->first) {
throw timed_out();
throw operation_obsolete();
}
trackerData.lastUpdate = now();
@ -364,8 +364,8 @@ ACTOR Future<Void> logRouterPeekMessages( LogRouterData* self, TLogPeekRequest r
req.onlySpilled = prevPeekData.second;
wait(yield());
} catch( Error &e ) {
if(e.code() == error_code_timed_out) {
req.reply.sendError(timed_out());
if(e.code() == error_code_timed_out || e.code() == error_code_operation_obsolete) {
req.reply.sendError(e);
return Void();
} else {
throw;
@ -425,15 +425,15 @@ ACTOR Future<Void> logRouterPeekMessages( LogRouterData* self, TLogPeekRequest r
trackerData.lastUpdate = now();
auto& sequenceData = trackerData.sequence_version[sequence+1];
if(trackerData.sequence_version.size() && sequence+1 < trackerData.sequence_version.begin()->first) {
req.reply.sendError(timed_out());
req.reply.sendError(operation_obsolete());
if(!sequenceData.isSet())
sequenceData.sendError(timed_out());
sequenceData.sendError(operation_obsolete());
return Void();
}
if(sequenceData.isSet()) {
if(sequenceData.getFuture().get().first != reply.end) {
TEST(true); //tlog peek second attempt ended at a different version
req.reply.sendError(timed_out());
req.reply.sendError(operation_obsolete());
return Void();
}
} else {

View File

@ -151,11 +151,8 @@ ACTOR Future<Void> serverPeekParallelGetMore( ILogSystem::ServerPeekCursor* self
self->futureResults.push_back( brokenPromiseToNever( self->interf->get().interf().peekMessages.getReply(TLogPeekRequest(self->messageVersion.version,self->tag,self->returnIfBlocked, self->onlySpilled, std::make_pair(self->randomID, self->sequence++)), taskID) ) );
}
if (self->sequence == std::numeric_limits<decltype(self->sequence)>::max()) {
throw timed_out();
throw operation_obsolete();
}
} else if (self->futureResults.size() == 1) {
self->randomID = deterministicRandom()->randomUniqueID();
self->sequence = 0;
} else if (self->futureResults.size() == 0) {
return Void();
}
@ -166,7 +163,7 @@ ACTOR Future<Void> serverPeekParallelGetMore( ILogSystem::ServerPeekCursor* self
choose {
when( TLogPeekReply res = wait( self->interf->get().present() ? self->futureResults.front() : Never() ) ) {
if(res.begin.get() != expectedBegin) {
throw timed_out();
throw operation_obsolete();
}
expectedBegin = res.end;
self->futureResults.pop_front();
@ -194,8 +191,11 @@ ACTOR Future<Void> serverPeekParallelGetMore( ILogSystem::ServerPeekCursor* self
if(e.code() == error_code_end_of_stream) {
self->end.reset( self->messageVersion.version );
return Void();
} else if(e.code() == error_code_timed_out) {
TraceEvent("PeekCursorTimedOut", self->randomID);
} else if(e.code() == error_code_timed_out || e.code() == error_code_operation_obsolete) {
TraceEvent("PeekCursorTimedOut", self->randomID).error(e);
// We *should* never get timed_out(), as it means the TLog got stuck while handling a parallel peek,
// and thus we've likely just wasted 10min.
ASSERT_WE_THINK(e.code() == error_code_operation_obsolete || SERVER_KNOBS->PEEK_TRACKER_EXPIRATION_TIME < 10);
self->interfaceChanged = self->interf->onChange();
self->randomID = deterministicRandom()->randomUniqueID();
self->sequence = 0;

View File

@ -876,18 +876,18 @@ namespace oldTLog_4_6 {
peekId = req.sequence.get().first;
sequence = req.sequence.get().second;
if (sequence >= SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS && self->peekTracker.find(peekId) == self->peekTracker.end()) {
throw timed_out();
throw operation_obsolete();
}
if(sequence > 0) {
auto& trackerData = self->peekTracker[peekId];
trackerData.lastUpdate = now();
Version ver = wait(trackerData.sequence_version[sequence].getFuture());
req.begin = ver;
req.begin = std::max(ver, req.begin);
wait(yield());
}
} catch( Error &e ) {
if(e.code() == error_code_timed_out) {
req.reply.sendError(timed_out());
if(e.code() == error_code_timed_out || e.code() == error_code_operation_obsolete) {
req.reply.sendError(e);
return Void();
} else {
throw;
@ -923,15 +923,15 @@ namespace oldTLog_4_6 {
auto& sequenceData = trackerData.sequence_version[sequence+1];
trackerData.lastUpdate = now();
if(trackerData.sequence_version.size() && sequence+1 < trackerData.sequence_version.begin()->first) {
req.reply.sendError(timed_out());
req.reply.sendError(operation_obsolete());
if (!sequenceData.isSet())
sequenceData.sendError(timed_out());
sequenceData.sendError(operation_obsolete());
return Void();
}
if(sequenceData.isSet()) {
if(sequenceData.getFuture().get() != rep.end) {
TEST(true); //tlog peek second attempt ended at a different version
req.reply.sendError(timed_out());
req.reply.sendError(operation_obsolete());
return Void();
}
} else {
@ -1002,7 +1002,7 @@ namespace oldTLog_4_6 {
if(sequenceData.isSet()) {
if(sequenceData.getFuture().get() != reply.end) {
TEST(true); //tlog peek second attempt ended at a different version
req.reply.sendError(timed_out());
req.reply.sendError(operation_obsolete());
return Void();
}
} else {

View File

@ -1052,7 +1052,7 @@ ACTOR Future<Void> tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere
peekId = req.sequence.get().first;
sequence = req.sequence.get().second;
if (sequence >= SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS && logData->peekTracker.find(peekId) == logData->peekTracker.end()) {
throw timed_out();
throw operation_obsolete();
}
auto& trackerData = logData->peekTracker[peekId];
if (sequence == 0 && trackerData.sequence_version.find(0) == trackerData.sequence_version.end()) {
@ -1061,24 +1061,24 @@ ACTOR Future<Void> tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere
auto seqBegin = trackerData.sequence_version.begin();
while(trackerData.sequence_version.size() && seqBegin->first <= sequence - SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS) {
if(seqBegin->second.canBeSet()) {
seqBegin->second.sendError(timed_out());
seqBegin->second.sendError(operation_obsolete());
}
trackerData.sequence_version.erase(seqBegin);
seqBegin = trackerData.sequence_version.begin();
}
if(trackerData.sequence_version.size() && sequence < seqBegin->first) {
throw timed_out();
throw operation_obsolete();
}
trackerData.lastUpdate = now();
std::pair<Version, bool> prevPeekData = wait(trackerData.sequence_version[sequence].getFuture());
req.begin = prevPeekData.first;
req.begin = std::max(prevPeekData.first, req.begin);
req.onlySpilled = prevPeekData.second;
wait(yield());
} catch( Error &e ) {
if(e.code() == error_code_timed_out) {
req.reply.sendError(timed_out());
if(e.code() == error_code_timed_out || e.code() == error_code_operation_obsolete) {
req.reply.sendError(e);
return Void();
} else {
throw;
@ -1134,15 +1134,15 @@ ACTOR Future<Void> tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere
auto& sequenceData = trackerData.sequence_version[sequence+1];
trackerData.lastUpdate = now();
if(trackerData.sequence_version.size() && sequence+1 < trackerData.sequence_version.begin()->first) {
req.reply.sendError(timed_out());
req.reply.sendError(operation_obsolete());
if (!sequenceData.isSet())
sequenceData.sendError(timed_out());
sequenceData.sendError(operation_obsolete());
return Void();
}
if(sequenceData.isSet()) {
if(sequenceData.getFuture().get().first != rep.end) {
TEST(true); //tlog peek second attempt ended at a different version
req.reply.sendError(timed_out());
req.reply.sendError(operation_obsolete());
return Void();
}
} else {
@ -1210,15 +1210,15 @@ ACTOR Future<Void> tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere
trackerData.lastUpdate = now();
auto& sequenceData = trackerData.sequence_version[sequence+1];
if(trackerData.sequence_version.size() && sequence+1 < trackerData.sequence_version.begin()->first) {
req.reply.sendError(timed_out());
req.reply.sendError(operation_obsolete());
if(!sequenceData.isSet())
sequenceData.sendError(timed_out());
sequenceData.sendError(operation_obsolete());
return Void();
}
if(sequenceData.isSet()) {
if(sequenceData.getFuture().get().first != reply.end) {
TEST(true); //tlog peek second attempt ended at a different version
req.reply.sendError(timed_out());
req.reply.sendError(operation_obsolete());
return Void();
}
} else {

View File

@ -1356,6 +1356,9 @@ ACTOR Future<Void> tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere
try {
peekId = req.sequence.get().first;
sequence = req.sequence.get().second;
if (sequence >= SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS && logData->peekTracker.find(peekId) == logData->peekTracker.end()) {
throw operation_obsolete();
}
auto& trackerData = logData->peekTracker[peekId];
if (sequence == 0 && trackerData.sequence_version.find(0) == trackerData.sequence_version.end()) {
trackerData.sequence_version[0].send(std::make_pair(req.begin, req.onlySpilled));
@ -1364,24 +1367,24 @@ ACTOR Future<Void> tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere
// The peek cursor and this comparison need to agree about the maximum number of in-flight requests.
while(trackerData.sequence_version.size() && seqBegin->first <= sequence - SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS) {
if(seqBegin->second.canBeSet()) {
seqBegin->second.sendError(timed_out());
seqBegin->second.sendError(operation_obsolete());
}
trackerData.sequence_version.erase(seqBegin);
seqBegin = trackerData.sequence_version.begin();
}
if(trackerData.sequence_version.size() && sequence < seqBegin->first) {
throw timed_out();
throw operation_obsolete();
}
trackerData.lastUpdate = now();
std::pair<Version, bool> prevPeekData = wait(trackerData.sequence_version[sequence].getFuture());
req.begin = prevPeekData.first;
req.begin = std::max(prevPeekData.first, req.begin);
req.onlySpilled = prevPeekData.second;
wait(yield());
} catch( Error &e ) {
if(e.code() == error_code_timed_out) {
req.reply.sendError(timed_out());
if(e.code() == error_code_timed_out || e.code() == error_code_operation_obsolete) {
req.reply.sendError(e);
return Void();
} else {
throw;
@ -1437,15 +1440,15 @@ ACTOR Future<Void> tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere
auto& sequenceData = trackerData.sequence_version[sequence+1];
trackerData.lastUpdate = now();
if(trackerData.sequence_version.size() && sequence+1 < trackerData.sequence_version.begin()->first) {
req.reply.sendError(timed_out());
req.reply.sendError(operation_obsolete());
if (!sequenceData.isSet())
sequenceData.sendError(timed_out());
sequenceData.sendError(operation_obsolete());
return Void();
}
if(sequenceData.isSet()) {
if(sequenceData.getFuture().get().first != rep.end) {
TEST(true); //tlog peek second attempt ended at a different version
req.reply.sendError(timed_out());
req.reply.sendError(operation_obsolete());
return Void();
}
} else {
@ -1601,15 +1604,15 @@ ACTOR Future<Void> tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere
trackerData.lastUpdate = now();
auto& sequenceData = trackerData.sequence_version[sequence+1];
if(trackerData.sequence_version.size() && sequence+1 < trackerData.sequence_version.begin()->first) {
req.reply.sendError(timed_out());
req.reply.sendError(operation_obsolete());
if(!sequenceData.isSet())
sequenceData.sendError(timed_out());
sequenceData.sendError(operation_obsolete());
return Void();
}
if(sequenceData.isSet()) {
if(sequenceData.getFuture().get().first != reply.end) {
TEST(true); //tlog peek second attempt ended at a different version
req.reply.sendError(timed_out());
req.reply.sendError(operation_obsolete());
return Void();
}
} else {

View File

@ -1374,7 +1374,7 @@ ACTOR Future<Void> tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere
peekId = req.sequence.get().first;
sequence = req.sequence.get().second;
if (sequence >= SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS && logData->peekTracker.find(peekId) == logData->peekTracker.end()) {
throw timed_out();
throw operation_obsolete();
}
auto& trackerData = logData->peekTracker[peekId];
if (sequence == 0 && trackerData.sequence_version.find(0) == trackerData.sequence_version.end()) {
@ -1384,24 +1384,24 @@ ACTOR Future<Void> tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere
// The peek cursor and this comparison need to agree about the maximum number of in-flight requests.
while(trackerData.sequence_version.size() && seqBegin->first <= sequence - SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS) {
if(seqBegin->second.canBeSet()) {
seqBegin->second.sendError(timed_out());
seqBegin->second.sendError(operation_obsolete());
}
trackerData.sequence_version.erase(seqBegin);
seqBegin = trackerData.sequence_version.begin();
}
if(trackerData.sequence_version.size() && sequence < seqBegin->first) {
throw timed_out();
throw operation_obsolete();
}
trackerData.lastUpdate = now();
std::pair<Version, bool> prevPeekData = wait(trackerData.sequence_version[sequence].getFuture());
req.begin = prevPeekData.first;
req.begin = std::max(prevPeekData.first, req.begin);
req.onlySpilled = prevPeekData.second;
wait(yield());
} catch( Error &e ) {
if(e.code() == error_code_timed_out) {
req.reply.sendError(timed_out());
if(e.code() == error_code_timed_out || e.code() == error_code_operation_obsolete) {
req.reply.sendError(e);
return Void();
} else {
throw;
@ -1414,6 +1414,7 @@ ACTOR Future<Void> tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere
if(req.sequence.present()) {
auto& trackerData = logData->peekTracker[peekId];
auto& sequenceData = trackerData.sequence_version[sequence+1];
trackerData.lastUpdate = now();
if (!sequenceData.isSet()) {
sequenceData.send(std::make_pair(req.begin, req.onlySpilled));
}
@ -1457,15 +1458,15 @@ ACTOR Future<Void> tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere
auto& sequenceData = trackerData.sequence_version[sequence+1];
trackerData.lastUpdate = now();
if(trackerData.sequence_version.size() && sequence+1 < trackerData.sequence_version.begin()->first) {
req.reply.sendError(timed_out());
req.reply.sendError(operation_obsolete());
if (!sequenceData.isSet())
sequenceData.sendError(timed_out());
sequenceData.sendError(operation_obsolete());
return Void();
}
if(sequenceData.isSet()) {
if(sequenceData.getFuture().get().first != rep.end) {
TEST(true); //tlog peek second attempt ended at a different version
req.reply.sendError(timed_out());
req.reply.sendError(operation_obsolete());
return Void();
}
} else {
@ -1618,18 +1619,23 @@ ACTOR Future<Void> tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere
if(req.sequence.present()) {
auto& trackerData = logData->peekTracker[peekId];
trackerData.lastUpdate = now();
auto& sequenceData = trackerData.sequence_version[sequence+1];
trackerData.lastUpdate = now();
if(trackerData.sequence_version.size() && sequence+1 < trackerData.sequence_version.begin()->first) {
req.reply.sendError(timed_out());
if(!sequenceData.isSet())
sequenceData.sendError(timed_out());
req.reply.sendError(operation_obsolete());
if(!sequenceData.isSet()) {
// It would technically be more correct to .send({req.begin, req.onlySpilled}), as the next
// request might still be in the window of active requests, but LogSystemPeekCursor will
// throw away all future responses upon getting an operation_obsolete(), so computing a
// response will probably be a waste of CPU.
sequenceData.sendError(operation_obsolete());
}
return Void();
}
if(sequenceData.isSet()) {
if(sequenceData.getFuture().get().first != reply.end) {
TEST(true); //tlog peek second attempt ended at a different version
req.reply.sendError(timed_out());
req.reply.sendError(operation_obsolete());
return Void();
}
} else {

View File

@ -34,6 +34,7 @@ ERROR( success, 0, "Success" )
ERROR( end_of_stream, 1, "End of stream" )
ERROR( operation_failed, 1000, "Operation failed")
ERROR( wrong_shard_server, 1001, "Shard is not available from this server")
ERROR( operation_obsolete, 1002, "Operation result no longer necessary")
ERROR( timed_out, 1004, "Operation timed out" )
ERROR( coordinated_state_conflict, 1005, "Conflict occurred while changing coordination information" )
ERROR( all_alternatives_failed, 1006, "All alternatives failed" )