DD:getTeam may fail to get a team when it can

Due to randomness, when unhealthy teams are majority while there still
exists healthy teams, getTeam function may be unlucky to find
any feasible (ok) team, which leads to BestTeamStuck situation.

This commit increases the tries from 10 to 20.

A long-term solution may first find all feasible teams and choose a random
one from them. Since This can affect the statistics of which team is picked.
So it is not included in this commit.

Non-functional change: This commit removes unneeded printf introduced by
fast restore PR 1404.
This commit is contained in:
Meng Xu 2019-09-07 20:05:59 -07:00
parent 7395081468
commit 0b785e5c1c
5 changed files with 24 additions and 5 deletions

View File

@ -819,6 +819,8 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
else { else {
int nTries = 0; int nTries = 0;
while( randomTeams.size() < SERVER_KNOBS->BEST_TEAM_OPTION_COUNT && nTries < SERVER_KNOBS->BEST_TEAM_MAX_TEAM_TRIES ) { while( randomTeams.size() < SERVER_KNOBS->BEST_TEAM_OPTION_COUNT && nTries < SERVER_KNOBS->BEST_TEAM_MAX_TEAM_TRIES ) {
// If unhealthy team is majority, we may not find an ok desk in this while loop
// TODO: We may want to create the qualified team and choose among those.
Reference<IDataDistributionTeam> dest = deterministicRandom()->randomChoice(self->teams); Reference<IDataDistributionTeam> dest = deterministicRandom()->randomChoice(self->teams);
bool ok = dest->isHealthy() && (!req.preferLowerUtilization || dest->hasHealthyFreeSpace()); bool ok = dest->isHealthy() && (!req.preferLowerUtilization || dest->hasHealthyFreeSpace());
@ -872,6 +874,10 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
} }
} }
} }
// if (!bestOption.present()) {
// TraceEvent("GetTeamRequest").detail("Request", req.getDesc());
// self->traceAllInfo(true);
// }
req.reply.send( bestOption ); req.reply.send( bestOption );

View File

@ -114,6 +114,22 @@ struct GetTeamRequest {
GetTeamRequest() {} GetTeamRequest() {}
GetTeamRequest( bool wantsNewServers, bool wantsTrueBest, bool preferLowerUtilization, double inflightPenalty = 1.0 ) : wantsNewServers( wantsNewServers ), wantsTrueBest( wantsTrueBest ), preferLowerUtilization( preferLowerUtilization ), inflightPenalty( inflightPenalty ) {} GetTeamRequest( bool wantsNewServers, bool wantsTrueBest, bool preferLowerUtilization, double inflightPenalty = 1.0 ) : wantsNewServers( wantsNewServers ), wantsTrueBest( wantsTrueBest ), preferLowerUtilization( preferLowerUtilization ), inflightPenalty( inflightPenalty ) {}
std::string getDesc() {
std::stringstream ss;
ss << "WantsNewServers:" << wantsNewServers << " WantsTrueBest:" << wantsTrueBest << " PreferLowerUtilization:" << preferLowerUtilization << " inflightPenalty:" << inflightPenalty << ";";
ss << "Sources:";
for (auto& s : sources) {
ss << s.toString() << ",";
}
ss << "CompleteSources:";
for (auto& cs : completeSources) {
ss << cs.toString() << ",";
}
return ss.str();
}
}; };
struct GetMetricsRequest { struct GetMetricsRequest {

View File

@ -159,7 +159,7 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) {
init( INITIAL_FAILURE_REACTION_DELAY, 30.0 ); if( randomize && BUGGIFY ) INITIAL_FAILURE_REACTION_DELAY = 0.0; init( INITIAL_FAILURE_REACTION_DELAY, 30.0 ); if( randomize && BUGGIFY ) INITIAL_FAILURE_REACTION_DELAY = 0.0;
init( CHECK_TEAM_DELAY, 30.0 ); init( CHECK_TEAM_DELAY, 30.0 );
init( LOG_ON_COMPLETION_DELAY, DD_QUEUE_LOGGING_INTERVAL ); init( LOG_ON_COMPLETION_DELAY, DD_QUEUE_LOGGING_INTERVAL );
init( BEST_TEAM_MAX_TEAM_TRIES, 10 ); init( BEST_TEAM_MAX_TEAM_TRIES, 20 );
init( BEST_TEAM_OPTION_COUNT, 4 ); init( BEST_TEAM_OPTION_COUNT, 4 );
init( BEST_OF_AMT, 4 ); init( BEST_OF_AMT, 4 );
init( SERVER_LIST_DELAY, 1.0 ); init( SERVER_LIST_DELAY, 1.0 );

View File

@ -702,8 +702,6 @@ ACTOR Future<DistributedTestResults> runWorkload( Database cx, std::vector< Test
checks.push_back(workloads[i].check.template getReplyUnlessFailedFor<CheckReply>(waitForFailureTime, 0)); checks.push_back(workloads[i].check.template getReplyUnlessFailedFor<CheckReply>(waitForFailureTime, 0));
wait( waitForAll( checks ) ); wait( waitForAll( checks ) );
printf("checking tests DONE num_workloads:%d\n", workloads.size());
throwIfError(checks, "CheckFailedForWorkload" + printable(spec.title)); throwIfError(checks, "CheckFailedForWorkload" + printable(spec.title));
for(int i = 0; i < checks.size(); i++) { for(int i = 0; i < checks.size(); i++) {
@ -801,7 +799,6 @@ ACTOR Future<bool> runTest( Database cx, std::vector< TesterInterface > testers,
try { try {
Future<DistributedTestResults> fTestResults = runWorkload( cx, testers, spec ); Future<DistributedTestResults> fTestResults = runWorkload( cx, testers, spec );
if( spec.timeout > 0 ) { if( spec.timeout > 0 ) {
printf("[INFO] TestSpec, timeout:%d\n", spec.timeout);
fTestResults = timeoutError( fTestResults, spec.timeout ); fTestResults = timeoutError( fTestResults, spec.timeout );
} }
DistributedTestResults _testResults = wait( fTestResults ); DistributedTestResults _testResults = wait( fTestResults );

View File

@ -142,7 +142,7 @@ struct CycleWorkload : TestWorkload {
} }
void logTestData(const VectorRef<KeyValueRef>& data) { void logTestData(const VectorRef<KeyValueRef>& data) {
TraceEvent("MXTestFailureDetail"); TraceEvent("TestFailureDetail");
int index = 0; int index = 0;
for (auto& entry : data) { for (auto& entry : data) {
TraceEvent("CurrentDataEntry") TraceEvent("CurrentDataEntry")