DD:getTeam may fail to get a team when it can
Due to randomness, when unhealthy teams are majority while there still exists healthy teams, getTeam function may be unlucky to find any feasible (ok) team, which leads to BestTeamStuck situation. This commit increases the tries from 10 to 20. A long-term solution may first find all feasible teams and choose a random one from them. Since This can affect the statistics of which team is picked. So it is not included in this commit. Non-functional change: This commit removes unneeded printf introduced by fast restore PR 1404.
This commit is contained in:
parent
7395081468
commit
0b785e5c1c
|
@ -819,6 +819,8 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
|
||||||
else {
|
else {
|
||||||
int nTries = 0;
|
int nTries = 0;
|
||||||
while( randomTeams.size() < SERVER_KNOBS->BEST_TEAM_OPTION_COUNT && nTries < SERVER_KNOBS->BEST_TEAM_MAX_TEAM_TRIES ) {
|
while( randomTeams.size() < SERVER_KNOBS->BEST_TEAM_OPTION_COUNT && nTries < SERVER_KNOBS->BEST_TEAM_MAX_TEAM_TRIES ) {
|
||||||
|
// If unhealthy team is majority, we may not find an ok desk in this while loop
|
||||||
|
// TODO: We may want to create the qualified team and choose among those.
|
||||||
Reference<IDataDistributionTeam> dest = deterministicRandom()->randomChoice(self->teams);
|
Reference<IDataDistributionTeam> dest = deterministicRandom()->randomChoice(self->teams);
|
||||||
|
|
||||||
bool ok = dest->isHealthy() && (!req.preferLowerUtilization || dest->hasHealthyFreeSpace());
|
bool ok = dest->isHealthy() && (!req.preferLowerUtilization || dest->hasHealthyFreeSpace());
|
||||||
|
@ -872,6 +874,10 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// if (!bestOption.present()) {
|
||||||
|
// TraceEvent("GetTeamRequest").detail("Request", req.getDesc());
|
||||||
|
// self->traceAllInfo(true);
|
||||||
|
// }
|
||||||
|
|
||||||
req.reply.send( bestOption );
|
req.reply.send( bestOption );
|
||||||
|
|
||||||
|
|
|
@ -114,6 +114,22 @@ struct GetTeamRequest {
|
||||||
|
|
||||||
GetTeamRequest() {}
|
GetTeamRequest() {}
|
||||||
GetTeamRequest( bool wantsNewServers, bool wantsTrueBest, bool preferLowerUtilization, double inflightPenalty = 1.0 ) : wantsNewServers( wantsNewServers ), wantsTrueBest( wantsTrueBest ), preferLowerUtilization( preferLowerUtilization ), inflightPenalty( inflightPenalty ) {}
|
GetTeamRequest( bool wantsNewServers, bool wantsTrueBest, bool preferLowerUtilization, double inflightPenalty = 1.0 ) : wantsNewServers( wantsNewServers ), wantsTrueBest( wantsTrueBest ), preferLowerUtilization( preferLowerUtilization ), inflightPenalty( inflightPenalty ) {}
|
||||||
|
|
||||||
|
std::string getDesc() {
|
||||||
|
std::stringstream ss;
|
||||||
|
|
||||||
|
ss << "WantsNewServers:" << wantsNewServers << " WantsTrueBest:" << wantsTrueBest << " PreferLowerUtilization:" << preferLowerUtilization << " inflightPenalty:" << inflightPenalty << ";";
|
||||||
|
ss << "Sources:";
|
||||||
|
for (auto& s : sources) {
|
||||||
|
ss << s.toString() << ",";
|
||||||
|
}
|
||||||
|
ss << "CompleteSources:";
|
||||||
|
for (auto& cs : completeSources) {
|
||||||
|
ss << cs.toString() << ",";
|
||||||
|
}
|
||||||
|
|
||||||
|
return ss.str();
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct GetMetricsRequest {
|
struct GetMetricsRequest {
|
||||||
|
|
|
@ -159,7 +159,7 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) {
|
||||||
init( INITIAL_FAILURE_REACTION_DELAY, 30.0 ); if( randomize && BUGGIFY ) INITIAL_FAILURE_REACTION_DELAY = 0.0;
|
init( INITIAL_FAILURE_REACTION_DELAY, 30.0 ); if( randomize && BUGGIFY ) INITIAL_FAILURE_REACTION_DELAY = 0.0;
|
||||||
init( CHECK_TEAM_DELAY, 30.0 );
|
init( CHECK_TEAM_DELAY, 30.0 );
|
||||||
init( LOG_ON_COMPLETION_DELAY, DD_QUEUE_LOGGING_INTERVAL );
|
init( LOG_ON_COMPLETION_DELAY, DD_QUEUE_LOGGING_INTERVAL );
|
||||||
init( BEST_TEAM_MAX_TEAM_TRIES, 10 );
|
init( BEST_TEAM_MAX_TEAM_TRIES, 20 );
|
||||||
init( BEST_TEAM_OPTION_COUNT, 4 );
|
init( BEST_TEAM_OPTION_COUNT, 4 );
|
||||||
init( BEST_OF_AMT, 4 );
|
init( BEST_OF_AMT, 4 );
|
||||||
init( SERVER_LIST_DELAY, 1.0 );
|
init( SERVER_LIST_DELAY, 1.0 );
|
||||||
|
|
|
@ -702,8 +702,6 @@ ACTOR Future<DistributedTestResults> runWorkload( Database cx, std::vector< Test
|
||||||
checks.push_back(workloads[i].check.template getReplyUnlessFailedFor<CheckReply>(waitForFailureTime, 0));
|
checks.push_back(workloads[i].check.template getReplyUnlessFailedFor<CheckReply>(waitForFailureTime, 0));
|
||||||
wait( waitForAll( checks ) );
|
wait( waitForAll( checks ) );
|
||||||
|
|
||||||
printf("checking tests DONE num_workloads:%d\n", workloads.size());
|
|
||||||
|
|
||||||
throwIfError(checks, "CheckFailedForWorkload" + printable(spec.title));
|
throwIfError(checks, "CheckFailedForWorkload" + printable(spec.title));
|
||||||
|
|
||||||
for(int i = 0; i < checks.size(); i++) {
|
for(int i = 0; i < checks.size(); i++) {
|
||||||
|
@ -801,7 +799,6 @@ ACTOR Future<bool> runTest( Database cx, std::vector< TesterInterface > testers,
|
||||||
try {
|
try {
|
||||||
Future<DistributedTestResults> fTestResults = runWorkload( cx, testers, spec );
|
Future<DistributedTestResults> fTestResults = runWorkload( cx, testers, spec );
|
||||||
if( spec.timeout > 0 ) {
|
if( spec.timeout > 0 ) {
|
||||||
printf("[INFO] TestSpec, timeout:%d\n", spec.timeout);
|
|
||||||
fTestResults = timeoutError( fTestResults, spec.timeout );
|
fTestResults = timeoutError( fTestResults, spec.timeout );
|
||||||
}
|
}
|
||||||
DistributedTestResults _testResults = wait( fTestResults );
|
DistributedTestResults _testResults = wait( fTestResults );
|
||||||
|
|
|
@ -142,7 +142,7 @@ struct CycleWorkload : TestWorkload {
|
||||||
}
|
}
|
||||||
|
|
||||||
void logTestData(const VectorRef<KeyValueRef>& data) {
|
void logTestData(const VectorRef<KeyValueRef>& data) {
|
||||||
TraceEvent("MXTestFailureDetail");
|
TraceEvent("TestFailureDetail");
|
||||||
int index = 0;
|
int index = 0;
|
||||||
for (auto& entry : data) {
|
for (auto& entry : data) {
|
||||||
TraceEvent("CurrentDataEntry")
|
TraceEvent("CurrentDataEntry")
|
||||||
|
|
Loading…
Reference in New Issue