Change traceTeamCollectionInfo to actor
There are cases where traceTeamCollectionInfo was called within the same execution block, i.e., no wait between the two traceTeamCollectionInfo calls. Because simulation uses the same time for all execution instructions in the same execution block, having more than one traceTeamCollectionInfo at the same time will mess up the trackLatest semantics. When one of them is always chosen by simulator, simulation test will report false positive error. Changing this function to actor and adding a small delay inside the function can solve this problem.
This commit is contained in:
parent
4fe3c7f749
commit
f889843332
|
@ -535,6 +535,7 @@ Future<Void> storageServerTracker(
|
|||
Version const& addedVersion);
|
||||
|
||||
Future<Void> teamTracker(struct DDTeamCollection* const& self, Reference<TCTeamInfo> const& team, bool const& badTeam, bool const& redundantTeam);
|
||||
ACTOR static Future<Void> traceTeamCollectionInfo(DDTeamCollection* self);
|
||||
|
||||
struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
|
||||
enum { REQUESTING_WORKER = 0, GETTING_WORKER = 1, GETTING_STORAGE = 2 };
|
||||
|
@ -958,7 +959,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
|
|||
}
|
||||
|
||||
// Trace and record the current number of teams for correctness test
|
||||
self->traceTeamCollectionInfo();
|
||||
wait( self->traceTeamCollectionInfo(self) );
|
||||
|
||||
return Void();
|
||||
}
|
||||
|
@ -1873,29 +1874,29 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
|
|||
}
|
||||
|
||||
// Check if the number of server (and machine teams) is larger than the maximum allowed number
|
||||
void traceTeamCollectionInfo() {
|
||||
int totalHealthyServerCount = calculateHealthyServerCount();
|
||||
ACTOR static Future<Void> traceTeamCollectionInfo(DDTeamCollection* self) {
|
||||
int totalHealthyServerCount = self->calculateHealthyServerCount();
|
||||
int desiredServerTeams = SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER * totalHealthyServerCount;
|
||||
int maxServerTeams = SERVER_KNOBS->MAX_TEAMS_PER_SERVER * totalHealthyServerCount;
|
||||
|
||||
int totalHealthyMachineCount = calculateHealthyMachineCount();
|
||||
int totalHealthyMachineCount = self->calculateHealthyMachineCount();
|
||||
int desiredMachineTeams = SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER * totalHealthyMachineCount;
|
||||
int maxMachineTeams = SERVER_KNOBS->MAX_TEAMS_PER_SERVER * totalHealthyMachineCount;
|
||||
int healthyMachineTeamCount = getHealthyMachineTeamCount();
|
||||
int healthyMachineTeamCount = self->getHealthyMachineTeamCount();
|
||||
|
||||
std::pair<int, int> minMaxTeamNumberOnServer = calculateMinMaxServerTeamNumOnServer();
|
||||
std::pair<int, int> minMaxMachineTeamNumberOnMachine = calculateMinMaxMachineTeamNumOnMachine();
|
||||
std::pair<int, int> minMaxTeamNumberOnServer = self->calculateMinMaxServerTeamNumOnServer();
|
||||
std::pair<int, int> minMaxMachineTeamNumberOnMachine = self->calculateMinMaxMachineTeamNumOnMachine();
|
||||
|
||||
TraceEvent("TeamCollectionInfo", distributorId)
|
||||
.detail("Primary", primary)
|
||||
TraceEvent("TeamCollectionInfo", self->distributorId)
|
||||
.detail("Primary", self->primary)
|
||||
.detail("AddedTeamNumber", 0)
|
||||
.detail("AimToBuildTeamNumber", 0)
|
||||
.detail("RemainingTeamBudget", 0)
|
||||
.detail("CurrentTeamNumber", teams.size())
|
||||
.detail("CurrentTeamNumber", self->teams.size())
|
||||
.detail("DesiredTeamNumber", desiredServerTeams)
|
||||
.detail("MaxTeamNumber", maxServerTeams)
|
||||
.detail("StorageTeamSize", configuration.storageTeamSize)
|
||||
.detail("CurrentMachineTeamNumber", machineTeams.size())
|
||||
.detail("StorageTeamSize", self->configuration.storageTeamSize)
|
||||
.detail("CurrentMachineTeamNumber", self->machineTeams.size())
|
||||
.detail("CurrentHealthyMachineTeamNumber", healthyMachineTeamCount)
|
||||
.detail("DesiredMachineTeams", desiredMachineTeams)
|
||||
.detail("MaxMachineTeams", maxMachineTeams)
|
||||
|
@ -1904,15 +1905,20 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
|
|||
.detail("MaxTeamNumberOnServer", minMaxTeamNumberOnServer.second)
|
||||
.detail("MinMachineTeamNumberOnMachine", minMaxMachineTeamNumberOnMachine.first)
|
||||
.detail("MaxMachineTeamNumberOnMachine", minMaxMachineTeamNumberOnMachine.second)
|
||||
.detail("DoBuildTeams", doBuildTeams)
|
||||
.detail("DoBuildTeams", self->doBuildTeams)
|
||||
.trackLatest("TeamCollectionInfo");
|
||||
|
||||
// Advance time so that we will not have multiple TeamCollectionInfo at the same time, otherwise
|
||||
// simulation test will randomly pick one TeamCollectionInfo trace, which could be the one before build teams
|
||||
wait( delay(0.01) );
|
||||
|
||||
// Debug purpose
|
||||
// if (healthyMachineTeamCount > desiredMachineTeams || machineTeams.size() > maxMachineTeams) {
|
||||
// // When the number of machine teams is over the limit, print out the current team info.
|
||||
// traceAllInfo(true);
|
||||
// }
|
||||
|
||||
return Void();
|
||||
}
|
||||
|
||||
// Use the current set of known processes (from server_info) to compute an optimized set of storage server teams.
|
||||
|
@ -2474,7 +2480,7 @@ ACTOR Future<Void> teamRemover(DDTeamCollection* self) {
|
|||
.detail("CurrentMachineTeamNumber", self->machineTeams.size())
|
||||
.detail("DesiredMachineTeam", desiredMachineTeams)
|
||||
.detail("NumMachineTeamRemoved", numMachineTeamRemoved);
|
||||
self->traceTeamCollectionInfo();
|
||||
wait( self->traceTeamCollectionInfo(self) );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -3087,6 +3093,7 @@ ACTOR Future<Void> storageServerTracker(
|
|||
}
|
||||
lastIsUnhealthy = status.isUnhealthy();
|
||||
|
||||
state bool recordTeamCollectionInfo = false;
|
||||
choose {
|
||||
when( wait( failureTracker ) ) {
|
||||
// The server is failed AND all data has been removed from it, so permanently remove it.
|
||||
|
@ -3190,7 +3197,8 @@ ACTOR Future<Void> storageServerTracker(
|
|||
self->badTeamRemover = removeBadTeams(self);
|
||||
self->addActor.send(self->badTeamRemover);
|
||||
// The team number changes, so we need to update the team number info
|
||||
self->traceTeamCollectionInfo();
|
||||
// wait( traceTeamCollectionInfo(self) );
|
||||
recordTeamCollectionInfo = true;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -3198,12 +3206,14 @@ ACTOR Future<Void> storageServerTracker(
|
|||
// We rely on the old failureTracker being actorCancelled since the old actor now has a pointer to an invalid location
|
||||
status = ServerStatus( status.isFailed, status.isUndesired, server->lastKnownInterface.locality );
|
||||
|
||||
// wait( traceTeamCollectionInfo(self) );
|
||||
recordTeamCollectionInfo = true;
|
||||
//Restart the storeTracker for the new interface
|
||||
storeTracker = keyValueStoreTypeTracker(self, server);
|
||||
hasWrongStoreTypeOrDC = false;
|
||||
self->doBuildTeams = true;
|
||||
self->restartTeamBuilder.trigger();
|
||||
self->traceTeamCollectionInfo();
|
||||
|
||||
if(restartRecruiting)
|
||||
self->restartRecruiting.trigger();
|
||||
}
|
||||
|
@ -3224,6 +3234,10 @@ ACTOR Future<Void> storageServerTracker(
|
|||
server->wakeUpTracker = Promise<Void>();
|
||||
}
|
||||
}
|
||||
|
||||
if ( recordTeamCollectionInfo ) {
|
||||
wait( self->traceTeamCollectionInfo(self) );
|
||||
}
|
||||
}
|
||||
} catch( Error &e ) {
|
||||
if (e.code() != error_code_actor_cancelled && errorOut.canBeSet())
|
||||
|
@ -3458,7 +3472,7 @@ ACTOR Future<Void> dataDistributionTeamCollection(
|
|||
self->redundantTeamRemover = teamRemover(self);
|
||||
self->addActor.send(self->redundantTeamRemover);
|
||||
}
|
||||
self->traceTeamCollectionInfo();
|
||||
wait( self->traceTeamCollectionInfo(self) );
|
||||
|
||||
if(self->includedDCs.size()) {
|
||||
//start this actor before any potential recruitments can happen
|
||||
|
|
|
@ -323,7 +323,7 @@ ACTOR Future<bool> getTeamCollectionValid(Database cx, WorkerInterface dataDistr
|
|||
.detail("MaxMachineTeamNumberOnMachine", maxMachineTeamOnMachine)
|
||||
.detail("DesiredTeamsPerServer", SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER)
|
||||
.detail("MaxTeamsPerServer", SERVER_KNOBS->MAX_TEAMS_PER_SERVER);
|
||||
wait(delay(5.0));
|
||||
return false;
|
||||
} else {
|
||||
return true;
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue