Merge pull request #3912 from dongxinEric/misc/log-detailed-teams-info-in-dd
Added a way to print detailed information about team collection for d…
This commit is contained in:
commit
b0d78ecf37
|
@ -4,6 +4,11 @@
|
||||||
Release Notes
|
Release Notes
|
||||||
#############
|
#############
|
||||||
|
|
||||||
|
6.2.28
|
||||||
|
======
|
||||||
|
* Log detailed team collection information when median available space ratio of all teams is too low. `(PR #3912) <https://github.com/apple/foundationdb/pull/3912>`_
|
||||||
|
|
||||||
|
|
||||||
6.2.27
|
6.2.27
|
||||||
======
|
======
|
||||||
* For clusters with a large number of shards, avoid slow tasks in the data distributor by adding yields to the shard map destruction. `(PR #3834) <https://github.com/apple/foundationdb/pull/3834>`_
|
* For clusters with a large number of shards, avoid slow tasks in the data distributor by adding yields to the shard map destruction. `(PR #3834) <https://github.com/apple/foundationdb/pull/3834>`_
|
||||||
|
|
|
@ -630,6 +630,8 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
|
||||||
int lowestUtilizationTeam;
|
int lowestUtilizationTeam;
|
||||||
int highestUtilizationTeam;
|
int highestUtilizationTeam;
|
||||||
|
|
||||||
|
AsyncTrigger printDetailedTeamsInfo;
|
||||||
|
|
||||||
void resetLocalitySet() {
|
void resetLocalitySet() {
|
||||||
storageServerSet = Reference<LocalitySet>(new LocalityMap<UID>());
|
storageServerSet = Reference<LocalitySet>(new LocalityMap<UID>());
|
||||||
LocalityMap<UID>* storageServerMap = (LocalityMap<UID>*) storageServerSet.getPtr();
|
LocalityMap<UID>* storageServerMap = (LocalityMap<UID>*) storageServerSet.getPtr();
|
||||||
|
@ -765,6 +767,13 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
|
||||||
} else {
|
} else {
|
||||||
self->medianAvailableSpace = SERVER_KNOBS->MIN_AVAILABLE_SPACE_RATIO;
|
self->medianAvailableSpace = SERVER_KNOBS->MIN_AVAILABLE_SPACE_RATIO;
|
||||||
}
|
}
|
||||||
|
if (self->medianAvailableSpace < SERVER_KNOBS->TARGET_AVAILABLE_SPACE_RATIO) {
|
||||||
|
TraceEvent(SevWarn, "DDTeamMedianAvailableSpaceTooSmall", self->distributorId)
|
||||||
|
.detail("MedianAvailableSpaceRatio", self->medianAvailableSpace)
|
||||||
|
.detail("TargetAvailableSpaceRatio", SERVER_KNOBS->TARGET_AVAILABLE_SPACE_RATIO)
|
||||||
|
.detail("Primary", self->primary);
|
||||||
|
self->printDetailedTeamsInfo.trigger();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool foundSrc = false;
|
bool foundSrc = false;
|
||||||
|
@ -2569,6 +2578,196 @@ ACTOR Future<Void> waitUntilHealthy(DDTeamCollection* self, double extraDelay =
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Take a snapshot of necessary data structures from `DDTeamCollection` and print them out with yields to avoid slow
|
||||||
|
// task on the run loop.
|
||||||
|
ACTOR Future<Void> printSnapshotTeamsInfo(Reference<DDTeamCollection> self) {
|
||||||
|
state DatabaseConfiguration configuration;
|
||||||
|
state std::map<UID, Reference<TCServerInfo>> server_info;
|
||||||
|
state std::map<UID, ServerStatus> server_status;
|
||||||
|
state vector<Reference<TCTeamInfo>> teams;
|
||||||
|
state std::map<Standalone<StringRef>, Reference<TCMachineInfo>> machine_info;
|
||||||
|
state std::vector<Reference<TCMachineTeamInfo>> machineTeams;
|
||||||
|
// state std::vector<std::string> internedLocalityRecordKeyNameStrings;
|
||||||
|
// state int machineLocalityMapEntryArraySize;
|
||||||
|
// state std::vector<Reference<LocalityRecord>> machineLocalityMapRecordArray;
|
||||||
|
state int traceEventsPrinted = 0;
|
||||||
|
state std::vector<const UID*> serverIDs;
|
||||||
|
state double lastPrintTime = 0;
|
||||||
|
loop {
|
||||||
|
wait(self->printDetailedTeamsInfo.onTrigger());
|
||||||
|
if (now() - lastPrintTime < SERVER_KNOBS->DD_TEAMS_INFO_PRINT_INTERVAL) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
lastPrintTime = now();
|
||||||
|
|
||||||
|
traceEventsPrinted = 0;
|
||||||
|
|
||||||
|
double snapshotStart = now();
|
||||||
|
|
||||||
|
configuration = self->configuration;
|
||||||
|
server_info = self->server_info;
|
||||||
|
teams = self->teams;
|
||||||
|
machine_info = self->machine_info;
|
||||||
|
machineTeams = self->machineTeams;
|
||||||
|
// internedLocalityRecordKeyNameStrings = self->machineLocalityMap._keymap->_lookuparray;
|
||||||
|
// machineLocalityMapEntryArraySize = self->machineLocalityMap.size();
|
||||||
|
// machineLocalityMapRecordArray = self->machineLocalityMap.getRecordArray();
|
||||||
|
std::vector<const UID*> _uids = self->machineLocalityMap.getObjects();
|
||||||
|
serverIDs = _uids;
|
||||||
|
|
||||||
|
auto const& keys = self->server_status.getKeys();
|
||||||
|
for (auto const& key : keys) {
|
||||||
|
server_status.emplace(key, self->server_status.get(key));
|
||||||
|
}
|
||||||
|
|
||||||
|
TraceEvent("DDPrintSnapshotTeasmInfo", self->distributorId)
|
||||||
|
.detail("SnapshotSpeed", now() - snapshotStart)
|
||||||
|
.detail("Primary", self->primary);
|
||||||
|
|
||||||
|
// Print to TraceEvents
|
||||||
|
TraceEvent("DDConfig", self->distributorId)
|
||||||
|
.detail("StorageTeamSize", configuration.storageTeamSize)
|
||||||
|
.detail("DesiredTeamsPerServer", SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER)
|
||||||
|
.detail("MaxTeamsPerServer", SERVER_KNOBS->MAX_TEAMS_PER_SERVER)
|
||||||
|
.detail("Primary", self->primary);
|
||||||
|
|
||||||
|
TraceEvent("ServerInfo", self->distributorId)
|
||||||
|
.detail("Size", server_info.size())
|
||||||
|
.detail("Primary", self->primary);
|
||||||
|
state int i;
|
||||||
|
state std::map<UID, Reference<TCServerInfo>>::iterator server = server_info.begin();
|
||||||
|
for (i = 0; i < server_info.size(); i++) {
|
||||||
|
TraceEvent("ServerInfo", self->distributorId)
|
||||||
|
.detail("ServerInfoIndex", i)
|
||||||
|
.detail("ServerID", server->first.toString())
|
||||||
|
.detail("ServerTeamOwned", server->second->teams.size())
|
||||||
|
.detail("MachineID", server->second->machine->machineID.contents().toString())
|
||||||
|
.detail("Primary", self->primary);
|
||||||
|
server++;
|
||||||
|
if (++traceEventsPrinted % SERVER_KNOBS->DD_TEAMS_INFO_PRINT_YIELD_COUNT == 0) {
|
||||||
|
wait(yield());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
server = server_info.begin();
|
||||||
|
for (i = 0; i < server_info.size(); i++) {
|
||||||
|
const UID& uid = server->first;
|
||||||
|
TraceEvent("ServerStatus", self->distributorId)
|
||||||
|
.detail("ServerUID", uid)
|
||||||
|
.detail("Healthy", !server_status.at(uid).isUnhealthy())
|
||||||
|
.detail("MachineIsValid", server_info[uid]->machine.isValid())
|
||||||
|
.detail("MachineTeamSize",
|
||||||
|
server_info[uid]->machine.isValid() ? server_info[uid]->machine->machineTeams.size() : -1)
|
||||||
|
.detail("Primary", self->primary);
|
||||||
|
server++;
|
||||||
|
if (++traceEventsPrinted % SERVER_KNOBS->DD_TEAMS_INFO_PRINT_YIELD_COUNT == 0) {
|
||||||
|
wait(yield());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
TraceEvent("ServerTeamInfo", self->distributorId).detail("Size", teams.size()).detail("Primary", self->primary);
|
||||||
|
for (i = 0; i < teams.size(); i++) {
|
||||||
|
const auto& team = teams[i];
|
||||||
|
TraceEvent("ServerTeamInfo", self->distributorId)
|
||||||
|
.detail("TeamIndex", i)
|
||||||
|
.detail("Healthy", team->isHealthy())
|
||||||
|
.detail("TeamSize", team->size())
|
||||||
|
.detail("MemberIDs", team->getServerIDsStr())
|
||||||
|
.detail("Primary", self->primary);
|
||||||
|
if (++traceEventsPrinted % SERVER_KNOBS->DD_TEAMS_INFO_PRINT_YIELD_COUNT == 0) {
|
||||||
|
wait(yield());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
TraceEvent("MachineInfo", self->distributorId)
|
||||||
|
.detail("Size", machine_info.size())
|
||||||
|
.detail("Primary", self->primary);
|
||||||
|
state std::map<Standalone<StringRef>, Reference<TCMachineInfo>>::iterator machine = machine_info.begin();
|
||||||
|
state bool isMachineHealthy = false;
|
||||||
|
for (i = 0; i < machine_info.size(); i++) {
|
||||||
|
Reference<TCMachineInfo> _machine = machine->second;
|
||||||
|
if (!_machine.isValid() || machine_info.find(_machine->machineID) == machine_info.end() ||
|
||||||
|
_machine->serversOnMachine.empty()) {
|
||||||
|
isMachineHealthy = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Healthy machine has at least one healthy server
|
||||||
|
for (auto& server : _machine->serversOnMachine) {
|
||||||
|
if (!server_status.at(server->id).isUnhealthy()) {
|
||||||
|
isMachineHealthy = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
isMachineHealthy = false;
|
||||||
|
TraceEvent("MachineInfo", self->distributorId)
|
||||||
|
.detail("MachineInfoIndex", i)
|
||||||
|
.detail("Healthy", isMachineHealthy)
|
||||||
|
.detail("MachineID", machine->first.contents().toString())
|
||||||
|
.detail("MachineTeamOwned", machine->second->machineTeams.size())
|
||||||
|
.detail("ServerNumOnMachine", machine->second->serversOnMachine.size())
|
||||||
|
.detail("ServersID", machine->second->getServersIDStr())
|
||||||
|
.detail("Primary", self->primary);
|
||||||
|
machine++;
|
||||||
|
if (++traceEventsPrinted % SERVER_KNOBS->DD_TEAMS_INFO_PRINT_YIELD_COUNT == 0) {
|
||||||
|
wait(yield());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
TraceEvent("MachineTeamInfo", self->distributorId)
|
||||||
|
.detail("Size", machineTeams.size())
|
||||||
|
.detail("Primary", self->primary);
|
||||||
|
for (i = 0; i < machineTeams.size(); i++) {
|
||||||
|
const auto& team = machineTeams[i];
|
||||||
|
TraceEvent("MachineTeamInfo", self->distributorId)
|
||||||
|
.detail("TeamIndex", i)
|
||||||
|
.detail("MachineIDs", team->getMachineIDsStr())
|
||||||
|
.detail("ServerTeams", team->serverTeams.size())
|
||||||
|
.detail("Primary", self->primary);
|
||||||
|
if (++traceEventsPrinted % SERVER_KNOBS->DD_TEAMS_INFO_PRINT_YIELD_COUNT == 0) {
|
||||||
|
wait(yield());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: re-enable the following logging or remove them.
|
||||||
|
// TraceEvent("LocalityRecordKeyName", self->distributorId)
|
||||||
|
// .detail("Size", internedLocalityRecordKeyNameStrings.size())
|
||||||
|
// .detail("Primary", self->primary);
|
||||||
|
// for (i = 0; i < internedLocalityRecordKeyNameStrings.size(); i++) {
|
||||||
|
// TraceEvent("LocalityRecordKeyIndexName", self->distributorId)
|
||||||
|
// .detail("KeyIndex", i)
|
||||||
|
// .detail("KeyName", internedLocalityRecordKeyNameStrings[i])
|
||||||
|
// .detail("Primary", self->primary);
|
||||||
|
// if (++traceEventsPrinted % SERVER_KNOBS->DD_TEAMS_INFO_PRINT_YIELD_COUNT == 0) {
|
||||||
|
// wait(yield());
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
|
||||||
|
// TraceEvent("MachineLocalityMap", self->distributorId)
|
||||||
|
// .detail("Size", machineLocalityMapEntryArraySize)
|
||||||
|
// .detail("Primary", self->primary);
|
||||||
|
// for (i = 0; i < serverIDs.size(); i++) {
|
||||||
|
// const auto& serverID = serverIDs[i];
|
||||||
|
// Reference<LocalityRecord> record = machineLocalityMapRecordArray[i];
|
||||||
|
// if (record.isValid()) {
|
||||||
|
// TraceEvent("MachineLocalityMap", self->distributorId)
|
||||||
|
// .detail("LocalityIndex", i)
|
||||||
|
// .detail("UID", serverID->toString())
|
||||||
|
// .detail("LocalityRecord", record->toString())
|
||||||
|
// .detail("Primary", self->primary);
|
||||||
|
// } else {
|
||||||
|
// TraceEvent("MachineLocalityMap", self->distributorId)
|
||||||
|
// .detail("LocalityIndex", i)
|
||||||
|
// .detail("UID", serverID->toString())
|
||||||
|
// .detail("LocalityRecord", "[NotFound]")
|
||||||
|
// .detail("Primary", self->primary);
|
||||||
|
// }
|
||||||
|
// if (++traceEventsPrinted % SERVER_KNOBS->DD_TEAMS_INFO_PRINT_YIELD_COUNT == 0) {
|
||||||
|
// wait(yield());
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
ACTOR Future<Void> removeBadTeams(DDTeamCollection* self) {
|
ACTOR Future<Void> removeBadTeams(DDTeamCollection* self) {
|
||||||
wait(self->initialFailureReactionDelay);
|
wait(self->initialFailureReactionDelay);
|
||||||
wait(waitUntilHealthy(self));
|
wait(waitUntilHealthy(self));
|
||||||
|
@ -4329,9 +4528,11 @@ ACTOR Future<Void> dataDistribution(Reference<DataDistributorData> self)
|
||||||
teamCollectionsPtrs.push_back(remoteTeamCollection.getPtr());
|
teamCollectionsPtrs.push_back(remoteTeamCollection.getPtr());
|
||||||
remoteTeamCollection->teamCollections = teamCollectionsPtrs;
|
remoteTeamCollection->teamCollections = teamCollectionsPtrs;
|
||||||
actors.push_back( reportErrorsExcept( dataDistributionTeamCollection( remoteTeamCollection, initData, tcis[1], self->dbInfo ), "DDTeamCollectionSecondary", self->ddId, &normalDDQueueErrors() ) );
|
actors.push_back( reportErrorsExcept( dataDistributionTeamCollection( remoteTeamCollection, initData, tcis[1], self->dbInfo ), "DDTeamCollectionSecondary", self->ddId, &normalDDQueueErrors() ) );
|
||||||
|
actors.push_back(printSnapshotTeamsInfo(remoteTeamCollection));
|
||||||
}
|
}
|
||||||
primaryTeamCollection->teamCollections = teamCollectionsPtrs;
|
primaryTeamCollection->teamCollections = teamCollectionsPtrs;
|
||||||
actors.push_back( reportErrorsExcept( dataDistributionTeamCollection( primaryTeamCollection, initData, tcis[0], self->dbInfo ), "DDTeamCollectionPrimary", self->ddId, &normalDDQueueErrors() ) );
|
actors.push_back( reportErrorsExcept( dataDistributionTeamCollection( primaryTeamCollection, initData, tcis[0], self->dbInfo ), "DDTeamCollectionPrimary", self->ddId, &normalDDQueueErrors() ) );
|
||||||
|
actors.push_back(printSnapshotTeamsInfo(primaryTeamCollection));
|
||||||
actors.push_back(yieldPromiseStream(output.getFuture(), input));
|
actors.push_back(yieldPromiseStream(output.getFuture(), input));
|
||||||
|
|
||||||
wait( waitForAll( actors ) );
|
wait( waitForAll( actors ) );
|
||||||
|
|
|
@ -219,6 +219,8 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs, bool isSimula
|
||||||
init( DD_VALIDATE_LOCALITY, true ); if( randomize && BUGGIFY ) DD_VALIDATE_LOCALITY = false;
|
init( DD_VALIDATE_LOCALITY, true ); if( randomize && BUGGIFY ) DD_VALIDATE_LOCALITY = false;
|
||||||
init( DD_CHECK_INVALID_LOCALITY_DELAY, 60 ); if( randomize && BUGGIFY ) DD_CHECK_INVALID_LOCALITY_DELAY = 1 + deterministicRandom()->random01() * 600;
|
init( DD_CHECK_INVALID_LOCALITY_DELAY, 60 ); if( randomize && BUGGIFY ) DD_CHECK_INVALID_LOCALITY_DELAY = 1 + deterministicRandom()->random01() * 600;
|
||||||
init( DD_ENABLE_VERBOSE_TRACING, false ); if( randomize && BUGGIFY ) DD_ENABLE_VERBOSE_TRACING = true;
|
init( DD_ENABLE_VERBOSE_TRACING, false ); if( randomize && BUGGIFY ) DD_ENABLE_VERBOSE_TRACING = true;
|
||||||
|
init( DD_TEAMS_INFO_PRINT_INTERVAL, 60 ); if( randomize && BUGGIFY ) DD_TEAMS_INFO_PRINT_INTERVAL = 10;
|
||||||
|
init( DD_TEAMS_INFO_PRINT_YIELD_COUNT, 100 ); if( randomize && BUGGIFY ) DD_TEAMS_INFO_PRINT_YIELD_COUNT = deterministicRandom()->random01() * 1000 + 1;
|
||||||
|
|
||||||
// TeamRemover
|
// TeamRemover
|
||||||
init( TR_FLAG_DISABLE_MACHINE_TEAM_REMOVER, false ); if( randomize && BUGGIFY ) TR_FLAG_DISABLE_MACHINE_TEAM_REMOVER = deterministicRandom()->random01() < 0.1 ? true : false; // false by default. disable the consistency check when it's true
|
init( TR_FLAG_DISABLE_MACHINE_TEAM_REMOVER, false ); if( randomize && BUGGIFY ) TR_FLAG_DISABLE_MACHINE_TEAM_REMOVER = deterministicRandom()->random01() < 0.1 ? true : false; // false by default. disable the consistency check when it's true
|
||||||
|
|
|
@ -182,6 +182,8 @@ public:
|
||||||
bool DD_VALIDATE_LOCALITY;
|
bool DD_VALIDATE_LOCALITY;
|
||||||
int DD_CHECK_INVALID_LOCALITY_DELAY;
|
int DD_CHECK_INVALID_LOCALITY_DELAY;
|
||||||
bool DD_ENABLE_VERBOSE_TRACING;
|
bool DD_ENABLE_VERBOSE_TRACING;
|
||||||
|
int DD_TEAMS_INFO_PRINT_INTERVAL;
|
||||||
|
int DD_TEAMS_INFO_PRINT_YIELD_COUNT;
|
||||||
|
|
||||||
// TeamRemover to remove redundant teams
|
// TeamRemover to remove redundant teams
|
||||||
bool TR_FLAG_DISABLE_MACHINE_TEAM_REMOVER; // disable the machineTeamRemover actor
|
bool TR_FLAG_DISABLE_MACHINE_TEAM_REMOVER; // disable the machineTeamRemover actor
|
||||||
|
|
Loading…
Reference in New Issue