Added detailed logging when there is no servers left in a server team, because that may indicate a data loss incident.
This commit is contained in:
parent
b0d78ecf37
commit
c037bfd001
|
@ -631,6 +631,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
|
||||||
int highestUtilizationTeam;
|
int highestUtilizationTeam;
|
||||||
|
|
||||||
AsyncTrigger printDetailedTeamsInfo;
|
AsyncTrigger printDetailedTeamsInfo;
|
||||||
|
PromiseStream<GetMetricsRequest> getShardMetrics;
|
||||||
|
|
||||||
void resetLocalitySet() {
|
void resetLocalitySet() {
|
||||||
storageServerSet = Reference<LocalitySet>(new LocalityMap<UID>());
|
storageServerSet = Reference<LocalitySet>(new LocalityMap<UID>());
|
||||||
|
@ -662,7 +663,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
|
||||||
DatabaseConfiguration configuration, std::vector<Optional<Key>> includedDCs,
|
DatabaseConfiguration configuration, std::vector<Optional<Key>> includedDCs,
|
||||||
Optional<std::vector<Optional<Key>>> otherTrackedDCs, Future<Void> readyToStart,
|
Optional<std::vector<Optional<Key>>> otherTrackedDCs, Future<Void> readyToStart,
|
||||||
Reference<AsyncVar<bool>> zeroHealthyTeams, bool primary,
|
Reference<AsyncVar<bool>> zeroHealthyTeams, bool primary,
|
||||||
Reference<AsyncVar<bool>> processingUnhealthy)
|
Reference<AsyncVar<bool>> processingUnhealthy, PromiseStream<GetMetricsRequest> getShardMetrics)
|
||||||
: cx(cx), distributorId(distributorId), lock(lock), output(output),
|
: cx(cx), distributorId(distributorId), lock(lock), output(output),
|
||||||
shardsAffectedByTeamFailure(shardsAffectedByTeamFailure), doBuildTeams(true), lastBuildTeamsFailed(false),
|
shardsAffectedByTeamFailure(shardsAffectedByTeamFailure), doBuildTeams(true), lastBuildTeamsFailed(false),
|
||||||
teamBuilder(Void()), badTeamRemover(Void()), redundantMachineTeamRemover(Void()),
|
teamBuilder(Void()), badTeamRemover(Void()), redundantMachineTeamRemover(Void()),
|
||||||
|
@ -675,8 +676,10 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
|
||||||
initializationDoneActor(logOnCompletion(readyToStart && initialFailureReactionDelay, this)),
|
initializationDoneActor(logOnCompletion(readyToStart && initialFailureReactionDelay, this)),
|
||||||
optimalTeamCount(0), recruitingStream(0), restartRecruiting(SERVER_KNOBS->DEBOUNCE_RECRUITING_DELAY),
|
optimalTeamCount(0), recruitingStream(0), restartRecruiting(SERVER_KNOBS->DEBOUNCE_RECRUITING_DELAY),
|
||||||
unhealthyServers(0), includedDCs(includedDCs), otherTrackedDCs(otherTrackedDCs),
|
unhealthyServers(0), includedDCs(includedDCs), otherTrackedDCs(otherTrackedDCs),
|
||||||
zeroHealthyTeams(zeroHealthyTeams), zeroOptimalTeams(true), primary(primary), medianAvailableSpace(SERVER_KNOBS->MIN_AVAILABLE_SPACE_RATIO),
|
zeroHealthyTeams(zeroHealthyTeams), zeroOptimalTeams(true), primary(primary),
|
||||||
lastMedianAvailableSpaceUpdate(0), processingUnhealthy(processingUnhealthy), lowestUtilizationTeam(0), highestUtilizationTeam(0) {
|
medianAvailableSpace(SERVER_KNOBS->MIN_AVAILABLE_SPACE_RATIO), lastMedianAvailableSpaceUpdate(0),
|
||||||
|
processingUnhealthy(processingUnhealthy), lowestUtilizationTeam(0), highestUtilizationTeam(0),
|
||||||
|
getShardMetrics(getShardMetrics) {
|
||||||
if(!primary || configuration.usableRegions == 1) {
|
if(!primary || configuration.usableRegions == 1) {
|
||||||
TraceEvent("DDTrackerStarting", distributorId)
|
TraceEvent("DDTrackerStarting", distributorId)
|
||||||
.detail( "State", "Inactive" )
|
.detail( "State", "Inactive" )
|
||||||
|
@ -2965,6 +2968,30 @@ ACTOR Future<Void> serverTeamRemover(DDTeamCollection* self) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ACTOR Future<Void> zeroServerLeftLogger_impl(DDTeamCollection* self, Reference<TCTeamInfo> team) {
|
||||||
|
wait(delay(SERVER_KNOBS->DD_TEAM_ZERO_SERVER_LEFT_LOG_DELAY));
|
||||||
|
state vector<KeyRange> shards = self->shardsAffectedByTeamFailure->getShardsFor(
|
||||||
|
ShardsAffectedByTeamFailure::Team(team->getServerIDs(), self->primary));
|
||||||
|
state std::vector<Future<StorageMetrics>> sizes;
|
||||||
|
sizes.reserve(shards.size());
|
||||||
|
|
||||||
|
for (auto const& shard : shards) {
|
||||||
|
sizes.emplace_back(brokenPromiseToNever(self->getShardMetrics.getReply(GetMetricsRequest(shard))));
|
||||||
|
TraceEvent(SevError, "DDShardLost").detail("ShardBegin", shard.begin).detail("ShardEnd", shard.end);
|
||||||
|
}
|
||||||
|
|
||||||
|
wait(waitForAll(sizes));
|
||||||
|
|
||||||
|
int64_t bytesLost = 0;
|
||||||
|
for (auto const& size : sizes) {
|
||||||
|
bytesLost += size.get().bytes;
|
||||||
|
}
|
||||||
|
|
||||||
|
TraceEvent(SevError, "DDZeroServerLeftInTeam").detail("Team", team->getDesc()).detail("TotalBytesLost", bytesLost);
|
||||||
|
|
||||||
|
return Void();
|
||||||
|
}
|
||||||
|
|
||||||
// Track a team and issue RelocateShards when the level of degradation changes
|
// Track a team and issue RelocateShards when the level of degradation changes
|
||||||
// A badTeam can be unhealthy or just a redundantTeam removed by machineTeamRemover() or serverTeamRemover()
|
// A badTeam can be unhealthy or just a redundantTeam removed by machineTeamRemover() or serverTeamRemover()
|
||||||
ACTOR Future<Void> teamTracker(DDTeamCollection* self, Reference<TCTeamInfo> team, bool badTeam, bool redundantTeam) {
|
ACTOR Future<Void> teamTracker(DDTeamCollection* self, Reference<TCTeamInfo> team, bool badTeam, bool redundantTeam) {
|
||||||
|
@ -2979,6 +3006,8 @@ ACTOR Future<Void> teamTracker(DDTeamCollection* self, Reference<TCTeamInfo> tea
|
||||||
state bool lastZeroHealthy = self->zeroHealthyTeams->get();
|
state bool lastZeroHealthy = self->zeroHealthyTeams->get();
|
||||||
state bool firstCheck = true;
|
state bool firstCheck = true;
|
||||||
|
|
||||||
|
state Future<Void> zeroServerLeftLogger;
|
||||||
|
|
||||||
if(logTeamEvents) {
|
if(logTeamEvents) {
|
||||||
TraceEvent("TeamTrackerStarting", self->distributorId).detail("Reason", "Initial wait complete (sc)").detail("Team", team->getDesc());
|
TraceEvent("TeamTrackerStarting", self->distributorId).detail("Reason", "Initial wait complete (sc)").detail("Team", team->getDesc());
|
||||||
}
|
}
|
||||||
|
@ -3126,12 +3155,24 @@ ACTOR Future<Void> teamTracker(DDTeamCollection* self, Reference<TCTeamInfo> tea
|
||||||
if(lastPriority != team->getPriority()) {
|
if(lastPriority != team->getPriority()) {
|
||||||
self->priority_teams[lastPriority]--;
|
self->priority_teams[lastPriority]--;
|
||||||
self->priority_teams[team->getPriority()]++;
|
self->priority_teams[team->getPriority()]++;
|
||||||
|
if (lastPriority == SERVER_KNOBS->PRIORITY_TEAM_0_LEFT &&
|
||||||
|
team->getPriority() < SERVER_KNOBS->PRIORITY_TEAM_0_LEFT) {
|
||||||
|
zeroServerLeftLogger = Void();
|
||||||
|
}
|
||||||
|
if (logTeamEvents) {
|
||||||
|
auto dataLoss = team->getPriority() == SERVER_KNOBS->PRIORITY_TEAM_0_LEFT;
|
||||||
|
auto severity = dataLoss ? SevError : SevInfo;
|
||||||
|
TraceEvent(severity, "TeamPriorityChange", self->distributorId)
|
||||||
|
.detail("Priority", team->getPriority())
|
||||||
|
.detail("Info", team->getDesc())
|
||||||
|
.detail("ZeroHealthyTeams", self->zeroHealthyTeams->get());
|
||||||
|
if (team->getPriority() == SERVER_KNOBS->PRIORITY_TEAM_0_LEFT) {
|
||||||
|
// 0 servers left in this team, data might be lost.
|
||||||
|
zeroServerLeftLogger = zeroServerLeftLogger_impl(self, team);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if(logTeamEvents) {
|
|
||||||
TraceEvent("TeamPriorityChange", self->distributorId).detail("Priority", team->getPriority())
|
|
||||||
.detail("Info", team->getDesc()).detail("ZeroHealthyTeams", self->zeroHealthyTeams->get());
|
|
||||||
}
|
|
||||||
|
|
||||||
lastZeroHealthy = self->zeroHealthyTeams->get(); //set this again in case it changed from this teams health changing
|
lastZeroHealthy = self->zeroHealthyTeams->get(); //set this again in case it changed from this teams health changing
|
||||||
if( self->initialFailureReactionDelay.isReady() && !self->zeroHealthyTeams->get() ) {
|
if( self->initialFailureReactionDelay.isReady() && !self->zeroHealthyTeams->get() ) {
|
||||||
|
@ -4521,10 +4562,16 @@ ACTOR Future<Void> dataDistribution(Reference<DataDistributorData> self)
|
||||||
actors.push_back( reportErrorsExcept( dataDistributionQueue( cx, output, input.getFuture(), getShardMetrics, processingUnhealthy, tcis, shardsAffectedByTeamFailure, lock, getAverageShardBytes, self->ddId, storageTeamSize, configuration.storageTeamSize, &lastLimited ), "DDQueue", self->ddId, &normalDDQueueErrors() ) );
|
actors.push_back( reportErrorsExcept( dataDistributionQueue( cx, output, input.getFuture(), getShardMetrics, processingUnhealthy, tcis, shardsAffectedByTeamFailure, lock, getAverageShardBytes, self->ddId, storageTeamSize, configuration.storageTeamSize, &lastLimited ), "DDQueue", self->ddId, &normalDDQueueErrors() ) );
|
||||||
|
|
||||||
vector<DDTeamCollection*> teamCollectionsPtrs;
|
vector<DDTeamCollection*> teamCollectionsPtrs;
|
||||||
Reference<DDTeamCollection> primaryTeamCollection( new DDTeamCollection(cx, self->ddId, lock, output, shardsAffectedByTeamFailure, configuration, primaryDcId, configuration.usableRegions > 1 ? remoteDcIds : std::vector<Optional<Key>>(), readyToStart.getFuture(), zeroHealthyTeams[0], true, processingUnhealthy) );
|
Reference<DDTeamCollection> primaryTeamCollection(new DDTeamCollection(
|
||||||
|
cx, self->ddId, lock, output, shardsAffectedByTeamFailure, configuration, primaryDcId,
|
||||||
|
configuration.usableRegions > 1 ? remoteDcIds : std::vector<Optional<Key>>(), readyToStart.getFuture(),
|
||||||
|
zeroHealthyTeams[0], true, processingUnhealthy, getShardMetrics));
|
||||||
teamCollectionsPtrs.push_back(primaryTeamCollection.getPtr());
|
teamCollectionsPtrs.push_back(primaryTeamCollection.getPtr());
|
||||||
if (configuration.usableRegions > 1) {
|
if (configuration.usableRegions > 1) {
|
||||||
Reference<DDTeamCollection> remoteTeamCollection( new DDTeamCollection(cx, self->ddId, lock, output, shardsAffectedByTeamFailure, configuration, remoteDcIds, Optional<std::vector<Optional<Key>>>(), readyToStart.getFuture() && remoteRecovered(self->dbInfo), zeroHealthyTeams[1], false, processingUnhealthy) );
|
Reference<DDTeamCollection> remoteTeamCollection(new DDTeamCollection(
|
||||||
|
cx, self->ddId, lock, output, shardsAffectedByTeamFailure, configuration, remoteDcIds,
|
||||||
|
Optional<std::vector<Optional<Key>>>(), readyToStart.getFuture() && remoteRecovered(self->dbInfo),
|
||||||
|
zeroHealthyTeams[1], false, processingUnhealthy, getShardMetrics));
|
||||||
teamCollectionsPtrs.push_back(remoteTeamCollection.getPtr());
|
teamCollectionsPtrs.push_back(remoteTeamCollection.getPtr());
|
||||||
remoteTeamCollection->teamCollections = teamCollectionsPtrs;
|
remoteTeamCollection->teamCollections = teamCollectionsPtrs;
|
||||||
actors.push_back( reportErrorsExcept( dataDistributionTeamCollection( remoteTeamCollection, initData, tcis[1], self->dbInfo ), "DDTeamCollectionSecondary", self->ddId, &normalDDQueueErrors() ) );
|
actors.push_back( reportErrorsExcept( dataDistributionTeamCollection( remoteTeamCollection, initData, tcis[1], self->dbInfo ), "DDTeamCollectionSecondary", self->ddId, &normalDDQueueErrors() ) );
|
||||||
|
@ -4764,20 +4811,11 @@ DDTeamCollection* testTeamCollection(int teamSize, Reference<IReplicationPolicy>
|
||||||
conf.storageTeamSize = teamSize;
|
conf.storageTeamSize = teamSize;
|
||||||
conf.storagePolicy = policy;
|
conf.storagePolicy = policy;
|
||||||
|
|
||||||
DDTeamCollection* collection = new DDTeamCollection(
|
DDTeamCollection* collection =
|
||||||
database,
|
new DDTeamCollection(database, UID(0, 0), MoveKeysLock(), PromiseStream<RelocateShard>(),
|
||||||
UID(0, 0),
|
Reference<ShardsAffectedByTeamFailure>(new ShardsAffectedByTeamFailure()), conf, {}, {},
|
||||||
MoveKeysLock(),
|
Future<Void>(Void()), Reference<AsyncVar<bool>>(new AsyncVar<bool>(true)), true,
|
||||||
PromiseStream<RelocateShard>(),
|
Reference<AsyncVar<bool>>(new AsyncVar<bool>(false)), PromiseStream<GetMetricsRequest>());
|
||||||
Reference<ShardsAffectedByTeamFailure>(new ShardsAffectedByTeamFailure()),
|
|
||||||
conf,
|
|
||||||
{},
|
|
||||||
{},
|
|
||||||
Future<Void>(Void()),
|
|
||||||
Reference<AsyncVar<bool>>( new AsyncVar<bool>(true) ),
|
|
||||||
true,
|
|
||||||
Reference<AsyncVar<bool>>( new AsyncVar<bool>(false) )
|
|
||||||
);
|
|
||||||
|
|
||||||
for (int id = 1; id <= processCount; ++id) {
|
for (int id = 1; id <= processCount; ++id) {
|
||||||
UID uid(id, 0);
|
UID uid(id, 0);
|
||||||
|
@ -4805,9 +4843,8 @@ DDTeamCollection* testMachineTeamCollection(int teamSize, Reference<IReplication
|
||||||
DDTeamCollection* collection =
|
DDTeamCollection* collection =
|
||||||
new DDTeamCollection(database, UID(0, 0), MoveKeysLock(), PromiseStream<RelocateShard>(),
|
new DDTeamCollection(database, UID(0, 0), MoveKeysLock(), PromiseStream<RelocateShard>(),
|
||||||
Reference<ShardsAffectedByTeamFailure>(new ShardsAffectedByTeamFailure()), conf, {}, {},
|
Reference<ShardsAffectedByTeamFailure>(new ShardsAffectedByTeamFailure()), conf, {}, {},
|
||||||
Future<Void>(Void()),
|
Future<Void>(Void()), Reference<AsyncVar<bool>>(new AsyncVar<bool>(true)), true,
|
||||||
Reference<AsyncVar<bool>>(new AsyncVar<bool>(true)), true,
|
Reference<AsyncVar<bool>>(new AsyncVar<bool>(false)), PromiseStream<GetMetricsRequest>());
|
||||||
Reference<AsyncVar<bool>>(new AsyncVar<bool>(false)));
|
|
||||||
|
|
||||||
for (int id = 1; id <= processCount; id++) {
|
for (int id = 1; id <= processCount; id++) {
|
||||||
UID uid(id, 0);
|
UID uid(id, 0);
|
||||||
|
|
|
@ -221,6 +221,7 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs, bool isSimula
|
||||||
init( DD_ENABLE_VERBOSE_TRACING, false ); if( randomize && BUGGIFY ) DD_ENABLE_VERBOSE_TRACING = true;
|
init( DD_ENABLE_VERBOSE_TRACING, false ); if( randomize && BUGGIFY ) DD_ENABLE_VERBOSE_TRACING = true;
|
||||||
init( DD_TEAMS_INFO_PRINT_INTERVAL, 60 ); if( randomize && BUGGIFY ) DD_TEAMS_INFO_PRINT_INTERVAL = 10;
|
init( DD_TEAMS_INFO_PRINT_INTERVAL, 60 ); if( randomize && BUGGIFY ) DD_TEAMS_INFO_PRINT_INTERVAL = 10;
|
||||||
init( DD_TEAMS_INFO_PRINT_YIELD_COUNT, 100 ); if( randomize && BUGGIFY ) DD_TEAMS_INFO_PRINT_YIELD_COUNT = deterministicRandom()->random01() * 1000 + 1;
|
init( DD_TEAMS_INFO_PRINT_YIELD_COUNT, 100 ); if( randomize && BUGGIFY ) DD_TEAMS_INFO_PRINT_YIELD_COUNT = deterministicRandom()->random01() * 1000 + 1;
|
||||||
|
init( DD_TEAM_ZERO_SERVER_LEFT_LOG_DELAY, 120 );
|
||||||
|
|
||||||
// TeamRemover
|
// TeamRemover
|
||||||
init( TR_FLAG_DISABLE_MACHINE_TEAM_REMOVER, false ); if( randomize && BUGGIFY ) TR_FLAG_DISABLE_MACHINE_TEAM_REMOVER = deterministicRandom()->random01() < 0.1 ? true : false; // false by default. disable the consistency check when it's true
|
init( TR_FLAG_DISABLE_MACHINE_TEAM_REMOVER, false ); if( randomize && BUGGIFY ) TR_FLAG_DISABLE_MACHINE_TEAM_REMOVER = deterministicRandom()->random01() < 0.1 ? true : false; // false by default. disable the consistency check when it's true
|
||||||
|
|
|
@ -184,6 +184,7 @@ public:
|
||||||
bool DD_ENABLE_VERBOSE_TRACING;
|
bool DD_ENABLE_VERBOSE_TRACING;
|
||||||
int DD_TEAMS_INFO_PRINT_INTERVAL;
|
int DD_TEAMS_INFO_PRINT_INTERVAL;
|
||||||
int DD_TEAMS_INFO_PRINT_YIELD_COUNT;
|
int DD_TEAMS_INFO_PRINT_YIELD_COUNT;
|
||||||
|
int DD_TEAM_ZERO_SERVER_LEFT_LOG_DELAY;
|
||||||
|
|
||||||
// TeamRemover to remove redundant teams
|
// TeamRemover to remove redundant teams
|
||||||
bool TR_FLAG_DISABLE_MACHINE_TEAM_REMOVER; // disable the machineTeamRemover actor
|
bool TR_FLAG_DISABLE_MACHINE_TEAM_REMOVER; // disable the machineTeamRemover actor
|
||||||
|
|
Loading…
Reference in New Issue