Merge pull request #4495 from sfc-gh-etschannen/feature-fix-exlude-failed
Execute exclude failed commands after shutting down the rest of data distribution
This commit is contained in:
commit
35700f919f
|
@ -658,6 +658,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
|
||||||
|
|
||||||
AsyncTrigger printDetailedTeamsInfo;
|
AsyncTrigger printDetailedTeamsInfo;
|
||||||
PromiseStream<GetMetricsRequest> getShardMetrics;
|
PromiseStream<GetMetricsRequest> getShardMetrics;
|
||||||
|
Promise<UID> removeFailedServer;
|
||||||
|
|
||||||
void resetLocalitySet() {
|
void resetLocalitySet() {
|
||||||
storageServerSet = Reference<LocalitySet>(new LocalityMap<UID>());
|
storageServerSet = Reference<LocalitySet>(new LocalityMap<UID>());
|
||||||
|
@ -695,7 +696,8 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
|
||||||
Reference<AsyncVar<bool>> zeroHealthyTeams,
|
Reference<AsyncVar<bool>> zeroHealthyTeams,
|
||||||
bool primary,
|
bool primary,
|
||||||
Reference<AsyncVar<bool>> processingUnhealthy,
|
Reference<AsyncVar<bool>> processingUnhealthy,
|
||||||
PromiseStream<GetMetricsRequest> getShardMetrics)
|
PromiseStream<GetMetricsRequest> getShardMetrics,
|
||||||
|
Promise<UID> removeFailedServer)
|
||||||
: cx(cx), distributorId(distributorId), lock(lock), output(output),
|
: cx(cx), distributorId(distributorId), lock(lock), output(output),
|
||||||
shardsAffectedByTeamFailure(shardsAffectedByTeamFailure), doBuildTeams(true), lastBuildTeamsFailed(false),
|
shardsAffectedByTeamFailure(shardsAffectedByTeamFailure), doBuildTeams(true), lastBuildTeamsFailed(false),
|
||||||
teamBuilder(Void()), badTeamRemover(Void()), checkInvalidLocalities(Void()), wrongStoreTypeRemover(Void()),
|
teamBuilder(Void()), badTeamRemover(Void()), checkInvalidLocalities(Void()), wrongStoreTypeRemover(Void()),
|
||||||
|
@ -710,7 +712,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
|
||||||
zeroHealthyTeams(zeroHealthyTeams), zeroOptimalTeams(true), primary(primary),
|
zeroHealthyTeams(zeroHealthyTeams), zeroOptimalTeams(true), primary(primary),
|
||||||
medianAvailableSpace(SERVER_KNOBS->MIN_AVAILABLE_SPACE_RATIO), lastMedianAvailableSpaceUpdate(0),
|
medianAvailableSpace(SERVER_KNOBS->MIN_AVAILABLE_SPACE_RATIO), lastMedianAvailableSpaceUpdate(0),
|
||||||
processingUnhealthy(processingUnhealthy), lowestUtilizationTeam(0), highestUtilizationTeam(0),
|
processingUnhealthy(processingUnhealthy), lowestUtilizationTeam(0), highestUtilizationTeam(0),
|
||||||
getShardMetrics(getShardMetrics) {
|
getShardMetrics(getShardMetrics), removeFailedServer(removeFailedServer) {
|
||||||
if (!primary || configuration.usableRegions == 1) {
|
if (!primary || configuration.usableRegions == 1) {
|
||||||
TraceEvent("DDTrackerStarting", distributorId).detail("State", "Inactive").trackLatest("DDTrackerStarting");
|
TraceEvent("DDTrackerStarting", distributorId).detail("State", "Inactive").trackLatest("DDTrackerStarting");
|
||||||
}
|
}
|
||||||
|
@ -4145,10 +4147,14 @@ ACTOR Future<Void> storageServerTracker(
|
||||||
TraceEvent(SevWarn, "FailedServerRemoveKeys", self->distributorId)
|
TraceEvent(SevWarn, "FailedServerRemoveKeys", self->distributorId)
|
||||||
.detail("Server", server->id)
|
.detail("Server", server->id)
|
||||||
.detail("Excluded", worstAddr.toString());
|
.detail("Excluded", worstAddr.toString());
|
||||||
wait(removeKeysFromFailedServer(cx, server->id, self->lock, ddEnabledState));
|
wait(delay(0.0)); //Do not throw an error while still inside trackExcludedServers
|
||||||
if (BUGGIFY)
|
while (!ddEnabledState->isDDEnabled()) {
|
||||||
wait(delay(5.0));
|
wait(delay(1.0));
|
||||||
self->shardsAffectedByTeamFailure->eraseServer(server->id);
|
}
|
||||||
|
if (self->removeFailedServer.canBeSet()) {
|
||||||
|
self->removeFailedServer.send(server->id);
|
||||||
|
}
|
||||||
|
throw movekeys_conflict();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -4944,6 +4950,7 @@ ACTOR Future<Void> monitorBatchLimitedTime(Reference<AsyncVar<ServerDBInfo>> db,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Runs the data distribution algorithm for FDB, including the DD Queue, DD tracker, and DD team collection
|
||||||
ACTOR Future<Void> dataDistribution(Reference<DataDistributorData> self,
|
ACTOR Future<Void> dataDistribution(Reference<DataDistributorData> self,
|
||||||
PromiseStream<GetMetricsListRequest> getShardMetricsList,
|
PromiseStream<GetMetricsListRequest> getShardMetricsList,
|
||||||
const DDEnabledState* ddEnabledState) {
|
const DDEnabledState* ddEnabledState) {
|
||||||
|
@ -4973,7 +4980,7 @@ ACTOR Future<Void> dataDistribution(Reference<DataDistributorData> self,
|
||||||
// Stored outside of data distribution tracker to avoid slow tasks
|
// Stored outside of data distribution tracker to avoid slow tasks
|
||||||
// when tracker is cancelled
|
// when tracker is cancelled
|
||||||
state KeyRangeMap<ShardTrackedData> shards;
|
state KeyRangeMap<ShardTrackedData> shards;
|
||||||
|
state Promise<UID> removeFailedServer;
|
||||||
try {
|
try {
|
||||||
loop {
|
loop {
|
||||||
TraceEvent("DDInitTakingMoveKeysLock", self->ddId);
|
TraceEvent("DDInitTakingMoveKeysLock", self->ddId);
|
||||||
|
@ -5204,7 +5211,8 @@ ACTOR Future<Void> dataDistribution(Reference<DataDistributorData> self,
|
||||||
zeroHealthyTeams[0],
|
zeroHealthyTeams[0],
|
||||||
true,
|
true,
|
||||||
processingUnhealthy,
|
processingUnhealthy,
|
||||||
getShardMetrics);
|
getShardMetrics,
|
||||||
|
removeFailedServer);
|
||||||
teamCollectionsPtrs.push_back(primaryTeamCollection.getPtr());
|
teamCollectionsPtrs.push_back(primaryTeamCollection.getPtr());
|
||||||
if (configuration.usableRegions > 1) {
|
if (configuration.usableRegions > 1) {
|
||||||
remoteTeamCollection =
|
remoteTeamCollection =
|
||||||
|
@ -5220,7 +5228,8 @@ ACTOR Future<Void> dataDistribution(Reference<DataDistributorData> self,
|
||||||
zeroHealthyTeams[1],
|
zeroHealthyTeams[1],
|
||||||
false,
|
false,
|
||||||
processingUnhealthy,
|
processingUnhealthy,
|
||||||
getShardMetrics);
|
getShardMetrics,
|
||||||
|
removeFailedServer);
|
||||||
teamCollectionsPtrs.push_back(remoteTeamCollection.getPtr());
|
teamCollectionsPtrs.push_back(remoteTeamCollection.getPtr());
|
||||||
remoteTeamCollection->teamCollections = teamCollectionsPtrs;
|
remoteTeamCollection->teamCollections = teamCollectionsPtrs;
|
||||||
actors.push_back(
|
actors.push_back(
|
||||||
|
@ -5252,12 +5261,21 @@ ACTOR Future<Void> dataDistribution(Reference<DataDistributorData> self,
|
||||||
primaryTeamCollection = Reference<DDTeamCollection>();
|
primaryTeamCollection = Reference<DDTeamCollection>();
|
||||||
remoteTeamCollection = Reference<DDTeamCollection>();
|
remoteTeamCollection = Reference<DDTeamCollection>();
|
||||||
wait(shards.clearAsync());
|
wait(shards.clearAsync());
|
||||||
if (err.code() != error_code_movekeys_conflict)
|
TraceEvent("DataDistributorTeamCollectionsDestroyed").error(err);
|
||||||
throw err;
|
if (removeFailedServer.getFuture().isReady() && !removeFailedServer.getFuture().isError()) {
|
||||||
bool ddEnabled = wait(isDataDistributionEnabled(cx, ddEnabledState));
|
TraceEvent("RemoveFailedServer", removeFailedServer.getFuture().get()).error(err);
|
||||||
TraceEvent("DataDistributionMoveKeysConflict").detail("DataDistributionEnabled", ddEnabled).error(err);
|
wait(removeKeysFromFailedServer(cx, removeFailedServer.getFuture().get(), lock, ddEnabledState));
|
||||||
if (ddEnabled)
|
wait(removeStorageServer(cx, removeFailedServer.getFuture().get(), lock, ddEnabledState));
|
||||||
throw err;
|
} else {
|
||||||
|
if (err.code() != error_code_movekeys_conflict) {
|
||||||
|
throw err;
|
||||||
|
}
|
||||||
|
bool ddEnabled = wait(isDataDistributionEnabled(cx, ddEnabledState));
|
||||||
|
TraceEvent("DataDistributionMoveKeysConflict").detail("DataDistributionEnabled", ddEnabled).error(err);
|
||||||
|
if (ddEnabled) {
|
||||||
|
throw err;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -5682,7 +5700,8 @@ std::unique_ptr<DDTeamCollection> testTeamCollection(int teamSize,
|
||||||
makeReference<AsyncVar<bool>>(true),
|
makeReference<AsyncVar<bool>>(true),
|
||||||
true,
|
true,
|
||||||
makeReference<AsyncVar<bool>>(false),
|
makeReference<AsyncVar<bool>>(false),
|
||||||
PromiseStream<GetMetricsRequest>()));
|
PromiseStream<GetMetricsRequest>(),
|
||||||
|
Promise<UID>()));
|
||||||
|
|
||||||
for (int id = 1; id <= processCount; ++id) {
|
for (int id = 1; id <= processCount; ++id) {
|
||||||
UID uid(id, 0);
|
UID uid(id, 0);
|
||||||
|
@ -5723,7 +5742,8 @@ std::unique_ptr<DDTeamCollection> testMachineTeamCollection(int teamSize,
|
||||||
makeReference<AsyncVar<bool>>(true),
|
makeReference<AsyncVar<bool>>(true),
|
||||||
true,
|
true,
|
||||||
makeReference<AsyncVar<bool>>(false),
|
makeReference<AsyncVar<bool>>(false),
|
||||||
PromiseStream<GetMetricsRequest>()));
|
PromiseStream<GetMetricsRequest>(),
|
||||||
|
Promise<UID>()));
|
||||||
|
|
||||||
for (int id = 1; id <= processCount; id++) {
|
for (int id = 1; id <= processCount; id++) {
|
||||||
UID uid(id, 0);
|
UID uid(id, 0);
|
||||||
|
|
|
@ -178,7 +178,6 @@ public:
|
||||||
void moveShard(KeyRangeRef keys, std::vector<Team> destinationTeam);
|
void moveShard(KeyRangeRef keys, std::vector<Team> destinationTeam);
|
||||||
void finishMove(KeyRangeRef keys);
|
void finishMove(KeyRangeRef keys);
|
||||||
void check();
|
void check();
|
||||||
void eraseServer(UID ssID);
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
struct OrderByTeamKey {
|
struct OrderByTeamKey {
|
||||||
|
|
|
@ -999,10 +999,6 @@ void ShardsAffectedByTeamFailure::erase(Team team, KeyRange const& range) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void ShardsAffectedByTeamFailure::eraseServer(UID ssID) {
|
|
||||||
storageServerShards[ssID] = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
void ShardsAffectedByTeamFailure::insert(Team team, KeyRange const& range) {
|
void ShardsAffectedByTeamFailure::insert(Team team, KeyRange const& range) {
|
||||||
if (team_shards.insert(std::pair<Team, KeyRange>(team, range)).second) {
|
if (team_shards.insert(std::pair<Team, KeyRange>(team, range)).second) {
|
||||||
for (auto uid = team.servers.begin(); uid != team.servers.end(); ++uid)
|
for (auto uid = team.servers.begin(); uid != team.servers.end(); ++uid)
|
||||||
|
|
Loading…
Reference in New Issue