DD:MisconfiguredLocality:Fix review comments
This commit is contained in:
parent
a8fff56f6e
commit
d2fd1f4931
|
@ -46,7 +46,7 @@ Fixes
|
|||
* The ``fileconfigure`` command in ``fdbcli`` could fail with an unknown error if the file did not contain a valid JSON object. `(PR #2017) <https://github.com/apple/foundationdb/pull/2017>`_.
|
||||
* Configuring regions would fail with an internal error if the cluster contained storage servers that didn't set a datacenter ID. `(PR #2017) <https://github.com/apple/foundationdb/pull/2017>`_.
|
||||
* Clients no longer prefer reading from servers with the same zone ID, because it could create hot shards. [6.2.3] `(PR #2019) <https://github.com/apple/foundationdb/pull/2019>`_.
|
||||
* Data distribution no longer gets stuck when storage servers' localities are misconfigured and later corrected. [6.2.5] `(PR #2110) <https://github.com/apple/foundationdb/pull/2110>`_.
|
||||
* Data distribution could fail to start if any storage servers had misconfigured locality information. This problem could persist even after the offending storage servers were removed or fixed. [6.2.5] `(PR #2110) <https://github.com/apple/foundationdb/pull/2110>`_.
|
||||
|
||||
Status
|
||||
------
|
||||
|
|
|
@ -42,6 +42,8 @@ class TCTeamInfo;
|
|||
struct TCMachineInfo;
|
||||
class TCMachineTeamInfo;
|
||||
|
||||
ACTOR Future<Void> checkAndRemoveInvalidLocalityAddr(DDTeamCollection* self);
|
||||
|
||||
struct TCServerInfo : public ReferenceCounted<TCServerInfo> {
|
||||
UID id;
|
||||
StorageServerInterface lastKnownInterface;
|
||||
|
@ -993,6 +995,12 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
|
|||
TraceEvent(SevWarnAlways, "MissingLocality")
|
||||
.detail("Server", i->first.uniqueID)
|
||||
.detail("Locality", i->first.locality.toString());
|
||||
auto addr = i->first.address();
|
||||
self->invalidLocalityAddr.insert(AddressExclusion(addr.ip, addr.port));
|
||||
if (self->checkInvalidLocalities.isReady()) {
|
||||
self->checkInvalidLocalities = checkAndRemoveInvalidLocalityAddr(self);
|
||||
self->addActor.send(self->checkInvalidLocalities);
|
||||
}
|
||||
}
|
||||
self->addServer(i->first, i->second, self->serverTrackerErrorOut, 0);
|
||||
}
|
||||
|
@ -3578,7 +3586,7 @@ ACTOR Future<Void> monitorStorageServerRecruitment(DDTeamCollection* self) {
|
|||
}
|
||||
|
||||
ACTOR Future<Void> checkAndRemoveInvalidLocalityAddr(DDTeamCollection* self) {
|
||||
state int checkCount = 0;
|
||||
state double start = now();
|
||||
state Transaction tr(self->cx);
|
||||
|
||||
loop {
|
||||
|
@ -3590,10 +3598,10 @@ ACTOR Future<Void> checkAndRemoveInvalidLocalityAddr(DDTeamCollection* self) {
|
|||
|
||||
// Because worker's processId can be changed when its locality is changed, we cannot watch on the old
|
||||
// processId; This actor is inactive most time, so iterating all workers incurs little performance overhead.
|
||||
Future<vector<ProcessData>> workers = getWorkers(&tr);
|
||||
std::set<AddressExclusion> existingAddrs;
|
||||
for (int i = 0; i < workers.get().size(); i++) {
|
||||
const ProcessData& workerData = workers.get()[i];
|
||||
state vector<ProcessData> workers = wait(getWorkers(&tr));
|
||||
state std::set<AddressExclusion> existingAddrs;
|
||||
for (int i = 0; i < workers.size(); i++) {
|
||||
const ProcessData& workerData = workers[i];
|
||||
AddressExclusion addr(workerData.address.ip, workerData.address.port);
|
||||
existingAddrs.insert(addr);
|
||||
if (self->invalidLocalityAddr.count(addr) &&
|
||||
|
@ -3603,6 +3611,8 @@ ACTOR Future<Void> checkAndRemoveInvalidLocalityAddr(DDTeamCollection* self) {
|
|||
}
|
||||
}
|
||||
|
||||
wait(yield(TaskPriority::DataDistribution));
|
||||
|
||||
// In case system operator permanently excludes workers on the address with invalid locality
|
||||
for (auto addr = self->invalidLocalityAddr.begin(); addr != self->invalidLocalityAddr.end();) {
|
||||
if (!existingAddrs.count(*addr)) {
|
||||
|
@ -3617,10 +3627,9 @@ ACTOR Future<Void> checkAndRemoveInvalidLocalityAddr(DDTeamCollection* self) {
|
|||
break;
|
||||
}
|
||||
|
||||
checkCount++;
|
||||
if (checkCount > 100) {
|
||||
// The incorrect locality info is not properly correctly on time
|
||||
TraceEvent(SevWarn, "InvalidLocality").detail("Addresses", self->invalidLocalityAddr.size());
|
||||
if (now() - start > 300) { // Report warning if invalid locality is not corrected within 300 seconds
|
||||
// The incorrect locality info has not been properly corrected in a reasonable time
|
||||
TraceEvent(SevWarn, "PersistentInvalidLocality").detail("Addresses", self->invalidLocalityAddr.size());
|
||||
}
|
||||
} catch (Error& e) {
|
||||
wait(tr.onError(e));
|
||||
|
|
|
@ -188,7 +188,7 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) {
|
|||
init( REBALANCE_MAX_RETRIES, 100 );
|
||||
init( DD_OVERLAP_PENALTY, 10000 );
|
||||
init( DD_VALIDATE_LOCALITY, true ); if( randomize && BUGGIFY ) DD_VALIDATE_LOCALITY = false;
|
||||
init( DD_CHECK_INVALID_LOCALITY_DELAY, 60 ); if( randomize && BUGGIFY ) DD_CHECK_INVALID_LOCALITY_DELAY = 10 + deterministicRandom()->random01() * 600;
|
||||
init( DD_CHECK_INVALID_LOCALITY_DELAY, 60 ); if( randomize && BUGGIFY ) DD_CHECK_INVALID_LOCALITY_DELAY = 1 + deterministicRandom()->random01() * 600;
|
||||
|
||||
// TeamRemover
|
||||
TR_FLAG_DISABLE_MACHINE_TEAM_REMOVER = false; if( randomize && BUGGIFY ) TR_FLAG_DISABLE_MACHINE_TEAM_REMOVER = deterministicRandom()->random01() < 0.1 ? true : false; // false by default. disable the consistency check when it's true
|
||||
|
|
Loading…
Reference in New Issue