Address PR comments.

Revert knob name change, fix comparison between new and old
recruitments, and get rid of empty `if` block.
This commit is contained in:
Suraj Gupta 2021-09-15 13:41:59 -04:00 committed by Josh Slocum
parent 6b4eb06201
commit 6533678f0d
3 changed files with 11 additions and 15 deletions

View File

@ -241,6 +241,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( DD_LOCATION_CACHE_SIZE, 2000000 ); if( randomize && BUGGIFY ) DD_LOCATION_CACHE_SIZE = 3;
init( MOVEKEYS_LOCK_POLLING_DELAY, 5.0 );
init( DEBOUNCE_RECRUITING_DELAY, 5.0 );
init( DD_FAILURE_TIME, 1.0 ); if( randomize && BUGGIFY ) DD_FAILURE_TIME = 10.0;
init( DD_ZERO_HEALTHY_TEAM_DELAY, 1.0 );
init( REBALANCE_MAX_RETRIES, 100 );
init( DD_OVERLAP_PENALTY, 10000 );
@ -465,7 +466,6 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( VERSION_LAG_METRIC_INTERVAL, 0.5 ); if( randomize && BUGGIFY ) VERSION_LAG_METRIC_INTERVAL = 10.0;
init( MAX_VERSION_DIFFERENCE, 20 * VERSIONS_PER_SECOND );
init( FORCE_RECOVERY_CHECK_DELAY, 5.0 );
init( DATA_DISTRIBUTOR_FAILURE_TIME, 1.0 ); if( randomize && BUGGIFY ) DATA_DISTRIBUTOR_FAILURE_TIME = 10.0;
init( RATEKEEPER_FAILURE_TIME, 1.0 );
init( REPLACE_INTERFACE_DELAY, 60.0 );
init( REPLACE_INTERFACE_CHECK_DELAY, 5.0 );

View File

@ -224,6 +224,7 @@ public:
// Remove wrong storage engines
double DD_REMOVE_STORE_ENGINE_DELAY; // wait for the specified time before remove the next batch
double DD_FAILURE_TIME;
double DD_ZERO_HEALTHY_TEAM_DELAY;
// KeyValueStore SQLITE
@ -389,7 +390,6 @@ public:
double VERSION_LAG_METRIC_INTERVAL;
int64_t MAX_VERSION_DIFFERENCE;
double FORCE_RECOVERY_CHECK_DELAY;
double DATA_DISTRIBUTOR_FAILURE_TIME;
double RATEKEEPER_FAILURE_TIME;
double REPLACE_INTERFACE_DELAY;
double REPLACE_INTERFACE_CHECK_DELAY;

View File

@ -3510,23 +3510,21 @@ void checkBetterSingletons(ClusterControllerData* self) {
// check if we can colocate the singletons in a more optimal way
// TODO: verify that we don't need to get the pid from the worker like we were doing before
Optional<Standalone<StringRef>> currentRKProcessId = rkSingleton.interface.get().locality.processId();
Optional<Standalone<StringRef>> currentDDProcessId = ddSingleton.interface.get().locality.processId();
Optional<Standalone<StringRef>> currRKProcessId = rkSingleton.interface.get().locality.processId();
Optional<Standalone<StringRef>> currDDProcessId = ddSingleton.interface.get().locality.processId();
Optional<Standalone<StringRef>> newRKProcessId = newRKWorker.interf.locality.processId();
Optional<Standalone<StringRef>> newDDProcessId = newRKWorker.interf.locality.processId();
auto currColocMap = getColocCounts({ currentRKProcessId, currentDDProcessId });
auto currColocMap = getColocCounts({ currRKProcessId, currDDProcessId });
auto newColocMap = getColocCounts({ newRKProcessId, newDDProcessId });
auto currColocCounts = std::make_tuple(currColocMap[newRKProcessId], currColocMap[newDDProcessId]);
auto newColocCounts = std::make_tuple(newColocMap[newRKProcessId], newColocMap[newDDProcessId]);
// if the new coloc counts are collectively better (i.e. each singleton's coloc count has not increased)
if (newColocCounts <= currColocCounts) {
// if the new coloc counts are not worse (i.e. each singleton's coloc count has not increased)
if (newColocMap[newRKProcessId] <= currColocMap[currRKProcessId] &&
newColocMap[newDDProcessId] <= currColocMap[currDDProcessId]) {
// rerecruit the singleton for which we have found a better process, if any
if (newColocMap[newRKProcessId] < currColocMap[currentRKProcessId]) {
if (newColocMap[newRKProcessId] < currColocMap[currRKProcessId]) {
rkSingleton.recruit(self);
} else if (newColocMap[newDDProcessId] < currColocMap[currentDDProcessId]) {
} else if (newColocMap[newDDProcessId] < currColocMap[currDDProcessId]) {
ddSingleton.recruit(self);
}
}
@ -3892,8 +3890,6 @@ void haltRegisteringOrCurrentSingleton(ClusterControllerData* self,
.detail("DcID", printable(self->clusterControllerDcId))
.detail("ReqDcID", printable(worker.locality.dcId()))
.detail("Recruiting" + roleAbbr + "ID", recruitingID.present() ? recruitingID.get() : UID());
if (registeringSingleton.getClusterRole() == ProcessClass::DataDistributor) {
}
registeringSingleton.halt(self, worker.locality.processId());
} else if (!recruitingID.present()) {
// if not currently recruiting, then halt previous one in favour of requesting one
@ -4767,7 +4763,7 @@ ACTOR Future<Void> monitorDataDistributor(ClusterControllerData* self) {
if (self->db.serverInfo->get().distributor.present() && !self->recruitDistributor.get()) {
choose {
when(wait(waitFailureClient(self->db.serverInfo->get().distributor.get().waitFailure,
SERVER_KNOBS->DATA_DISTRIBUTOR_FAILURE_TIME))) {
SERVER_KNOBS->DD_FAILURE_TIME))) {
TraceEvent("CCDataDistributorDied", self->id)
.detail("DDID", self->db.serverInfo->get().distributor.get().id());
self->db.clearInterf(ProcessClass::DataDistributorClass);