Merge pull request #609 from etschannen/release-6.0
Improved simulation strength by only remove datacenters that have been killed
This commit is contained in:
commit
8879954254
|
@ -2,7 +2,7 @@
|
|||
Release Notes
|
||||
#############
|
||||
|
||||
6.0.1
|
||||
6.0.2
|
||||
=====
|
||||
|
||||
Features
|
||||
|
@ -22,6 +22,7 @@ Performance
|
|||
* Avoid assigning storage servers responsibility for keys they do not have.
|
||||
* Clients optimistically assume the first leader reply from a coordinator is correct. `(PR #425) <https://github.com/apple/foundationdb/pull/425>`_
|
||||
* Network connections are now closed after no interface needs the connection. [6.0.1] `(Issue #375) <https://github.com/apple/foundationdb/issues/375>`_
|
||||
* Significantly improved the CPU efficiency of copy mutations to transaction logs during recovery. [6.0.2] `(PR #595) <https://github.com/apple/foundationdb/pull/595>`_
|
||||
|
||||
Fixes
|
||||
-----
|
||||
|
@ -29,8 +30,8 @@ Fixes
|
|||
* Not all endpoint failures were reported to the failure monitor.
|
||||
* Watches registered on a lagging storage server would take a long time to trigger.
|
||||
* The cluster controller would not start a new generation until it recovered its files from disk.
|
||||
* Disk errors cause the server process to exit, preventing the process from being reused unless it can read its files from disk. `(PR #568) <https://github.com/apple/foundationdb/pull/568>`_
|
||||
|
||||
* Under heavy write load, storage servers would occasionally pause for ~100ms. [6.0.2] `(PR #597) <https://github.com/apple/foundationdb/pull/597>`_
|
||||
* Storage servers were not given time to rejoin the cluster before being marked as failed. [6.0.2] `(PR #592) <https://github.com/apple/foundationdb/pull/592>`_
|
||||
|
||||
Status
|
||||
------
|
||||
|
|
|
@ -1001,28 +1001,49 @@ public:
|
|||
}
|
||||
virtual bool isAvailable() const
|
||||
{
|
||||
std::vector<ProcessInfo*> processesLeft, processesDead;
|
||||
|
||||
std::vector<ProcessInfo*> processesLeft, processesDead;
|
||||
for (auto processInfo : getAllProcesses()) {
|
||||
// Add non-test processes (ie. datahall is not be set for test processes)
|
||||
if (processInfo->isAvailableClass()) {
|
||||
// Ignore excluded machines
|
||||
if (processInfo->isExcluded())
|
||||
if (processInfo->isExcluded() || processInfo->isCleared() || !processInfo->isAvailable()) {
|
||||
processesDead.push_back(processInfo);
|
||||
else if (processInfo->isCleared())
|
||||
processesDead.push_back(processInfo);
|
||||
// Mark all of the unavailable as dead
|
||||
else if (!processInfo->isAvailable())
|
||||
processesDead.push_back(processInfo);
|
||||
else if (protectedAddresses.count(processInfo->address))
|
||||
processesLeft.push_back(processInfo);
|
||||
else
|
||||
} else {
|
||||
processesLeft.push_back(processInfo);
|
||||
}
|
||||
}
|
||||
}
|
||||
return canKillProcesses(processesLeft, processesDead, KillInstantly, NULL);
|
||||
}
|
||||
|
||||
virtual bool datacenterDead(Optional<Standalone<StringRef>> dcId) const
|
||||
{
|
||||
if(!dcId.present()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
LocalityGroup primaryProcessesLeft, primaryProcessesDead;
|
||||
std::vector<LocalityData> primaryLocalitiesDead, primaryLocalitiesLeft;
|
||||
|
||||
for (auto processInfo : getAllProcesses()) {
|
||||
if (processInfo->isAvailableClass() && processInfo->locality.dcId() == dcId) {
|
||||
if (processInfo->isExcluded() || processInfo->isCleared() || !processInfo->isAvailable()) {
|
||||
primaryProcessesDead.add(processInfo->locality);
|
||||
primaryLocalitiesDead.push_back(processInfo->locality);
|
||||
} else {
|
||||
primaryProcessesLeft.add(processInfo->locality);
|
||||
primaryLocalitiesLeft.push_back(processInfo->locality);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<LocalityData> badCombo;
|
||||
bool primaryTLogsDead = tLogWriteAntiQuorum ? !validateAllCombinations(badCombo, primaryProcessesDead, tLogPolicy, primaryLocalitiesLeft, tLogWriteAntiQuorum, false) : primaryProcessesDead.validate(tLogPolicy);
|
||||
if(usableRegions > 1 && remoteTLogPolicy && !primaryTLogsDead) {
|
||||
primaryTLogsDead = primaryProcessesDead.validate(remoteTLogPolicy);
|
||||
}
|
||||
|
||||
return primaryTLogsDead || primaryProcessesDead.validate(storagePolicy);
|
||||
}
|
||||
|
||||
// The following function will determine if the specified configuration of available and dead processes can allow the cluster to survive
|
||||
virtual bool canKillProcesses(std::vector<ProcessInfo*> const& availableProcesses, std::vector<ProcessInfo*> const& deadProcesses, KillType kt, KillType* newKillType) const
|
||||
{
|
||||
|
@ -1264,9 +1285,7 @@ public:
|
|||
int protectedWorker = 0, unavailable = 0, excluded = 0, cleared = 0;
|
||||
|
||||
for (auto processInfo : getAllProcesses()) {
|
||||
// Add non-test processes (ie. datahall is not be set for test processes)
|
||||
if (processInfo->isAvailableClass()) {
|
||||
// Do not include any excluded machines
|
||||
if (processInfo->isExcluded()) {
|
||||
processesDead.push_back(processInfo);
|
||||
excluded++;
|
||||
|
@ -1283,11 +1302,11 @@ public:
|
|||
processesLeft.push_back(processInfo);
|
||||
protectedWorker++;
|
||||
}
|
||||
else if (processInfo->locality.zoneId() != zoneId)
|
||||
else if (processInfo->locality.zoneId() != zoneId) {
|
||||
processesLeft.push_back(processInfo);
|
||||
// Add processes from dead machines and datacenter machines to dead group
|
||||
else
|
||||
} else {
|
||||
processesDead.push_back(processInfo);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!canKillProcesses(processesLeft, processesDead, kt, &kt)) {
|
||||
|
@ -1385,22 +1404,14 @@ public:
|
|||
{
|
||||
std::vector<ProcessInfo*> processesLeft, processesDead;
|
||||
for (auto processInfo : getAllProcesses()) {
|
||||
// Add non-test processes (ie. datahall is not be set for test processes)
|
||||
if (processInfo->isAvailableClass()) {
|
||||
// Mark all of the unavailable as dead
|
||||
if (processInfo->isExcluded())
|
||||
if (processInfo->isExcluded() || processInfo->isCleared() || !processInfo->isAvailable()) {
|
||||
processesDead.push_back(processInfo);
|
||||
else if (processInfo->isCleared())
|
||||
processesDead.push_back(processInfo);
|
||||
else if (!processInfo->isAvailable())
|
||||
processesDead.push_back(processInfo);
|
||||
else if (protectedAddresses.count(processInfo->address))
|
||||
} else if (protectedAddresses.count(processInfo->address) || datacenterZones.find(processInfo->locality.zoneId()) == datacenterZones.end()) {
|
||||
processesLeft.push_back(processInfo);
|
||||
// Keep all not in the datacenter zones
|
||||
else if (datacenterZones.find(processInfo->locality.zoneId()) == datacenterZones.end())
|
||||
processesLeft.push_back(processInfo);
|
||||
else
|
||||
} else {
|
||||
processesDead.push_back(processInfo);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -154,6 +154,7 @@ public:
|
|||
//virtual KillType getMachineKillState( UID zoneID ) = 0;
|
||||
virtual bool canKillProcesses(std::vector<ProcessInfo*> const& availableProcesses, std::vector<ProcessInfo*> const& deadProcesses, KillType kt, KillType* newKillType) const = 0;
|
||||
virtual bool isAvailable() const = 0;
|
||||
virtual bool datacenterDead(Optional<Standalone<StringRef>> dcId) const = 0;
|
||||
virtual void displayWorkers() const;
|
||||
|
||||
virtual void addRole(NetworkAddress const& address, std::string const& role) {
|
||||
|
@ -282,6 +283,8 @@ public:
|
|||
Optional<Standalone<StringRef>> primaryDcId;
|
||||
IRepPolicyRef remoteTLogPolicy;
|
||||
int32_t usableRegions;
|
||||
std::string disablePrimary;
|
||||
std::string disableRemote;
|
||||
bool allowLogSetKills;
|
||||
Optional<Standalone<StringRef>> remoteDcId;
|
||||
bool hasSatelliteReplication;
|
||||
|
|
|
@ -283,27 +283,35 @@ ACTOR Future<bool> getStorageServersRecruiting( Database cx, Reference<AsyncVar<
|
|||
}
|
||||
}
|
||||
|
||||
ACTOR Future<Void> reconfigureAfter(Database cx, double time, Reference<AsyncVar<ServerDBInfo>> dbInfo) {
|
||||
Void _ = wait( delay(time) );
|
||||
ACTOR Future<Void> repairDeadDatacenter(Database cx, Reference<AsyncVar<ServerDBInfo>> dbInfo, std::string context) {
|
||||
if(g_network->isSimulated()) {
|
||||
bool primaryDead = g_simulator.datacenterDead(g_simulator.primaryDcId);
|
||||
bool remoteDead = g_simulator.datacenterDead(g_simulator.remoteDcId);
|
||||
|
||||
if(g_network->isSimulated() && g_simulator.allowLogSetKills) {
|
||||
TraceEvent(SevWarnAlways, "DisablingFearlessConfiguration").detail("Location", "QuietDatabase").detail("Stage", "Repopulate");
|
||||
g_simulator.usableRegions = 1;
|
||||
ConfigurationResult::Type _ = wait( changeConfig( cx, "repopulate_anti_quorum=1" ) );
|
||||
while( dbInfo->get().recoveryState < RecoveryState::STORAGE_RECOVERED ) {
|
||||
Void _ = wait( dbInfo->onChange() );
|
||||
ASSERT(!primaryDead || !remoteDead);
|
||||
if(primaryDead || remoteDead) {
|
||||
TraceEvent(SevWarnAlways, "DisablingFearlessConfiguration").detail("Location", context).detail("Stage", "Repopulate");
|
||||
g_simulator.usableRegions = 1;
|
||||
ConfigurationResult::Type _ = wait( changeConfig( cx, (primaryDead ? g_simulator.disablePrimary : g_simulator.disableRemote) + " repopulate_anti_quorum=1" ) );
|
||||
while( dbInfo->get().recoveryState < RecoveryState::STORAGE_RECOVERED ) {
|
||||
Void _ = wait( dbInfo->onChange() );
|
||||
}
|
||||
TraceEvent(SevWarnAlways, "DisablingFearlessConfiguration").detail("Location", context).detail("Stage", "Usable_Regions");
|
||||
ConfigurationResult::Type _ = wait( changeConfig( cx, "usable_regions=1" ) );
|
||||
}
|
||||
TraceEvent(SevWarnAlways, "DisablingFearlessConfiguration").detail("Location", "QuietDatabase").detail("Stage", "Usable_Regions");
|
||||
ConfigurationResult::Type _ = wait( changeConfig( cx, "usable_regions=1" ) );
|
||||
g_simulator.allowLogSetKills = false;
|
||||
}
|
||||
return Void();
|
||||
}
|
||||
|
||||
ACTOR Future<Void> reconfigureAfter(Database cx, double time, Reference<AsyncVar<ServerDBInfo>> dbInfo, std::string context) {
|
||||
Void _ = wait( delay(time) );
|
||||
Void _ = wait( repairDeadDatacenter(cx, dbInfo, context) );
|
||||
return Void();
|
||||
}
|
||||
|
||||
ACTOR Future<Void> waitForQuietDatabase( Database cx, Reference<AsyncVar<ServerDBInfo>> dbInfo, std::string phase, int64_t dataInFlightGate = 2e6,
|
||||
int64_t maxTLogQueueGate = 5e6, int64_t maxStorageServerQueueGate = 5e6, int64_t maxDataDistributionQueueSize = 0 ) {
|
||||
state Future<Void> reconfig = reconfigureAfter(cx, 100 + (g_random->random01()*100), dbInfo);
|
||||
state Future<Void> reconfig = reconfigureAfter(cx, 100 + (g_random->random01()*100), dbInfo, "QuietDatabase");
|
||||
|
||||
TraceEvent(("QuietDatabase" + phase + "Begin").c_str());
|
||||
|
||||
|
|
|
@ -35,5 +35,6 @@ Future<int64_t> getDataDistributionQueueSize( Database const &cx, Reference<Asyn
|
|||
Future<vector<StorageServerInterface>> getStorageServers( Database const& cx, bool const &use_system_priority = false);
|
||||
Future<vector<std::pair<WorkerInterface, ProcessClass>>> getWorkers( Reference<AsyncVar<ServerDBInfo>> const& dbInfo, int const& flags = 0 );
|
||||
Future<WorkerInterface> getMasterWorker( Database const& cx, Reference<AsyncVar<ServerDBInfo>> const& dbInfo );
|
||||
Future<Void> repairDeadDatacenter(Database const& cx, Reference<AsyncVar<ServerDBInfo>> const& dbInfo, std::string const& context);
|
||||
|
||||
#endif
|
|
@ -918,6 +918,16 @@ void SimulationConfig::generateNormalConfig(int minimumReplication) {
|
|||
}
|
||||
|
||||
set_config("regions=" + json_spirit::write_string(json_spirit::mValue(regionArr), json_spirit::Output_options::none));
|
||||
|
||||
if(needsRemote) {
|
||||
StatusArray disablePrimary = regionArr;
|
||||
disablePrimary[0].get_obj()["datacenters"].get_array()[0].get_obj()["priority"] = -1;
|
||||
g_simulator.disablePrimary = "regions=" + json_spirit::write_string(json_spirit::mValue(disablePrimary), json_spirit::Output_options::none);
|
||||
|
||||
StatusArray disableRemote = regionArr;
|
||||
disableRemote[1].get_obj()["datacenters"].get_array()[0].get_obj()["priority"] = -1;
|
||||
g_simulator.disableRemote = "regions=" + json_spirit::write_string(json_spirit::mValue(disableRemote), json_spirit::Output_options::none);
|
||||
}
|
||||
}
|
||||
|
||||
if(generateFearless && minimumReplication > 1) {
|
||||
|
|
|
@ -772,17 +772,7 @@ ACTOR Future<Void> checkConsistency(Database cx, std::vector< TesterInterface >
|
|||
spec.options[0].push_back_deep(spec.options.arena(), KeyValueRef(LiteralStringRef("failureIsError"), LiteralStringRef("true")));
|
||||
lastRun = true;
|
||||
}
|
||||
if(g_network->isSimulated() && g_simulator.allowLogSetKills) {
|
||||
TraceEvent(SevWarnAlways, "DisablingFearlessConfiguration").detail("Location", "ConsistencyCheck").detail("Stage", "Repopulate");
|
||||
g_simulator.usableRegions = 1;
|
||||
ConfigurationResult::Type _ = wait( changeConfig( cx, "repopulate_anti_quorum=1" ) );
|
||||
while( dbInfo->get().recoveryState < RecoveryState::STORAGE_RECOVERED ) {
|
||||
Void _ = wait( dbInfo->onChange() );
|
||||
}
|
||||
TraceEvent(SevWarnAlways, "DisablingFearlessConfiguration").detail("Location", "ConsistencyCheck").detail("Stage", "Usable_Regions");
|
||||
ConfigurationResult::Type _ = wait( changeConfig( cx, "usable_regions=1" ) );
|
||||
g_simulator.allowLogSetKills = false;
|
||||
}
|
||||
Void _ = wait( repairDeadDatacenter(cx, dbInfo, "ConsistencyCheck") );
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -146,7 +146,7 @@ ACTOR Future<Void> workerHandleErrors(FutureStream<ErrorInfo> errors) {
|
|||
|
||||
endRole(err.id, err.context, "Error", ok, err.error);
|
||||
|
||||
if (err.error.code() == error_code_please_reboot || err.error.code() == error_code_io_timeout || err.error.code() == error_code_io_error) throw err.error;
|
||||
if (err.error.code() == error_code_please_reboot || err.error.code() == error_code_io_timeout) throw err.error;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -663,10 +663,15 @@ public:
|
|||
setUnconditional(v);
|
||||
}
|
||||
void setUnconditional( V const& v ) {
|
||||
Promise<Void> trigger;
|
||||
this->nextChange.swap(trigger);
|
||||
Promise<Void> t;
|
||||
this->nextChange.swap(t);
|
||||
this->value = v;
|
||||
trigger.send(Void());
|
||||
t.send(Void());
|
||||
}
|
||||
void trigger() {
|
||||
Promise<Void> t;
|
||||
this->nextChange.swap(t);
|
||||
t.send(Void());
|
||||
}
|
||||
|
||||
private:
|
||||
|
@ -683,7 +688,7 @@ public:
|
|||
return v.onChange();
|
||||
}
|
||||
void trigger() {
|
||||
v.setUnconditional(Void());
|
||||
v.trigger();
|
||||
}
|
||||
private:
|
||||
AsyncVar<Void> v;
|
||||
|
|
Loading…
Reference in New Issue