Merge pull request #609 from etschannen/release-6.0

Improved simulation strength by only remove datacenters that have been killed
This commit is contained in:
A.J. Beamon 2018-07-16 15:59:28 -07:00 committed by GitHub
commit 8879954254
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 91 additions and 62 deletions

View File

@ -2,7 +2,7 @@
Release Notes
#############
6.0.1
6.0.2
=====
Features
@ -22,6 +22,7 @@ Performance
* Avoid assigning storage servers responsibility for keys they do not have.
* Clients optimistically assume the first leader reply from a coordinator is correct. `(PR #425) <https://github.com/apple/foundationdb/pull/425>`_
* Network connections are now closed after no interface needs the connection. [6.0.1] `(Issue #375) <https://github.com/apple/foundationdb/issues/375>`_
* Significantly improved the CPU efficiency of copy mutations to transaction logs during recovery. [6.0.2] `(PR #595) <https://github.com/apple/foundationdb/pull/595>`_
Fixes
-----
@ -29,8 +30,8 @@ Fixes
* Not all endpoint failures were reported to the failure monitor.
* Watches registered on a lagging storage server would take a long time to trigger.
* The cluster controller would not start a new generation until it recovered its files from disk.
* Disk errors cause the server process to exit, preventing the process from being reused unless it can read its files from disk. `(PR #568) <https://github.com/apple/foundationdb/pull/568>`_
* Under heavy write load, storage servers would occasionally pause for ~100ms. [6.0.2] `(PR #597) <https://github.com/apple/foundationdb/pull/597>`_
* Storage servers were not given time to rejoin the cluster before being marked as failed. [6.0.2] `(PR #592) <https://github.com/apple/foundationdb/pull/592>`_
Status
------

View File

@ -1001,28 +1001,49 @@ public:
}
virtual bool isAvailable() const
{
std::vector<ProcessInfo*> processesLeft, processesDead;
std::vector<ProcessInfo*> processesLeft, processesDead;
for (auto processInfo : getAllProcesses()) {
// Add non-test processes (ie. datahall is not be set for test processes)
if (processInfo->isAvailableClass()) {
// Ignore excluded machines
if (processInfo->isExcluded())
if (processInfo->isExcluded() || processInfo->isCleared() || !processInfo->isAvailable()) {
processesDead.push_back(processInfo);
else if (processInfo->isCleared())
processesDead.push_back(processInfo);
// Mark all of the unavailable as dead
else if (!processInfo->isAvailable())
processesDead.push_back(processInfo);
else if (protectedAddresses.count(processInfo->address))
processesLeft.push_back(processInfo);
else
} else {
processesLeft.push_back(processInfo);
}
}
}
return canKillProcesses(processesLeft, processesDead, KillInstantly, NULL);
}
virtual bool datacenterDead(Optional<Standalone<StringRef>> dcId) const
{
if(!dcId.present()) {
return false;
}
LocalityGroup primaryProcessesLeft, primaryProcessesDead;
std::vector<LocalityData> primaryLocalitiesDead, primaryLocalitiesLeft;
for (auto processInfo : getAllProcesses()) {
if (processInfo->isAvailableClass() && processInfo->locality.dcId() == dcId) {
if (processInfo->isExcluded() || processInfo->isCleared() || !processInfo->isAvailable()) {
primaryProcessesDead.add(processInfo->locality);
primaryLocalitiesDead.push_back(processInfo->locality);
} else {
primaryProcessesLeft.add(processInfo->locality);
primaryLocalitiesLeft.push_back(processInfo->locality);
}
}
}
std::vector<LocalityData> badCombo;
bool primaryTLogsDead = tLogWriteAntiQuorum ? !validateAllCombinations(badCombo, primaryProcessesDead, tLogPolicy, primaryLocalitiesLeft, tLogWriteAntiQuorum, false) : primaryProcessesDead.validate(tLogPolicy);
if(usableRegions > 1 && remoteTLogPolicy && !primaryTLogsDead) {
primaryTLogsDead = primaryProcessesDead.validate(remoteTLogPolicy);
}
return primaryTLogsDead || primaryProcessesDead.validate(storagePolicy);
}
// The following function will determine if the specified configuration of available and dead processes can allow the cluster to survive
virtual bool canKillProcesses(std::vector<ProcessInfo*> const& availableProcesses, std::vector<ProcessInfo*> const& deadProcesses, KillType kt, KillType* newKillType) const
{
@ -1264,9 +1285,7 @@ public:
int protectedWorker = 0, unavailable = 0, excluded = 0, cleared = 0;
for (auto processInfo : getAllProcesses()) {
// Add non-test processes (ie. datahall is not be set for test processes)
if (processInfo->isAvailableClass()) {
// Do not include any excluded machines
if (processInfo->isExcluded()) {
processesDead.push_back(processInfo);
excluded++;
@ -1283,11 +1302,11 @@ public:
processesLeft.push_back(processInfo);
protectedWorker++;
}
else if (processInfo->locality.zoneId() != zoneId)
else if (processInfo->locality.zoneId() != zoneId) {
processesLeft.push_back(processInfo);
// Add processes from dead machines and datacenter machines to dead group
else
} else {
processesDead.push_back(processInfo);
}
}
}
if (!canKillProcesses(processesLeft, processesDead, kt, &kt)) {
@ -1385,22 +1404,14 @@ public:
{
std::vector<ProcessInfo*> processesLeft, processesDead;
for (auto processInfo : getAllProcesses()) {
// Add non-test processes (ie. datahall is not be set for test processes)
if (processInfo->isAvailableClass()) {
// Mark all of the unavailable as dead
if (processInfo->isExcluded())
if (processInfo->isExcluded() || processInfo->isCleared() || !processInfo->isAvailable()) {
processesDead.push_back(processInfo);
else if (processInfo->isCleared())
processesDead.push_back(processInfo);
else if (!processInfo->isAvailable())
processesDead.push_back(processInfo);
else if (protectedAddresses.count(processInfo->address))
} else if (protectedAddresses.count(processInfo->address) || datacenterZones.find(processInfo->locality.zoneId()) == datacenterZones.end()) {
processesLeft.push_back(processInfo);
// Keep all not in the datacenter zones
else if (datacenterZones.find(processInfo->locality.zoneId()) == datacenterZones.end())
processesLeft.push_back(processInfo);
else
} else {
processesDead.push_back(processInfo);
}
}
}

View File

@ -154,6 +154,7 @@ public:
//virtual KillType getMachineKillState( UID zoneID ) = 0;
virtual bool canKillProcesses(std::vector<ProcessInfo*> const& availableProcesses, std::vector<ProcessInfo*> const& deadProcesses, KillType kt, KillType* newKillType) const = 0;
virtual bool isAvailable() const = 0;
virtual bool datacenterDead(Optional<Standalone<StringRef>> dcId) const = 0;
virtual void displayWorkers() const;
virtual void addRole(NetworkAddress const& address, std::string const& role) {
@ -282,6 +283,8 @@ public:
Optional<Standalone<StringRef>> primaryDcId;
IRepPolicyRef remoteTLogPolicy;
int32_t usableRegions;
std::string disablePrimary;
std::string disableRemote;
bool allowLogSetKills;
Optional<Standalone<StringRef>> remoteDcId;
bool hasSatelliteReplication;

View File

@ -283,27 +283,35 @@ ACTOR Future<bool> getStorageServersRecruiting( Database cx, Reference<AsyncVar<
}
}
ACTOR Future<Void> reconfigureAfter(Database cx, double time, Reference<AsyncVar<ServerDBInfo>> dbInfo) {
Void _ = wait( delay(time) );
ACTOR Future<Void> repairDeadDatacenter(Database cx, Reference<AsyncVar<ServerDBInfo>> dbInfo, std::string context) {
if(g_network->isSimulated()) {
bool primaryDead = g_simulator.datacenterDead(g_simulator.primaryDcId);
bool remoteDead = g_simulator.datacenterDead(g_simulator.remoteDcId);
if(g_network->isSimulated() && g_simulator.allowLogSetKills) {
TraceEvent(SevWarnAlways, "DisablingFearlessConfiguration").detail("Location", "QuietDatabase").detail("Stage", "Repopulate");
g_simulator.usableRegions = 1;
ConfigurationResult::Type _ = wait( changeConfig( cx, "repopulate_anti_quorum=1" ) );
while( dbInfo->get().recoveryState < RecoveryState::STORAGE_RECOVERED ) {
Void _ = wait( dbInfo->onChange() );
ASSERT(!primaryDead || !remoteDead);
if(primaryDead || remoteDead) {
TraceEvent(SevWarnAlways, "DisablingFearlessConfiguration").detail("Location", context).detail("Stage", "Repopulate");
g_simulator.usableRegions = 1;
ConfigurationResult::Type _ = wait( changeConfig( cx, (primaryDead ? g_simulator.disablePrimary : g_simulator.disableRemote) + " repopulate_anti_quorum=1" ) );
while( dbInfo->get().recoveryState < RecoveryState::STORAGE_RECOVERED ) {
Void _ = wait( dbInfo->onChange() );
}
TraceEvent(SevWarnAlways, "DisablingFearlessConfiguration").detail("Location", context).detail("Stage", "Usable_Regions");
ConfigurationResult::Type _ = wait( changeConfig( cx, "usable_regions=1" ) );
}
TraceEvent(SevWarnAlways, "DisablingFearlessConfiguration").detail("Location", "QuietDatabase").detail("Stage", "Usable_Regions");
ConfigurationResult::Type _ = wait( changeConfig( cx, "usable_regions=1" ) );
g_simulator.allowLogSetKills = false;
}
return Void();
}
ACTOR Future<Void> reconfigureAfter(Database cx, double time, Reference<AsyncVar<ServerDBInfo>> dbInfo, std::string context) {
Void _ = wait( delay(time) );
Void _ = wait( repairDeadDatacenter(cx, dbInfo, context) );
return Void();
}
ACTOR Future<Void> waitForQuietDatabase( Database cx, Reference<AsyncVar<ServerDBInfo>> dbInfo, std::string phase, int64_t dataInFlightGate = 2e6,
int64_t maxTLogQueueGate = 5e6, int64_t maxStorageServerQueueGate = 5e6, int64_t maxDataDistributionQueueSize = 0 ) {
state Future<Void> reconfig = reconfigureAfter(cx, 100 + (g_random->random01()*100), dbInfo);
state Future<Void> reconfig = reconfigureAfter(cx, 100 + (g_random->random01()*100), dbInfo, "QuietDatabase");
TraceEvent(("QuietDatabase" + phase + "Begin").c_str());

View File

@ -35,5 +35,6 @@ Future<int64_t> getDataDistributionQueueSize( Database const &cx, Reference<Asyn
Future<vector<StorageServerInterface>> getStorageServers( Database const& cx, bool const &use_system_priority = false);
Future<vector<std::pair<WorkerInterface, ProcessClass>>> getWorkers( Reference<AsyncVar<ServerDBInfo>> const& dbInfo, int const& flags = 0 );
Future<WorkerInterface> getMasterWorker( Database const& cx, Reference<AsyncVar<ServerDBInfo>> const& dbInfo );
Future<Void> repairDeadDatacenter(Database const& cx, Reference<AsyncVar<ServerDBInfo>> const& dbInfo, std::string const& context);
#endif

View File

@ -918,6 +918,16 @@ void SimulationConfig::generateNormalConfig(int minimumReplication) {
}
set_config("regions=" + json_spirit::write_string(json_spirit::mValue(regionArr), json_spirit::Output_options::none));
if(needsRemote) {
StatusArray disablePrimary = regionArr;
disablePrimary[0].get_obj()["datacenters"].get_array()[0].get_obj()["priority"] = -1;
g_simulator.disablePrimary = "regions=" + json_spirit::write_string(json_spirit::mValue(disablePrimary), json_spirit::Output_options::none);
StatusArray disableRemote = regionArr;
disableRemote[1].get_obj()["datacenters"].get_array()[0].get_obj()["priority"] = -1;
g_simulator.disableRemote = "regions=" + json_spirit::write_string(json_spirit::mValue(disableRemote), json_spirit::Output_options::none);
}
}
if(generateFearless && minimumReplication > 1) {

View File

@ -772,17 +772,7 @@ ACTOR Future<Void> checkConsistency(Database cx, std::vector< TesterInterface >
spec.options[0].push_back_deep(spec.options.arena(), KeyValueRef(LiteralStringRef("failureIsError"), LiteralStringRef("true")));
lastRun = true;
}
if(g_network->isSimulated() && g_simulator.allowLogSetKills) {
TraceEvent(SevWarnAlways, "DisablingFearlessConfiguration").detail("Location", "ConsistencyCheck").detail("Stage", "Repopulate");
g_simulator.usableRegions = 1;
ConfigurationResult::Type _ = wait( changeConfig( cx, "repopulate_anti_quorum=1" ) );
while( dbInfo->get().recoveryState < RecoveryState::STORAGE_RECOVERED ) {
Void _ = wait( dbInfo->onChange() );
}
TraceEvent(SevWarnAlways, "DisablingFearlessConfiguration").detail("Location", "ConsistencyCheck").detail("Stage", "Usable_Regions");
ConfigurationResult::Type _ = wait( changeConfig( cx, "usable_regions=1" ) );
g_simulator.allowLogSetKills = false;
}
Void _ = wait( repairDeadDatacenter(cx, dbInfo, "ConsistencyCheck") );
}
}

View File

@ -146,7 +146,7 @@ ACTOR Future<Void> workerHandleErrors(FutureStream<ErrorInfo> errors) {
endRole(err.id, err.context, "Error", ok, err.error);
if (err.error.code() == error_code_please_reboot || err.error.code() == error_code_io_timeout || err.error.code() == error_code_io_error) throw err.error;
if (err.error.code() == error_code_please_reboot || err.error.code() == error_code_io_timeout) throw err.error;
}
}
}

View File

@ -663,10 +663,15 @@ public:
setUnconditional(v);
}
void setUnconditional( V const& v ) {
Promise<Void> trigger;
this->nextChange.swap(trigger);
Promise<Void> t;
this->nextChange.swap(t);
this->value = v;
trigger.send(Void());
t.send(Void());
}
void trigger() {
Promise<Void> t;
this->nextChange.swap(t);
t.send(Void());
}
private:
@ -683,7 +688,7 @@ public:
return v.onChange();
}
void trigger() {
v.setUnconditional(Void());
v.trigger();
}
private:
AsyncVar<Void> v;