Merge branch 'release-6.1'

# Conflicts:
#	documentation/sphinx/source/release-notes.rst
#	versions.target
This commit is contained in:
Evan Tschannen 2019-04-08 18:38:42 -07:00
commit 21c0ba555c
32 changed files with 169 additions and 64 deletions

View File

@ -77,7 +77,9 @@ fdb_bool_t fdb_error_predicate( int predicate_test, fdb_error_t code ) {
return code == error_code_not_committed ||
code == error_code_transaction_too_old ||
code == error_code_future_version ||
code == error_code_database_locked;
code == error_code_database_locked ||
code == error_code_proxy_memory_limit_exceeded ||
code == error_code_process_behind;
}
return false;
}

View File

@ -6,11 +6,11 @@ if(WIN32)
target_compile_options(coveragetool PRIVATE "/langversion:6")
set_property(TARGET coveragetool PROPERTY VS_DOTNET_REFERENCES
"System"
"ystem.Core"
"System.Core"
"System.Xml.Linq"
"ystem.Data.DataSetExtensions"
"System.Data.DataSetExtensions"
"Microsoft.CSharp"
"ystem.Data"
"System.Data"
"System.Xml")
else()
set(COVERAGETOOL_COMPILER_REFERENCES

View File

@ -661,6 +661,11 @@ You can now remove old client library versions from your clients. This is only t
Version-specific notes on upgrading
===================================
Upgrading from 6.1.x
--------------------
Upgrades from 6.1.x will keep all your old data and configuration settings.
Upgrading from 6.0.x
--------------------

View File

@ -32,12 +32,16 @@ FoundationDB may return the following error codes from API functions. If you nee
+-----------------------------------------------+-----+--------------------------------------------------------------------------------+
| accessed_unreadable | 1036| Read or wrote an unreadable key |
+-----------------------------------------------+-----+--------------------------------------------------------------------------------+
| process_behind | 1037| Storage process does not have recent mutations |
+-----------------------------------------------+-----+--------------------------------------------------------------------------------+
| database_locked | 1038| Database is locked |
+-----------------------------------------------+-----+--------------------------------------------------------------------------------+
| cluster_version_changed | 1039| Cluster has been upgraded to a new protocol version |
+-----------------------------------------------+-----+--------------------------------------------------------------------------------+
| external_client_already_loaded | 1040| External client has already been loaded |
+-----------------------------------------------+-----+--------------------------------------------------------------------------------+
| proxy_memory_limit_exceeded | 1042| Proxy commit memory limit exceeded |
+-----------------------------------------------+-----+--------------------------------------------------------------------------------+
| operation_cancelled | 1101| Asynchronous operation cancelled |
+-----------------------------------------------+-----+--------------------------------------------------------------------------------+
| future_released | 1102| Future has been released |

View File

@ -10,38 +10,38 @@ macOS
The macOS installation package is supported on macOS 10.7+. It includes the client and (optionally) the server.
* `FoundationDB-6.1.0.pkg <https://www.foundationdb.org/downloads/6.1.0/macOS/installers/FoundationDB-6.1.0.pkg>`_
* `FoundationDB-6.1.1.pkg <https://www.foundationdb.org/downloads/6.1.1/macOS/installers/FoundationDB-6.1.1.pkg>`_
Ubuntu
------
The Ubuntu packages are supported on 64-bit Ubuntu 12.04+, but beware of the Linux kernel bug in Ubuntu 12.x.
* `foundationdb-clients-6.1.0-1_amd64.deb <https://www.foundationdb.org/downloads/6.1.0/ubuntu/installers/foundationdb-clients_6.1.0-1_amd64.deb>`_
* `foundationdb-server-6.1.0-1_amd64.deb <https://www.foundationdb.org/downloads/6.1.0/ubuntu/installers/foundationdb-server_6.1.0-1_amd64.deb>`_ (depends on the clients package)
* `foundationdb-clients-6.1.1-1_amd64.deb <https://www.foundationdb.org/downloads/6.1.1/ubuntu/installers/foundationdb-clients_6.1.1-1_amd64.deb>`_
* `foundationdb-server-6.1.1-1_amd64.deb <https://www.foundationdb.org/downloads/6.1.1/ubuntu/installers/foundationdb-server_6.1.1-1_amd64.deb>`_ (depends on the clients package)
RHEL/CentOS EL6
---------------
The RHEL/CentOS EL6 packages are supported on 64-bit RHEL/CentOS 6.x.
* `foundationdb-clients-6.1.0-1.el6.x86_64.rpm <https://www.foundationdb.org/downloads/6.1.0/rhel6/installers/foundationdb-clients-6.1.0-1.el6.x86_64.rpm>`_
* `foundationdb-server-6.1.0-1.el6.x86_64.rpm <https://www.foundationdb.org/downloads/6.1.0/rhel6/installers/foundationdb-server-6.1.0-1.el6.x86_64.rpm>`_ (depends on the clients package)
* `foundationdb-clients-6.1.1-1.el6.x86_64.rpm <https://www.foundationdb.org/downloads/6.1.1/rhel6/installers/foundationdb-clients-6.1.1-1.el6.x86_64.rpm>`_
* `foundationdb-server-6.1.1-1.el6.x86_64.rpm <https://www.foundationdb.org/downloads/6.1.1/rhel6/installers/foundationdb-server-6.1.1-1.el6.x86_64.rpm>`_ (depends on the clients package)
RHEL/CentOS EL7
---------------
The RHEL/CentOS EL7 packages are supported on 64-bit RHEL/CentOS 7.x.
* `foundationdb-clients-6.1.0-1.el7.x86_64.rpm <https://www.foundationdb.org/downloads/6.1.0/rhel7/installers/foundationdb-clients-6.1.0-1.el7.x86_64.rpm>`_
* `foundationdb-server-6.1.0-1.el7.x86_64.rpm <https://www.foundationdb.org/downloads/6.1.0/rhel7/installers/foundationdb-server-6.1.0-1.el7.x86_64.rpm>`_ (depends on the clients package)
* `foundationdb-clients-6.1.1-1.el7.x86_64.rpm <https://www.foundationdb.org/downloads/6.1.1/rhel7/installers/foundationdb-clients-6.1.1-1.el7.x86_64.rpm>`_
* `foundationdb-server-6.1.1-1.el7.x86_64.rpm <https://www.foundationdb.org/downloads/6.1.1/rhel7/installers/foundationdb-server-6.1.1-1.el7.x86_64.rpm>`_ (depends on the clients package)
Windows
-------
The Windows installer is supported on 64-bit Windows XP and later. It includes the client and (optionally) the server.
* `foundationdb-6.1.0-x64.msi <https://www.foundationdb.org/downloads/6.1.0/windows/installers/foundationdb-6.1.0-x64.msi>`_
* `foundationdb-6.1.1-x64.msi <https://www.foundationdb.org/downloads/6.1.1/windows/installers/foundationdb-6.1.1-x64.msi>`_
API Language Bindings
=====================
@ -58,18 +58,18 @@ On macOS and Windows, the FoundationDB Python API bindings are installed as part
If you need to use the FoundationDB Python API from other Python installations or paths, download the Python package:
* `foundationdb-6.1.0.tar.gz <https://www.foundationdb.org/downloads/6.1.0/bindings/python/foundationdb-6.1.0.tar.gz>`_
* `foundationdb-6.1.1.tar.gz <https://www.foundationdb.org/downloads/6.1.1/bindings/python/foundationdb-6.1.1.tar.gz>`_
Ruby 1.9.3/2.0.0+
-----------------
* `fdb-6.1.0.gem <https://www.foundationdb.org/downloads/6.1.0/bindings/ruby/fdb-6.1.0.gem>`_
* `fdb-6.1.1.gem <https://www.foundationdb.org/downloads/6.1.1/bindings/ruby/fdb-6.1.1.gem>`_
Java 8+
-------
* `fdb-java-6.1.0.jar <https://www.foundationdb.org/downloads/6.1.0/bindings/java/fdb-java-6.1.0.jar>`_
* `fdb-java-6.1.0-javadoc.jar <https://www.foundationdb.org/downloads/6.1.0/bindings/java/fdb-java-6.1.0-javadoc.jar>`_
* `fdb-java-6.1.1.jar <https://www.foundationdb.org/downloads/6.1.1/bindings/java/fdb-java-6.1.1.jar>`_
* `fdb-java-6.1.1-javadoc.jar <https://www.foundationdb.org/downloads/6.1.1/bindings/java/fdb-java-6.1.1-javadoc.jar>`_
Go 1.1+
-------

View File

@ -434,7 +434,8 @@
"triple",
"three_datacenter",
"three_datacenter_fallback",
"three_data_hall"
"three_data_hall",
"three_data_hall_fallback"
]},
"regions":[{
"datacenters":[{

View File

@ -448,9 +448,14 @@ ACTOR Future<Void> readCommitted(Database cx, PromiseStream<RangeResultWithVersi
}
}
catch (Error &e) {
if (e.code() != error_code_transaction_too_old && e.code() != error_code_future_version)
throw;
tr = Transaction(cx);
if (e.code() == error_code_transaction_too_old) {
// We are using this transaction until it's too old and then resetting to a fresh one,
// so we don't need to delay.
tr.fullReset();
}
else {
wait(tr.onError(e));
}
}
}
}
@ -539,9 +544,14 @@ ACTOR Future<Void> readCommitted(Database cx, PromiseStream<RCGroup> results, Fu
nextKey = firstGreaterThan(rangevalue.end()[-1].key);
}
catch (Error &e) {
if (e.code() != error_code_transaction_too_old && e.code() != error_code_future_version)
throw;
wait(tr.onError(e));
if (e.code() == error_code_transaction_too_old) {
// We are using this transaction until it's too old and then resetting to a fresh one,
// so we don't need to delay.
tr.fullReset();
}
else {
wait(tr.onError(e));
}
}
}
}

View File

@ -239,6 +239,8 @@ StatusObject DatabaseConfiguration::toJSON(bool noPolicies) const {
result["redundancy_mode"] = "triple";
} else if( tLogReplicationFactor == 4 && storageTeamSize == 3 && tlogInfo == "data_hall^2 x zoneid^2 x 1" && storageInfo == "data_hall^3 x 1" ) {
result["redundancy_mode"] = "three_data_hall";
} else if( tLogReplicationFactor == 4 && storageTeamSize == 2 && tlogInfo == "data_hall^2 x zoneid^2 x 1" && storageInfo == "data_hall^2 x 1" ) {
result["redundancy_mode"] = "three_data_hall_fallback";
} else {
customRedundancy = true;
}

View File

@ -147,6 +147,7 @@ public:
int64_t transactionsNotCommitted;
int64_t transactionsMaybeCommitted;
int64_t transactionsResourceConstrained;
int64_t transactionsProcessBehind;
ContinuousSample<double> latencies, readLatencies, commitLatencies, GRVLatencies, mutationsPerCommit, bytesPerCommit;
int outstandingWatches;

View File

@ -169,7 +169,11 @@ ACTOR Future<Void> failureMonitorClientLoop(
ACTOR Future<Void> failureMonitorClient( Reference<AsyncVar<Optional<struct ClusterInterface>>> ci, bool trackMyStatus ) {
state SimpleFailureMonitor* monitor = static_cast<SimpleFailureMonitor*>( &IFailureMonitor::failureMonitor() );
state Reference<FailureMonitorClientState> fmState = Reference<FailureMonitorClientState>(new FailureMonitorClientState());
auto localAddr = g_network->getLocalAddresses();
monitor->setStatus(localAddr.address, FailureStatus(false));
if(localAddr.secondaryAddress.present()) {
monitor->setStatus(localAddr.secondaryAddress.get(), FailureStatus(false));
}
loop {
state Future<Void> client = ci->get().present() ? failureMonitorClientLoop(monitor, ci->get().get(), fmState, trackMyStatus) : Void();
wait( ci->onChange() );

View File

@ -145,6 +145,13 @@ std::map<std::string, std::string> configForToken( std::string const& mode ) {
tLogPolicy = Reference<IReplicationPolicy>(new PolicyAcross(2, "data_hall",
Reference<IReplicationPolicy>(new PolicyAcross(2, "zoneid", Reference<IReplicationPolicy>(new PolicyOne())))
));
} else if(mode == "three_data_hall_fallback") {
redundancy="2";
log_replicas="4";
storagePolicy = Reference<IReplicationPolicy>(new PolicyAcross(2, "data_hall", Reference<IReplicationPolicy>(new PolicyOne())));
tLogPolicy = Reference<IReplicationPolicy>(new PolicyAcross(2, "data_hall",
Reference<IReplicationPolicy>(new PolicyAcross(2, "zoneid", Reference<IReplicationPolicy>(new PolicyOne())))
));
} else
redundancySpecified = false;
if (redundancySpecified) {
@ -510,6 +517,12 @@ ConfigureAutoResult parseConfig( StatusObject const& status ) {
} else if( result.old_replication == "three_datacenter_fallback" ) {
storage_replication = 4;
log_replication = 4;
} else if( result.old_replication == "three_data_hall" ) {
storage_replication = 3;
log_replication = 4;
} else if( result.old_replication == "three_data_hall_fallback" ) {
storage_replication = 2;
log_replication = 4;
} else
return ConfigureAutoResult();

View File

@ -222,6 +222,7 @@ ACTOR Future<Void> databaseLogger( DatabaseContext *cx ) {
.detail("NotCommitted", cx->transactionsNotCommitted)
.detail("MaybeCommitted", cx->transactionsMaybeCommitted)
.detail("ResourceConstrained", cx->transactionsResourceConstrained)
.detail("ProcessBehind", cx->transactionsProcessBehind)
.detail("MeanLatency", cx->latencies.mean())
.detail("MedianLatency", cx->latencies.median())
.detail("Latency90", cx->latencies.percentile(0.90))
@ -513,7 +514,7 @@ DatabaseContext::DatabaseContext(
lockAware(lockAware), apiVersion(apiVersion), provisional(false),
transactionReadVersions(0), transactionLogicalReads(0), transactionPhysicalReads(0), transactionCommittedMutations(0), transactionCommittedMutationBytes(0),
transactionsCommitStarted(0), transactionsCommitCompleted(0), transactionsTooOld(0), transactionsFutureVersions(0), transactionsNotCommitted(0),
transactionsMaybeCommitted(0), transactionsResourceConstrained(0), outstandingWatches(0), transactionTimeout(0.0), transactionMaxRetries(-1),
transactionsMaybeCommitted(0), transactionsResourceConstrained(0), transactionsProcessBehind(0), outstandingWatches(0), transactionTimeout(0.0), transactionMaxRetries(-1),
latencies(1000), readLatencies(1000), commitLatencies(1000), GRVLatencies(1000), mutationsPerCommit(1000), bytesPerCommit(1000), mvCacheInsertLocation(0),
healthMetricsLastUpdated(0), detailedHealthMetricsLastUpdated(0)
{
@ -1487,8 +1488,9 @@ ACTOR Future< Void > watchValue( Future<Version> version, Key key, Optional<Valu
if (e.code() == error_code_wrong_shard_server || e.code() == error_code_all_alternatives_failed) {
cx->invalidateCache( key );
wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, info.taskID));
} else if( e.code() == error_code_watch_cancelled ) {
TEST( true ); // Too many watches on the storage server, poll for changes instead
} else if( e.code() == error_code_watch_cancelled || e.code() == error_code_process_behind ) {
TEST( e.code() == error_code_watch_cancelled ); // Too many watches on the storage server, poll for changes instead
TEST( e.code() == error_code_process_behind ); // The storage servers are all behind
wait(delay(CLIENT_KNOBS->WATCH_POLLING_TIME, info.taskID));
} else if ( e.code() == error_code_timed_out ) { //The storage server occasionally times out watches in case it was cancelled
TEST( true ); // A watch timed out
@ -3028,7 +3030,8 @@ Future<Void> Transaction::onError( Error const& e ) {
if (e.code() == error_code_not_committed ||
e.code() == error_code_commit_unknown_result ||
e.code() == error_code_database_locked ||
e.code() == error_code_proxy_memory_limit_exceeded)
e.code() == error_code_proxy_memory_limit_exceeded ||
e.code() == error_code_process_behind)
{
if(e.code() == error_code_not_committed)
cx->transactionsNotCommitted++;
@ -3036,6 +3039,8 @@ Future<Void> Transaction::onError( Error const& e ) {
cx->transactionsMaybeCommitted++;
if (e.code() == error_code_proxy_memory_limit_exceeded)
cx->transactionsResourceConstrained++;
if (e.code() == error_code_process_behind)
cx->transactionsProcessBehind++;
double backoff = getBackoff(e.code());
reset();

View File

@ -458,7 +458,8 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema(
"triple",
"three_datacenter",
"three_datacenter_fallback",
"three_data_hall"
"three_data_hall",
"three_data_hall_fallback"
]},
"regions":[{
"datacenters":[{
@ -667,7 +668,8 @@ const KeyRef JSONSchemas::clusterConfigurationSchema = LiteralStringRef(R"config
"triple",
"three_datacenter",
"three_datacenter_fallback",
"three_data_hall"
"three_data_hall",
"three_data_hall_fallback"
]},
"regions":[{
"datacenters":[{

View File

@ -138,6 +138,12 @@ FailureStatus SimpleFailureMonitor::getState( Endpoint const& endpoint ) {
}
}
FailureStatus SimpleFailureMonitor::getState( NetworkAddress const& address ) {
auto a = addressStatus.find(address);
if (a == addressStatus.end()) return FailureStatus();
else return a->second;
}
bool SimpleFailureMonitor::onlyEndpointFailed( Endpoint const& endpoint ) {
if(!endpointKnownFailed.get(endpoint))
return false;

View File

@ -84,6 +84,9 @@ public:
// Returns the currently known status for the endpoint
virtual FailureStatus getState( Endpoint const& endpoint ) = 0;
// Returns the currently known status for the address
virtual FailureStatus getState( NetworkAddress const& address ) = 0;
// Only use this function when the endpoint is known to be failed
virtual void endpointNotFound( Endpoint const& ) = 0;
@ -130,6 +133,7 @@ public:
virtual Future<Void> onStateChanged( Endpoint const& endpoint );
virtual FailureStatus getState( Endpoint const& endpoint );
virtual FailureStatus getState( NetworkAddress const& address );
virtual Future<Void> onDisconnectOrFailure( Endpoint const& endpoint );
virtual bool onlyEndpointFailed( Endpoint const& endpoint );
virtual bool permanentlyFailed( Endpoint const& endpoint );

View File

@ -149,7 +149,9 @@ public:
lastIncompatibleMessage(0),
transportId(transportId),
numIncompatibleConnections(0)
{}
{
degraded = Reference<AsyncVar<bool>>( new AsyncVar<bool>(false) );
}
~TransportData();
@ -170,6 +172,8 @@ public:
NetworkAddressList localAddresses;
std::vector<Future<Void>> listeners;
std::unordered_map<NetworkAddress, struct Peer*> peers;
std::unordered_map<NetworkAddress, std::pair<double, double>> closedPeers;
Reference<AsyncVar<bool>> degraded;
bool warnAlwaysForLargePacket;
// These declarations must be in exactly this order
@ -483,6 +487,17 @@ struct Peer : NonCopyable {
TraceEvent(ok ? SevInfo : SevWarnAlways, "IncompatibleConnectionClosed", conn ? conn->getDebugID() : UID()).error(e, true).suppressFor(1.0).detail("PeerAddr", self->destination);
}
if(self->destination.isPublic() && IFailureMonitor::failureMonitor().getState(self->destination).isAvailable()) {
auto& it = self->transport->closedPeers[self->destination];
if(now() - it.second > FLOW_KNOBS->TOO_MANY_CONNECTIONS_CLOSED_RESET_DELAY) {
it.first = now();
} else if(now() - it.first > FLOW_KNOBS->TOO_MANY_CONNECTIONS_CLOSED_TIMEOUT) {
TraceEvent(SevWarnAlways, "TooManyConnectionsClosed", conn ? conn->getDebugID() : UID()).suppressFor(5.0).detail("PeerAddr", self->destination);
self->transport->degraded->set(true);
}
it.second = now();
}
if (conn) {
conn->close();
conn = Reference<IConnection>();
@ -1100,6 +1115,10 @@ int FlowTransport::getEndpointCount() {
return -1;
}
Reference<AsyncVar<bool>> FlowTransport::getDegraded() {
return self->degraded;
}
bool FlowTransport::incompatibleOutgoingConnectionsPresent() {
return self->numIncompatibleConnections > 0;
}

View File

@ -143,6 +143,9 @@ public:
// Makes PacketID "unreliable" (either the data or a connection close event will be delivered
// eventually). It can still be used safely to send a reply to a "reliable" request.
Reference<AsyncVar<bool>> getDegraded();
// This async var will be set to true when the process cannot connect to a public network address that the failure monitor thinks is healthy.
void sendUnreliable( ISerializeSource const& what, const Endpoint& destination, bool openConnection = true );// { cancelReliable(sendReliable(what,destination)); }
int getEndpointCount();

View File

@ -108,7 +108,7 @@ bool checkAndProcessResult(ErrorOr<T> result, Reference<ModelHolder> holder, boo
}
if(triedAllOptions && errCode == error_code_process_behind) {
throw future_version();
throw result.getError();
}
return false;

View File

@ -310,12 +310,18 @@ public:
// See IFailureMonitor::onFailedFor() for an explanation of the duration and slope parameters.
template <class X>
Future<ErrorOr<REPLY_TYPE(X)>> getReplyUnlessFailedFor(const X& value, double sustainedFailureDuration, double sustainedFailureSlope, int taskID) const {
return waitValueOrSignal(getReply(value, taskID), makeDependent<T>(IFailureMonitor::failureMonitor()).onFailedFor(getEndpoint(taskID), sustainedFailureDuration, sustainedFailureSlope), getEndpoint(taskID));
// If it is local endpoint, no need for failure monitoring
return waitValueOrSignal(getReply(value, taskID),
makeDependent<T>(IFailureMonitor::failureMonitor()).onFailedFor(getEndpoint(taskID), sustainedFailureDuration, sustainedFailureSlope),
getEndpoint(taskID));
}
template <class X>
Future<ErrorOr<REPLY_TYPE(X)>> getReplyUnlessFailedFor(const X& value, double sustainedFailureDuration, double sustainedFailureSlope) const {
return waitValueOrSignal(getReply(value), makeDependent<T>(IFailureMonitor::failureMonitor()).onFailedFor(getEndpoint(), sustainedFailureDuration, sustainedFailureSlope), getEndpoint());
// If it is local endpoint, no need for failure monitoring
return waitValueOrSignal(getReply(value),
makeDependent<T>(IFailureMonitor::failureMonitor()).onFailedFor(getEndpoint(), sustainedFailureDuration, sustainedFailureSlope),
getEndpoint());
}
template <class X>

View File

@ -75,7 +75,6 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) {
init( DISK_QUEUE_FILE_SHRINK_BYTES, 100<<20 ); // BUGGIFYd per file within the DiskQueue
init( TLOG_DEGRADED_DELAY_COUNT, 5 );
init( TLOG_DEGRADED_DURATION, 5.0 );
init( TLOG_DEGRADED_RESET_INTERVAL, 48*60*60 ); if ( randomize && BUGGIFY ) TLOG_DEGRADED_RESET_INTERVAL = 10;
// Data distribution queue
init( HEALTH_POLL_TIME, 1.0 );
@ -417,6 +416,9 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) {
//Worker
init( WORKER_LOGGING_INTERVAL, 5.0 );
init( INCOMPATIBLE_PEER_DELAY_BEFORE_LOGGING, 5.0 );
init( DEGRADED_RESET_INTERVAL, 24*60*60 ); if ( randomize && BUGGIFY ) DEGRADED_RESET_INTERVAL = 10;
init( DEGRADED_WARNING_LIMIT, 1 );
init( DEGRADED_WARNING_RESET_DELAY, 7*24*60*60 );
// Test harness
init( WORKER_POLL_DELAY, 1.0 );

View File

@ -79,7 +79,6 @@ public:
int64_t DISK_QUEUE_FILE_SHRINK_BYTES; // When we shrink the disk queue, by how many bytes should it shrink?
int TLOG_DEGRADED_DELAY_COUNT;
double TLOG_DEGRADED_DURATION;
double TLOG_DEGRADED_RESET_INTERVAL;
// Data distribution queue
double HEALTH_POLL_TIME;
@ -356,6 +355,9 @@ public:
//Worker
double WORKER_LOGGING_INTERVAL;
double INCOMPATIBLE_PEER_DELAY_BEFORE_LOGGING;
double DEGRADED_RESET_INTERVAL;
double DEGRADED_WARNING_LIMIT;
double DEGRADED_WARNING_RESET_DELAY;
// Test harness
double WORKER_POLL_DELAY;

View File

@ -1810,7 +1810,7 @@ ACTOR Future<Standalone<RangeResultRef>> tryGetRange( Database cx, Version versi
}
}
} catch( Error &e ) {
if( begin.getKey() != keys.begin && ( e.code() == error_code_transaction_too_old || e.code() == error_code_future_version ) ) {
if( begin.getKey() != keys.begin && ( e.code() == error_code_transaction_too_old || e.code() == error_code_future_version || e.code() == error_code_process_behind ) ) {
if( e.code() == error_code_transaction_too_old )
*isTooOld = true;
output.more = true;
@ -2014,8 +2014,8 @@ ACTOR Future<Void> fetchKeys( StorageServer *data, AddingShard* shard ) {
debug_nextRetryToLog += std::min(debug_nextRetryToLog, 1024);
TraceEvent(SevWarn, "FetchPast", data->thisServerID).detail("TotalAttempts", debug_getRangeRetries).detail("FKID", interval.pairID).detail("V", lastFV).detail("N", fetchVersion).detail("E", data->version.get());
}
} else if (e.code() == error_code_future_version) {
TEST(true); // fetchKeys got future_version, so there must be a huge storage lag somewhere. Keep trying.
} else if (e.code() == error_code_future_version || e.code() == error_code_process_behind) {
TEST(true); // fetchKeys got future_version or process_behind, so there must be a huge storage lag somewhere. Keep trying.
} else {
throw;
}

View File

@ -625,7 +625,7 @@ ACTOR Future<Void> workerServer( Reference<ClusterConnectionFile> connFile, Refe
state WorkerCache<InitializeStorageReply> storageCache;
state Reference<AsyncVar<ServerDBInfo>> dbInfo( new AsyncVar<ServerDBInfo>(ServerDBInfo()) );
state Future<Void> metricsLogger;
state Reference<AsyncVar<bool>> degraded( new AsyncVar<bool>(false) );
state Reference<AsyncVar<bool>> degraded = FlowTransport::transport().getDegraded();
// tLogFnForOptions() can return a function that doesn't correspond with the FDB version that the
// TLogVersion represents. This can be done if the newer TLog doesn't support a requested option.
// As (store type, spill type) can map to the same TLogFn across multiple TLogVersions, we need to
@ -652,7 +652,7 @@ ACTOR Future<Void> workerServer( Reference<ClusterConnectionFile> connFile, Refe
}
}
errorForwarders.add( resetAfter(degraded, SERVER_KNOBS->TLOG_DEGRADED_RESET_INTERVAL, false));
errorForwarders.add( resetAfter(degraded, SERVER_KNOBS->DEGRADED_RESET_INTERVAL, false, SERVER_KNOBS->DEGRADED_WARNING_LIMIT, SERVER_KNOBS->DEGRADED_WARNING_RESET_DELAY, "DegradedReset"));
errorForwarders.add( loadedPonger( interf.debugPing.getFuture() ) );
errorForwarders.add( waitFailureServer( interf.waitFailure.getFuture() ) );
errorForwarders.add( monitorServerDBInfo( ccInterface, connFile, locality, dbInfo ) );

View File

@ -288,7 +288,8 @@ struct ConsistencyCheckWorkload : TestWorkload
}
catch(Error &e)
{
if(e.code() == error_code_transaction_too_old || e.code() == error_code_future_version || e.code() == error_code_wrong_shard_server || e.code() == error_code_all_alternatives_failed || e.code() == error_code_server_request_queue_full)
if (e.code() == error_code_transaction_too_old || e.code() == error_code_future_version ||
e.code() == error_code_wrong_shard_server || e.code() == error_code_all_alternatives_failed || e.code() == error_code_process_behind)
TraceEvent("ConsistencyCheck_Retry").error(e); // FIXME: consistency check does not retry in this case
else
self->testFailure(format("Error %d - %s", e.code(), e.name()));
@ -387,6 +388,7 @@ struct ConsistencyCheckWorkload : TestWorkload
state Key beginKey = allKeys.begin.withPrefix(keyServersPrefix);
state Key endKey = allKeys.end.withPrefix(keyServersPrefix);
state int i = 0;
state Transaction onErrorTr(cx); // This transaction exists only to access onError and its backoff behavior
//If the responses are too big, we may use multiple requests to get the key locations. Each request begins where the last left off
for ( ; i < shards.size(); i++) {
@ -466,11 +468,9 @@ struct ConsistencyCheckWorkload : TestWorkload
keyLocations.push_back_deep(keyLocations.arena(), currentLocations.end()[-1]);
}
catch (Error& e) {
//If we failed because of a version problem, then retry
if(e.code() == error_code_transaction_too_old || e.code() == error_code_future_version || e.code() == error_code_transaction_too_old)
TraceEvent("ConsistencyCheck_RetryGetKeyLocations").error(e);
else
throw;
state Error err = e;
wait(onErrorTr.onError(err));
TraceEvent("ConsistencyCheck_RetryGetKeyLocations").error(err);
}
}
}
@ -713,6 +713,7 @@ struct ConsistencyCheckWorkload : TestWorkload
state int64_t totalReadAmount = 0;
state KeySelector begin = firstGreaterOrEqual(range.begin);
state Transaction onErrorTr(cx); // This transaction exists only to access onError and its backoff behavior
//Read a limited number of entries at a time, repeating until all keys in the shard have been read
loop
@ -933,11 +934,9 @@ struct ConsistencyCheckWorkload : TestWorkload
}
catch(Error &e)
{
//If we failed because of a version problem, then retry
if(e.code() == error_code_transaction_too_old || e.code() == error_code_future_version || e.code() == error_code_transaction_too_old)
TraceEvent("ConsistencyCheck_RetryDataConsistency").error(e);
else
throw;
state Error err = e;
wait(onErrorTr.onError(err));
TraceEvent("ConsistencyCheck_RetryDataConsistency").error(err);
}
}

View File

@ -58,7 +58,8 @@ struct ExceptionContract {
e.code() == error_code_future_version ||
e.code() == error_code_transaction_cancelled ||
e.code() == error_code_key_too_large ||
e.code() == error_code_value_too_large)
e.code() == error_code_value_too_large ||
e.code() == error_code_process_behind)
{
return;
}

View File

@ -64,6 +64,8 @@ FlowKnobs::FlowKnobs(bool randomize, bool isSimulated) {
init( RECONNECTION_TIME_GROWTH_RATE, 1.2 );
init( RECONNECTION_RESET_TIME, 5.0 );
init( CONNECTION_ACCEPT_DELAY, 0.01 );
init( TOO_MANY_CONNECTIONS_CLOSED_RESET_DELAY, 5.0 );
init( TOO_MANY_CONNECTIONS_CLOSED_TIMEOUT, 20.0 );
init( TLS_CERT_REFRESH_DELAY_SECONDS, 12*60*60 );

View File

@ -95,6 +95,8 @@ public:
int64_t BUGGIFY_SIM_PAGE_CACHE_64K;
int MAX_EVICT_ATTEMPTS;
double PAGE_CACHE_TRUNCATE_LOOKUP_FRACTION;
double TOO_MANY_CONNECTIONS_CLOSED_RESET_DELAY;
int TOO_MANY_CONNECTIONS_CLOSED_TIMEOUT;
//AsyncFileKAIO
int MAX_OUTSTANDING;

View File

@ -615,7 +615,7 @@ void getNetworkTraffic(const IPAddress& ip, uint64_t& bytesSent, uint64_t& bytes
snmp_stream >> retransSegs;
}
void getMachineLoad(uint64_t& idleTime, uint64_t& totalTime) {
void getMachineLoad(uint64_t& idleTime, uint64_t& totalTime, bool logDetails) {
INJECT_FAULT( platform_error, "getMachineLoad" ); // Even though this function doesn't throw errors, the equivalents for other platforms do, and since all of our simulation testing is on Linux...
std::ifstream stat_stream("/proc/stat", std::ifstream::in);
@ -628,7 +628,7 @@ void getMachineLoad(uint64_t& idleTime, uint64_t& totalTime) {
totalTime = t_user+t_nice+t_system+t_idle+t_iowait+t_irq+t_softirq+t_steal+t_guest;
idleTime = t_idle+t_iowait;
if( !DEBUG_DETERMINISM )
if( !DEBUG_DETERMINISM && logDetails )
TraceEvent("MachineLoadDetail").detail("User", t_user).detail("Nice", t_nice).detail("System", t_system).detail("Idle", t_idle).detail("IOWait", t_iowait).detail("IRQ", t_irq).detail("SoftIRQ", t_softirq).detail("Steal", t_steal).detail("Guest", t_guest);
}
@ -818,7 +818,7 @@ void getNetworkTraffic(const IPAddress& ip, uint64_t& bytesSent, uint64_t& bytes
free(buf);
}
void getMachineLoad(uint64_t& idleTime, uint64_t& totalTime) {
void getMachineLoad(uint64_t& idleTime, uint64_t& totalTime, bool logDetails) {
INJECT_FAULT( platform_error, "getMachineLoad" );
mach_msg_type_number_t count = HOST_CPU_LOAD_INFO_COUNT;
host_cpu_load_info_data_t r_load;
@ -1103,7 +1103,7 @@ void initPdhStrings(SystemStatisticsState *state, std::string dataFolder) {
}
#endif
SystemStatistics getSystemStatistics(std::string dataFolder, const IPAddress* ip, SystemStatisticsState** statState) {
SystemStatistics getSystemStatistics(std::string dataFolder, const IPAddress* ip, SystemStatisticsState** statState, bool logDetails) {
if( (*statState) == NULL )
(*statState) = new SystemStatisticsState();
SystemStatistics returnStats;
@ -1238,7 +1238,7 @@ SystemStatistics getSystemStatistics(std::string dataFolder, const IPAddress* ip
uint64_t clockIdleTime = (*statState)->lastClockIdleTime;
uint64_t clockTotalTime = (*statState)->lastClockTotalTime;
getMachineLoad(clockIdleTime, clockTotalTime);
getMachineLoad(clockIdleTime, clockTotalTime, logDetails);
returnStats.machineCPUSeconds = clockTotalTime - (*statState)->lastClockTotalTime != 0 ? ( 1 - ((clockIdleTime - (*statState)->lastClockIdleTime) / ((double)(clockTotalTime - (*statState)->lastClockTotalTime)))) * returnStats.elapsed : 0;
(*statState)->lastClockIdleTime = clockIdleTime;
(*statState)->lastClockTotalTime = clockTotalTime;

View File

@ -247,7 +247,7 @@ struct SystemStatisticsState;
struct IPAddress;
SystemStatistics getSystemStatistics(std::string dataFolder, const IPAddress* ip, SystemStatisticsState **statState);
SystemStatistics getSystemStatistics(std::string dataFolder, const IPAddress* ip, SystemStatisticsState **statState, bool logDetails);
double getProcessorTimeThread();
@ -272,7 +272,7 @@ void getNetworkTraffic(uint64_t& bytesSent, uint64_t& bytesReceived, uint64_t& o
void getDiskStatistics(std::string const& directory, uint64_t& currentIOs, uint64_t& busyTicks, uint64_t& reads, uint64_t& writes, uint64_t& writeSectors);
void getMachineLoad(uint64_t& idleTime, uint64_t& totalTime);
void getMachineLoad(uint64_t& idleTime, uint64_t& totalTime, bool logDetails);
double timer(); // Returns the system real time clock with high precision. May jump around when system time is adjusted!
double timer_monotonic(); // Returns a high precision monotonic clock which is adjusted to be kind of similar to timer() at startup, but might not be a globally accurate time.

View File

@ -45,7 +45,7 @@ SystemStatistics getSystemStatistics() {
static StatisticsState statState = StatisticsState();
const IPAddress ipAddr = machineState.ip.present() ? machineState.ip.get() : IPAddress();
return getSystemStatistics(
machineState.folder.present() ? machineState.folder.get() : "", &ipAddr, &statState.systemState);
machineState.folder.present() ? machineState.folder.get() : "", &ipAddr, &statState.systemState, false);
}
#define TRACEALLOCATOR( size ) TraceEvent("MemSample").detail("Count", FastAllocator<size>::getApproximateMemoryUnused()/size).detail("TotalSize", FastAllocator<size>::getApproximateMemoryUnused()).detail("SampleCount", 1).detail("Hash", "FastAllocatedUnused" #size ).detail("Bt", "na")
@ -54,7 +54,7 @@ SystemStatistics getSystemStatistics() {
SystemStatistics customSystemMonitor(std::string eventName, StatisticsState *statState, bool machineMetrics) {
const IPAddress ipAddr = machineState.ip.present() ? machineState.ip.get() : IPAddress();
SystemStatistics currentStats = getSystemStatistics(machineState.folder.present() ? machineState.folder.get() : "",
&ipAddr, &statState->systemState);
&ipAddr, &statState->systemState, true);
NetworkData netData;
netData.init();
if (!DEBUG_DETERMINISM && currentStats.initialized) {

View File

@ -776,13 +776,23 @@ Future<Void> setAfter( Reference<AsyncVar<T>> var, double time, T val ) {
}
ACTOR template <class T>
Future<Void> resetAfter( Reference<AsyncVar<T>> var, double time, T val ) {
Future<Void> resetAfter( Reference<AsyncVar<T>> var, double time, T val, int warningLimit = -1, double warningResetDelay = 0, const char* context = NULL ) {
state bool isEqual = var->get() == val;
state Future<Void> resetDelay = isEqual ? Never() : delay(time);
state int resetCount = 0;
state double lastReset = now();
loop {
choose {
when( wait( resetDelay ) ) {
var->set( val );
if(now() - lastReset > warningResetDelay) {
resetCount = 0;
}
resetCount++;
if(context && warningLimit >= 0 && resetCount > warningLimit) {
TraceEvent(SevWarnAlways, context).detail("ResetCount", resetCount).detail("LastReset", now() - lastReset);
}
lastReset = now();
isEqual = true;
resetDelay = Never();
}

View File

@ -32,7 +32,7 @@
<Wix xmlns='http://schemas.microsoft.com/wix/2006/wi'>
<Product Name='$(var.Title)'
Id='{1A6617BB-F6FE-48AE-B5FA-161D7BA3D9CD}'
Id='{1F036D0A-3560-4A5C-BD40-F1B254876257}'
UpgradeCode='{A95EA002-686E-4164-8356-C715B7F8B1C8}'
Version='$(var.Version)'
Manufacturer='$(var.Manufacturer)'