Merge branch 'release-6.1'
# Conflicts: # documentation/sphinx/source/release-notes.rst # versions.target
This commit is contained in:
commit
21c0ba555c
|
@ -77,7 +77,9 @@ fdb_bool_t fdb_error_predicate( int predicate_test, fdb_error_t code ) {
|
|||
return code == error_code_not_committed ||
|
||||
code == error_code_transaction_too_old ||
|
||||
code == error_code_future_version ||
|
||||
code == error_code_database_locked;
|
||||
code == error_code_database_locked ||
|
||||
code == error_code_proxy_memory_limit_exceeded ||
|
||||
code == error_code_process_behind;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
|
|
@ -6,11 +6,11 @@ if(WIN32)
|
|||
target_compile_options(coveragetool PRIVATE "/langversion:6")
|
||||
set_property(TARGET coveragetool PROPERTY VS_DOTNET_REFERENCES
|
||||
"System"
|
||||
"ystem.Core"
|
||||
"System.Core"
|
||||
"System.Xml.Linq"
|
||||
"ystem.Data.DataSetExtensions"
|
||||
"System.Data.DataSetExtensions"
|
||||
"Microsoft.CSharp"
|
||||
"ystem.Data"
|
||||
"System.Data"
|
||||
"System.Xml")
|
||||
else()
|
||||
set(COVERAGETOOL_COMPILER_REFERENCES
|
||||
|
|
|
@ -661,6 +661,11 @@ You can now remove old client library versions from your clients. This is only t
|
|||
Version-specific notes on upgrading
|
||||
===================================
|
||||
|
||||
Upgrading from 6.1.x
|
||||
--------------------
|
||||
|
||||
Upgrades from 6.1.x will keep all your old data and configuration settings.
|
||||
|
||||
Upgrading from 6.0.x
|
||||
--------------------
|
||||
|
||||
|
|
|
@ -32,12 +32,16 @@ FoundationDB may return the following error codes from API functions. If you nee
|
|||
+-----------------------------------------------+-----+--------------------------------------------------------------------------------+
|
||||
| accessed_unreadable | 1036| Read or wrote an unreadable key |
|
||||
+-----------------------------------------------+-----+--------------------------------------------------------------------------------+
|
||||
| process_behind | 1037| Storage process does not have recent mutations |
|
||||
+-----------------------------------------------+-----+--------------------------------------------------------------------------------+
|
||||
| database_locked | 1038| Database is locked |
|
||||
+-----------------------------------------------+-----+--------------------------------------------------------------------------------+
|
||||
| cluster_version_changed | 1039| Cluster has been upgraded to a new protocol version |
|
||||
+-----------------------------------------------+-----+--------------------------------------------------------------------------------+
|
||||
| external_client_already_loaded | 1040| External client has already been loaded |
|
||||
+-----------------------------------------------+-----+--------------------------------------------------------------------------------+
|
||||
| proxy_memory_limit_exceeded | 1042| Proxy commit memory limit exceeded |
|
||||
+-----------------------------------------------+-----+--------------------------------------------------------------------------------+
|
||||
| operation_cancelled | 1101| Asynchronous operation cancelled |
|
||||
+-----------------------------------------------+-----+--------------------------------------------------------------------------------+
|
||||
| future_released | 1102| Future has been released |
|
||||
|
|
|
@ -10,38 +10,38 @@ macOS
|
|||
|
||||
The macOS installation package is supported on macOS 10.7+. It includes the client and (optionally) the server.
|
||||
|
||||
* `FoundationDB-6.1.0.pkg <https://www.foundationdb.org/downloads/6.1.0/macOS/installers/FoundationDB-6.1.0.pkg>`_
|
||||
* `FoundationDB-6.1.1.pkg <https://www.foundationdb.org/downloads/6.1.1/macOS/installers/FoundationDB-6.1.1.pkg>`_
|
||||
|
||||
Ubuntu
|
||||
------
|
||||
|
||||
The Ubuntu packages are supported on 64-bit Ubuntu 12.04+, but beware of the Linux kernel bug in Ubuntu 12.x.
|
||||
|
||||
* `foundationdb-clients-6.1.0-1_amd64.deb <https://www.foundationdb.org/downloads/6.1.0/ubuntu/installers/foundationdb-clients_6.1.0-1_amd64.deb>`_
|
||||
* `foundationdb-server-6.1.0-1_amd64.deb <https://www.foundationdb.org/downloads/6.1.0/ubuntu/installers/foundationdb-server_6.1.0-1_amd64.deb>`_ (depends on the clients package)
|
||||
* `foundationdb-clients-6.1.1-1_amd64.deb <https://www.foundationdb.org/downloads/6.1.1/ubuntu/installers/foundationdb-clients_6.1.1-1_amd64.deb>`_
|
||||
* `foundationdb-server-6.1.1-1_amd64.deb <https://www.foundationdb.org/downloads/6.1.1/ubuntu/installers/foundationdb-server_6.1.1-1_amd64.deb>`_ (depends on the clients package)
|
||||
|
||||
RHEL/CentOS EL6
|
||||
---------------
|
||||
|
||||
The RHEL/CentOS EL6 packages are supported on 64-bit RHEL/CentOS 6.x.
|
||||
|
||||
* `foundationdb-clients-6.1.0-1.el6.x86_64.rpm <https://www.foundationdb.org/downloads/6.1.0/rhel6/installers/foundationdb-clients-6.1.0-1.el6.x86_64.rpm>`_
|
||||
* `foundationdb-server-6.1.0-1.el6.x86_64.rpm <https://www.foundationdb.org/downloads/6.1.0/rhel6/installers/foundationdb-server-6.1.0-1.el6.x86_64.rpm>`_ (depends on the clients package)
|
||||
* `foundationdb-clients-6.1.1-1.el6.x86_64.rpm <https://www.foundationdb.org/downloads/6.1.1/rhel6/installers/foundationdb-clients-6.1.1-1.el6.x86_64.rpm>`_
|
||||
* `foundationdb-server-6.1.1-1.el6.x86_64.rpm <https://www.foundationdb.org/downloads/6.1.1/rhel6/installers/foundationdb-server-6.1.1-1.el6.x86_64.rpm>`_ (depends on the clients package)
|
||||
|
||||
RHEL/CentOS EL7
|
||||
---------------
|
||||
|
||||
The RHEL/CentOS EL7 packages are supported on 64-bit RHEL/CentOS 7.x.
|
||||
|
||||
* `foundationdb-clients-6.1.0-1.el7.x86_64.rpm <https://www.foundationdb.org/downloads/6.1.0/rhel7/installers/foundationdb-clients-6.1.0-1.el7.x86_64.rpm>`_
|
||||
* `foundationdb-server-6.1.0-1.el7.x86_64.rpm <https://www.foundationdb.org/downloads/6.1.0/rhel7/installers/foundationdb-server-6.1.0-1.el7.x86_64.rpm>`_ (depends on the clients package)
|
||||
* `foundationdb-clients-6.1.1-1.el7.x86_64.rpm <https://www.foundationdb.org/downloads/6.1.1/rhel7/installers/foundationdb-clients-6.1.1-1.el7.x86_64.rpm>`_
|
||||
* `foundationdb-server-6.1.1-1.el7.x86_64.rpm <https://www.foundationdb.org/downloads/6.1.1/rhel7/installers/foundationdb-server-6.1.1-1.el7.x86_64.rpm>`_ (depends on the clients package)
|
||||
|
||||
Windows
|
||||
-------
|
||||
|
||||
The Windows installer is supported on 64-bit Windows XP and later. It includes the client and (optionally) the server.
|
||||
|
||||
* `foundationdb-6.1.0-x64.msi <https://www.foundationdb.org/downloads/6.1.0/windows/installers/foundationdb-6.1.0-x64.msi>`_
|
||||
* `foundationdb-6.1.1-x64.msi <https://www.foundationdb.org/downloads/6.1.1/windows/installers/foundationdb-6.1.1-x64.msi>`_
|
||||
|
||||
API Language Bindings
|
||||
=====================
|
||||
|
@ -58,18 +58,18 @@ On macOS and Windows, the FoundationDB Python API bindings are installed as part
|
|||
|
||||
If you need to use the FoundationDB Python API from other Python installations or paths, download the Python package:
|
||||
|
||||
* `foundationdb-6.1.0.tar.gz <https://www.foundationdb.org/downloads/6.1.0/bindings/python/foundationdb-6.1.0.tar.gz>`_
|
||||
* `foundationdb-6.1.1.tar.gz <https://www.foundationdb.org/downloads/6.1.1/bindings/python/foundationdb-6.1.1.tar.gz>`_
|
||||
|
||||
Ruby 1.9.3/2.0.0+
|
||||
-----------------
|
||||
|
||||
* `fdb-6.1.0.gem <https://www.foundationdb.org/downloads/6.1.0/bindings/ruby/fdb-6.1.0.gem>`_
|
||||
* `fdb-6.1.1.gem <https://www.foundationdb.org/downloads/6.1.1/bindings/ruby/fdb-6.1.1.gem>`_
|
||||
|
||||
Java 8+
|
||||
-------
|
||||
|
||||
* `fdb-java-6.1.0.jar <https://www.foundationdb.org/downloads/6.1.0/bindings/java/fdb-java-6.1.0.jar>`_
|
||||
* `fdb-java-6.1.0-javadoc.jar <https://www.foundationdb.org/downloads/6.1.0/bindings/java/fdb-java-6.1.0-javadoc.jar>`_
|
||||
* `fdb-java-6.1.1.jar <https://www.foundationdb.org/downloads/6.1.1/bindings/java/fdb-java-6.1.1.jar>`_
|
||||
* `fdb-java-6.1.1-javadoc.jar <https://www.foundationdb.org/downloads/6.1.1/bindings/java/fdb-java-6.1.1-javadoc.jar>`_
|
||||
|
||||
Go 1.1+
|
||||
-------
|
||||
|
|
|
@ -434,7 +434,8 @@
|
|||
"triple",
|
||||
"three_datacenter",
|
||||
"three_datacenter_fallback",
|
||||
"three_data_hall"
|
||||
"three_data_hall",
|
||||
"three_data_hall_fallback"
|
||||
]},
|
||||
"regions":[{
|
||||
"datacenters":[{
|
||||
|
|
|
@ -448,9 +448,14 @@ ACTOR Future<Void> readCommitted(Database cx, PromiseStream<RangeResultWithVersi
|
|||
}
|
||||
}
|
||||
catch (Error &e) {
|
||||
if (e.code() != error_code_transaction_too_old && e.code() != error_code_future_version)
|
||||
throw;
|
||||
tr = Transaction(cx);
|
||||
if (e.code() == error_code_transaction_too_old) {
|
||||
// We are using this transaction until it's too old and then resetting to a fresh one,
|
||||
// so we don't need to delay.
|
||||
tr.fullReset();
|
||||
}
|
||||
else {
|
||||
wait(tr.onError(e));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -539,9 +544,14 @@ ACTOR Future<Void> readCommitted(Database cx, PromiseStream<RCGroup> results, Fu
|
|||
nextKey = firstGreaterThan(rangevalue.end()[-1].key);
|
||||
}
|
||||
catch (Error &e) {
|
||||
if (e.code() != error_code_transaction_too_old && e.code() != error_code_future_version)
|
||||
throw;
|
||||
wait(tr.onError(e));
|
||||
if (e.code() == error_code_transaction_too_old) {
|
||||
// We are using this transaction until it's too old and then resetting to a fresh one,
|
||||
// so we don't need to delay.
|
||||
tr.fullReset();
|
||||
}
|
||||
else {
|
||||
wait(tr.onError(e));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -239,6 +239,8 @@ StatusObject DatabaseConfiguration::toJSON(bool noPolicies) const {
|
|||
result["redundancy_mode"] = "triple";
|
||||
} else if( tLogReplicationFactor == 4 && storageTeamSize == 3 && tlogInfo == "data_hall^2 x zoneid^2 x 1" && storageInfo == "data_hall^3 x 1" ) {
|
||||
result["redundancy_mode"] = "three_data_hall";
|
||||
} else if( tLogReplicationFactor == 4 && storageTeamSize == 2 && tlogInfo == "data_hall^2 x zoneid^2 x 1" && storageInfo == "data_hall^2 x 1" ) {
|
||||
result["redundancy_mode"] = "three_data_hall_fallback";
|
||||
} else {
|
||||
customRedundancy = true;
|
||||
}
|
||||
|
|
|
@ -147,6 +147,7 @@ public:
|
|||
int64_t transactionsNotCommitted;
|
||||
int64_t transactionsMaybeCommitted;
|
||||
int64_t transactionsResourceConstrained;
|
||||
int64_t transactionsProcessBehind;
|
||||
ContinuousSample<double> latencies, readLatencies, commitLatencies, GRVLatencies, mutationsPerCommit, bytesPerCommit;
|
||||
|
||||
int outstandingWatches;
|
||||
|
|
|
@ -169,7 +169,11 @@ ACTOR Future<Void> failureMonitorClientLoop(
|
|||
ACTOR Future<Void> failureMonitorClient( Reference<AsyncVar<Optional<struct ClusterInterface>>> ci, bool trackMyStatus ) {
|
||||
state SimpleFailureMonitor* monitor = static_cast<SimpleFailureMonitor*>( &IFailureMonitor::failureMonitor() );
|
||||
state Reference<FailureMonitorClientState> fmState = Reference<FailureMonitorClientState>(new FailureMonitorClientState());
|
||||
|
||||
auto localAddr = g_network->getLocalAddresses();
|
||||
monitor->setStatus(localAddr.address, FailureStatus(false));
|
||||
if(localAddr.secondaryAddress.present()) {
|
||||
monitor->setStatus(localAddr.secondaryAddress.get(), FailureStatus(false));
|
||||
}
|
||||
loop {
|
||||
state Future<Void> client = ci->get().present() ? failureMonitorClientLoop(monitor, ci->get().get(), fmState, trackMyStatus) : Void();
|
||||
wait( ci->onChange() );
|
||||
|
|
|
@ -145,6 +145,13 @@ std::map<std::string, std::string> configForToken( std::string const& mode ) {
|
|||
tLogPolicy = Reference<IReplicationPolicy>(new PolicyAcross(2, "data_hall",
|
||||
Reference<IReplicationPolicy>(new PolicyAcross(2, "zoneid", Reference<IReplicationPolicy>(new PolicyOne())))
|
||||
));
|
||||
} else if(mode == "three_data_hall_fallback") {
|
||||
redundancy="2";
|
||||
log_replicas="4";
|
||||
storagePolicy = Reference<IReplicationPolicy>(new PolicyAcross(2, "data_hall", Reference<IReplicationPolicy>(new PolicyOne())));
|
||||
tLogPolicy = Reference<IReplicationPolicy>(new PolicyAcross(2, "data_hall",
|
||||
Reference<IReplicationPolicy>(new PolicyAcross(2, "zoneid", Reference<IReplicationPolicy>(new PolicyOne())))
|
||||
));
|
||||
} else
|
||||
redundancySpecified = false;
|
||||
if (redundancySpecified) {
|
||||
|
@ -510,6 +517,12 @@ ConfigureAutoResult parseConfig( StatusObject const& status ) {
|
|||
} else if( result.old_replication == "three_datacenter_fallback" ) {
|
||||
storage_replication = 4;
|
||||
log_replication = 4;
|
||||
} else if( result.old_replication == "three_data_hall" ) {
|
||||
storage_replication = 3;
|
||||
log_replication = 4;
|
||||
} else if( result.old_replication == "three_data_hall_fallback" ) {
|
||||
storage_replication = 2;
|
||||
log_replication = 4;
|
||||
} else
|
||||
return ConfigureAutoResult();
|
||||
|
||||
|
|
|
@ -222,6 +222,7 @@ ACTOR Future<Void> databaseLogger( DatabaseContext *cx ) {
|
|||
.detail("NotCommitted", cx->transactionsNotCommitted)
|
||||
.detail("MaybeCommitted", cx->transactionsMaybeCommitted)
|
||||
.detail("ResourceConstrained", cx->transactionsResourceConstrained)
|
||||
.detail("ProcessBehind", cx->transactionsProcessBehind)
|
||||
.detail("MeanLatency", cx->latencies.mean())
|
||||
.detail("MedianLatency", cx->latencies.median())
|
||||
.detail("Latency90", cx->latencies.percentile(0.90))
|
||||
|
@ -513,7 +514,7 @@ DatabaseContext::DatabaseContext(
|
|||
lockAware(lockAware), apiVersion(apiVersion), provisional(false),
|
||||
transactionReadVersions(0), transactionLogicalReads(0), transactionPhysicalReads(0), transactionCommittedMutations(0), transactionCommittedMutationBytes(0),
|
||||
transactionsCommitStarted(0), transactionsCommitCompleted(0), transactionsTooOld(0), transactionsFutureVersions(0), transactionsNotCommitted(0),
|
||||
transactionsMaybeCommitted(0), transactionsResourceConstrained(0), outstandingWatches(0), transactionTimeout(0.0), transactionMaxRetries(-1),
|
||||
transactionsMaybeCommitted(0), transactionsResourceConstrained(0), transactionsProcessBehind(0), outstandingWatches(0), transactionTimeout(0.0), transactionMaxRetries(-1),
|
||||
latencies(1000), readLatencies(1000), commitLatencies(1000), GRVLatencies(1000), mutationsPerCommit(1000), bytesPerCommit(1000), mvCacheInsertLocation(0),
|
||||
healthMetricsLastUpdated(0), detailedHealthMetricsLastUpdated(0)
|
||||
{
|
||||
|
@ -1487,8 +1488,9 @@ ACTOR Future< Void > watchValue( Future<Version> version, Key key, Optional<Valu
|
|||
if (e.code() == error_code_wrong_shard_server || e.code() == error_code_all_alternatives_failed) {
|
||||
cx->invalidateCache( key );
|
||||
wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, info.taskID));
|
||||
} else if( e.code() == error_code_watch_cancelled ) {
|
||||
TEST( true ); // Too many watches on the storage server, poll for changes instead
|
||||
} else if( e.code() == error_code_watch_cancelled || e.code() == error_code_process_behind ) {
|
||||
TEST( e.code() == error_code_watch_cancelled ); // Too many watches on the storage server, poll for changes instead
|
||||
TEST( e.code() == error_code_process_behind ); // The storage servers are all behind
|
||||
wait(delay(CLIENT_KNOBS->WATCH_POLLING_TIME, info.taskID));
|
||||
} else if ( e.code() == error_code_timed_out ) { //The storage server occasionally times out watches in case it was cancelled
|
||||
TEST( true ); // A watch timed out
|
||||
|
@ -3028,7 +3030,8 @@ Future<Void> Transaction::onError( Error const& e ) {
|
|||
if (e.code() == error_code_not_committed ||
|
||||
e.code() == error_code_commit_unknown_result ||
|
||||
e.code() == error_code_database_locked ||
|
||||
e.code() == error_code_proxy_memory_limit_exceeded)
|
||||
e.code() == error_code_proxy_memory_limit_exceeded ||
|
||||
e.code() == error_code_process_behind)
|
||||
{
|
||||
if(e.code() == error_code_not_committed)
|
||||
cx->transactionsNotCommitted++;
|
||||
|
@ -3036,6 +3039,8 @@ Future<Void> Transaction::onError( Error const& e ) {
|
|||
cx->transactionsMaybeCommitted++;
|
||||
if (e.code() == error_code_proxy_memory_limit_exceeded)
|
||||
cx->transactionsResourceConstrained++;
|
||||
if (e.code() == error_code_process_behind)
|
||||
cx->transactionsProcessBehind++;
|
||||
|
||||
double backoff = getBackoff(e.code());
|
||||
reset();
|
||||
|
|
|
@ -458,7 +458,8 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema(
|
|||
"triple",
|
||||
"three_datacenter",
|
||||
"three_datacenter_fallback",
|
||||
"three_data_hall"
|
||||
"three_data_hall",
|
||||
"three_data_hall_fallback"
|
||||
]},
|
||||
"regions":[{
|
||||
"datacenters":[{
|
||||
|
@ -667,7 +668,8 @@ const KeyRef JSONSchemas::clusterConfigurationSchema = LiteralStringRef(R"config
|
|||
"triple",
|
||||
"three_datacenter",
|
||||
"three_datacenter_fallback",
|
||||
"three_data_hall"
|
||||
"three_data_hall",
|
||||
"three_data_hall_fallback"
|
||||
]},
|
||||
"regions":[{
|
||||
"datacenters":[{
|
||||
|
|
|
@ -138,6 +138,12 @@ FailureStatus SimpleFailureMonitor::getState( Endpoint const& endpoint ) {
|
|||
}
|
||||
}
|
||||
|
||||
FailureStatus SimpleFailureMonitor::getState( NetworkAddress const& address ) {
|
||||
auto a = addressStatus.find(address);
|
||||
if (a == addressStatus.end()) return FailureStatus();
|
||||
else return a->second;
|
||||
}
|
||||
|
||||
bool SimpleFailureMonitor::onlyEndpointFailed( Endpoint const& endpoint ) {
|
||||
if(!endpointKnownFailed.get(endpoint))
|
||||
return false;
|
||||
|
|
|
@ -84,6 +84,9 @@ public:
|
|||
// Returns the currently known status for the endpoint
|
||||
virtual FailureStatus getState( Endpoint const& endpoint ) = 0;
|
||||
|
||||
// Returns the currently known status for the address
|
||||
virtual FailureStatus getState( NetworkAddress const& address ) = 0;
|
||||
|
||||
// Only use this function when the endpoint is known to be failed
|
||||
virtual void endpointNotFound( Endpoint const& ) = 0;
|
||||
|
||||
|
@ -130,6 +133,7 @@ public:
|
|||
|
||||
virtual Future<Void> onStateChanged( Endpoint const& endpoint );
|
||||
virtual FailureStatus getState( Endpoint const& endpoint );
|
||||
virtual FailureStatus getState( NetworkAddress const& address );
|
||||
virtual Future<Void> onDisconnectOrFailure( Endpoint const& endpoint );
|
||||
virtual bool onlyEndpointFailed( Endpoint const& endpoint );
|
||||
virtual bool permanentlyFailed( Endpoint const& endpoint );
|
||||
|
|
|
@ -149,7 +149,9 @@ public:
|
|||
lastIncompatibleMessage(0),
|
||||
transportId(transportId),
|
||||
numIncompatibleConnections(0)
|
||||
{}
|
||||
{
|
||||
degraded = Reference<AsyncVar<bool>>( new AsyncVar<bool>(false) );
|
||||
}
|
||||
|
||||
~TransportData();
|
||||
|
||||
|
@ -170,6 +172,8 @@ public:
|
|||
NetworkAddressList localAddresses;
|
||||
std::vector<Future<Void>> listeners;
|
||||
std::unordered_map<NetworkAddress, struct Peer*> peers;
|
||||
std::unordered_map<NetworkAddress, std::pair<double, double>> closedPeers;
|
||||
Reference<AsyncVar<bool>> degraded;
|
||||
bool warnAlwaysForLargePacket;
|
||||
|
||||
// These declarations must be in exactly this order
|
||||
|
@ -483,6 +487,17 @@ struct Peer : NonCopyable {
|
|||
TraceEvent(ok ? SevInfo : SevWarnAlways, "IncompatibleConnectionClosed", conn ? conn->getDebugID() : UID()).error(e, true).suppressFor(1.0).detail("PeerAddr", self->destination);
|
||||
}
|
||||
|
||||
if(self->destination.isPublic() && IFailureMonitor::failureMonitor().getState(self->destination).isAvailable()) {
|
||||
auto& it = self->transport->closedPeers[self->destination];
|
||||
if(now() - it.second > FLOW_KNOBS->TOO_MANY_CONNECTIONS_CLOSED_RESET_DELAY) {
|
||||
it.first = now();
|
||||
} else if(now() - it.first > FLOW_KNOBS->TOO_MANY_CONNECTIONS_CLOSED_TIMEOUT) {
|
||||
TraceEvent(SevWarnAlways, "TooManyConnectionsClosed", conn ? conn->getDebugID() : UID()).suppressFor(5.0).detail("PeerAddr", self->destination);
|
||||
self->transport->degraded->set(true);
|
||||
}
|
||||
it.second = now();
|
||||
}
|
||||
|
||||
if (conn) {
|
||||
conn->close();
|
||||
conn = Reference<IConnection>();
|
||||
|
@ -1100,6 +1115,10 @@ int FlowTransport::getEndpointCount() {
|
|||
return -1;
|
||||
}
|
||||
|
||||
Reference<AsyncVar<bool>> FlowTransport::getDegraded() {
|
||||
return self->degraded;
|
||||
}
|
||||
|
||||
bool FlowTransport::incompatibleOutgoingConnectionsPresent() {
|
||||
return self->numIncompatibleConnections > 0;
|
||||
}
|
||||
|
|
|
@ -143,6 +143,9 @@ public:
|
|||
// Makes PacketID "unreliable" (either the data or a connection close event will be delivered
|
||||
// eventually). It can still be used safely to send a reply to a "reliable" request.
|
||||
|
||||
Reference<AsyncVar<bool>> getDegraded();
|
||||
// This async var will be set to true when the process cannot connect to a public network address that the failure monitor thinks is healthy.
|
||||
|
||||
void sendUnreliable( ISerializeSource const& what, const Endpoint& destination, bool openConnection = true );// { cancelReliable(sendReliable(what,destination)); }
|
||||
|
||||
int getEndpointCount();
|
||||
|
|
|
@ -108,7 +108,7 @@ bool checkAndProcessResult(ErrorOr<T> result, Reference<ModelHolder> holder, boo
|
|||
}
|
||||
|
||||
if(triedAllOptions && errCode == error_code_process_behind) {
|
||||
throw future_version();
|
||||
throw result.getError();
|
||||
}
|
||||
|
||||
return false;
|
||||
|
|
|
@ -310,12 +310,18 @@ public:
|
|||
// See IFailureMonitor::onFailedFor() for an explanation of the duration and slope parameters.
|
||||
template <class X>
|
||||
Future<ErrorOr<REPLY_TYPE(X)>> getReplyUnlessFailedFor(const X& value, double sustainedFailureDuration, double sustainedFailureSlope, int taskID) const {
|
||||
return waitValueOrSignal(getReply(value, taskID), makeDependent<T>(IFailureMonitor::failureMonitor()).onFailedFor(getEndpoint(taskID), sustainedFailureDuration, sustainedFailureSlope), getEndpoint(taskID));
|
||||
// If it is local endpoint, no need for failure monitoring
|
||||
return waitValueOrSignal(getReply(value, taskID),
|
||||
makeDependent<T>(IFailureMonitor::failureMonitor()).onFailedFor(getEndpoint(taskID), sustainedFailureDuration, sustainedFailureSlope),
|
||||
getEndpoint(taskID));
|
||||
}
|
||||
|
||||
template <class X>
|
||||
Future<ErrorOr<REPLY_TYPE(X)>> getReplyUnlessFailedFor(const X& value, double sustainedFailureDuration, double sustainedFailureSlope) const {
|
||||
return waitValueOrSignal(getReply(value), makeDependent<T>(IFailureMonitor::failureMonitor()).onFailedFor(getEndpoint(), sustainedFailureDuration, sustainedFailureSlope), getEndpoint());
|
||||
// If it is local endpoint, no need for failure monitoring
|
||||
return waitValueOrSignal(getReply(value),
|
||||
makeDependent<T>(IFailureMonitor::failureMonitor()).onFailedFor(getEndpoint(), sustainedFailureDuration, sustainedFailureSlope),
|
||||
getEndpoint());
|
||||
}
|
||||
|
||||
template <class X>
|
||||
|
|
|
@ -75,7 +75,6 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) {
|
|||
init( DISK_QUEUE_FILE_SHRINK_BYTES, 100<<20 ); // BUGGIFYd per file within the DiskQueue
|
||||
init( TLOG_DEGRADED_DELAY_COUNT, 5 );
|
||||
init( TLOG_DEGRADED_DURATION, 5.0 );
|
||||
init( TLOG_DEGRADED_RESET_INTERVAL, 48*60*60 ); if ( randomize && BUGGIFY ) TLOG_DEGRADED_RESET_INTERVAL = 10;
|
||||
|
||||
// Data distribution queue
|
||||
init( HEALTH_POLL_TIME, 1.0 );
|
||||
|
@ -417,6 +416,9 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) {
|
|||
//Worker
|
||||
init( WORKER_LOGGING_INTERVAL, 5.0 );
|
||||
init( INCOMPATIBLE_PEER_DELAY_BEFORE_LOGGING, 5.0 );
|
||||
init( DEGRADED_RESET_INTERVAL, 24*60*60 ); if ( randomize && BUGGIFY ) DEGRADED_RESET_INTERVAL = 10;
|
||||
init( DEGRADED_WARNING_LIMIT, 1 );
|
||||
init( DEGRADED_WARNING_RESET_DELAY, 7*24*60*60 );
|
||||
|
||||
// Test harness
|
||||
init( WORKER_POLL_DELAY, 1.0 );
|
||||
|
|
|
@ -79,7 +79,6 @@ public:
|
|||
int64_t DISK_QUEUE_FILE_SHRINK_BYTES; // When we shrink the disk queue, by how many bytes should it shrink?
|
||||
int TLOG_DEGRADED_DELAY_COUNT;
|
||||
double TLOG_DEGRADED_DURATION;
|
||||
double TLOG_DEGRADED_RESET_INTERVAL;
|
||||
|
||||
// Data distribution queue
|
||||
double HEALTH_POLL_TIME;
|
||||
|
@ -356,6 +355,9 @@ public:
|
|||
//Worker
|
||||
double WORKER_LOGGING_INTERVAL;
|
||||
double INCOMPATIBLE_PEER_DELAY_BEFORE_LOGGING;
|
||||
double DEGRADED_RESET_INTERVAL;
|
||||
double DEGRADED_WARNING_LIMIT;
|
||||
double DEGRADED_WARNING_RESET_DELAY;
|
||||
|
||||
// Test harness
|
||||
double WORKER_POLL_DELAY;
|
||||
|
|
|
@ -1810,7 +1810,7 @@ ACTOR Future<Standalone<RangeResultRef>> tryGetRange( Database cx, Version versi
|
|||
}
|
||||
}
|
||||
} catch( Error &e ) {
|
||||
if( begin.getKey() != keys.begin && ( e.code() == error_code_transaction_too_old || e.code() == error_code_future_version ) ) {
|
||||
if( begin.getKey() != keys.begin && ( e.code() == error_code_transaction_too_old || e.code() == error_code_future_version || e.code() == error_code_process_behind ) ) {
|
||||
if( e.code() == error_code_transaction_too_old )
|
||||
*isTooOld = true;
|
||||
output.more = true;
|
||||
|
@ -2014,8 +2014,8 @@ ACTOR Future<Void> fetchKeys( StorageServer *data, AddingShard* shard ) {
|
|||
debug_nextRetryToLog += std::min(debug_nextRetryToLog, 1024);
|
||||
TraceEvent(SevWarn, "FetchPast", data->thisServerID).detail("TotalAttempts", debug_getRangeRetries).detail("FKID", interval.pairID).detail("V", lastFV).detail("N", fetchVersion).detail("E", data->version.get());
|
||||
}
|
||||
} else if (e.code() == error_code_future_version) {
|
||||
TEST(true); // fetchKeys got future_version, so there must be a huge storage lag somewhere. Keep trying.
|
||||
} else if (e.code() == error_code_future_version || e.code() == error_code_process_behind) {
|
||||
TEST(true); // fetchKeys got future_version or process_behind, so there must be a huge storage lag somewhere. Keep trying.
|
||||
} else {
|
||||
throw;
|
||||
}
|
||||
|
|
|
@ -625,7 +625,7 @@ ACTOR Future<Void> workerServer( Reference<ClusterConnectionFile> connFile, Refe
|
|||
state WorkerCache<InitializeStorageReply> storageCache;
|
||||
state Reference<AsyncVar<ServerDBInfo>> dbInfo( new AsyncVar<ServerDBInfo>(ServerDBInfo()) );
|
||||
state Future<Void> metricsLogger;
|
||||
state Reference<AsyncVar<bool>> degraded( new AsyncVar<bool>(false) );
|
||||
state Reference<AsyncVar<bool>> degraded = FlowTransport::transport().getDegraded();
|
||||
// tLogFnForOptions() can return a function that doesn't correspond with the FDB version that the
|
||||
// TLogVersion represents. This can be done if the newer TLog doesn't support a requested option.
|
||||
// As (store type, spill type) can map to the same TLogFn across multiple TLogVersions, we need to
|
||||
|
@ -652,7 +652,7 @@ ACTOR Future<Void> workerServer( Reference<ClusterConnectionFile> connFile, Refe
|
|||
}
|
||||
}
|
||||
|
||||
errorForwarders.add( resetAfter(degraded, SERVER_KNOBS->TLOG_DEGRADED_RESET_INTERVAL, false));
|
||||
errorForwarders.add( resetAfter(degraded, SERVER_KNOBS->DEGRADED_RESET_INTERVAL, false, SERVER_KNOBS->DEGRADED_WARNING_LIMIT, SERVER_KNOBS->DEGRADED_WARNING_RESET_DELAY, "DegradedReset"));
|
||||
errorForwarders.add( loadedPonger( interf.debugPing.getFuture() ) );
|
||||
errorForwarders.add( waitFailureServer( interf.waitFailure.getFuture() ) );
|
||||
errorForwarders.add( monitorServerDBInfo( ccInterface, connFile, locality, dbInfo ) );
|
||||
|
|
|
@ -288,7 +288,8 @@ struct ConsistencyCheckWorkload : TestWorkload
|
|||
}
|
||||
catch(Error &e)
|
||||
{
|
||||
if(e.code() == error_code_transaction_too_old || e.code() == error_code_future_version || e.code() == error_code_wrong_shard_server || e.code() == error_code_all_alternatives_failed || e.code() == error_code_server_request_queue_full)
|
||||
if (e.code() == error_code_transaction_too_old || e.code() == error_code_future_version ||
|
||||
e.code() == error_code_wrong_shard_server || e.code() == error_code_all_alternatives_failed || e.code() == error_code_process_behind)
|
||||
TraceEvent("ConsistencyCheck_Retry").error(e); // FIXME: consistency check does not retry in this case
|
||||
else
|
||||
self->testFailure(format("Error %d - %s", e.code(), e.name()));
|
||||
|
@ -387,6 +388,7 @@ struct ConsistencyCheckWorkload : TestWorkload
|
|||
state Key beginKey = allKeys.begin.withPrefix(keyServersPrefix);
|
||||
state Key endKey = allKeys.end.withPrefix(keyServersPrefix);
|
||||
state int i = 0;
|
||||
state Transaction onErrorTr(cx); // This transaction exists only to access onError and its backoff behavior
|
||||
|
||||
//If the responses are too big, we may use multiple requests to get the key locations. Each request begins where the last left off
|
||||
for ( ; i < shards.size(); i++) {
|
||||
|
@ -466,11 +468,9 @@ struct ConsistencyCheckWorkload : TestWorkload
|
|||
keyLocations.push_back_deep(keyLocations.arena(), currentLocations.end()[-1]);
|
||||
}
|
||||
catch (Error& e) {
|
||||
//If we failed because of a version problem, then retry
|
||||
if(e.code() == error_code_transaction_too_old || e.code() == error_code_future_version || e.code() == error_code_transaction_too_old)
|
||||
TraceEvent("ConsistencyCheck_RetryGetKeyLocations").error(e);
|
||||
else
|
||||
throw;
|
||||
state Error err = e;
|
||||
wait(onErrorTr.onError(err));
|
||||
TraceEvent("ConsistencyCheck_RetryGetKeyLocations").error(err);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -713,6 +713,7 @@ struct ConsistencyCheckWorkload : TestWorkload
|
|||
state int64_t totalReadAmount = 0;
|
||||
|
||||
state KeySelector begin = firstGreaterOrEqual(range.begin);
|
||||
state Transaction onErrorTr(cx); // This transaction exists only to access onError and its backoff behavior
|
||||
|
||||
//Read a limited number of entries at a time, repeating until all keys in the shard have been read
|
||||
loop
|
||||
|
@ -933,11 +934,9 @@ struct ConsistencyCheckWorkload : TestWorkload
|
|||
}
|
||||
catch(Error &e)
|
||||
{
|
||||
//If we failed because of a version problem, then retry
|
||||
if(e.code() == error_code_transaction_too_old || e.code() == error_code_future_version || e.code() == error_code_transaction_too_old)
|
||||
TraceEvent("ConsistencyCheck_RetryDataConsistency").error(e);
|
||||
else
|
||||
throw;
|
||||
state Error err = e;
|
||||
wait(onErrorTr.onError(err));
|
||||
TraceEvent("ConsistencyCheck_RetryDataConsistency").error(err);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -58,7 +58,8 @@ struct ExceptionContract {
|
|||
e.code() == error_code_future_version ||
|
||||
e.code() == error_code_transaction_cancelled ||
|
||||
e.code() == error_code_key_too_large ||
|
||||
e.code() == error_code_value_too_large)
|
||||
e.code() == error_code_value_too_large ||
|
||||
e.code() == error_code_process_behind)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
|
|
@ -64,6 +64,8 @@ FlowKnobs::FlowKnobs(bool randomize, bool isSimulated) {
|
|||
init( RECONNECTION_TIME_GROWTH_RATE, 1.2 );
|
||||
init( RECONNECTION_RESET_TIME, 5.0 );
|
||||
init( CONNECTION_ACCEPT_DELAY, 0.01 );
|
||||
init( TOO_MANY_CONNECTIONS_CLOSED_RESET_DELAY, 5.0 );
|
||||
init( TOO_MANY_CONNECTIONS_CLOSED_TIMEOUT, 20.0 );
|
||||
|
||||
init( TLS_CERT_REFRESH_DELAY_SECONDS, 12*60*60 );
|
||||
|
||||
|
|
|
@ -95,6 +95,8 @@ public:
|
|||
int64_t BUGGIFY_SIM_PAGE_CACHE_64K;
|
||||
int MAX_EVICT_ATTEMPTS;
|
||||
double PAGE_CACHE_TRUNCATE_LOOKUP_FRACTION;
|
||||
double TOO_MANY_CONNECTIONS_CLOSED_RESET_DELAY;
|
||||
int TOO_MANY_CONNECTIONS_CLOSED_TIMEOUT;
|
||||
|
||||
//AsyncFileKAIO
|
||||
int MAX_OUTSTANDING;
|
||||
|
|
|
@ -615,7 +615,7 @@ void getNetworkTraffic(const IPAddress& ip, uint64_t& bytesSent, uint64_t& bytes
|
|||
snmp_stream >> retransSegs;
|
||||
}
|
||||
|
||||
void getMachineLoad(uint64_t& idleTime, uint64_t& totalTime) {
|
||||
void getMachineLoad(uint64_t& idleTime, uint64_t& totalTime, bool logDetails) {
|
||||
INJECT_FAULT( platform_error, "getMachineLoad" ); // Even though this function doesn't throw errors, the equivalents for other platforms do, and since all of our simulation testing is on Linux...
|
||||
std::ifstream stat_stream("/proc/stat", std::ifstream::in);
|
||||
|
||||
|
@ -628,7 +628,7 @@ void getMachineLoad(uint64_t& idleTime, uint64_t& totalTime) {
|
|||
totalTime = t_user+t_nice+t_system+t_idle+t_iowait+t_irq+t_softirq+t_steal+t_guest;
|
||||
idleTime = t_idle+t_iowait;
|
||||
|
||||
if( !DEBUG_DETERMINISM )
|
||||
if( !DEBUG_DETERMINISM && logDetails )
|
||||
TraceEvent("MachineLoadDetail").detail("User", t_user).detail("Nice", t_nice).detail("System", t_system).detail("Idle", t_idle).detail("IOWait", t_iowait).detail("IRQ", t_irq).detail("SoftIRQ", t_softirq).detail("Steal", t_steal).detail("Guest", t_guest);
|
||||
}
|
||||
|
||||
|
@ -818,7 +818,7 @@ void getNetworkTraffic(const IPAddress& ip, uint64_t& bytesSent, uint64_t& bytes
|
|||
free(buf);
|
||||
}
|
||||
|
||||
void getMachineLoad(uint64_t& idleTime, uint64_t& totalTime) {
|
||||
void getMachineLoad(uint64_t& idleTime, uint64_t& totalTime, bool logDetails) {
|
||||
INJECT_FAULT( platform_error, "getMachineLoad" );
|
||||
mach_msg_type_number_t count = HOST_CPU_LOAD_INFO_COUNT;
|
||||
host_cpu_load_info_data_t r_load;
|
||||
|
@ -1103,7 +1103,7 @@ void initPdhStrings(SystemStatisticsState *state, std::string dataFolder) {
|
|||
}
|
||||
#endif
|
||||
|
||||
SystemStatistics getSystemStatistics(std::string dataFolder, const IPAddress* ip, SystemStatisticsState** statState) {
|
||||
SystemStatistics getSystemStatistics(std::string dataFolder, const IPAddress* ip, SystemStatisticsState** statState, bool logDetails) {
|
||||
if( (*statState) == NULL )
|
||||
(*statState) = new SystemStatisticsState();
|
||||
SystemStatistics returnStats;
|
||||
|
@ -1238,7 +1238,7 @@ SystemStatistics getSystemStatistics(std::string dataFolder, const IPAddress* ip
|
|||
uint64_t clockIdleTime = (*statState)->lastClockIdleTime;
|
||||
uint64_t clockTotalTime = (*statState)->lastClockTotalTime;
|
||||
|
||||
getMachineLoad(clockIdleTime, clockTotalTime);
|
||||
getMachineLoad(clockIdleTime, clockTotalTime, logDetails);
|
||||
returnStats.machineCPUSeconds = clockTotalTime - (*statState)->lastClockTotalTime != 0 ? ( 1 - ((clockIdleTime - (*statState)->lastClockIdleTime) / ((double)(clockTotalTime - (*statState)->lastClockTotalTime)))) * returnStats.elapsed : 0;
|
||||
(*statState)->lastClockIdleTime = clockIdleTime;
|
||||
(*statState)->lastClockTotalTime = clockTotalTime;
|
||||
|
|
|
@ -247,7 +247,7 @@ struct SystemStatisticsState;
|
|||
|
||||
struct IPAddress;
|
||||
|
||||
SystemStatistics getSystemStatistics(std::string dataFolder, const IPAddress* ip, SystemStatisticsState **statState);
|
||||
SystemStatistics getSystemStatistics(std::string dataFolder, const IPAddress* ip, SystemStatisticsState **statState, bool logDetails);
|
||||
|
||||
double getProcessorTimeThread();
|
||||
|
||||
|
@ -272,7 +272,7 @@ void getNetworkTraffic(uint64_t& bytesSent, uint64_t& bytesReceived, uint64_t& o
|
|||
|
||||
void getDiskStatistics(std::string const& directory, uint64_t& currentIOs, uint64_t& busyTicks, uint64_t& reads, uint64_t& writes, uint64_t& writeSectors);
|
||||
|
||||
void getMachineLoad(uint64_t& idleTime, uint64_t& totalTime);
|
||||
void getMachineLoad(uint64_t& idleTime, uint64_t& totalTime, bool logDetails);
|
||||
|
||||
double timer(); // Returns the system real time clock with high precision. May jump around when system time is adjusted!
|
||||
double timer_monotonic(); // Returns a high precision monotonic clock which is adjusted to be kind of similar to timer() at startup, but might not be a globally accurate time.
|
||||
|
|
|
@ -45,7 +45,7 @@ SystemStatistics getSystemStatistics() {
|
|||
static StatisticsState statState = StatisticsState();
|
||||
const IPAddress ipAddr = machineState.ip.present() ? machineState.ip.get() : IPAddress();
|
||||
return getSystemStatistics(
|
||||
machineState.folder.present() ? machineState.folder.get() : "", &ipAddr, &statState.systemState);
|
||||
machineState.folder.present() ? machineState.folder.get() : "", &ipAddr, &statState.systemState, false);
|
||||
}
|
||||
|
||||
#define TRACEALLOCATOR( size ) TraceEvent("MemSample").detail("Count", FastAllocator<size>::getApproximateMemoryUnused()/size).detail("TotalSize", FastAllocator<size>::getApproximateMemoryUnused()).detail("SampleCount", 1).detail("Hash", "FastAllocatedUnused" #size ).detail("Bt", "na")
|
||||
|
@ -54,7 +54,7 @@ SystemStatistics getSystemStatistics() {
|
|||
SystemStatistics customSystemMonitor(std::string eventName, StatisticsState *statState, bool machineMetrics) {
|
||||
const IPAddress ipAddr = machineState.ip.present() ? machineState.ip.get() : IPAddress();
|
||||
SystemStatistics currentStats = getSystemStatistics(machineState.folder.present() ? machineState.folder.get() : "",
|
||||
&ipAddr, &statState->systemState);
|
||||
&ipAddr, &statState->systemState, true);
|
||||
NetworkData netData;
|
||||
netData.init();
|
||||
if (!DEBUG_DETERMINISM && currentStats.initialized) {
|
||||
|
|
|
@ -776,13 +776,23 @@ Future<Void> setAfter( Reference<AsyncVar<T>> var, double time, T val ) {
|
|||
}
|
||||
|
||||
ACTOR template <class T>
|
||||
Future<Void> resetAfter( Reference<AsyncVar<T>> var, double time, T val ) {
|
||||
Future<Void> resetAfter( Reference<AsyncVar<T>> var, double time, T val, int warningLimit = -1, double warningResetDelay = 0, const char* context = NULL ) {
|
||||
state bool isEqual = var->get() == val;
|
||||
state Future<Void> resetDelay = isEqual ? Never() : delay(time);
|
||||
state int resetCount = 0;
|
||||
state double lastReset = now();
|
||||
loop {
|
||||
choose {
|
||||
when( wait( resetDelay ) ) {
|
||||
var->set( val );
|
||||
if(now() - lastReset > warningResetDelay) {
|
||||
resetCount = 0;
|
||||
}
|
||||
resetCount++;
|
||||
if(context && warningLimit >= 0 && resetCount > warningLimit) {
|
||||
TraceEvent(SevWarnAlways, context).detail("ResetCount", resetCount).detail("LastReset", now() - lastReset);
|
||||
}
|
||||
lastReset = now();
|
||||
isEqual = true;
|
||||
resetDelay = Never();
|
||||
}
|
||||
|
|
|
@ -32,7 +32,7 @@
|
|||
|
||||
<Wix xmlns='http://schemas.microsoft.com/wix/2006/wi'>
|
||||
<Product Name='$(var.Title)'
|
||||
Id='{1A6617BB-F6FE-48AE-B5FA-161D7BA3D9CD}'
|
||||
Id='{1F036D0A-3560-4A5C-BD40-F1B254876257}'
|
||||
UpgradeCode='{A95EA002-686E-4164-8356-C715B7F8B1C8}'
|
||||
Version='$(var.Version)'
|
||||
Manufacturer='$(var.Manufacturer)'
|
||||
|
|
Loading…
Reference in New Issue