Merge pull request #2776 from etschannen/feature-dd-region-queue

When configured with multiple regions, the DD queue could start too many relocations
This commit is contained in:
Evan Tschannen 2020-03-04 18:42:39 -08:00 committed by GitHub
commit 93becf1986
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 48 additions and 30 deletions

View File

@ -10,38 +10,38 @@ macOS
The macOS installation package is supported on macOS 10.7+. It includes the client and (optionally) the server.
* `FoundationDB-6.2.17.pkg <https://www.foundationdb.org/downloads/6.2.17/macOS/installers/FoundationDB-6.2.17.pkg>`_
* `FoundationDB-6.2.18.pkg <https://www.foundationdb.org/downloads/6.2.18/macOS/installers/FoundationDB-6.2.18.pkg>`_
Ubuntu
------
The Ubuntu packages are supported on 64-bit Ubuntu 12.04+, but beware of the Linux kernel bug in Ubuntu 12.x.
* `foundationdb-clients-6.2.17-1_amd64.deb <https://www.foundationdb.org/downloads/6.2.17/ubuntu/installers/foundationdb-clients_6.2.17-1_amd64.deb>`_
* `foundationdb-server-6.2.17-1_amd64.deb <https://www.foundationdb.org/downloads/6.2.17/ubuntu/installers/foundationdb-server_6.2.17-1_amd64.deb>`_ (depends on the clients package)
* `foundationdb-clients-6.2.18-1_amd64.deb <https://www.foundationdb.org/downloads/6.2.18/ubuntu/installers/foundationdb-clients_6.2.18-1_amd64.deb>`_
* `foundationdb-server-6.2.18-1_amd64.deb <https://www.foundationdb.org/downloads/6.2.18/ubuntu/installers/foundationdb-server_6.2.18-1_amd64.deb>`_ (depends on the clients package)
RHEL/CentOS EL6
---------------
The RHEL/CentOS EL6 packages are supported on 64-bit RHEL/CentOS 6.x.
* `foundationdb-clients-6.2.17-1.el6.x86_64.rpm <https://www.foundationdb.org/downloads/6.2.17/rhel6/installers/foundationdb-clients-6.2.17-1.el6.x86_64.rpm>`_
* `foundationdb-server-6.2.17-1.el6.x86_64.rpm <https://www.foundationdb.org/downloads/6.2.17/rhel6/installers/foundationdb-server-6.2.17-1.el6.x86_64.rpm>`_ (depends on the clients package)
* `foundationdb-clients-6.2.18-1.el6.x86_64.rpm <https://www.foundationdb.org/downloads/6.2.18/rhel6/installers/foundationdb-clients-6.2.18-1.el6.x86_64.rpm>`_
* `foundationdb-server-6.2.18-1.el6.x86_64.rpm <https://www.foundationdb.org/downloads/6.2.18/rhel6/installers/foundationdb-server-6.2.18-1.el6.x86_64.rpm>`_ (depends on the clients package)
RHEL/CentOS EL7
---------------
The RHEL/CentOS EL7 packages are supported on 64-bit RHEL/CentOS 7.x.
* `foundationdb-clients-6.2.17-1.el7.x86_64.rpm <https://www.foundationdb.org/downloads/6.2.17/rhel7/installers/foundationdb-clients-6.2.17-1.el7.x86_64.rpm>`_
* `foundationdb-server-6.2.17-1.el7.x86_64.rpm <https://www.foundationdb.org/downloads/6.2.17/rhel7/installers/foundationdb-server-6.2.17-1.el7.x86_64.rpm>`_ (depends on the clients package)
* `foundationdb-clients-6.2.18-1.el7.x86_64.rpm <https://www.foundationdb.org/downloads/6.2.18/rhel7/installers/foundationdb-clients-6.2.18-1.el7.x86_64.rpm>`_
* `foundationdb-server-6.2.18-1.el7.x86_64.rpm <https://www.foundationdb.org/downloads/6.2.18/rhel7/installers/foundationdb-server-6.2.18-1.el7.x86_64.rpm>`_ (depends on the clients package)
Windows
-------
The Windows installer is supported on 64-bit Windows XP and later. It includes the client and (optionally) the server.
* `foundationdb-6.2.17-x64.msi <https://www.foundationdb.org/downloads/6.2.17/windows/installers/foundationdb-6.2.17-x64.msi>`_
* `foundationdb-6.2.18-x64.msi <https://www.foundationdb.org/downloads/6.2.18/windows/installers/foundationdb-6.2.18-x64.msi>`_
API Language Bindings
=====================
@ -58,18 +58,18 @@ On macOS and Windows, the FoundationDB Python API bindings are installed as part
If you need to use the FoundationDB Python API from other Python installations or paths, download the Python package:
* `foundationdb-6.2.17.tar.gz <https://www.foundationdb.org/downloads/6.2.17/bindings/python/foundationdb-6.2.17.tar.gz>`_
* `foundationdb-6.2.18.tar.gz <https://www.foundationdb.org/downloads/6.2.18/bindings/python/foundationdb-6.2.18.tar.gz>`_
Ruby 1.9.3/2.0.0+
-----------------
* `fdb-6.2.17.gem <https://www.foundationdb.org/downloads/6.2.17/bindings/ruby/fdb-6.2.17.gem>`_
* `fdb-6.2.18.gem <https://www.foundationdb.org/downloads/6.2.18/bindings/ruby/fdb-6.2.18.gem>`_
Java 8+
-------
* `fdb-java-6.2.17.jar <https://www.foundationdb.org/downloads/6.2.17/bindings/java/fdb-java-6.2.17.jar>`_
* `fdb-java-6.2.17-javadoc.jar <https://www.foundationdb.org/downloads/6.2.17/bindings/java/fdb-java-6.2.17-javadoc.jar>`_
* `fdb-java-6.2.18.jar <https://www.foundationdb.org/downloads/6.2.18/bindings/java/fdb-java-6.2.18.jar>`_
* `fdb-java-6.2.18-javadoc.jar <https://www.foundationdb.org/downloads/6.2.18/bindings/java/fdb-java-6.2.18-javadoc.jar>`_
Go 1.11+
--------

View File

@ -5,6 +5,18 @@ Release Notes
6.2.18
======
Fixes
-----
* When configuring a cluster to usable_regions=2, data distribution would not react to machine failures while copying data to the remote region. `(PR #2774) <https://github.com/apple/foundationdb/pull/2774>`_.
* When a cluster is configured with usable_regions=2, data distribution could push a cluster into saturation by relocating too many shards simulatenously. `(PR #2776) <https://github.com/apple/foundationdb/pull/2776>`_.
* Backup could not establish TLS connections (broken in 6.2.16). `(PR #2775) <https://github.com/apple/foundationdb/pull/2775>`_.
Performance
-----------
* Improved the efficiency of establishing large numbers of network connections. `(PR #2777) <https://github.com/apple/foundationdb/pull/2777>`_.
Features
--------
@ -21,7 +33,7 @@ Other Changes
Fixes
-----
* Restored the ability to set TLS configuration using environment variables. `(PR #2755) <https://github.com/apple/foundationdb/pull/2755>`_.
* Restored the ability to set TLS configuration using environment variables (broken in 6.2.16). `(PR #2755) <https://github.com/apple/foundationdb/pull/2755>`_.
6.2.16
======

View File

@ -4282,7 +4282,7 @@ ACTOR Future<Void> dataDistribution(Reference<DataDistributorData> self)
actors.push_back( pollMoveKeysLock(cx, lock) );
actors.push_back( reportErrorsExcept( dataDistributionTracker( initData, cx, output, shardsAffectedByTeamFailure, getShardMetrics, getAverageShardBytes.getFuture(), readyToStart, anyZeroHealthyTeams, self->ddId ), "DDTracker", self->ddId, &normalDDQueueErrors() ) );
actors.push_back( reportErrorsExcept( dataDistributionQueue( cx, output, input.getFuture(), getShardMetrics, processingUnhealthy, tcis, shardsAffectedByTeamFailure, lock, getAverageShardBytes, self->ddId, storageTeamSize, &lastLimited ), "DDQueue", self->ddId, &normalDDQueueErrors() ) );
actors.push_back( reportErrorsExcept( dataDistributionQueue( cx, output, input.getFuture(), getShardMetrics, processingUnhealthy, tcis, shardsAffectedByTeamFailure, lock, getAverageShardBytes, self->ddId, storageTeamSize, configuration.storageTeamSize, &lastLimited ), "DDQueue", self->ddId, &normalDDQueueErrors() ) );
vector<DDTeamCollection*> teamCollectionsPtrs;
Reference<DDTeamCollection> primaryTeamCollection( new DDTeamCollection(cx, self->ddId, lock, output, shardsAffectedByTeamFailure, configuration, primaryDcId, configuration.usableRegions > 1 ? remoteDcIds : std::vector<Optional<Key>>(), readyToStart.getFuture(), zeroHealthyTeams[0], true, processingUnhealthy) );

View File

@ -204,6 +204,7 @@ Future<Void> dataDistributionQueue(
PromiseStream<Promise<int64_t>> const& getAverageShardBytes,
UID const& distributorId,
int const& teamSize,
int const& singleRegionTeamSize,
double* const& lastLimited);
//Holds the permitted size and IO Bounds for a shard

View File

@ -286,29 +286,30 @@ struct Busyness {
};
// find the "workFactor" for this, were it launched now
int getWorkFactor( RelocateData const& relocation ) {
// Avoid the divide by 0!
ASSERT( relocation.src.size() );
int getWorkFactor( RelocateData const& relocation, int singleRegionTeamSize ) {
if( relocation.healthPriority == SERVER_KNOBS->PRIORITY_TEAM_1_LEFT || relocation.healthPriority == SERVER_KNOBS->PRIORITY_TEAM_0_LEFT )
return WORK_FULL_UTILIZATION / SERVER_KNOBS->RELOCATION_PARALLELISM_PER_SOURCE_SERVER;
else if( relocation.healthPriority == SERVER_KNOBS->PRIORITY_TEAM_2_LEFT )
return WORK_FULL_UTILIZATION / 2 / SERVER_KNOBS->RELOCATION_PARALLELISM_PER_SOURCE_SERVER;
else // for now we assume that any message at a lower priority can best be assumed to have a full team left for work
return WORK_FULL_UTILIZATION / relocation.src.size() / SERVER_KNOBS->RELOCATION_PARALLELISM_PER_SOURCE_SERVER;
return WORK_FULL_UTILIZATION / singleRegionTeamSize / SERVER_KNOBS->RELOCATION_PARALLELISM_PER_SOURCE_SERVER;
}
// Data movement's resource control: Do not overload source servers used for the RelocateData
// return true if servers are not too busy to launch the relocation
bool canLaunch( RelocateData & relocation, int teamSize, std::map<UID, Busyness> & busymap,
bool canLaunch( RelocateData & relocation, int teamSize, int singleRegionTeamSize, std::map<UID, Busyness> & busymap,
std::vector<RelocateData> cancellableRelocations ) {
// assert this has not already been launched
ASSERT( relocation.workFactor == 0 );
ASSERT( relocation.src.size() != 0 );
ASSERT( teamSize >= singleRegionTeamSize );
// find the "workFactor" for this, were it launched now
int workFactor = getWorkFactor( relocation );
int neededServers = std::max( 1, (int)relocation.src.size() - teamSize + 1 );
int workFactor = getWorkFactor( relocation, singleRegionTeamSize );
int neededServers = std::min<int>( relocation.src.size(), teamSize - singleRegionTeamSize + 1 );
if(SERVER_KNOBS->USE_OLD_NEEDED_SERVERS) {
neededServers = std::max( 1, (int)relocation.src.size() - teamSize + 1 );
}
// see if each of the SS can launch this task
for( int i = 0; i < relocation.src.size(); i++ ) {
// For each source server for this relocation, copy and modify its busyness to reflect work that WOULD be cancelled
@ -329,9 +330,9 @@ bool canLaunch( RelocateData & relocation, int teamSize, std::map<UID, Busyness>
}
// update busyness for each server
void launch( RelocateData & relocation, std::map<UID, Busyness> & busymap ) {
void launch( RelocateData & relocation, std::map<UID, Busyness> & busymap, int singleRegionTeamSize ) {
// if we are here this means that we can launch and should adjust all the work the servers can do
relocation.workFactor = getWorkFactor( relocation );
relocation.workFactor = getWorkFactor( relocation, singleRegionTeamSize );
for( int i = 0; i < relocation.src.size(); i++ )
busymap[ relocation.src[i] ].addWork( relocation.priority, relocation.workFactor );
}
@ -360,6 +361,7 @@ struct DDQueueData {
int queuedRelocations;
int64_t bytesWritten;
int teamSize;
int singleRegionTeamSize;
std::map<UID, Busyness> busymap;
@ -416,10 +418,10 @@ struct DDQueueData {
DDQueueData( UID mid, MoveKeysLock lock, Database cx, std::vector<TeamCollectionInterface> teamCollections,
Reference<ShardsAffectedByTeamFailure> sABTF, PromiseStream<Promise<int64_t>> getAverageShardBytes,
int teamSize, PromiseStream<RelocateShard> output, FutureStream<RelocateShard> input, PromiseStream<GetMetricsRequest> getShardMetrics, double* lastLimited ) :
int teamSize, int singleRegionTeamSize, PromiseStream<RelocateShard> output, FutureStream<RelocateShard> input, PromiseStream<GetMetricsRequest> getShardMetrics, double* lastLimited ) :
activeRelocations( 0 ), queuedRelocations( 0 ), bytesWritten ( 0 ), teamCollections( teamCollections ),
shardsAffectedByTeamFailure( sABTF ), getAverageShardBytes( getAverageShardBytes ), distributorId( mid ), lock( lock ),
cx( cx ), teamSize( teamSize ), output( output ), input( input ), getShardMetrics( getShardMetrics ), startMoveKeysParallelismLock( SERVER_KNOBS->DD_MOVE_KEYS_PARALLELISM ),
cx( cx ), teamSize( teamSize ), singleRegionTeamSize( singleRegionTeamSize ), output( output ), input( input ), getShardMetrics( getShardMetrics ), startMoveKeysParallelismLock( SERVER_KNOBS->DD_MOVE_KEYS_PARALLELISM ),
finishMoveKeysParallelismLock( SERVER_KNOBS->DD_MOVE_KEYS_PARALLELISM ), lastLimited(lastLimited),
suppressIntervals(0), lastInterval(0), unhealthyRelocations(0), rawProcessingUnhealthy( new AsyncVar<bool>(false) ) {}
@ -816,7 +818,7 @@ struct DDQueueData {
// Data movement avoids overloading source servers in moving data.
// SOMEDAY: the list of source servers may be outdated since they were fetched when the work was put in the queue
// FIXME: we need spare capacity even when we're just going to be cancelling work via TEAM_HEALTHY
if( !canLaunch( rd, teamSize, busymap, cancellableRelocations ) ) {
if( !canLaunch( rd, teamSize, singleRegionTeamSize, busymap, cancellableRelocations ) ) {
//logRelocation( rd, "SkippingQueuedRelocation" );
continue;
}
@ -854,7 +856,7 @@ struct DDQueueData {
RelocateData& rrs = inFlight.rangeContaining(ranges[r].begin)->value();
rrs.keys = ranges[r];
launch( rrs, busymap );
launch( rrs, busymap, singleRegionTeamSize );
activeRelocations++;
startRelocation(rrs.priority, rrs.healthPriority);
inFlightActors.insert( rrs.keys, dataDistributionRelocator( this, rrs ) );
@ -1410,9 +1412,10 @@ ACTOR Future<Void> dataDistributionQueue(
PromiseStream<Promise<int64_t>> getAverageShardBytes,
UID distributorId,
int teamSize,
int singleRegionTeamSize,
double* lastLimited)
{
state DDQueueData self( distributorId, lock, cx, teamCollections, shardsAffectedByTeamFailure, getAverageShardBytes, teamSize, output, input, getShardMetrics, lastLimited );
state DDQueueData self( distributorId, lock, cx, teamCollections, shardsAffectedByTeamFailure, getAverageShardBytes, teamSize, singleRegionTeamSize, output, input, getShardMetrics, lastLimited );
state std::set<UID> serversToLaunchFrom;
state KeyRange keysToLaunchFrom;
state RelocateData launchData;

View File

@ -104,6 +104,7 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs, bool isSimula
init( INFLIGHT_PENALTY_HEALTHY, 1.0 );
init( INFLIGHT_PENALTY_UNHEALTHY, 500.0 );
init( INFLIGHT_PENALTY_ONE_LEFT, 1000.0 );
init( USE_OLD_NEEDED_SERVERS, false );
init( PRIORITY_RECOVER_MOVE, 110 );
init( PRIORITY_REBALANCE_UNDERUTILIZED_TEAM, 120 );

View File

@ -104,7 +104,8 @@ public:
double INFLIGHT_PENALTY_REDUNDANT;
double INFLIGHT_PENALTY_UNHEALTHY;
double INFLIGHT_PENALTY_ONE_LEFT;
bool USE_OLD_NEEDED_SERVERS;
// Higher priorities are executed first
// Priority/100 is the "priority group"/"superpriority". Priority inversion
// is possible within but not between priority groups; fewer priority groups