2017-05-26 04:48:44 +08:00
/*
* MoveKeys . actor . cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013 - 2018 Apple Inc . and the FoundationDB project authors
2018-02-22 02:25:11 +08:00
*
2017-05-26 04:48:44 +08:00
* Licensed under the Apache License , Version 2.0 ( the " License " ) ;
* you may not use this file except in compliance with the License .
* You may obtain a copy of the License at
2018-02-22 02:25:11 +08:00
*
2017-05-26 04:48:44 +08:00
* http : //www.apache.org/licenses/LICENSE-2.0
2018-02-22 02:25:11 +08:00
*
2017-05-26 04:48:44 +08:00
* Unless required by applicable law or agreed to in writing , software
* distributed under the License is distributed on an " AS IS " BASIS ,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND , either express or implied .
* See the License for the specific language governing permissions and
* limitations under the License .
*/
2018-08-02 09:09:54 +08:00
# include "flow/Util.h"
2017-05-26 04:48:44 +08:00
# include "fdbrpc/FailureMonitor.h"
# include "fdbclient/SystemData.h"
2019-02-18 10:55:52 +08:00
# include "fdbserver/MoveKeys.actor.h"
2018-10-20 01:30:13 +08:00
# include "fdbserver/Knobs.h"
2018-08-11 06:18:24 +08:00
# include "flow/actorcompiler.h" // This must be the last #include.
2017-05-26 04:48:44 +08:00
using std : : min ;
using std : : max ;
2019-07-24 07:16:31 +08:00
// in-memory flag to disable DD
bool ddEnabled = true ;
UID ddEnabledStatusUID = UID ( ) ;
bool isDDEnabled ( ) {
return ddEnabled ;
}
bool setDDEnabled ( bool status , UID snapUID ) {
TraceEvent ( " SetDDEnabled " )
. detail ( " Status " , status )
. detail ( " SnapUID " , snapUID ) ;
ASSERT ( snapUID ! = UID ( ) ) ;
if ( ! status ) {
// disabling DD
if ( ddEnabledStatusUID ! = UID ( ) ) {
// disable DD when a disable is already in progress not allowed
return false ;
}
ddEnabled = status ;
ddEnabledStatusUID = snapUID ;
return true ;
}
// enabling DD
if ( snapUID ! = ddEnabledStatusUID ) {
// enabling DD not allowed if UID does not match with the disable request
return false ;
}
// reset to default status
ddEnabled = status ;
ddEnabledStatusUID = UID ( ) ;
return true ;
}
2019-07-31 08:40:15 +08:00
ACTOR Future < MoveKeysLock > takeMoveKeysLock ( Database cx , UID ddId ) {
2017-05-26 04:48:44 +08:00
state Transaction tr ( cx ) ;
loop {
try {
state MoveKeysLock lock ;
tr . setOption ( FDBTransactionOptions : : PRIORITY_SYSTEM_IMMEDIATE ) ;
if ( ! g_network - > isSimulated ( ) ) {
2019-05-11 05:01:52 +08:00
UID id ( deterministicRandom ( ) - > randomUniqueID ( ) ) ;
2019-07-16 06:07:11 +08:00
TraceEvent ( " TakeMoveKeysLockTransaction " , ddId )
2017-05-26 04:48:44 +08:00
. detail ( " TransactionUID " , id ) ;
tr . debugTransaction ( id ) ;
}
2019-02-18 10:46:59 +08:00
{
Optional < Value > readVal = wait ( tr . get ( moveKeysLockOwnerKey ) ) ;
lock . prevOwner = readVal . present ( ) ? BinaryReader : : fromStringRef < UID > ( readVal . get ( ) , Unversioned ( ) ) : UID ( ) ;
}
{
Optional < Value > readVal = wait ( tr . get ( moveKeysLockWriteKey ) ) ;
lock . prevWrite = readVal . present ( ) ? BinaryReader : : fromStringRef < UID > ( readVal . get ( ) , Unversioned ( ) ) : UID ( ) ;
}
2019-05-11 05:01:52 +08:00
lock . myOwner = deterministicRandom ( ) - > randomUniqueID ( ) ;
2019-07-16 06:07:11 +08:00
tr . set ( moveKeysLockOwnerKey , BinaryWriter : : toValue ( lock . myOwner , Unversioned ( ) ) ) ;
wait ( tr . commit ( ) ) ;
2017-05-26 04:48:44 +08:00
return lock ;
} catch ( Error & e ) {
2018-08-11 04:57:10 +08:00
wait ( tr . onError ( e ) ) ;
2017-05-26 04:48:44 +08:00
TEST ( true ) ; // takeMoveKeysLock retry
}
}
}
ACTOR Future < Void > checkMoveKeysLock ( Transaction * tr , MoveKeysLock lock , bool isWrite = true ) {
2019-07-24 07:16:31 +08:00
if ( ! isDDEnabled ( ) ) {
TraceEvent ( SevDebug , " DDDisabledByInMemoryCheck " ) ;
throw movekeys_conflict ( ) ;
}
2017-05-26 04:48:44 +08:00
Optional < Value > readVal = wait ( tr - > get ( moveKeysLockOwnerKey ) ) ;
UID currentOwner = readVal . present ( ) ? BinaryReader : : fromStringRef < UID > ( readVal . get ( ) , Unversioned ( ) ) : UID ( ) ;
if ( currentOwner = = lock . prevOwner ) {
// Check that the previous owner hasn't touched the lock since we took it
Optional < Value > readVal = wait ( tr - > get ( moveKeysLockWriteKey ) ) ;
UID lastWrite = readVal . present ( ) ? BinaryReader : : fromStringRef < UID > ( readVal . get ( ) , Unversioned ( ) ) : UID ( ) ;
if ( lastWrite ! = lock . prevWrite ) {
TEST ( true ) ; // checkMoveKeysLock: Conflict with previous owner
throw movekeys_conflict ( ) ;
}
// Take the lock
if ( isWrite ) {
BinaryWriter wrMyOwner ( Unversioned ( ) ) ; wrMyOwner < < lock . myOwner ;
2019-03-29 02:52:50 +08:00
tr - > set ( moveKeysLockOwnerKey , wrMyOwner . toValue ( ) ) ;
2019-05-11 05:01:52 +08:00
BinaryWriter wrLastWrite ( Unversioned ( ) ) ; wrLastWrite < < deterministicRandom ( ) - > randomUniqueID ( ) ;
2019-03-29 02:52:50 +08:00
tr - > set ( moveKeysLockWriteKey , wrLastWrite . toValue ( ) ) ;
2017-05-26 04:48:44 +08:00
}
return Void ( ) ;
} else if ( currentOwner = = lock . myOwner ) {
if ( isWrite ) {
// Touch the lock, preventing overlapping attempts to take it
2019-05-11 05:01:52 +08:00
BinaryWriter wrLastWrite ( Unversioned ( ) ) ; wrLastWrite < < deterministicRandom ( ) - > randomUniqueID ( ) ;
2019-03-29 02:52:50 +08:00
tr - > set ( moveKeysLockWriteKey , wrLastWrite . toValue ( ) ) ;
2017-05-26 04:48:44 +08:00
// Make this transaction self-conflicting so the database will not execute it twice with the same write key
tr - > makeSelfConflicting ( ) ;
}
return Void ( ) ;
} else {
TEST ( true ) ; // checkMoveKeysLock: Conflict with new owner
throw movekeys_conflict ( ) ;
}
}
Future < Void > checkMoveKeysLockReadOnly ( Transaction * tr , MoveKeysLock lock ) {
return checkMoveKeysLock ( tr , lock , false ) ;
}
2018-06-08 03:29:25 +08:00
ACTOR Future < Optional < UID > > checkReadWrite ( Future < ErrorOr < std : : pair < Version , Version > > > fReply , UID uid , Version version ) {
ErrorOr < std : : pair < Version , Version > > reply = wait ( fReply ) ;
if ( ! reply . present ( ) | | reply . get ( ) . first < version )
2017-10-04 08:39:08 +08:00
return Optional < UID > ( ) ;
return Optional < UID > ( uid ) ;
2017-05-26 04:48:44 +08:00
}
Future < Void > removeOldDestinations ( Transaction * tr , UID oldDest , VectorRef < KeyRangeRef > shards , KeyRangeRef currentKeys ) {
KeyRef beginKey = currentKeys . begin ;
vector < Future < Void > > actors ;
for ( int i = 0 ; i < shards . size ( ) ; i + + ) {
if ( beginKey < shards [ i ] . begin )
actors . push_back ( krmSetRangeCoalescing ( tr , serverKeysPrefixFor ( oldDest ) , KeyRangeRef ( beginKey , shards [ i ] . begin ) , allKeys , serverKeysFalse ) ) ;
beginKey = shards [ i ] . end ;
}
if ( beginKey < currentKeys . end )
actors . push_back ( krmSetRangeCoalescing ( tr , serverKeysPrefixFor ( oldDest ) , KeyRangeRef ( beginKey , currentKeys . end ) , allKeys , serverKeysFalse ) ) ;
return waitForAll ( actors ) ;
}
2018-09-01 03:43:14 +08:00
ACTOR Future < vector < UID > > addReadWriteDestinations ( KeyRangeRef shard , vector < StorageServerInterface > srcInterfs , vector < StorageServerInterface > destInterfs , Version version , int desiredHealthy , int maxServers ) {
if ( srcInterfs . size ( ) > = maxServers ) {
return vector < UID > ( ) ;
}
state vector < Future < Optional < UID > > > srcChecks ;
for ( int s = 0 ; s < srcInterfs . size ( ) ; s + + ) {
2019-06-25 17:47:35 +08:00
srcChecks . push_back ( checkReadWrite ( srcInterfs [ s ] . getShardState . getReplyUnlessFailedFor ( GetShardStateRequest ( shard , GetShardStateRequest : : NO_WAIT ) , SERVER_KNOBS - > SERVER_READY_QUORUM_INTERVAL , 0 , TaskPriority : : MoveKeys ) , srcInterfs [ s ] . id ( ) , 0 ) ) ;
2018-09-01 03:43:14 +08:00
}
state vector < Future < Optional < UID > > > destChecks ;
for ( int s = 0 ; s < destInterfs . size ( ) ; s + + ) {
2019-06-25 17:47:35 +08:00
destChecks . push_back ( checkReadWrite ( destInterfs [ s ] . getShardState . getReplyUnlessFailedFor ( GetShardStateRequest ( shard , GetShardStateRequest : : NO_WAIT ) , SERVER_KNOBS - > SERVER_READY_QUORUM_INTERVAL , 0 , TaskPriority : : MoveKeys ) , destInterfs [ s ] . id ( ) , version ) ) ;
2018-09-01 03:43:14 +08:00
}
2018-09-06 07:06:33 +08:00
wait ( waitForAll ( srcChecks ) & & waitForAll ( destChecks ) ) ;
2018-09-01 03:43:14 +08:00
int healthySrcs = 0 ;
for ( auto it : srcChecks ) {
if ( it . get ( ) . present ( ) ) {
healthySrcs + + ;
}
}
vector < UID > result ;
int totalDesired = std : : min < int > ( desiredHealthy - healthySrcs , maxServers - srcInterfs . size ( ) ) ;
for ( int s = 0 ; s < destInterfs . size ( ) & & result . size ( ) < totalDesired ; s + + ) {
if ( destChecks [ s ] . get ( ) . present ( ) ) {
result . push_back ( destChecks [ s ] . get ( ) . get ( ) ) ;
}
}
return result ;
}
ACTOR Future < vector < vector < UID > > > additionalSources ( Standalone < RangeResultRef > shards , Transaction * tr , int desiredHealthy , int maxServers ) {
2017-10-04 08:39:08 +08:00
vector < Future < Optional < Value > > > serverListEntries ;
2018-09-01 03:43:14 +08:00
std : : set < UID > fetching ;
2017-10-04 08:39:08 +08:00
for ( int i = 0 ; i < shards . size ( ) - 1 ; + + i ) {
2017-05-26 04:48:44 +08:00
vector < UID > src ;
vector < UID > dest ;
decodeKeyServersValue ( shards [ i ] . value , src , dest ) ;
2018-09-01 03:43:14 +08:00
for ( int s = 0 ; s < src . size ( ) ; s + + ) {
if ( ! fetching . count ( src [ s ] ) ) {
fetching . insert ( src [ s ] ) ;
serverListEntries . push_back ( tr - > get ( serverListKeyFor ( src [ s ] ) ) ) ;
}
}
2017-10-04 08:39:08 +08:00
for ( int s = 0 ; s < dest . size ( ) ; s + + ) {
2018-09-01 03:43:14 +08:00
if ( ! fetching . count ( dest [ s ] ) ) {
fetching . insert ( dest [ s ] ) ;
serverListEntries . push_back ( tr - > get ( serverListKeyFor ( dest [ s ] ) ) ) ;
}
2017-05-26 04:48:44 +08:00
}
}
2017-10-04 08:39:08 +08:00
vector < Optional < Value > > serverListValues = wait ( getAll ( serverListEntries ) ) ;
2017-05-26 04:48:44 +08:00
2017-10-04 08:39:08 +08:00
std : : map < UID , StorageServerInterface > ssiMap ;
2017-05-26 04:48:44 +08:00
for ( int s = 0 ; s < serverListValues . size ( ) ; s + + ) {
auto si = decodeServerListValue ( serverListValues [ s ] . get ( ) ) ;
StorageServerInterface ssi = decodeServerListValue ( serverListValues [ s ] . get ( ) ) ;
ssiMap [ ssi . id ( ) ] = ssi ;
}
2018-09-01 03:43:14 +08:00
vector < Future < vector < UID > > > allChecks ;
2017-10-04 08:39:08 +08:00
for ( int i = 0 ; i < shards . size ( ) - 1 ; + + i ) {
2017-05-26 04:48:44 +08:00
KeyRangeRef rangeIntersectKeys ( shards [ i ] . key , shards [ i + 1 ] . key ) ;
vector < UID > src ;
vector < UID > dest ;
2018-09-01 03:43:14 +08:00
vector < StorageServerInterface > srcInterfs ;
vector < StorageServerInterface > destInterfs ;
2017-05-26 04:48:44 +08:00
decodeKeyServersValue ( shards [ i ] . value , src , dest ) ;
2018-09-01 03:43:14 +08:00
for ( int s = 0 ; s < src . size ( ) ; s + + ) {
srcInterfs . push_back ( ssiMap [ src [ s ] ] ) ;
}
2017-05-26 04:48:44 +08:00
2018-09-01 03:43:14 +08:00
for ( int s = 0 ; s < dest . size ( ) ; s + + ) {
if ( std : : find ( src . begin ( ) , src . end ( ) , dest [ s ] ) = = dest . end ( ) ) {
destInterfs . push_back ( ssiMap [ dest [ s ] ] ) ;
}
2017-05-26 04:48:44 +08:00
}
2018-09-01 03:43:14 +08:00
allChecks . push_back ( addReadWriteDestinations ( rangeIntersectKeys , srcInterfs , destInterfs , tr - > getReadVersion ( ) . get ( ) , desiredHealthy , maxServers ) ) ;
2017-05-26 04:48:44 +08:00
}
2018-09-01 03:43:14 +08:00
vector < vector < UID > > result = wait ( getAll ( allChecks ) ) ;
return result ;
2017-05-26 04:48:44 +08:00
}
2019-08-13 01:08:12 +08:00
// keyServer: map from keys to destination servers
// serverKeys: two-dimension map: [servers][keys], value is the servers' state of having the keys: active(not-have),
2019-08-20 04:47:48 +08:00
// complete(already has), ""()
2019-08-17 07:46:54 +08:00
// MXQ: What does serverKeys[dest][keys] mean? It seems having the same meaning with serverKeys[servers][keys]? (I think so.)
2019-08-13 01:08:12 +08:00
2017-05-26 04:48:44 +08:00
// Set keyServers[keys].dest = servers
// Set serverKeys[servers][keys] = active for each subrange of keys that the server did not already have, complete for each subrange that it already has
// Set serverKeys[dest][keys] = "" for the dest servers of each existing shard in keys (unless that destination is a member of servers OR if the source list is sufficiently degraded)
2018-06-19 01:24:57 +08:00
ACTOR Future < Void > startMoveKeys ( Database occ , KeyRange keys , vector < UID > servers , MoveKeysLock lock , FlowLock * startMoveKeysLock , UID relocationIntervalId ) {
2017-05-26 04:48:44 +08:00
state TraceInterval interval ( " RelocateShard_StartMoveKeys " ) ;
//state TraceInterval waitInterval("");
2019-06-25 17:47:35 +08:00
wait ( startMoveKeysLock - > take ( TaskPriority : : DataDistributionLaunch ) ) ;
2017-05-26 04:48:44 +08:00
state FlowLock : : Releaser releaser ( * startMoveKeysLock ) ;
TraceEvent ( SevDebug , interval . begin ( ) , relocationIntervalId ) ;
try {
state Key begin = keys . begin ;
state int batches = 0 ;
state int shards = 0 ;
state int maxRetries = 0 ;
2019-08-13 01:08:12 +08:00
// If it's multiple transaction, how do we achieve atomicity?
// This process can be split up into multiple transactions if there are too many existing overlapping shards
// In that case, each iteration of this loop will have begin set to the end of the last processed shard
2017-05-26 04:48:44 +08:00
while ( begin < keys . end ) {
TEST ( begin > keys . begin ) ; //Multi-transactional startMoveKeys
batches + + ;
state Transaction tr ( occ ) ;
state int retries = 0 ;
loop {
try {
retries + + ;
//Keep track of old dests that may need to have ranges removed from serverKeys
state std : : set < UID > oldDests ;
//Keep track of shards for all src servers so that we can preserve their values in serverKeys
state Map < UID , VectorRef < KeyRangeRef > > shardMap ;
2019-06-25 17:47:35 +08:00
tr . info . taskID = TaskPriority : : MoveKeys ;
2017-05-26 04:48:44 +08:00
tr . setOption ( FDBTransactionOptions : : PRIORITY_SYSTEM_IMMEDIATE ) ;
2018-08-11 04:57:10 +08:00
wait ( checkMoveKeysLock ( & tr , lock ) ) ;
2017-05-26 04:48:44 +08:00
vector < Future < Optional < Value > > > serverListEntries ;
for ( int s = 0 ; s < servers . size ( ) ; s + + )
serverListEntries . push_back ( tr . get ( serverListKeyFor ( servers [ s ] ) ) ) ;
state vector < Optional < Value > > serverListValues = wait ( getAll ( serverListEntries ) ) ;
for ( int s = 0 ; s < serverListValues . size ( ) ; s + + ) {
if ( ! serverListValues [ s ] . present ( ) ) {
2019-08-13 01:08:12 +08:00
// Attempt to move onto a server that isn't in serverList (removed or never added to the
// database) This can happen (why?) and is handled by the data distribution algorithm
2019-08-17 07:46:54 +08:00
// FIXME: Answer why this can happen?
2017-05-26 04:48:44 +08:00
TEST ( true ) ; //start move keys moving to a removed server
throw move_to_removed_server ( ) ;
}
}
//Get all existing shards overlapping keys (exclude any that have been processed in a previous iteration of the outer loop)
state KeyRange currentKeys = KeyRangeRef ( begin , keys . end ) ;
state Standalone < RangeResultRef > old = wait ( krmGetRanges ( & tr , keyServersPrefix , currentKeys , SERVER_KNOBS - > MOVE_KEYS_KRM_LIMIT , SERVER_KNOBS - > MOVE_KEYS_KRM_LIMIT_BYTES ) ) ;
//Determine the last processed key (which will be the beginning for the next iteration)
state Key endKey = old . end ( ) [ - 1 ] . key ;
currentKeys = KeyRangeRef ( currentKeys . begin , endKey ) ;
2019-08-17 07:46:54 +08:00
// TraceEvent("StartMoveKeysBatch", relocationIntervalId)
// .detail("KeyBegin", currentKeys.begin.toString())
// .detail("KeyEnd", currentKeys.end.toString());
2017-05-26 04:48:44 +08:00
2019-08-13 01:08:12 +08:00
// printf("Moving '%s'-'%s' (%d) to %d servers\n", keys.begin.toString().c_str(),
// keys.end.toString().c_str(), old.size(), servers.size()); for(int i=0; i<old.size(); i++)
// printf("'%s': '%s'\n", old[i].key.toString().c_str(), old[i].value.toString().c_str());
2017-05-26 04:48:44 +08:00
//Check that enough servers for each shard are in the correct state
2018-09-01 03:43:14 +08:00
vector < vector < UID > > addAsSource = wait ( additionalSources ( old , & tr , servers . size ( ) , SERVER_KNOBS - > MAX_ADDED_SOURCES_MULTIPLIER * servers . size ( ) ) ) ;
2017-05-26 04:48:44 +08:00
// For each intersecting range, update keyServers[range] dest to be servers and clear existing dest servers from serverKeys
for ( int i = 0 ; i < old . size ( ) - 1 ; + + i ) {
KeyRangeRef rangeIntersectKeys ( old [ i ] . key , old [ i + 1 ] . key ) ;
vector < UID > src ;
vector < UID > dest ;
decodeKeyServersValue ( old [ i ] . value , src , dest ) ;
2019-08-17 07:46:54 +08:00
// TraceEvent("StartMoveKeysOldRange", relocationIntervalId)
// .detail("KeyBegin", rangeIntersectKeys.begin.toString())
// .detail("KeyEnd", rangeIntersectKeys.end.toString())
// .detail("OldSrc", describe(src))
// .detail("OldDest", describe(dest))
// .detail("ReadVersion", tr.getReadVersion().get());
2017-05-26 04:48:44 +08:00
2018-09-01 03:43:14 +08:00
for ( auto & uid : addAsSource [ i ] ) {
src . push_back ( uid ) ;
2017-05-26 04:48:44 +08:00
}
2018-02-03 03:46:04 +08:00
uniquify ( src ) ;
2017-10-04 08:39:08 +08:00
2017-05-26 04:48:44 +08:00
//Update dest servers for this range to be equal to servers
krmSetPreviouslyEmptyRange ( & tr , keyServersPrefix , rangeIntersectKeys , keyServersValue ( src , servers ) , old [ i + 1 ] . value ) ;
//Track old destination servers. They may be removed from serverKeys soon, since they are about to be overwritten in keyServers
for ( auto s = dest . begin ( ) ; s ! = dest . end ( ) ; + + s ) {
oldDests . insert ( * s ) ;
2019-08-17 07:46:54 +08:00
// TraceEvent("StartMoveKeysOldDestAdd", relocationIntervalId).detail("Server", *s);
2017-05-26 04:48:44 +08:00
}
//Keep track of src shards so that we can preserve their values when we overwrite serverKeys
2018-02-03 03:46:04 +08:00
for ( auto & uid : src ) {
shardMap [ uid ] . push_back ( old . arena ( ) , rangeIntersectKeys ) ;
2019-08-17 07:46:54 +08:00
// TraceEvent("StartMoveKeysShardMapAdd", relocationIntervalId).detail("Server", uid);
2017-05-26 04:48:44 +08:00
}
}
state std : : set < UID > : : iterator oldDest ;
//Remove old dests from serverKeys. In order for krmSetRangeCoalescing to work correctly in the same prefix for a single transaction, we must
//do most of the coalescing ourselves. Only the shards on the boundary of currentRange are actually coalesced with the ranges outside of currentRange.
//For all shards internal to currentRange, we overwrite all consecutive keys whose value is or should be serverKeysFalse in a single write
vector < Future < Void > > actors ;
for ( oldDest = oldDests . begin ( ) ; oldDest ! = oldDests . end ( ) ; + + oldDest )
if ( std : : find ( servers . begin ( ) , servers . end ( ) , * oldDest ) = = servers . end ( ) )
actors . push_back ( removeOldDestinations ( & tr , * oldDest , shardMap [ * oldDest ] , currentKeys ) ) ;
//Update serverKeys to include keys (or the currently processed subset of keys) for each SS in servers
for ( int i = 0 ; i < servers . size ( ) ; i + + ) {
// Since we are setting this for the entire range, serverKeys and keyServers aren't guaranteed to have the same shard boundaries
// If that invariant was important, we would have to move this inside the loop above and also set it for the src servers
actors . push_back ( krmSetRangeCoalescing ( & tr , serverKeysPrefixFor ( servers [ i ] ) , currentKeys , allKeys , serverKeysTrue ) ) ;
}
2018-08-11 04:57:10 +08:00
wait ( waitForAll ( actors ) ) ;
2017-05-26 04:48:44 +08:00
2018-08-11 04:57:10 +08:00
wait ( tr . commit ( ) ) ;
2017-05-26 04:48:44 +08:00
/*TraceEvent("StartMoveKeysCommitDone", relocationIntervalId)
. detail ( " CommitVersion " , tr . getCommittedVersion ( ) )
. detail ( " ShardsInBatch " , old . size ( ) - 1 ) ; */
begin = endKey ;
shards + = old . size ( ) - 1 ;
break ;
} catch ( Error & e ) {
state Error err = e ;
if ( err . code ( ) = = error_code_move_to_removed_server )
throw ;
2018-08-11 04:57:10 +08:00
wait ( tr . onError ( e ) ) ;
2018-04-11 06:52:32 +08:00
if ( retries % 10 = = 0 ) {
2018-06-09 02:11:08 +08:00
TraceEvent ( retries = = 50 ? SevWarnAlways : SevWarn , " StartMoveKeysRetrying " , relocationIntervalId )
2018-08-02 05:30:57 +08:00
. error ( err )
2019-03-19 06:03:43 +08:00
. detail ( " Keys " , keys )
. detail ( " BeginKey " , begin )
2018-08-02 05:30:57 +08:00
. detail ( " NumTries " , retries ) ;
2018-04-11 06:52:32 +08:00
}
2017-05-26 04:48:44 +08:00
}
}
if ( retries > maxRetries ) {
maxRetries = retries ;
}
}
//printf("Committed moving '%s'-'%s' (version %lld)\n", keys.begin.toString().c_str(), keys.end.toString().c_str(), tr.getCommittedVersion());
TraceEvent ( SevDebug , interval . end ( ) , relocationIntervalId )
. detail ( " Batches " , batches )
. detail ( " Shards " , shards )
. detail ( " MaxRetries " , maxRetries ) ;
} catch ( Error & e ) {
TraceEvent ( SevDebug , interval . end ( ) , relocationIntervalId ) . error ( e , true ) ;
throw ;
}
return Void ( ) ;
}
2019-02-13 08:07:23 +08:00
ACTOR Future < Void > waitForShardReady ( StorageServerInterface server , KeyRange keys , Version minVersion , GetShardStateRequest : : waitMode mode ) {
2017-05-26 04:48:44 +08:00
loop {
try {
2019-06-25 17:47:35 +08:00
std : : pair < Version , Version > rep = wait ( server . getShardState . getReply ( GetShardStateRequest ( keys , mode ) , TaskPriority : : MoveKeys ) ) ;
2019-01-19 03:30:18 +08:00
if ( rep . first > = minVersion ) {
2017-05-26 04:48:44 +08:00
return Void ( ) ;
}
2019-06-25 17:47:35 +08:00
wait ( delayJittered ( SERVER_KNOBS - > SHARD_READY_DELAY , TaskPriority : : MoveKeys ) ) ;
2017-05-26 04:48:44 +08:00
}
catch ( Error & e ) {
if ( e . code ( ) ! = error_code_timed_out ) {
if ( e . code ( ) ! = error_code_broken_promise )
throw e ;
2018-08-11 04:57:10 +08:00
wait ( Never ( ) ) ; // Never return: A storage server which has failed will never be ready
2017-05-26 04:48:44 +08:00
throw internal_error ( ) ; // does not happen
}
}
}
}
ACTOR Future < Void > checkFetchingState ( Database cx , vector < UID > dest , KeyRange keys ,
Promise < Void > dataMovementComplete , UID relocationIntervalId ) {
state Transaction tr ( cx ) ;
loop {
try {
2018-08-11 04:57:10 +08:00
if ( BUGGIFY ) wait ( delay ( 5 ) ) ;
2017-05-26 04:48:44 +08:00
2019-06-25 17:47:35 +08:00
tr . info . taskID = TaskPriority : : MoveKeys ;
2017-05-26 04:48:44 +08:00
tr . setOption ( FDBTransactionOptions : : PRIORITY_SYSTEM_IMMEDIATE ) ;
vector < Future < Optional < Value > > > serverListEntries ;
for ( int s = 0 ; s < dest . size ( ) ; s + + )
serverListEntries . push_back ( tr . get ( serverListKeyFor ( dest [ s ] ) ) ) ;
state vector < Optional < Value > > serverListValues = wait ( getAll ( serverListEntries ) ) ;
vector < Future < Void > > requests ;
for ( int s = 0 ; s < serverListValues . size ( ) ; s + + ) {
if ( ! serverListValues [ s ] . present ( ) ) {
// FIXME: Is this the right behavior? dataMovementComplete will never be sent!
TEST ( true ) ; //check fetching state moved to removed server
throw move_to_removed_server ( ) ;
}
auto si = decodeServerListValue ( serverListValues [ s ] . get ( ) ) ;
ASSERT ( si . id ( ) = = dest [ s ] ) ;
2019-02-13 08:07:23 +08:00
requests . push_back ( waitForShardReady ( si , keys , tr . getReadVersion ( ) . get ( ) , GetShardStateRequest : : FETCHING ) ) ;
2017-05-26 04:48:44 +08:00
}
2018-08-11 04:57:10 +08:00
wait ( timeoutError ( waitForAll ( requests ) ,
2019-06-25 17:47:35 +08:00
SERVER_KNOBS - > SERVER_READY_QUORUM_TIMEOUT , TaskPriority : : MoveKeys ) ) ;
2017-05-26 04:48:44 +08:00
dataMovementComplete . send ( Void ( ) ) ;
return Void ( ) ;
} catch ( Error & e ) {
if ( e . code ( ) = = error_code_timed_out )
tr . reset ( ) ;
else
2018-08-11 04:57:10 +08:00
wait ( tr . onError ( e ) ) ;
2017-05-26 04:48:44 +08:00
}
}
}
// Set keyServers[keys].src = keyServers[keys].dest and keyServers[keys].dest=[], return when successful
// keyServers[k].dest must be the same for all k in keys
// Set serverKeys[dest][keys] = true; serverKeys[src][keys] = false for all src not in dest
// Should be cancelled and restarted if keyServers[keys].dest changes (?so this is no longer true?)
2019-02-13 08:07:23 +08:00
ACTOR Future < Void > finishMoveKeys ( Database occ , KeyRange keys , vector < UID > destinationTeam , MoveKeysLock lock , FlowLock * finishMoveKeysParallelismLock , bool hasRemote , UID relocationIntervalId )
2017-05-26 04:48:44 +08:00
{
state TraceInterval interval ( " RelocateShard_FinishMoveKeys " ) ;
state TraceInterval waitInterval ( " " ) ;
state Key begin = keys . begin ;
2019-02-18 10:46:59 +08:00
state Key endKey ;
2017-05-26 04:48:44 +08:00
state int retries = 0 ;
state FlowLock : : Releaser releaser ;
ASSERT ( ! destinationTeam . empty ( ) ) ;
try {
2019-03-19 06:03:43 +08:00
TraceEvent ( SevDebug , interval . begin ( ) , relocationIntervalId ) . detail ( " KeyBegin " , keys . begin ) . detail ( " KeyEnd " , keys . end ) ;
2017-05-26 04:48:44 +08:00
//This process can be split up into multiple transactions if there are too many existing overlapping shards
//In that case, each iteration of this loop will have begin set to the end of the last processed shard
while ( begin < keys . end ) {
TEST ( begin > keys . begin ) ; //Multi-transactional finishMoveKeys
state Transaction tr ( occ ) ;
//printf("finishMoveKeys( '%s'-'%s' )\n", keys.begin.toString().c_str(), keys.end.toString().c_str());
loop {
try {
2019-06-25 17:47:35 +08:00
tr . info . taskID = TaskPriority : : MoveKeys ;
2017-05-26 04:48:44 +08:00
tr . setOption ( FDBTransactionOptions : : PRIORITY_SYSTEM_IMMEDIATE ) ;
releaser . release ( ) ;
2019-06-25 17:47:35 +08:00
wait ( finishMoveKeysParallelismLock - > take ( TaskPriority : : DataDistributionLaunch ) ) ;
2017-05-26 04:48:44 +08:00
releaser = FlowLock : : Releaser ( * finishMoveKeysParallelismLock ) ;
2018-08-11 04:57:10 +08:00
wait ( checkMoveKeysLock ( & tr , lock ) ) ;
2017-05-26 04:48:44 +08:00
state KeyRange currentKeys = KeyRangeRef ( begin , keys . end ) ;
state Standalone < RangeResultRef > keyServers = wait ( krmGetRanges ( & tr , keyServersPrefix , currentKeys , SERVER_KNOBS - > MOVE_KEYS_KRM_LIMIT , SERVER_KNOBS - > MOVE_KEYS_KRM_LIMIT_BYTES ) ) ;
//Determine the last processed key (which will be the beginning for the next iteration)
2019-02-18 10:46:59 +08:00
endKey = keyServers . end ( ) [ - 1 ] . key ;
2017-05-26 04:48:44 +08:00
currentKeys = KeyRangeRef ( currentKeys . begin , endKey ) ;
//printf(" finishMoveKeys( '%s'-'%s' ): read keyServers at %lld\n", keys.begin.toString().c_str(), keys.end.toString().c_str(), tr.getReadVersion().get());
// Decode and sanity check the result (dest must be the same for all ranges)
bool alreadyMoved = true ;
state vector < UID > dest ;
state std : : set < UID > allServers ;
state std : : set < UID > intendedTeam ( destinationTeam . begin ( ) , destinationTeam . end ( ) ) ;
state vector < UID > src ;
2018-02-03 03:46:04 +08:00
vector < UID > completeSrc ;
2017-05-26 04:48:44 +08:00
//Iterate through the beginning of keyServers until we find one that hasn't already been processed
int currentIndex ;
for ( currentIndex = 0 ; currentIndex < keyServers . size ( ) - 1 & & alreadyMoved ; currentIndex + + ) {
decodeKeyServersValue ( keyServers [ currentIndex ] . value , src , dest ) ;
std : : set < UID > srcSet ;
2018-02-03 03:46:04 +08:00
for ( int s = 0 ; s < src . size ( ) ; s + + ) {
2017-05-26 04:48:44 +08:00
srcSet . insert ( src [ s ] ) ;
2018-02-03 03:46:04 +08:00
}
if ( currentIndex = = 0 ) {
completeSrc = src ;
} else {
for ( int i = 0 ; i < completeSrc . size ( ) ; i + + ) {
if ( ! srcSet . count ( completeSrc [ i ] ) ) {
2018-08-02 09:09:54 +08:00
swapAndPop ( & completeSrc , i - - ) ;
2018-02-03 03:46:04 +08:00
}
}
}
2017-05-26 04:48:44 +08:00
std : : set < UID > destSet ;
2018-02-03 03:46:04 +08:00
for ( int s = 0 ; s < dest . size ( ) ; s + + ) {
2017-05-26 04:48:44 +08:00
destSet . insert ( dest [ s ] ) ;
2018-02-03 03:46:04 +08:00
}
2017-05-26 04:48:44 +08:00
allServers . insert ( srcSet . begin ( ) , srcSet . end ( ) ) ;
allServers . insert ( destSet . begin ( ) , destSet . end ( ) ) ;
alreadyMoved = destSet . empty ( ) & & srcSet = = intendedTeam ;
if ( destSet ! = intendedTeam & & ! alreadyMoved ) {
TraceEvent ( SevWarn , " MoveKeysDestTeamNotIntended " , relocationIntervalId )
2019-03-19 06:03:43 +08:00
. detail ( " KeyBegin " , keys . begin )
. detail ( " KeyEnd " , keys . end )
. detail ( " IterationBegin " , begin )
. detail ( " IterationEnd " , endKey )
2017-05-26 04:48:44 +08:00
. detail ( " DestSet " , describe ( destSet ) )
. detail ( " IntendedTeam " , describe ( intendedTeam ) )
2019-03-19 06:03:43 +08:00
. detail ( " KeyServers " , keyServers ) ;
2017-05-26 04:48:44 +08:00
//ASSERT( false );
ASSERT ( ! dest . empty ( ) ) ; //The range has already been moved, but to a different dest (or maybe dest was cleared)
intendedTeam . clear ( ) ;
for ( int i = 0 ; i < dest . size ( ) ; i + + )
intendedTeam . insert ( dest [ i ] ) ;
}
else if ( alreadyMoved ) {
dest . clear ( ) ;
src . clear ( ) ;
TEST ( true ) ; //FinishMoveKeys first key in iteration sub-range has already been processed
}
}
//Process the rest of the key servers
for ( ; currentIndex < keyServers . size ( ) - 1 ; currentIndex + + ) {
vector < UID > src2 , dest2 ;
decodeKeyServersValue ( keyServers [ currentIndex ] . value , src2 , dest2 ) ;
std : : set < UID > srcSet ;
for ( int s = 0 ; s < src2 . size ( ) ; s + + )
srcSet . insert ( src2 [ s ] ) ;
2018-02-03 03:46:04 +08:00
for ( int i = 0 ; i < completeSrc . size ( ) ; i + + ) {
if ( ! srcSet . count ( completeSrc [ i ] ) ) {
2018-08-02 09:09:54 +08:00
swapAndPop ( & completeSrc , i - - ) ;
2018-02-03 03:46:04 +08:00
}
}
2017-05-26 04:48:44 +08:00
allServers . insert ( srcSet . begin ( ) , srcSet . end ( ) ) ;
alreadyMoved = dest2 . empty ( ) & & srcSet = = intendedTeam ;
if ( dest2 ! = dest & & ! alreadyMoved ) {
TraceEvent ( SevError , " FinishMoveKeysError " , relocationIntervalId )
. detail ( " Reason " , " dest mismatch " )
. detail ( " Dest " , describe ( dest ) )
. detail ( " Dest2 " , describe ( dest2 ) ) ;
ASSERT ( false ) ;
}
}
if ( ! dest . size ( ) ) {
TEST ( true ) ; // A previous finishMoveKeys for this range committed just as it was cancelled to start this one?
TraceEvent ( " FinishMoveKeysNothingToDo " , relocationIntervalId )
2019-03-19 06:03:43 +08:00
. detail ( " KeyBegin " , keys . begin )
. detail ( " KeyEnd " , keys . end )
. detail ( " IterationBegin " , begin )
. detail ( " IterationEnd " , endKey ) ;
2017-05-26 04:48:44 +08:00
begin = keyServers . end ( ) [ - 1 ] . key ;
break ;
}
2018-06-09 02:11:08 +08:00
waitInterval = TraceInterval ( " RelocateShard_FinishMoveKeysWaitDurable " ) ;
2017-05-26 04:48:44 +08:00
TraceEvent ( SevDebug , waitInterval . begin ( ) , relocationIntervalId )
2019-03-19 06:03:43 +08:00
. detail ( " KeyBegin " , keys . begin )
. detail ( " KeyEnd " , keys . end ) ;
2017-05-26 04:48:44 +08:00
// Wait for a durable quorum of servers in destServers to have keys available (readWrite)
// They must also have at least the transaction read version so they can't "forget" the shard between
// now and when this transaction commits.
state vector < Future < Void > > serverReady ; // only for count below
2018-02-03 03:46:04 +08:00
state vector < UID > newDestinations ;
std : : set < UID > completeSrcSet ( completeSrc . begin ( ) , completeSrc . end ( ) ) ;
for ( auto & it : dest ) {
2018-06-09 07:17:27 +08:00
if ( ! hasRemote | | ! completeSrcSet . count ( it ) ) {
2018-02-03 03:46:04 +08:00
newDestinations . push_back ( it ) ;
}
}
2017-05-26 04:48:44 +08:00
// for smartQuorum
state vector < StorageServerInterface > storageServerInterfaces ;
vector < Future < Optional < Value > > > serverListEntries ;
2018-02-03 03:46:04 +08:00
for ( int s = 0 ; s < newDestinations . size ( ) ; s + + )
serverListEntries . push_back ( tr . get ( serverListKeyFor ( newDestinations [ s ] ) ) ) ;
2017-05-26 04:48:44 +08:00
state vector < Optional < Value > > serverListValues = wait ( getAll ( serverListEntries ) ) ;
releaser . release ( ) ;
for ( int s = 0 ; s < serverListValues . size ( ) ; s + + ) {
ASSERT ( serverListValues [ s ] . present ( ) ) ; // There should always be server list entries for servers in keyServers
auto si = decodeServerListValue ( serverListValues [ s ] . get ( ) ) ;
2018-02-03 03:46:04 +08:00
ASSERT ( si . id ( ) = = newDestinations [ s ] ) ;
2017-05-26 04:48:44 +08:00
storageServerInterfaces . push_back ( si ) ;
}
2019-07-30 10:17:10 +08:00
// Wait for new destination servers to fetch the keys
2017-05-26 04:48:44 +08:00
for ( int s = 0 ; s < storageServerInterfaces . size ( ) ; s + + )
2019-02-13 08:07:23 +08:00
serverReady . push_back ( waitForShardReady ( storageServerInterfaces [ s ] , keys , tr . getReadVersion ( ) . get ( ) , GetShardStateRequest : : READABLE ) ) ;
2019-06-25 17:47:35 +08:00
wait ( timeout ( waitForAll ( serverReady ) , SERVER_KNOBS - > SERVER_READY_QUORUM_TIMEOUT , Void ( ) , TaskPriority : : MoveKeys ) ) ;
2018-02-03 03:46:04 +08:00
int count = dest . size ( ) - newDestinations . size ( ) ;
2017-05-26 04:48:44 +08:00
for ( int s = 0 ; s < serverReady . size ( ) ; s + + )
count + = serverReady [ s ] . isReady ( ) & & ! serverReady [ s ] . isError ( ) ;
//printf(" fMK: moved data to %d/%d servers\n", count, serverReady.size());
TraceEvent ( SevDebug , waitInterval . end ( ) , relocationIntervalId ) . detail ( " ReadyServers " , count ) ;
2018-06-19 01:24:57 +08:00
if ( count = = dest . size ( ) ) {
2017-05-26 04:48:44 +08:00
// update keyServers, serverKeys
// SOMEDAY: Doing these in parallel is safe because none of them overlap or touch (one per server)
2018-08-11 04:57:10 +08:00
wait ( krmSetRangeCoalescing ( & tr , keyServersPrefix , currentKeys , keys , keyServersValue ( dest ) ) ) ;
2017-05-26 04:48:44 +08:00
std : : set < UID > : : iterator asi = allServers . begin ( ) ;
std : : vector < Future < Void > > actors ;
while ( asi ! = allServers . end ( ) ) {
bool destHasServer = std : : find ( dest . begin ( ) , dest . end ( ) , * asi ) ! = dest . end ( ) ;
actors . push_back ( krmSetRangeCoalescing ( & tr , serverKeysPrefixFor ( * asi ) , currentKeys , allKeys , destHasServer ? serverKeysTrue : serverKeysFalse ) ) ;
+ + asi ;
}
2018-08-11 04:57:10 +08:00
wait ( waitForAll ( actors ) ) ;
wait ( tr . commit ( ) ) ;
2017-05-26 04:48:44 +08:00
begin = endKey ;
break ;
}
tr . reset ( ) ;
} catch ( Error & error ) {
if ( error . code ( ) = = error_code_actor_cancelled ) throw ;
state Error err = error ;
2018-08-11 04:57:10 +08:00
wait ( tr . onError ( error ) ) ;
2018-04-11 06:52:32 +08:00
retries + + ;
if ( retries % 10 = = 0 ) {
2018-06-09 02:11:08 +08:00
TraceEvent ( retries = = 20 ? SevWarnAlways : SevWarn , " RelocateShard_FinishMoveKeysRetrying " , relocationIntervalId )
2018-04-11 06:52:32 +08:00
. error ( err )
2019-03-19 06:03:43 +08:00
. detail ( " KeyBegin " , keys . begin )
. detail ( " KeyEnd " , keys . end )
. detail ( " IterationBegin " , begin )
. detail ( " IterationEnd " , endKey ) ;
2018-04-11 06:52:32 +08:00
}
2017-05-26 04:48:44 +08:00
}
}
}
TraceEvent ( SevDebug , interval . end ( ) , relocationIntervalId ) ;
} catch ( Error & e ) {
TraceEvent ( SevDebug , interval . end ( ) , relocationIntervalId ) . error ( e , true ) ;
throw ;
}
return Void ( ) ;
}
ACTOR Future < std : : pair < Version , Tag > > addStorageServer ( Database cx , StorageServerInterface server )
{
state Transaction tr ( cx ) ;
state int maxSkipTags = 1 ;
loop {
try {
2017-08-04 07:16:36 +08:00
state Future < Standalone < RangeResultRef > > fTagLocalities = tr . getRange ( tagLocalityListKeys , CLIENT_KNOBS - > TOO_MANY ) ;
2017-05-26 04:48:44 +08:00
state Future < Optional < Value > > fv = tr . get ( serverListKeyFor ( server . id ( ) ) ) ;
state Future < Optional < Value > > fExclProc = tr . get (
StringRef ( encodeExcludedServersKey ( AddressExclusion ( server . address ( ) . ip , server . address ( ) . port ) ) ) ) ;
state Future < Optional < Value > > fExclIP = tr . get (
StringRef ( encodeExcludedServersKey ( AddressExclusion ( server . address ( ) . ip ) ) ) ) ;
2017-08-04 07:16:36 +08:00
state Future < Standalone < RangeResultRef > > fTags = tr . getRange ( serverTagKeys , CLIENT_KNOBS - > TOO_MANY , true ) ;
2018-03-31 08:39:45 +08:00
state Future < Standalone < RangeResultRef > > fHistoryTags = tr . getRange ( serverTagHistoryKeys , CLIENT_KNOBS - > TOO_MANY , true ) ;
2017-05-26 04:48:44 +08:00
2018-08-11 04:57:10 +08:00
wait ( success ( fTagLocalities ) & & success ( fv ) & & success ( fExclProc ) & & success ( fExclIP ) & & success ( fTags ) & & success ( fHistoryTags ) ) ;
2017-05-26 04:48:44 +08:00
// If we have been added to the excluded state servers list, we have to fail
if ( fExclProc . get ( ) . present ( ) | | fExclIP . get ( ) . present ( ) )
throw recruitment_failed ( ) ;
2018-03-31 08:39:45 +08:00
if ( fTagLocalities . get ( ) . more | | fTags . get ( ) . more | | fHistoryTags . get ( ) . more )
2017-05-26 04:48:44 +08:00
ASSERT ( false ) ;
2017-08-04 07:16:36 +08:00
int8_t maxTagLocality = 0 ;
state int8_t locality = - 1 ;
for ( auto & kv : fTagLocalities . get ( ) ) {
int8_t loc = decodeTagLocalityListValue ( kv . value ) ;
if ( decodeTagLocalityListKey ( kv . key ) = = server . locality . dcId ( ) ) {
locality = loc ;
break ;
}
maxTagLocality = std : : max ( maxTagLocality , loc ) ;
}
if ( locality = = - 1 ) {
locality = maxTagLocality + 1 ;
if ( locality < 0 )
throw recruitment_failed ( ) ;
tr . set ( tagLocalityListKeyFor ( server . locality . dcId ( ) ) , tagLocalityListValue ( locality ) ) ;
}
2019-05-11 05:01:52 +08:00
int skipTags = deterministicRandom ( ) - > randomInt ( 0 , maxSkipTags ) ;
2017-05-26 04:48:44 +08:00
2017-08-04 07:16:36 +08:00
state uint16_t tagId = 0 ;
std : : vector < uint16_t > usedTags ;
for ( auto & it : fTags . get ( ) ) {
Tag t = decodeServerTagValue ( it . value ) ;
if ( t . locality = = locality ) {
usedTags . push_back ( t . id ) ;
}
}
2018-03-31 08:39:45 +08:00
for ( auto & it : fHistoryTags . get ( ) ) {
Tag t = decodeServerTagValue ( it . value ) ;
if ( t . locality = = locality ) {
usedTags . push_back ( t . id ) ;
}
}
2017-05-26 04:48:44 +08:00
std : : sort ( usedTags . begin ( ) , usedTags . end ( ) ) ;
int usedIdx = 0 ;
2017-08-04 07:16:36 +08:00
for ( ; usedTags . size ( ) > 0 & & tagId < = usedTags . end ( ) [ - 1 ] ; tagId + + ) {
if ( tagId < usedTags [ usedIdx ] ) {
2017-05-26 04:48:44 +08:00
if ( skipTags = = 0 )
break ;
skipTags - - ;
} else {
usedIdx + + ;
}
}
2017-08-04 07:16:36 +08:00
tagId + = skipTags ;
2017-06-30 06:50:19 +08:00
2017-08-04 07:16:36 +08:00
state Tag tag ( locality , tagId ) ;
2017-05-26 04:48:44 +08:00
tr . set ( serverTagKeyFor ( server . id ( ) ) , serverTagValue ( tag ) ) ;
tr . set ( serverListKeyFor ( server . id ( ) ) , serverListValue ( server ) ) ;
KeyRange conflictRange = singleKeyRange ( serverTagConflictKeyFor ( tag ) ) ;
tr . addReadConflictRange ( conflictRange ) ;
tr . addWriteConflictRange ( conflictRange ) ;
2018-08-11 04:57:10 +08:00
wait ( tr . commit ( ) ) ;
2017-05-26 04:48:44 +08:00
return std : : make_pair ( tr . getCommittedVersion ( ) , tag ) ;
} catch ( Error & e ) {
if ( e . code ( ) = = error_code_commit_unknown_result )
throw recruitment_failed ( ) ; // There is a remote possibility that we successfully added ourselves and then someone removed us, so we have to fail
2018-02-22 09:05:39 +08:00
if ( e . code ( ) = = error_code_not_committed ) {
2018-03-18 01:36:19 +08:00
maxSkipTags = SERVER_KNOBS - > MAX_SKIP_TAGS ;
2018-02-22 09:05:39 +08:00
}
2017-05-26 04:48:44 +08:00
2018-08-11 04:57:10 +08:00
wait ( tr . onError ( e ) ) ;
2017-05-26 04:48:44 +08:00
}
}
}
2019-08-13 01:08:12 +08:00
// A SS can be removed only if all data (shards) on the SS have been moved away from the SS.
2017-05-26 04:48:44 +08:00
ACTOR Future < bool > canRemoveStorageServer ( Transaction * tr , UID serverID ) {
2019-08-20 04:47:48 +08:00
Standalone < RangeResultRef > keys = wait ( krmGetRanges ( tr , serverKeysPrefixFor ( serverID ) , allKeys , 2 ) ) ;
2017-05-26 04:48:44 +08:00
ASSERT ( keys . size ( ) > = 2 ) ;
if ( keys [ 0 ] . value = = keys [ 1 ] . value & & keys [ 1 ] . key ! = allKeys . end ) {
2019-03-19 06:03:43 +08:00
TraceEvent ( " ServerKeysCoalescingError " , serverID ) . detail ( " Key1 " , keys [ 0 ] . key ) . detail ( " Key2 " , keys [ 1 ] . key ) . detail ( " Value " , keys [ 0 ] . value ) ;
2017-05-26 04:48:44 +08:00
ASSERT ( false ) ;
}
//Return true if the entire range is false. Since these values are coalesced, we can return false if there is more than one result
return keys [ 0 ] . value = = serverKeysFalse & & keys [ 1 ] . key = = allKeys . end ;
}
ACTOR Future < Void > removeStorageServer ( Database cx , UID serverID , MoveKeysLock lock )
{
state Transaction tr ( cx ) ;
state bool retry = false ;
state int noCanRemoveCount = 0 ;
loop {
try {
tr . setOption ( FDBTransactionOptions : : PRIORITY_SYSTEM_IMMEDIATE ) ;
2018-08-11 04:57:10 +08:00
wait ( checkMoveKeysLock ( & tr , lock ) ) ;
2017-05-26 04:48:44 +08:00
TraceEvent ( " RemoveStorageServerLocked " ) . detail ( " ServerID " , serverID ) . detail ( " Version " , tr . getReadVersion ( ) . get ( ) ) ;
state bool canRemove = wait ( canRemoveStorageServer ( & tr , serverID ) ) ;
if ( ! canRemove ) {
TEST ( true ) ; // The caller had a transaction in flight that assigned keys to the server. Wait for it to reverse its mistake.
TraceEvent ( SevWarn , " NoCanRemove " ) . detail ( " Count " , noCanRemoveCount + + ) . detail ( " ServerID " , serverID ) ;
2019-06-25 17:47:35 +08:00
wait ( delayJittered ( SERVER_KNOBS - > REMOVE_RETRY_DELAY , TaskPriority : : DataDistributionLaunch ) ) ;
2017-05-26 04:48:44 +08:00
tr . reset ( ) ;
2018-06-09 02:11:08 +08:00
TraceEvent ( " RemoveStorageServerRetrying " ) . detail ( " CanRemove " , canRemove ) ;
2017-05-26 04:48:44 +08:00
} else {
2017-08-04 07:16:36 +08:00
state Future < Optional < Value > > fListKey = tr . get ( serverListKeyFor ( serverID ) ) ;
2018-03-31 08:39:45 +08:00
state Future < Standalone < RangeResultRef > > fTags = tr . getRange ( serverTagKeys , CLIENT_KNOBS - > TOO_MANY ) ;
state Future < Standalone < RangeResultRef > > fHistoryTags = tr . getRange ( serverTagHistoryKeys , CLIENT_KNOBS - > TOO_MANY ) ;
2017-08-04 07:16:36 +08:00
state Future < Standalone < RangeResultRef > > fTagLocalities = tr . getRange ( tagLocalityListKeys , CLIENT_KNOBS - > TOO_MANY ) ;
2018-11-12 04:37:53 +08:00
state Future < Standalone < RangeResultRef > > fTLogDatacenters = tr . getRange ( tLogDatacentersKeys , CLIENT_KNOBS - > TOO_MANY ) ;
2017-08-04 07:16:36 +08:00
2018-11-13 12:26:58 +08:00
wait ( success ( fListKey ) & & success ( fTags ) & & success ( fHistoryTags ) & & success ( fTagLocalities ) & & success ( fTLogDatacenters ) ) ;
2017-08-04 07:16:36 +08:00
if ( ! fListKey . get ( ) . present ( ) ) {
2017-05-26 04:48:44 +08:00
if ( retry ) {
TEST ( true ) ; // Storage server already removed after retrying transaction
return Void ( ) ;
}
ASSERT ( false ) ; // Removing an already-removed server? A never added server?
}
2017-08-04 07:16:36 +08:00
int8_t locality = - 100 ;
std : : set < int8_t > allLocalities ;
for ( auto & it : fTags . get ( ) ) {
UID sId = decodeServerTagKey ( it . key ) ;
Tag t = decodeServerTagValue ( it . value ) ;
if ( sId = = serverID ) {
locality = t . locality ;
} else {
allLocalities . insert ( t . locality ) ;
}
}
2018-03-31 08:39:45 +08:00
for ( auto & it : fHistoryTags . get ( ) ) {
Tag t = decodeServerTagValue ( it . value ) ;
allLocalities . insert ( t . locality ) ;
}
2017-08-04 07:16:36 +08:00
2018-11-12 04:37:53 +08:00
std : : map < Optional < Value > , int8_t > dcId_locality ;
for ( auto & kv : fTagLocalities . get ( ) ) {
dcId_locality [ decodeTagLocalityListKey ( kv . key ) ] = decodeTagLocalityListValue ( kv . value ) ;
}
for ( auto & it : fTLogDatacenters . get ( ) ) {
allLocalities . insert ( dcId_locality [ decodeTLogDatacentersKey ( it . key ) ] ) ;
}
2017-08-04 07:16:36 +08:00
if ( locality > = 0 & & ! allLocalities . count ( locality ) ) {
for ( auto & it : fTagLocalities . get ( ) ) {
if ( locality = = decodeTagLocalityListValue ( it . value ) ) {
tr . clear ( it . key ) ;
break ;
}
}
}
2017-05-26 04:48:44 +08:00
tr . clear ( serverListKeyFor ( serverID ) ) ;
tr . clear ( serverTagKeyFor ( serverID ) ) ;
2018-02-04 04:20:18 +08:00
tr . clear ( serverTagHistoryRangeFor ( serverID ) ) ;
2017-05-26 04:48:44 +08:00
retry = true ;
2018-08-11 04:57:10 +08:00
wait ( tr . commit ( ) ) ;
2017-05-26 04:48:44 +08:00
return Void ( ) ;
}
} catch ( Error & e ) {
state Error err = e ;
2018-08-11 04:57:10 +08:00
wait ( tr . onError ( e ) ) ;
2017-05-26 04:48:44 +08:00
TraceEvent ( " RemoveStorageServerRetrying " ) . error ( err ) ;
}
}
}
ACTOR Future < Void > moveKeys (
Database cx ,
KeyRange keys ,
vector < UID > destinationTeam ,
2018-02-03 03:46:04 +08:00
vector < UID > healthyDestinations ,
2017-05-26 04:48:44 +08:00
MoveKeysLock lock ,
Promise < Void > dataMovementComplete ,
FlowLock * startMoveKeysParallelismLock ,
FlowLock * finishMoveKeysParallelismLock ,
2018-06-09 07:17:27 +08:00
bool hasRemote ,
2017-05-26 04:48:44 +08:00
UID relocationIntervalId )
{
ASSERT ( destinationTeam . size ( ) ) ;
std : : sort ( destinationTeam . begin ( ) , destinationTeam . end ( ) ) ;
2018-08-11 04:57:10 +08:00
wait ( startMoveKeys ( cx , keys , destinationTeam , lock , startMoveKeysParallelismLock , relocationIntervalId ) ) ;
2017-05-26 04:48:44 +08:00
2018-02-03 03:46:04 +08:00
state Future < Void > completionSignaller = checkFetchingState ( cx , healthyDestinations , keys , dataMovementComplete , relocationIntervalId ) ;
2017-05-26 04:48:44 +08:00
2019-02-13 08:07:23 +08:00
wait ( finishMoveKeys ( cx , keys , destinationTeam , lock , finishMoveKeysParallelismLock , hasRemote , relocationIntervalId ) ) ;
2017-05-26 04:48:44 +08:00
//This is defensive, but make sure that we always say that the movement is complete before moveKeys completes
completionSignaller . cancel ( ) ;
if ( ! dataMovementComplete . isSet ( ) )
dataMovementComplete . send ( Void ( ) ) ;
return Void ( ) ;
}
void seedShardServers (
Arena & arena ,
CommitTransactionRef & tr ,
vector < StorageServerInterface > servers )
{
2017-08-04 07:16:36 +08:00
std : : map < Optional < Value > , Tag > dcId_locality ;
2017-05-26 04:48:44 +08:00
std : : map < UID , Tag > server_tag ;
2017-08-04 07:16:36 +08:00
int8_t nextLocality = 0 ;
for ( auto & s : servers ) {
if ( ! dcId_locality . count ( s . locality . dcId ( ) ) ) {
tr . set ( arena , tagLocalityListKeyFor ( s . locality . dcId ( ) ) , tagLocalityListValue ( nextLocality ) ) ;
dcId_locality [ s . locality . dcId ( ) ] = Tag ( nextLocality , 0 ) ;
nextLocality + + ;
}
Tag & t = dcId_locality [ s . locality . dcId ( ) ] ;
server_tag [ s . id ( ) ] = Tag ( t . locality , t . id ) ;
t . id + + ;
}
2017-05-26 04:48:44 +08:00
std : : sort ( servers . begin ( ) , servers . end ( ) ) ;
// This isn't strictly necessary, but make sure this is the first transaction
tr . read_snapshot = 0 ;
tr . read_conflict_ranges . push_back_deep ( arena , allKeys ) ;
for ( int s = 0 ; s < servers . size ( ) ; s + + ) {
tr . set ( arena , serverTagKeyFor ( servers [ s ] . id ( ) ) , serverTagValue ( server_tag [ servers [ s ] . id ( ) ] ) ) ;
2017-12-16 12:13:44 +08:00
tr . set ( arena , serverListKeyFor ( servers [ s ] . id ( ) ) , serverListValue ( servers [ s ] ) ) ;
2017-05-26 04:48:44 +08:00
}
std : : vector < UID > serverIds ;
for ( int i = 0 ; i < servers . size ( ) ; i + + )
serverIds . push_back ( servers [ i ] . id ( ) ) ;
// We have to set this range in two blocks, because the master tracking of "keyServersLocations" depends on a change to a specific
// key (keyServersKeyServersKey)
2017-07-27 04:45:11 +08:00
krmSetPreviouslyEmptyRange ( tr , arena , keyServersPrefix , KeyRangeRef ( KeyRef ( ) , allKeys . end ) , keyServersValue ( serverIds ) , Value ( ) ) ;
2017-05-26 04:48:44 +08:00
for ( int s = 0 ; s < servers . size ( ) ; s + + )
krmSetPreviouslyEmptyRange ( tr , arena , serverKeysPrefixFor ( servers [ s ] . id ( ) ) , allKeys , serverKeysTrue , serverKeysFalse ) ;
}