Merge pull request #339 from etschannen/feature-remote-logs

Non-copying transaction log recovery
This commit is contained in:
Alex Miller 2018-05-11 17:02:37 -07:00 committed by GitHub
commit e8afc37487
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
45 changed files with 2077 additions and 1334 deletions

View File

@ -37,7 +37,7 @@ void DatabaseConfiguration::resetInternal() {
regions.clear();
tLogPolicy = storagePolicy = remoteTLogPolicy = IRepPolicyRef();
remoteDesiredTLogCount = desiredLogRouterCount = -1;
remoteDesiredTLogCount = -1;
remoteTLogReplicationFactor = 0;
}
@ -153,7 +153,6 @@ bool DatabaseConfiguration::isValid() const {
storagePolicy &&
tLogPolicy &&
getDesiredRemoteLogs() >= 1 &&
getDesiredLogRouters() >= 1 &&
remoteTLogReplicationFactor >= 0 &&
regions.size() <= 2 &&
( remoteTLogReplicationFactor == 0 || ( remoteTLogPolicy && regions.size() == 2 && durableStorageQuorum == storageTeamSize ) ) ) ) {
@ -297,9 +296,6 @@ StatusObject DatabaseConfiguration::toJSON(bool noPolicies) const {
if( remoteDesiredTLogCount != -1 ) {
result["remote_logs"] = remoteDesiredTLogCount;
}
if( desiredLogRouterCount != -1 ) {
result["log_routers"] = desiredLogRouterCount;
}
if( autoMasterProxyCount != CLIENT_KNOBS->DEFAULT_AUTO_PROXIES ) {
result["auto_proxies"] = autoMasterProxyCount;
}
@ -340,7 +336,6 @@ bool DatabaseConfiguration::setInternal(KeyRef key, ValueRef value) {
else if (ck == LiteralStringRef("remote_logs")) parse(&remoteDesiredTLogCount, value);
else if (ck == LiteralStringRef("remote_log_replicas")) parse(&remoteTLogReplicationFactor, value);
else if (ck == LiteralStringRef("remote_log_policy")) parseReplicationPolicy(&remoteTLogPolicy, value);
else if (ck == LiteralStringRef("log_routers")) parse(&desiredLogRouterCount, value);
else if (ck == LiteralStringRef("regions")) parse(&regions, value);
else return false;
return true; // All of the above options currently require recovery to take effect

View File

@ -18,8 +18,8 @@
* limitations under the License.
*/
#ifndef FDBSERVER_DATABASECONFIGURATION_H
#define FDBSERVER_DATABASECONFIGURATION_H
#ifndef FDBCLIENT_DATABASECONFIGURATION_H
#define FDBCLIENT_DATABASECONFIGURATION_H
#pragma once
#include "fdbclient/FDBTypes.h"
@ -65,7 +65,7 @@ struct RegionInfo {
template <class Ar>
void serialize(Ar& ar) {
ar & dcId & priority & satelliteTLogPolicy & satelliteDesiredTLogCount & satelliteTLogReplicationFactor & satelliteTLogWriteAntiQuorum & & satelliteTLogUsableDcs & satellites;
ar & dcId & priority & satelliteTLogPolicy & satelliteDesiredTLogCount & satelliteTLogReplicationFactor & satelliteTLogWriteAntiQuorum & satelliteTLogUsableDcs & satellites;
}
};
@ -163,7 +163,6 @@ struct DatabaseConfiguration {
// Remote TLogs
int32_t remoteDesiredTLogCount;
int32_t remoteTLogReplicationFactor;
int32_t desiredLogRouterCount;
IRepPolicyRef remoteTLogPolicy;
//Data centers
@ -176,8 +175,7 @@ struct DatabaseConfiguration {
int32_t getDesiredProxies() const { if(masterProxyCount == -1) return autoMasterProxyCount; return masterProxyCount; }
int32_t getDesiredResolvers() const { if(resolverCount == -1) return autoResolverCount; return resolverCount; }
int32_t getDesiredLogs() const { if(desiredTLogCount == -1) return autoDesiredTLogCount; return desiredTLogCount; }
int32_t getDesiredRemoteLogs() const { if(remoteDesiredTLogCount == -1) return autoDesiredTLogCount; return remoteDesiredTLogCount; }
int32_t getDesiredLogRouters() const { if(desiredLogRouterCount == -1) return getDesiredRemoteLogs(); return desiredLogRouterCount; }
int32_t getDesiredRemoteLogs() const { if(remoteDesiredTLogCount == -1) return getDesiredLogs(); return remoteDesiredTLogCount; }
int32_t getDesiredSatelliteLogs( Optional<Key> dcId ) const {
auto desired = getRegion(dcId).satelliteDesiredTLogCount;
if(desired == -1) return autoDesiredTLogCount; return desired;

View File

@ -33,7 +33,7 @@ typedef StringRef KeyRef;
typedef StringRef ValueRef;
typedef int64_t Generation;
enum { tagLocalitySpecial = -1, tagLocalityLogRouter = -2, tagLocalityRemoteLog = -3, tagLocalityUpgraded = -4}; //The TLog and LogRouter require these number to be as compact as possible
enum { tagLocalitySpecial = -1, tagLocalityLogRouter = -2, tagLocalityRemoteLog = -3, tagLocalityUpgraded = -4, tagLocalityInvalid = -99 }; //The TLog and LogRouter require these number to be as compact as possible
#pragma pack(push, 1)
struct Tag {

View File

@ -122,7 +122,7 @@ ClientKnobs::ClientKnobs(bool randomize) {
init( BACKUP_LOGFILE_BLOCK_SIZE, 1024 * 1024);
init( BACKUP_DISPATCH_ADDTASK_SIZE, 50 );
init( RESTORE_DISPATCH_ADDTASK_SIZE, 150 );
init( RESTORE_DISPATCH_BATCH_SIZE, 30000 ); if( randomize && BUGGIFY ) RESTORE_DISPATCH_BATCH_SIZE = 1;
init( RESTORE_DISPATCH_BATCH_SIZE, 30000 ); if( randomize && BUGGIFY ) RESTORE_DISPATCH_BATCH_SIZE = 20;
init( RESTORE_WRITE_TX_SIZE, 256 * 1024 );
init( APPLY_MAX_LOCK_BYTES, 1e9 );
init( APPLY_MIN_LOCK_BYTES, 11e6 ); //Must be bigger than TRANSACTION_SIZE_LIMIT

View File

@ -65,7 +65,7 @@ std::map<std::string, std::string> configForToken( std::string const& mode ) {
std::string key = mode.substr(0, pos);
std::string value = mode.substr(pos+1);
if( (key == "logs" || key == "proxies" || key == "resolvers" || key == "remote_logs" || key == "satellite_logs" || key == "log_routers") && isInteger(value) ) {
if( (key == "logs" || key == "proxies" || key == "resolvers" || key == "remote_logs" || key == "satellite_logs") && isInteger(value) ) {
out[p+key] = value;
}
@ -1167,6 +1167,64 @@ ACTOR Future<Void> waitForExcludedServers( Database cx, vector<AddressExclusion>
}
}
ACTOR Future<DatabaseConfiguration> getDatabaseConfiguration( Database cx ) {
state Transaction tr(cx);
loop {
try {
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
Standalone<RangeResultRef> res = wait( tr.getRange(configKeys, CLIENT_KNOBS->TOO_MANY) );
ASSERT( res.size() < CLIENT_KNOBS->TOO_MANY );
DatabaseConfiguration config;
config.fromKeyValues((VectorRef<KeyValueRef>) res);
return config;
} catch( Error &e ) {
Void _ = wait( tr.onError(e) );
}
}
}
ACTOR Future<Void> waitForFullReplication( Database cx ) {
loop {
state ReadYourWritesTransaction tr(cx);
loop {
try {
tr.setOption( FDBTransactionOptions::READ_SYSTEM_KEYS );
tr.setOption( FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE );
tr.setOption( FDBTransactionOptions::LOCK_AWARE );
Standalone<RangeResultRef> confResults = wait( tr.getRange(configKeys, CLIENT_KNOBS->TOO_MANY) );
ASSERT( !confResults.more && confResults.size() < CLIENT_KNOBS->TOO_MANY );
state DatabaseConfiguration config;
config.fromKeyValues((VectorRef<KeyValueRef>) confResults);
state std::vector<Future<Optional<Value>>> replicasFutures;
for(auto& region : config.regions) {
replicasFutures.push_back(tr.get(datacenterReplicasKeyFor(region.dcId)));
}
Void _ = wait( waitForAll(replicasFutures) );
state std::vector<Future<Void>> watchFutures;
for(int i = 0; i < config.regions.size(); i++) {
if( !replicasFutures[i].get().present() || decodeDatacenterReplicasValue(replicasFutures[i].get().get()) < config.storageTeamSize ) {
watchFutures.push_back(tr.watch(datacenterReplicasKeyFor(config.regions[i].dcId)));
}
}
if( !watchFutures.size() || (config.remoteTLogReplicationFactor == 0 && watchFutures.size() < config.regions.size())) {
return Void();
}
Void _ = wait( tr.commit() );
Void _ = wait( waitForAny(watchFutures) );
break;
} catch (Error& e) {
Void _ = wait( tr.onError(e) );
}
}
}
}
ACTOR Future<Void> timeKeeperSetDisable(Database cx) {
loop {
state Transaction tr(cx);

View File

@ -34,6 +34,7 @@ standard API and some knowledge of the contents of the system key space.
#include "NativeAPI.h"
#include "Status.h"
#include "ReadYourWrites.h"
#include "DatabaseConfiguration.h"
// ConfigurationResult enumerates normal outcomes of changeConfig() and various error
// conditions specific to it. changeConfig may also throw an Error to report other problems.
@ -107,6 +108,9 @@ ConfigureAutoResult parseConfig( StatusObject const& status );
Future<ConfigurationResult::Type> changeConfig( Database const& cx, std::vector<StringRef> const& modes, Optional<ConfigureAutoResult> const& conf ); // Accepts a vector of configuration tokens
Future<ConfigurationResult::Type> changeConfig( Database const& cx, std::map<std::string, std::string> const& m ); // Accepts a full configuration in key/value format (from buildConfiguration)
Future<DatabaseConfiguration> getDatabaseConfiguration( Database const& cx );
Future<Void> waitForFullReplication( Database const& cx );
struct IQuorumChange : ReferenceCounted<IQuorumChange> {
virtual ~IQuorumChange() {}
virtual Future<vector<NetworkAddress>> getDesiredCoordinators( Transaction* tr, vector<NetworkAddress> oldCoordinators, Reference<ClusterConnectionFile>, CoordinatorsResult::Type& ) = 0;

View File

@ -107,10 +107,6 @@ const KeyRangeRef serverTagKeys(
LiteralStringRef("\xff/serverTag/"),
LiteralStringRef("\xff/serverTag0") );
const KeyRef serverTagPrefix = serverTagKeys.begin;
const KeyRef serverTagMaxOldKey = LiteralStringRef("\xff/serverTagMax");
const KeyRangeRef serverTagMaxKeys(
LiteralStringRef("\xff/serverTagMax/"),
LiteralStringRef("\xff/serverTagMax0") );
const KeyRangeRef serverTagConflictKeys(
LiteralStringRef("\xff/serverTagConflict/"),
LiteralStringRef("\xff/serverTagConflict0") );
@ -120,13 +116,6 @@ const KeyRangeRef serverTagHistoryKeys(
LiteralStringRef("\xff/serverTagHistory0") );
const KeyRef serverTagHistoryPrefix = serverTagHistoryKeys.begin;
const Key serverMaxTagKeyFor( int8_t tagLocality ) {
BinaryWriter wr(Unversioned());
wr.serializeBytes( serverTagMaxKeys.begin );
wr << tagLocality;
return wr.toStringRef();
}
const Key serverTagKeyFor( UID serverID ) {
BinaryWriter wr(Unversioned());
wr.serializeBytes( serverTagKeys.begin );
@ -209,36 +198,6 @@ const Key serverTagConflictKeyFor( Tag tag ) {
return wr.toStringRef();
}
const Value serverTagMaxValue( Tag tag ) {
BinaryWriter wr(Unversioned()); //This has to be unversioned because we are using an atomic op to max it
wr << tag;
return wr.toStringRef();
}
Tag decodeServerTagMaxValue( ValueRef const& value ) {
Tag s;
BinaryReader reader( value, Unversioned() );
reader >> s;
return s;
}
Tag decodeServerTagMaxValueOld( ValueRef const& value ) {
Tag s;
BinaryReader reader( value, Unversioned() );
int16_t id;
reader >> id;
if(id == invalidTagOld) {
s = invalidTag;
} else if(id == txsTagOld) {
s = txsTag;
} else {
ASSERT(id >= 0);
s.id = id;
s.locality = tagLocalityUpgraded;
}
return s;
}
const KeyRangeRef tagLocalityListKeys(
LiteralStringRef("\xff/tagLocalityList/"),
LiteralStringRef("\xff/tagLocalityList0") );
@ -269,6 +228,36 @@ int8_t decodeTagLocalityListValue( ValueRef const& value ) {
return s;
}
const KeyRangeRef datacenterReplicasKeys(
LiteralStringRef("\xff\x02/datacenterReplicas/"),
LiteralStringRef("\xff\x02/datacenterReplicas0") );
const KeyRef datacenterReplicasPrefix = datacenterReplicasKeys.begin;
const Key datacenterReplicasKeyFor( Optional<Value> dcID ) {
BinaryWriter wr(AssumeVersion(currentProtocolVersion));
wr.serializeBytes( datacenterReplicasKeys.begin );
wr << dcID;
return wr.toStringRef();
}
const Value datacenterReplicasValue( int const& replicas ) {
BinaryWriter wr(IncludeVersion());
wr << replicas;
return wr.toStringRef();
}
Optional<Value> decodeDatacenterReplicasKey( KeyRef const& key ) {
Optional<Value> dcID;
BinaryReader rd( key.removePrefix(datacenterReplicasKeys.begin), AssumeVersion(currentProtocolVersion) );
rd >> dcID;
return dcID;
}
int decodeDatacenterReplicasValue( ValueRef const& value ) {
int s;
BinaryReader reader( value, IncludeVersion() );
reader >> s;
return s;
}
// serverListKeys.contains(k) iff k.startsWith( serverListKeys.begin ) because '/'+1 == '0'
const KeyRangeRef serverListKeys(
LiteralStringRef("\xff/serverList/"),

View File

@ -54,14 +54,12 @@ bool serverHasKey( ValueRef storedValue );
extern const KeyRangeRef serverTagKeys;
extern const KeyRef serverTagPrefix;
extern const KeyRef serverTagMaxOldKey;
extern const KeyRangeRef serverTagMaxKeys;
extern const KeyRangeRef serverTagConflictKeys;
extern const KeyRef serverTagConflictPrefix;
extern const KeyRangeRef serverTagHistoryKeys;
extern const KeyRef serverTagHistoryPrefix;
const Key serverMaxTagKeyFor( int8_t tagLocality );
const Key serverTagKeyFor( UID serverID );
const Key serverTagHistoryKeyFor( UID serverID );
const KeyRange serverTagHistoryRangeFor( UID serverID );
@ -71,9 +69,6 @@ UID decodeServerTagKey( KeyRef const& );
Version decodeServerTagHistoryKey( KeyRef const& );
Tag decodeServerTagValue( ValueRef const& );
const Key serverTagConflictKeyFor( Tag );
const Value serverTagMaxValue( Tag );
Tag decodeServerTagMaxValue( ValueRef const& );
Tag decodeServerTagMaxValueOld( ValueRef const& );
// "\xff/tagLocalityList/[[datacenterID]]" := "[[tagLocality]]"
extern const KeyRangeRef tagLocalityListKeys;
@ -83,6 +78,14 @@ const Value tagLocalityListValue( int8_t const& );
Optional<Value> decodeTagLocalityListKey( KeyRef const& );
int8_t decodeTagLocalityListValue( ValueRef const& );
// "\xff\x02/DatacenterReplicas/[[datacenterID]]" := "[[replicas]]"
extern const KeyRangeRef datacenterReplicasKeys;
extern const KeyRef datacenterReplicasPrefix;
const Key datacenterReplicasKeyFor( Optional<Value> dcID );
const Value datacenterReplicasValue( int const& );
Optional<Value> decodeDatacenterReplicasKey( KeyRef const& );
int decodeDatacenterReplicasValue( ValueRef const& );
// "\xff/serverList/[[serverID]]" := "[[StorageServerInterface]]"
// Storage servers are listed here when they are recruited - always before assigning them keys
// Storage servers removed from here are never replaced. The same fdbserver, if re-recruited, will always

View File

@ -33,6 +33,7 @@
<ClInclude Include="ClusterInterface.h" />
<ClInclude Include="CommitTransaction.h" />
<ClInclude Include="CoordinationInterface.h" />
<ClInclude Include="DatabaseConfiguration.h" />
<ActorCompiler Include="DatabaseContext.h" />
<ActorCompiler Include="EventTypes.actor.h">
<EnableCompile Condition="'$(Configuration)|$(Platform)'=='Debug|X64'">false</EnableCompile>
@ -86,6 +87,7 @@
<ActorCompiler Include="BackupAgentBase.actor.cpp" />
<ActorCompiler Include="BackupContainer.actor.cpp" />
<ActorCompiler Include="DatabaseBackupAgent.actor.cpp" />
<ClCompile Include="DatabaseConfiguration.cpp" />
<ClCompile Include="AutoPublicAddress.cpp" />
<ClCompile Include="FDBOptions.g.cpp" />
<ActorCompiler Include="FileBackupAgent.actor.cpp" />

View File

@ -188,7 +188,8 @@ description is not currently required but encouraged.
<Option name="read_lock_aware" code="702"
description="The transaction can read from locked databases."/>
<Option name="first_in_batch" code="710"
description="No other transactions will be applied before this transaction within the same commit version."/>
description="No other transactions will be applied before this transaction within the same commit version."
hidden="true" />
</Scope>
<!-- The enumeration values matter - do not change them without

View File

@ -46,8 +46,8 @@ struct applyMutationsData {
// the same operations will be done on all proxies at the same time. Otherwise, the data stored in
// txnStateStore will become corrupted.
static void applyMetadataMutations(UID const& dbgid, Arena &arena, VectorRef<MutationRef> const& mutations, IKeyValueStore* txnStateStore, LogPushData* toCommit, bool *confChange, Reference<ILogSystem> logSystem = Reference<ILogSystem>(), Version popVersion = 0,
KeyRangeMap<std::set<Key> >* vecBackupKeys = NULL, KeyRangeMap<ServerCacheInfo>* keyInfo = NULL, std::map<Key, applyMutationsData>* uid_applyMutationsData = NULL,
RequestStream<CommitTransactionRequest> commit = RequestStream<CommitTransactionRequest>(), Database cx = Database(), NotifiedVersion* commitVersion = NULL, std::map<UID, Reference<StorageInfo>>* storageCache = NULL, bool initialCommit = false ) {
KeyRangeMap<std::set<Key> >* vecBackupKeys = NULL, KeyRangeMap<ServerCacheInfo>* keyInfo = NULL, std::map<Key, applyMutationsData>* uid_applyMutationsData = NULL, RequestStream<CommitTransactionRequest> commit = RequestStream<CommitTransactionRequest>(),
Database cx = Database(), NotifiedVersion* commitVersion = NULL, std::map<UID, Reference<StorageInfo>>* storageCache = NULL, std::map<Tag, Version>* tag_popped = NULL, bool initialCommit = false ) {
for (auto const& m : mutations) {
//TraceEvent("MetadataMutation", dbgid).detail("M", m.toString());
@ -291,8 +291,10 @@ static void applyMetadataMutations(UID const& dbgid, Arena &arena, VectorRef<Mut
if (logSystem && popVersion) {
auto serverKeysCleared = txnStateStore->readRange( range & serverTagKeys ).get(); // read is expected to be immediately available
for(auto &kv : serverKeysCleared) {
TraceEvent("ServerTagRemove").detail("popVersion", popVersion).detail("tag", decodeServerTagValue(kv.value).toString()).detail("server", decodeServerTagKey(kv.key));
Tag tag = decodeServerTagValue(kv.value);
TraceEvent("ServerTagRemove").detail("popVersion", popVersion).detail("tag", tag.toString()).detail("server", decodeServerTagKey(kv.key));
logSystem->pop( popVersion, decodeServerTagValue(kv.value) );
(*tag_popped)[tag] = popVersion;
if(toCommit) {
MutationRef privatized = m;
@ -317,8 +319,10 @@ static void applyMetadataMutations(UID const& dbgid, Arena &arena, VectorRef<Mut
if (logSystem && popVersion) {
auto serverKeysCleared = txnStateStore->readRange( range & serverTagHistoryKeys ).get(); // read is expected to be immediately available
for(auto &kv : serverKeysCleared) {
TraceEvent("ServerTagHistoryRemove").detail("popVersion", popVersion).detail("tag", decodeServerTagValue(kv.value).toString()).detail("version", decodeServerTagHistoryKey(kv.key));
logSystem->pop( popVersion, decodeServerTagValue(kv.value) );
Tag tag = decodeServerTagValue(kv.value);
TraceEvent("ServerTagHistoryRemove").detail("popVersion", popVersion).detail("tag", tag.toString()).detail("version", decodeServerTagHistoryKey(kv.key));
logSystem->pop( popVersion, tag );
(*tag_popped)[tag] = popVersion;
}
}
if(!initialCommit) txnStateStore->clear( range & serverTagHistoryKeys );

View File

@ -242,7 +242,7 @@ public:
return results;
}
std::vector<std::pair<WorkerInterface, ProcessClass>> getWorkersForTlogs( DatabaseConfiguration const& conf, int32_t tLogReplicationFactor, int32_t desired, IRepPolicyRef const& policy, std::map< Optional<Standalone<StringRef>>, int>& id_used, bool checkStable = false, std::set<Optional<Key>> dcIds = std::set<Optional<Key>>() ) {
std::vector<std::pair<WorkerInterface, ProcessClass>> getWorkersForTlogs( DatabaseConfiguration const& conf, int32_t required, int32_t desired, IRepPolicyRef const& policy, std::map< Optional<Standalone<StringRef>>, int>& id_used, bool checkStable = false, std::set<Optional<Key>> dcIds = std::set<Optional<Key>>() ) {
std::map<ProcessClass::Fitness, vector<std::pair<WorkerInterface, ProcessClass>>> fitness_workers;
std::vector<std::pair<WorkerInterface, ProcessClass>> results;
std::vector<LocalityData> unavailableLocals;
@ -272,13 +272,8 @@ public:
for (auto& worker : fitness_workers[(ProcessClass::Fitness) fitness] ) {
logServerMap->add(worker.first.locality, &worker);
}
if (logServerSet->size() < tLogReplicationFactor) {
TraceEvent(SevWarn,"GWFTADTooFew", id)
.detail("Fitness", fitness)
.detail("Processes", logServerSet->size())
.detail("tLogReplicationFactor", tLogReplicationFactor)
.detail("tLogPolicy", policy->info())
.detail("DesiredLogs", desired);
if (logServerSet->size() < required) {
TraceEvent(SevWarn,"GWFTADTooFew", id).detail("Fitness", fitness).detail("Processes", logServerSet->size()).detail("Required", required).detail("tLogPolicy", policy->info()).detail("DesiredLogs", desired);
}
else if (logServerSet->size() <= desired) {
if (logServerSet->validate(policy)) {
@ -288,14 +283,7 @@ public:
bCompleted = true;
break;
}
else {
TraceEvent(SevWarn,"GWFTADNotAcceptable", id)
.detail("Fitness", fitness)
.detail("Processes", logServerSet->size())
.detail("tLogReplicationFactor", tLogReplicationFactor)
.detail("tLogPolicy",policy->info())
.detail("DesiredLogs", desired);
}
TraceEvent(SevWarn,"GWFTADNotAcceptable", id).detail("Fitness", fitness).detail("Processes", logServerSet->size()).detail("Required", required).detail("tLogPolicy",policy->info()).detail("DesiredLogs", desired);
}
// Try to select the desired size, if larger
else {
@ -303,9 +291,7 @@ public:
std::vector<LocalityData> tLocalities;
// Try to find the best team of servers to fulfill the policy
if (findBestPolicySet(bestSet, logServerSet, policy, desired,
SERVER_KNOBS->POLICY_RATING_TESTS, SERVER_KNOBS->POLICY_GENERATIONS))
{
if (findBestPolicySet(bestSet, logServerSet, policy, desired, SERVER_KNOBS->POLICY_RATING_TESTS, SERVER_KNOBS->POLICY_GENERATIONS)) {
results.reserve(results.size() + bestSet.size());
for (auto& entry : bestSet) {
auto object = logServerMap->getObject(entry);
@ -313,53 +299,27 @@ public:
results.push_back(*object);
tLocalities.push_back(object->first.locality);
}
TraceEvent("GWFTADBestResults", id)
.detail("Fitness", fitness)
.detail("Processes", logServerSet->size())
.detail("BestCount", bestSet.size())
.detail("BestZones", ::describeZones(tLocalities))
.detail("BestDataHalls", ::describeDataHalls(tLocalities))
.detail("tLogPolicy", policy->info())
.detail("TotalResults", results.size())
.detail("DesiredLogs", desired);
TraceEvent("GWFTADBestResults", id).detail("Fitness", fitness).detail("Processes", logServerSet->size()).detail("BestCount", bestSet.size()).detail("BestZones", ::describeZones(tLocalities))
.detail("BestDataHalls", ::describeDataHalls(tLocalities)).detail("tLogPolicy", policy->info()).detail("TotalResults", results.size()).detail("DesiredLogs", desired);
bCompleted = true;
break;
}
else {
TraceEvent(SevWarn,"GWFTADNoBest", id)
.detail("Fitness", fitness)
.detail("Processes", logServerSet->size())
.detail("tLogReplicationFactor", tLogReplicationFactor)
.detail("tLogPolicy", policy->info())
.detail("DesiredLogs", desired);
}
TraceEvent(SevWarn,"GWFTADNoBest", id).detail("Fitness", fitness).detail("Processes", logServerSet->size()).detail("Required", required).detail("tLogPolicy", policy->info()).detail("DesiredLogs", desired);
}
}
// If policy cannot be satisfied
if (!bCompleted)
{
std::vector<LocalityData> tLocalities;
for (auto& object : logServerMap->getObjects()) {
tLocalities.push_back(object->first.locality);
}
if (!bCompleted) {
std::vector<LocalityData> tLocalities;
for (auto& object : logServerMap->getObjects()) {
tLocalities.push_back(object->first.locality);
}
TraceEvent(SevWarn, "GetTLogTeamFailed")
.detail("Policy", policy->info())
.detail("Processes", logServerSet->size())
.detail("Workers", id_worker.size())
.detail("FitnessGroups", fitness_workers.size())
.detail("TLogZones", ::describeZones(tLocalities))
.detail("TLogDataHalls", ::describeDataHalls(tLocalities))
.detail("MissingZones", ::describeZones(unavailableLocals))
.detail("MissingDataHalls", ::describeDataHalls(unavailableLocals))
.detail("Replication", tLogReplicationFactor)
.detail("DesiredLogs", desired)
.detail("RatingTests",SERVER_KNOBS->POLICY_RATING_TESTS)
.detail("checkStable", checkStable)
.detail("PolicyGenerations",SERVER_KNOBS->POLICY_GENERATIONS).backtrace();
TraceEvent(SevWarn, "GetTLogTeamFailed").detail("Policy", policy->info()).detail("Processes", logServerSet->size()).detail("Workers", id_worker.size()).detail("FitnessGroups", fitness_workers.size())
.detail("TLogZones", ::describeZones(tLocalities)).detail("TLogDataHalls", ::describeDataHalls(tLocalities)).detail("MissingZones", ::describeZones(unavailableLocals))
.detail("MissingDataHalls", ::describeDataHalls(unavailableLocals)).detail("Required", required).detail("DesiredLogs", desired).detail("RatingTests",SERVER_KNOBS->POLICY_RATING_TESTS)
.detail("checkStable", checkStable).detail("PolicyGenerations",SERVER_KNOBS->POLICY_GENERATIONS).backtrace();
// Free the set
logServerSet->clear();
logServerSet.clear();
throw no_more_servers();
@ -369,16 +329,9 @@ public:
id_used[result.first.locality.processId()]++;
}
TraceEvent("GetTLogTeamDone")
.detail("Completed", bCompleted).detail("Policy", policy->info())
.detail("Results", results.size()).detail("Processes", logServerSet->size())
.detail("Workers", id_worker.size())
.detail("Replication", tLogReplicationFactor)
.detail("Desired", desired)
.detail("RatingTests",SERVER_KNOBS->POLICY_RATING_TESTS)
.detail("PolicyGenerations",SERVER_KNOBS->POLICY_GENERATIONS);
TraceEvent("GetTLogTeamDone").detail("Completed", bCompleted).detail("Policy", policy->info()).detail("Results", results.size()).detail("Processes", logServerSet->size()).detail("Workers", id_worker.size())
.detail("Required", required).detail("Desired", desired).detail("RatingTests",SERVER_KNOBS->POLICY_RATING_TESTS).detail("PolicyGenerations",SERVER_KNOBS->POLICY_GENERATIONS);
// Free the set
logServerSet->clear();
logServerSet.clear();
@ -518,7 +471,7 @@ public:
if( now() - startTime < SERVER_KNOBS->WAIT_FOR_GOOD_RECRUITMENT_DELAY &&
( ( RoleFitness(remoteLogs, ProcessClass::TLog) > RoleFitness(SERVER_KNOBS->EXPECTED_TLOG_FITNESS, req.configuration.getDesiredLogs()) ) ||
( RoleFitness(logRouters, ProcessClass::LogRouter) > RoleFitness(SERVER_KNOBS->EXPECTED_LOG_ROUTER_FITNESS, req.configuration.getDesiredLogRouters()) ) ) ) {
( RoleFitness(logRouters, ProcessClass::LogRouter) > RoleFitness(SERVER_KNOBS->EXPECTED_LOG_ROUTER_FITNESS, req.logRouterCount) ) ) ) {
throw operation_failed();
}
@ -537,27 +490,18 @@ public:
primaryDC.insert(dcId);
result.dcId = dcId;
Optional<Key> remoteDcId;
RegionInfo region;
for(auto& r : req.configuration.regions) {
if(r.dcId != dcId.get()) {
ASSERT(!remoteDcId.present());
remoteDcId = r.dcId;
} else {
ASSERT(region.dcId == StringRef());
if(r.dcId == dcId.get()) {
region = r;
break;
}
}
if(req.recruitSeedServers) {
auto primaryStorageServers = getWorkersForSeedServers( req.configuration, req.configuration.storagePolicy, dcId );
for(int i = 0; i < primaryStorageServers.size(); i++)
for(int i = 0; i < primaryStorageServers.size(); i++) {
result.storageServers.push_back(primaryStorageServers[i].first);
if(req.configuration.remoteTLogReplicationFactor > 0) {
auto remoteStorageServers = getWorkersForSeedServers( req.configuration, req.configuration.storagePolicy, remoteDcId );
for(int i = 0; i < remoteStorageServers.size(); i++)
result.storageServers.push_back(remoteStorageServers[i].first);
}
}
@ -594,8 +538,10 @@ public:
for(int i = 0; i < proxies.size(); i++)
result.proxies.push_back(proxies[i].first);
auto logRouters = getWorkersForRoleInDatacenter( remoteDcId, ProcessClass::LogRouter, req.configuration.getDesiredLogRouters(), req.configuration, id_used );
result.logRouterCount = logRouters.size() ? logRouters.size() : 1;
auto oldLogRouters = getWorkersForRoleInDatacenter( dcId, ProcessClass::LogRouter, req.maxOldLogRouters, req.configuration, id_used );
for(int i = 0; i < oldLogRouters.size(); i++) {
result.oldLogRouters.push_back(oldLogRouters[i].first);
}
if( now() - startTime < SERVER_KNOBS->WAIT_FOR_GOOD_RECRUITMENT_DELAY &&
( RoleFitness(tlogs, ProcessClass::TLog) > RoleFitness(SERVER_KNOBS->EXPECTED_TLOG_FITNESS, req.configuration.getDesiredLogs()) ||
@ -656,7 +602,6 @@ public:
throw no_more_servers();
} else {
RecruitFromConfigurationReply result;
result.logRouterCount = 0;
std::map< Optional<Standalone<StringRef>>, int> id_used;
id_used[masterProcessId]++;
id_used[clusterControllerProcessId]++;
@ -700,6 +645,11 @@ public:
result.resolvers.push_back(resolvers[i].first);
for(int i = 0; i < proxies.size(); i++)
result.proxies.push_back(proxies[i].first);
auto oldLogRouters = getWorkersForRoleInDatacenter( dcId, ProcessClass::LogRouter, req.maxOldLogRouters, req.configuration, used );
for(int i = 0; i < oldLogRouters.size(); i++) {
result.oldLogRouters.push_back(oldLogRouters[i].first);
}
break;
} else {
if(fitness < bestFitness) {
@ -795,6 +745,7 @@ public:
std::vector<std::pair<WorkerInterface, ProcessClass>> remote_tlogs;
std::vector<std::pair<WorkerInterface, ProcessClass>> satellite_tlogs;
std::vector<std::pair<WorkerInterface, ProcessClass>> log_routers;
std::set<NetworkAddress> logRouterAddresses;
for( auto& logSet : dbi.logSystemConfig.tLogs ) {
for( auto& it : logSet.tLogs ) {
@ -819,7 +770,10 @@ public:
return false;
if ( tlogWorker->second.priorityInfo.isExcluded )
return true;
log_routers.push_back(std::make_pair(tlogWorker->second.interf, tlogWorker->second.processClass));
if( !logRouterAddresses.count( tlogWorker->second.interf.address() ) ) {
logRouterAddresses.insert( tlogWorker->second.interf.address() );
log_routers.push_back(std::make_pair(tlogWorker->second.interf, tlogWorker->second.processClass));
}
}
}
@ -901,7 +855,14 @@ public:
if(oldRemoteTLogFit < newRemoteTLogFit) return false;
RoleFitness oldLogRoutersFit(log_routers, ProcessClass::LogRouter);
RoleFitness newLogRoutersFit((db.config.remoteTLogReplicationFactor > 0 && dbi.recoveryState == RecoveryState::REMOTE_RECOVERED) ? getWorkersForRoleInDatacenter( *remoteDC.begin(), ProcessClass::LogRouter, db.config.getDesiredLogRouters(), db.config, id_used, Optional<WorkerFitnessInfo>(), true ) : log_routers, ProcessClass::LogRouter);
RoleFitness newLogRoutersFit((db.config.remoteTLogReplicationFactor > 0 && dbi.recoveryState == RecoveryState::REMOTE_RECOVERED) ? getWorkersForRoleInDatacenter( *remoteDC.begin(), ProcessClass::LogRouter, newTLogFit.count, db.config, id_used, Optional<WorkerFitnessInfo>(), true ) : log_routers, ProcessClass::LogRouter);
if(oldLogRoutersFit.count < oldTLogFit.count) {
oldLogRoutersFit.worstFit = ProcessClass::NeverAssign;
}
if(newLogRoutersFit.count < newTLogFit.count) {
newLogRoutersFit.worstFit = ProcessClass::NeverAssign;
}
if(oldLogRoutersFit < newLogRoutersFit) return false;
@ -2061,7 +2022,8 @@ ACTOR Future<Void> clusterController( ServerCoordinators coordinators, Reference
}
}
ACTOR Future<Void> clusterController( Reference<ClusterConnectionFile> connFile, Reference<AsyncVar<Optional<ClusterControllerFullInterface>>> currentCC, Reference<AsyncVar<ClusterControllerPriorityInfo>> asyncPriorityInfo ) {
ACTOR Future<Void> clusterController( Reference<ClusterConnectionFile> connFile, Reference<AsyncVar<Optional<ClusterControllerFullInterface>>> currentCC, Reference<AsyncVar<ClusterControllerPriorityInfo>> asyncPriorityInfo, Future<Void> recoveredDiskFiles ) {
Void _ = wait(recoveredDiskFiles);
state bool hasConnected = false;
loop {
try {

View File

@ -25,7 +25,7 @@
#include "fdbclient/ClusterInterface.h"
#include "fdbclient/StorageServerInterface.h"
#include "fdbclient/MasterProxyInterface.h"
#include "DatabaseConfiguration.h"
#include "fdbclient/DatabaseConfiguration.h"
#include "MasterInterface.h"
#include "TLogInterface.h"
#include "WorkerInterface.h"
@ -67,15 +67,16 @@ struct ClusterControllerFullInterface {
struct RecruitFromConfigurationRequest {
DatabaseConfiguration configuration;
bool recruitSeedServers;
int maxOldLogRouters;
ReplyPromise< struct RecruitFromConfigurationReply > reply;
RecruitFromConfigurationRequest() {}
explicit RecruitFromConfigurationRequest(DatabaseConfiguration const& configuration, bool recruitSeedServers)
: configuration(configuration), recruitSeedServers(recruitSeedServers) {}
explicit RecruitFromConfigurationRequest(DatabaseConfiguration const& configuration, bool recruitSeedServers, int maxOldLogRouters)
: configuration(configuration), recruitSeedServers(recruitSeedServers), maxOldLogRouters(maxOldLogRouters) {}
template <class Ar>
void serialize( Ar& ar ) {
ar & configuration & recruitSeedServers & reply;
ar & configuration & recruitSeedServers & maxOldLogRouters & reply;
}
};
@ -85,12 +86,12 @@ struct RecruitFromConfigurationReply {
vector<WorkerInterface> proxies;
vector<WorkerInterface> resolvers;
vector<WorkerInterface> storageServers;
int logRouterCount;
vector<WorkerInterface> oldLogRouters;
Optional<Key> dcId;
template <class Ar>
void serialize( Ar& ar ) {
ar & tLogs & satelliteTLogs & proxies & resolvers & storageServers & dcId & logRouterCount;
ar & tLogs & satelliteTLogs & proxies & resolvers & storageServers & oldLogRouters & dcId;
}
};

View File

@ -43,34 +43,36 @@ struct CoreTLogSet {
bool isLocal;
int32_t hasBestPolicy;
int8_t locality;
Version startVersion;
CoreTLogSet() : tLogWriteAntiQuorum(0), tLogReplicationFactor(0), isLocal(true), hasBestPolicy(HasBestPolicyId), locality(-99) {}
CoreTLogSet() : tLogWriteAntiQuorum(0), tLogReplicationFactor(0), isLocal(true), hasBestPolicy(HasBestPolicyId), locality(tagLocalityUpgraded), startVersion(invalidVersion) {}
bool operator == (CoreTLogSet const& rhs) const {
return tLogs == rhs.tLogs && tLogWriteAntiQuorum == rhs.tLogWriteAntiQuorum && tLogReplicationFactor == rhs.tLogReplicationFactor && isLocal == rhs.isLocal &&
hasBestPolicy == rhs.hasBestPolicy && locality == rhs.locality && ((!tLogPolicy && !rhs.tLogPolicy) || (tLogPolicy && rhs.tLogPolicy && (tLogPolicy->info() == rhs.tLogPolicy->info())));
return tLogs == rhs.tLogs && tLogWriteAntiQuorum == rhs.tLogWriteAntiQuorum && tLogReplicationFactor == rhs.tLogReplicationFactor && isLocal == rhs.isLocal && hasBestPolicy == rhs.hasBestPolicy &&
locality == rhs.locality && startVersion == rhs.startVersion && ((!tLogPolicy && !rhs.tLogPolicy) || (tLogPolicy && rhs.tLogPolicy && (tLogPolicy->info() == rhs.tLogPolicy->info())));
}
template <class Archive>
void serialize(Archive& ar) {
ar & tLogs & tLogWriteAntiQuorum & tLogReplicationFactor & tLogPolicy & tLogLocalities & isLocal & hasBestPolicy & locality;
ar & tLogs & tLogWriteAntiQuorum & tLogReplicationFactor & tLogPolicy & tLogLocalities & isLocal & hasBestPolicy & locality & startVersion;
}
};
struct OldTLogCoreData {
std::vector<CoreTLogSet> tLogs;
int32_t logRouterTags;
Version epochEnd;
OldTLogCoreData() : epochEnd(0) {}
OldTLogCoreData() : epochEnd(0), logRouterTags(0) {}
bool operator == (OldTLogCoreData const& rhs) const {
return tLogs == rhs.tLogs && epochEnd == rhs.epochEnd;
return tLogs == rhs.tLogs && logRouterTags == rhs.logRouterTags && epochEnd == rhs.epochEnd;
}
template <class Archive>
void serialize(Archive& ar) {
if( ar.protocolVersion() >= 0x0FDB00A560010001LL) {
ar & tLogs & epochEnd;
ar & tLogs & logRouterTags & epochEnd;
}
else if(ar.isDeserializing) {
tLogs.push_back(CoreTLogSet());
@ -81,11 +83,12 @@ struct OldTLogCoreData {
struct DBCoreState {
std::vector<CoreTLogSet> tLogs;
int32_t logRouterTags;
std::vector<OldTLogCoreData> oldTLogData;
DBRecoveryCount recoveryCount; // Increases with sequential successful recoveries.
int logSystemType;
DBCoreState() : recoveryCount(0), logSystemType(0) {}
DBCoreState() : logRouterTags(0), recoveryCount(0), logSystemType(0) {}
vector<UID> getPriorCommittedLogServers() {
vector<UID> priorCommittedLogServers;
@ -100,7 +103,7 @@ struct DBCoreState {
}
bool isEqual(DBCoreState const& r) const {
return logSystemType == r.logSystemType && recoveryCount == r.recoveryCount && tLogs == r.tLogs && oldTLogData == r.oldTLogData;
return logSystemType == r.logSystemType && recoveryCount == r.recoveryCount && tLogs == r.tLogs && oldTLogData == r.oldTLogData && logRouterTags == r.logRouterTags;
}
bool operator == ( const DBCoreState& rhs ) const { return isEqual(rhs); }
@ -114,7 +117,7 @@ struct DBCoreState {
ASSERT(ar.protocolVersion() >= 0x0FDB00A460010001LL);
if(ar.protocolVersion() >= 0x0FDB00A560010001LL) {
ar & tLogs & oldTLogData & recoveryCount & logSystemType;
ar & tLogs & logRouterTags & oldTLogData & recoveryCount & logSystemType;
} else if(ar.isDeserializing) {
tLogs.push_back(CoreTLogSet());
ar & tLogs[0].tLogs & tLogs[0].tLogWriteAntiQuorum & recoveryCount & tLogs[0].tLogReplicationFactor & logSystemType;
@ -128,6 +131,13 @@ struct DBCoreState {
ar & locality;
tLogs[0].tLogLocalities.push_back(locality);
}
if(oldTLogData.size()) {
tLogs[0].startVersion = oldTLogData[0].epochEnd;
for(int i = 0; i < oldTLogData.size() - 1; i++) {
oldTLogData[i].tLogs[0].startVersion = oldTLogData[i+1].epochEnd;
}
}
}
}
}

View File

@ -268,14 +268,20 @@ struct ServerStatus {
};
typedef AsyncMap<UID, ServerStatus> ServerStatusMap;
ACTOR Future<Void> waitForAllDataRemoved( Database cx, UID serverID ) {
ACTOR Future<Void> waitForAllDataRemoved( Database cx, UID serverID, Version addedVersion ) {
state Transaction tr(cx);
loop {
try {
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
bool canRemove = wait( canRemoveStorageServer( &tr, serverID ) );
if (canRemove)
return Void();
Version ver = wait( tr.getReadVersion() );
//we cannot remove a server immediately after adding it, because
if(ver > addedVersion + SERVER_KNOBS->MAX_READ_TRANSACTION_LIFE_VERSIONS) {
bool canRemove = wait( canRemoveStorageServer( &tr, serverID ) );
if (canRemove) {
return Void();
}
}
// Wait for any change to the serverKeys for this server
Void _ = wait( delay(SERVER_KNOBS->ALL_DATA_REMOVED_DELAY, TaskDataDistribution) );
@ -295,7 +301,8 @@ ACTOR Future<Void> storageServerFailureTracker(
ServerStatus *status,
PromiseStream<Void> serverFailures,
int64_t *unhealthyServers,
UID masterId )
UID masterId,
Version addedVersion )
{
loop {
bool unhealthy = statusMap->count(server.id()) && statusMap->get(server.id()).isUnhealthy();
@ -319,7 +326,7 @@ ACTOR Future<Void> storageServerFailureTracker(
TraceEvent("StatusMapChange", masterId).detail("ServerID", server.id()).detail("Status", status->toString()).
detail("Available", IFailureMonitor::failureMonitor().getState(server.waitFailure.getEndpoint()).isAvailable());
}
when ( Void _ = wait( status->isUnhealthy() ? waitForAllDataRemoved(cx, server.id()) : Never() ) ) { break; }
when ( Void _ = wait( status->isUnhealthy() ? waitForAllDataRemoved(cx, server.id(), addedVersion) : Never() ) ) { break; }
}
}
@ -479,7 +486,8 @@ Future<Void> storageServerTracker(
std::map<UID, Reference<TCServerInfo>>* const& other_servers,
PromiseStream< std::pair<UID, Optional<StorageServerInterface>> > const& changes,
PromiseStream<Void> const& serverFailures,
Promise<Void> const& errorOut);
Promise<Void> const& errorOut,
Version const& addedVersion);
Future<Void> teamTracker( struct DDTeamCollection* const& self, Reference<IDataDistributionTeam> const& team );
@ -525,6 +533,8 @@ struct DDTeamCollection {
std::vector<Optional<Key>> includedDCs;
Optional<std::vector<Optional<Key>>> otherTrackedDCs;
bool primary;
Reference<AsyncVar<bool>> processingUnhealthy;
DDTeamCollection(
Database const& cx,
UID masterId,
@ -535,12 +545,13 @@ struct DDTeamCollection {
std::vector<Optional<Key>> includedDCs,
Optional<std::vector<Optional<Key>>> otherTrackedDCs,
PromiseStream< std::pair<UID, Optional<StorageServerInterface>> > const& serverChanges,
Future<Void> readyToStart, Reference<AsyncVar<bool>> zeroHealthyTeams, bool primary )
Future<Void> readyToStart, Reference<AsyncVar<bool>> zeroHealthyTeams, bool primary,
Reference<AsyncVar<bool>> processingUnhealthy)
:cx(cx), masterId(masterId), lock(lock), output(output), shardsAffectedByTeamFailure(shardsAffectedByTeamFailure), doBuildTeams( true ), teamBuilder( Void() ),
configuration(configuration), serverChanges(serverChanges),
initialFailureReactionDelay( delay( BUGGIFY ? 0 : SERVER_KNOBS->INITIAL_FAILURE_REACTION_DELAY, TaskDataDistribution ) ), healthyTeamCount( 0 ),
initializationDoneActor(logOnCompletion(readyToStart && initialFailureReactionDelay, this)), optimalTeamCount( 0 ), recruitingStream(0), restartRecruiting( SERVER_KNOBS->DEBOUNCE_RECRUITING_DELAY ),
unhealthyServers(0), includedDCs(includedDCs), otherTrackedDCs(otherTrackedDCs), zeroHealthyTeams(zeroHealthyTeams), zeroOptimalTeams(true), primary(primary)
unhealthyServers(0), includedDCs(includedDCs), otherTrackedDCs(otherTrackedDCs), zeroHealthyTeams(zeroHealthyTeams), zeroOptimalTeams(true), primary(primary), processingUnhealthy(processingUnhealthy)
{
TraceEvent("DDTrackerStarting", masterId)
.detail( "State", "Inactive" )
@ -728,9 +739,9 @@ struct DDTeamCollection {
}
int bestSize = 0;
for( int i = 0; i < req.sources.size(); i++ ) {
if( self->server_info.count( req.sources[i] ) ) {
auto& teamList = self->server_info[ req.sources[i] ]->teams;
for( int i = 0; i < req.completeSources.size(); i++ ) {
if( self->server_info.count( req.completeSources[i] ) ) {
auto& teamList = self->server_info[ req.completeSources[i] ]->teams;
for( int j = 0; j < teamList.size(); j++ ) {
bool found = true;
for( int k = 0; k < teamList[j]->serverIDs.size(); k++ ) {
@ -799,7 +810,7 @@ struct DDTeamCollection {
// we preferentially mark the least used server as undesirable?
for (auto i = initTeams.allServers.begin(); i != initTeams.allServers.end(); ++i) {
if (shouldHandleServer(i->first)) {
addServer(i->first, i->second, serverTrackerErrorOut);
addServer(i->first, i->second, serverTrackerErrorOut, 0);
}
}
@ -1153,7 +1164,7 @@ struct DDTeamCollection {
return (includedDCs.empty() || std::find(includedDCs.begin(), includedDCs.end(), newServer.locality.dcId()) != includedDCs.end() || (otherTrackedDCs.present() && std::find(otherTrackedDCs.get().begin(), otherTrackedDCs.get().end(), newServer.locality.dcId()) == otherTrackedDCs.get().end()));
}
void addServer( StorageServerInterface newServer, ProcessClass processClass, Promise<Void> errorOut ) {
void addServer( StorageServerInterface newServer, ProcessClass processClass, Promise<Void> errorOut, Version addedVersion ) {
if (!shouldHandleServer(newServer)) {
return;
}
@ -1161,7 +1172,7 @@ struct DDTeamCollection {
TraceEvent("AddedStorageServer", masterId).detail("ServerID", newServer.id()).detail("ProcessClass", processClass.toString()).detail("WaitFailureToken", newServer.waitFailure.getEndpoint().token).detail("address", newServer.waitFailure.getEndpoint().address);
auto &r = server_info[newServer.id()] = Reference<TCServerInfo>( new TCServerInfo( newServer, processClass ) );
r->tracker = storageServerTracker( this, cx, r.getPtr(), &server_status, lock, masterId, &server_info, serverChanges, serverFailures, errorOut );
r->tracker = storageServerTracker( this, cx, r.getPtr(), &server_status, lock, masterId, &server_info, serverChanges, serverFailures, errorOut, addedVersion );
restartTeamBuilder.trigger();
}
@ -1265,8 +1276,9 @@ ACTOR Future<Void> teamTracker( DDTeamCollection *self, Reference<IDataDistribut
int serversLeft = teamLocality->size();
bool matchesPolicy = self->configuration.storagePolicy->validate(teamLocality->getEntries(), teamLocality);
if( !self->initialFailureReactionDelay.isReady() )
if( !self->initialFailureReactionDelay.isReady() ) {
change.push_back( self->initialFailureReactionDelay );
}
change.push_back( self->zeroHealthyTeams->onChange() );
bool recheck = (lastReady != self->initialFailureReactionDelay.isReady() || (lastZeroHealthy && !self->zeroHealthyTeams->get())) && (!matchesPolicy || anyUndesired || team->getServerIDs().size() != self->configuration.storageTeamSize);
@ -1327,7 +1339,7 @@ ACTOR Future<Void> teamTracker( DDTeamCollection *self, Reference<IDataDistribut
else
team->setPriority( PRIORITY_TEAM_UNHEALTHY );
}
else if ( team->getServerIDs().size() != self->configuration.storageTeamSize )
else if ( team->getServerIDs().size() != self->configuration.storageTeamSize || anyWrongConfiguration )
team->setPriority( PRIORITY_TEAM_UNHEALTHY );
else if( anyUndesired )
team->setPriority( PRIORITY_TEAM_CONTAINS_UNDESIRED_SERVER );
@ -1343,9 +1355,7 @@ ACTOR Future<Void> teamTracker( DDTeamCollection *self, Reference<IDataDistribut
int maxPriority = team->getPriority();
auto teams = self->shardsAffectedByTeamFailure->getTeamsFor( shards[i] );
for( int t=0; t<teams.size(); t++) {
ASSERT( teams[t].servers.size() );
if( self->server_info.count( teams[t].servers[0] ) ) {
if( teams[t].servers.size() && self->server_info.count( teams[t].servers[0] ) ) {
auto& info = self->server_info[teams[t].servers[0]];
bool found = false;
@ -1359,7 +1369,7 @@ ACTOR Future<Void> teamTracker( DDTeamCollection *self, Reference<IDataDistribut
TEST(!found); // A removed team is still associated with a shard in SABTF
} else {
TEST(true); // A removed server is still associated with a team in SABTF
TEST(teams[t].servers.size()); // A removed server is still associated with a team in SABTF
}
}
@ -1513,7 +1523,7 @@ ACTOR Future<Void> waitServerListChange( DDTeamCollection *self, Database cx, Fu
currentInterfaceChanged.send( std::make_pair(ssi,processClass) );
}
} else if( !self->recruitingIds.count(ssi.id()) ) {
self->addServer( ssi, processClass, self->serverTrackerErrorOut );
self->addServer( ssi, processClass, self->serverTrackerErrorOut, tr.getReadVersion().get() );
self->doBuildTeams = true;
}
}
@ -1565,7 +1575,8 @@ ACTOR Future<Void> storageServerTracker(
std::map<UID, Reference<TCServerInfo>>* other_servers,
PromiseStream< std::pair<UID, Optional<StorageServerInterface>> > changes,
PromiseStream<Void> serverFailures,
Promise<Void> errorOut)
Promise<Void> errorOut,
Version addedVersion)
{
state Future<Void> failureTracker;
state ServerStatus status( false, false, server->lastKnownInterface.locality );
@ -1649,7 +1660,7 @@ ACTOR Future<Void> storageServerTracker(
otherChanges.push_back( self->excludedServers.onChange( addr ) );
otherChanges.push_back( self->excludedServers.onChange( ipaddr ) );
failureTracker = storageServerFailureTracker( cx, server->lastKnownInterface, statusMap, &status, serverFailures, &self->unhealthyServers, masterId );
failureTracker = storageServerFailureTracker( cx, server->lastKnownInterface, statusMap, &status, serverFailures, &self->unhealthyServers, masterId, addedVersion );
//We need to recruit new storage servers if the key value store type has changed
if(hasWrongStoreTypeOrDC)
@ -1764,7 +1775,7 @@ ACTOR Future<Void> initializeStorage( DDTeamCollection *self, RecruitStorageRepl
self->recruitingIds.insert(interfaceId);
self->recruitingLocalities.insert(candidateWorker.worker.address());
ErrorOr<StorageServerInterface> newServer = wait( candidateWorker.worker.storage.tryGetReply( isr, TaskDataDistribution ) );
ErrorOr<InitializeStorageReply> newServer = wait( candidateWorker.worker.storage.tryGetReply( isr, TaskDataDistribution ) );
self->recruitingIds.erase(interfaceId);
self->recruitingLocalities.erase(candidateWorker.worker.address());
@ -1780,8 +1791,8 @@ ACTOR Future<Void> initializeStorage( DDTeamCollection *self, RecruitStorageRepl
Void _ = wait( delay(SERVER_KNOBS->STORAGE_RECRUITMENT_DELAY, TaskDataDistribution) );
}
else if( newServer.present() ) {
if( !self->server_info.count( newServer.get().id() ) )
self->addServer( newServer.get(), candidateWorker.processClass, self->serverTrackerErrorOut );
if( !self->server_info.count( newServer.get().interf.id() ) )
self->addServer( newServer.get().interf, candidateWorker.processClass, self->serverTrackerErrorOut, newServer.get().addedVersion );
else
TraceEvent(SevWarn, "DDRecruitmentError").detail("Reason", "Server ID already recruited");
@ -1859,6 +1870,28 @@ ACTOR Future<Void> storageRecruiter( DDTeamCollection *self, Reference<AsyncVar<
}
}
ACTOR Future<Void> updateReplicasKey(DDTeamCollection* self, Optional<Key> dcId) {
Void _ = wait(self->initialFailureReactionDelay);
Void _ = wait(delay(FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY, TaskLowPriority)); //After the team trackers wait on the initial failure reaction delay, they yield. We want to make sure every tracker has had the opportunity to send their relocations to the queue.
while(self->zeroHealthyTeams->get() || self->processingUnhealthy->get()) {
TraceEvent("DDUpdatingStalled", self->masterId).detail("dcId", printable(dcId)).detail("zeroHealthy", self->zeroHealthyTeams->get()).detail("processingUnhealthy", self->processingUnhealthy->get());
Void _ = wait(self->zeroHealthyTeams->onChange() || self->processingUnhealthy->onChange());
}
TraceEvent("DDUpdatingReplicas", self->masterId).detail("dcId", printable(dcId)).detail("replicas", self->configuration.storageTeamSize);
state Transaction tr(self->cx);
loop {
try {
tr.addReadConflictRange(singleKeyRange(datacenterReplicasKeyFor(dcId)));
tr.set(datacenterReplicasKeyFor(dcId), datacenterReplicasValue(self->configuration.storageTeamSize));
Void _ = wait( tr.commit() );
TraceEvent("DDUpdatedReplicas", self->masterId).detail("dcId", printable(dcId)).detail("replicas", self->configuration.storageTeamSize);
return Void();
} catch( Error &e ) {
Void _ = wait( tr.onError(e) );
}
}
}
// Keep track of servers and teams -- serves requests for getRandomTeam
ACTOR Future<Void> dataDistributionTeamCollection(
Reference<InitialDataDistribution> initData,
@ -1875,27 +1908,28 @@ ACTOR Future<Void> dataDistributionTeamCollection(
PromiseStream< std::pair<UID, Optional<StorageServerInterface>> > serverChanges,
Future<Void> readyToStart,
Reference<AsyncVar<bool>> zeroHealthyTeams,
bool primary)
bool primary,
Reference<AsyncVar<bool>> processingUnhealthy)
{
state DDTeamCollection self( cx, masterId, lock, output, shardsAffectedByTeamFailure, configuration, includedDCs, otherTrackedDCs, serverChanges, readyToStart, zeroHealthyTeams, primary );
state DDTeamCollection self( cx, masterId, lock, output, shardsAffectedByTeamFailure, configuration, includedDCs, otherTrackedDCs, serverChanges, readyToStart, zeroHealthyTeams, primary, processingUnhealthy );
state Future<Void> loggingTrigger = Void();
state PromiseStream<Void> serverRemoved;
state Future<Void> interfaceChanges;
state Future<Void> error = actorCollection( self.addActor.getFuture() );
state Future<Void> storageServerRecruitment;
state Future<Void> storageServerRecruitmentMonitor;
state Future<Void> trackExcluded;
TraceEvent("DDTeamCollectionBegin", masterId).detail("primary", primary);
Void _ = wait( readyToStart );
TraceEvent("DDTeamCollectionReadyToStart", masterId).detail("primary", primary);
try {
self.init( *initData );
initData = Reference<InitialDataDistribution>();
storageServerRecruitment = storageRecruiter( &self, db );
storageServerRecruitmentMonitor = monitorStorageServerRecruitment( &self );
interfaceChanges = waitServerListChange( &self, cx, serverRemoved.getFuture() );
trackExcluded = trackExcludedServers( &self, cx );
self.addActor.send(storageRecruiter( &self, db ));
self.addActor.send(monitorStorageServerRecruitment( &self ));
self.addActor.send(waitServerListChange( &self, cx, serverRemoved.getFuture() ));
self.addActor.send(trackExcludedServers( &self, cx ));
if(includedDCs.size()) {
self.addActor.send(updateReplicasKey(&self, includedDCs[0]));
}
// SOMEDAY: Monitor FF/serverList for (new) servers that aren't in allServers and add or remove them
loop choose {
@ -1926,10 +1960,7 @@ ACTOR Future<Void> dataDistributionTeamCollection(
self.countHealthyTeams();
}
when( Void _ = wait( self.serverTrackerErrorOut.getFuture() ) ) {} // Propagate errors from storageServerTracker
when( Void _ = wait( interfaceChanges ) ) {}
when( Void _ = wait( trackExcluded ) ) {}
when( Void _ = wait( error ) ) {}
when( Void _ = wait( storageServerRecruitment ) ) {}
}
} catch (Error& e) {
if (e.code() != error_code_movekeys_conflict)
@ -2022,69 +2053,6 @@ static std::set<int> const& normalDDQueueErrors() {
return s;
}
ACTOR Future<Void> popOldTags( Transaction* tr, Reference<ILogSystem> logSystem, Version recoveryCommitVersion, int8_t tagLocality, std::vector<Tag> tags ) {
Optional<Standalone<StringRef>> val = wait( tr->get( tagLocality == tagLocalityUpgraded ? serverTagMaxOldKey : serverMaxTagKeyFor(tagLocality) ) );
if(!val.present())
return Void();
state Tag maxTag = tagLocality == tagLocalityUpgraded ? decodeServerTagMaxValueOld(val.get()) : decodeServerTagMaxValue( val.get() );
TraceEvent("PopOldTags").detail("maxTag", maxTag.toString());
std::set<Tag> unusedTags;
for(uint16_t i = 0; i <= maxTag.id; i++)
unusedTags.insert(Tag(tagLocality, i));
for(Tag& t : tags) {
if(t.locality == tagLocality) {
unusedTags.erase(t);
}
}
for(auto tag : unusedTags)
logSystem->pop(recoveryCommitVersion, tag);
return Void();
}
ACTOR Future<Void> popOldTags( Database cx, Reference<ILogSystem> logSystem, Version recoveryCommitVersion ) {
state Transaction tr(cx);
if( recoveryCommitVersion == 1 )
return Void();
loop {
try {
state Future<Standalone<RangeResultRef>> fTagLocalities = tr.getRange( tagLocalityListKeys, CLIENT_KNOBS->TOO_MANY );
state Future<Standalone<RangeResultRef>> fTags = tr.getRange( serverTagKeys, CLIENT_KNOBS->TOO_MANY );
state Future<Standalone<RangeResultRef>> fHistoryTags = tr.getRange( serverTagHistoryKeys, CLIENT_KNOBS->TOO_MANY );
Void _ = wait( success(fTagLocalities) && success(fTags) && success(fHistoryTags) );
state std::vector<Future<Void>> popActors;
state std::vector<Tag> tags;
for(auto& kv : fTags.get()) {
tags.push_back(decodeServerTagValue( kv.value ));
}
for(auto& kv : fHistoryTags.get()) {
tags.push_back(decodeServerTagValue( kv.value ));
}
//FIXME: we have to check the old locality indefinitely, because we can never be sure when pops have succeeded, we can remove this code when we no longer need to support upgrades from 5.X to 6.0
popActors.push_back(popOldTags(&tr, logSystem, recoveryCommitVersion, tagLocalityUpgraded, tags));
for(auto& kv : fTagLocalities.get()) {
popActors.push_back(popOldTags(&tr, logSystem, recoveryCommitVersion, decodeTagLocalityListValue(kv.value), tags));
}
Void _ = wait( waitForAll(popActors) );
return Void();
} catch( Error &e ) {
Void _ = wait( tr.onError(e) );
}
}
}
ACTOR Future<Void> pollMoveKeysLock( Database cx, MoveKeysLock lock ) {
loop {
Void _ = wait(delay(SERVER_KNOBS->MOVEKEYS_LOCK_POLLING_DELAY));
@ -2108,45 +2076,37 @@ ACTOR Future<Void> dataDistribution(
Version recoveryCommitVersion,
std::vector<Optional<Key>> primaryDcId,
std::vector<Optional<Key>> remoteDcIds,
double* lastLimited)
double* lastLimited,
Future<Void> remoteRecovered)
{
state Database cx = openDBOnServer(db, TaskDataDistributionLaunch, true, true);
cx->locationCacheSize = SERVER_KNOBS->DD_LOCATION_CACHE_SIZE;
state Transaction trVer(cx);
state Transaction tr(cx);
loop {
try {
trVer.setOption( FDBTransactionOptions::ACCESS_SYSTEM_KEYS );
trVer.setOption( FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE );
tr.setOption( FDBTransactionOptions::ACCESS_SYSTEM_KEYS );
tr.setOption( FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE );
if( !g_network->isSimulated() ) {
UID id(g_random->randomUniqueID());
TraceEvent("UpgradeProcessClassTransaction", mi.id())
.detail("TransactionUID", id);
trVer.debugTransaction( id );
Standalone<RangeResultRef> replicaKeys = wait(tr.getRange(datacenterReplicasKeys, CLIENT_KNOBS->TOO_MANY));
for(auto& kv : replicaKeys) {
auto dcId = decodeDatacenterReplicasKey(kv.key);
auto replicas = decodeDatacenterReplicasValue(kv.value);
if((primaryDcId.size() && primaryDcId[0] == dcId) || (remoteDcIds.size() && remoteDcIds[0] == dcId)) {
if(replicas > configuration.storageTeamSize) {
tr.set(kv.key, datacenterReplicasValue(configuration.storageTeamSize));
}
} else {
tr.clear(kv.key);
}
}
Optional<Value> val = wait(trVer.get(processClassVersionKey));
if (val.present())
break;
Standalone<RangeResultRef> processClasses = wait( trVer.getRange( processClassKeys, CLIENT_KNOBS->TOO_MANY ) );
ASSERT( !processClasses.more && processClasses.size() < CLIENT_KNOBS->TOO_MANY );
trVer.clear(processClassKeys);
trVer.set(processClassVersionKey, processClassVersionValue);
for (auto it : processClasses) {
UID processUid = decodeProcessClassKeyOld(it.key);
trVer.set(processClassKeyFor(processUid.toString()), it.value);
}
Void _ = wait(trVer.commit());
TraceEvent("ProcessClassUpgrade");
Void _ = wait(tr.commit());
break;
}
catch(Error &e) {
Void _ = wait( trVer.onError(e) );
Void _ = wait(tr.onError(e));
}
}
@ -2194,6 +2154,7 @@ ACTOR Future<Void> dataDistribution(
state PromiseStream<RelocateShard> output;
state PromiseStream<Promise<int64_t>> getAverageShardBytes;
state PromiseStream<GetMetricsRequest> getShardMetrics;
state Reference<AsyncVar<bool>> processingUnhealthy( new AsyncVar<bool>(false) );
state Promise<Void> readyToStart;
vector<TeamCollectionInterface> tcis;
@ -2234,12 +2195,11 @@ ACTOR Future<Void> dataDistribution(
}
actors.push_back( pollMoveKeysLock(cx, lock) );
actors.push_back( popOldTags( cx, logSystem, recoveryCommitVersion) );
actors.push_back( reportErrorsExcept( dataDistributionTracker( initData, cx, output, getShardMetrics, getAverageShardBytes.getFuture(), readyToStart, anyZeroHealthyTeams, mi.id() ), "DDTracker", mi.id(), &normalDDQueueErrors() ) );
actors.push_back( reportErrorsExcept( dataDistributionQueue( cx, output, getShardMetrics, tcis, shardsAffectedByTeamFailure, lock, getAverageShardBytes, mi, storageTeamSize, configuration.durableStorageQuorum, lastLimited ), "DDQueue", mi.id(), &normalDDQueueErrors() ) );
actors.push_back( reportErrorsExcept( dataDistributionTeamCollection( initData, tcis[0], cx, db, shardsAffectedByTeamFailure, lock, output, mi.id(), configuration, primaryDcId, configuration.remoteTLogReplicationFactor > 0 ? remoteDcIds : std::vector<Optional<Key>>(), serverChanges, readyToStart.getFuture(), zeroHealthyTeams[0], true ), "DDTeamCollectionPrimary", mi.id(), &normalDDQueueErrors() ) );
actors.push_back( reportErrorsExcept( dataDistributionQueue( cx, output, getShardMetrics, processingUnhealthy, tcis, shardsAffectedByTeamFailure, lock, getAverageShardBytes, mi, storageTeamSize, configuration.durableStorageQuorum, lastLimited ), "DDQueue", mi.id(), &normalDDQueueErrors() ) );
actors.push_back( reportErrorsExcept( dataDistributionTeamCollection( initData, tcis[0], cx, db, shardsAffectedByTeamFailure, lock, output, mi.id(), configuration, primaryDcId, configuration.remoteTLogReplicationFactor > 0 ? remoteDcIds : std::vector<Optional<Key>>(), serverChanges, readyToStart.getFuture(), zeroHealthyTeams[0], true, processingUnhealthy ), "DDTeamCollectionPrimary", mi.id(), &normalDDQueueErrors() ) );
if (configuration.remoteTLogReplicationFactor > 0) {
actors.push_back( reportErrorsExcept( dataDistributionTeamCollection( initData, tcis[1], cx, db, shardsAffectedByTeamFailure, lock, output, mi.id(), configuration, remoteDcIds, Optional<std::vector<Optional<Key>>>(), serverChanges, readyToStart.getFuture(), zeroHealthyTeams[1], false ), "DDTeamCollectionSecondary", mi.id(), &normalDDQueueErrors() ) );
actors.push_back( reportErrorsExcept( dataDistributionTeamCollection( initData, tcis[1], cx, db, shardsAffectedByTeamFailure, lock, output, mi.id(), configuration, remoteDcIds, Optional<std::vector<Optional<Key>>>(), serverChanges, readyToStart.getFuture() && remoteRecovered, zeroHealthyTeams[1], false, processingUnhealthy ), "DDTeamCollectionSecondary", mi.id(), &normalDDQueueErrors() ) );
}
Void _ = wait( waitForAll( actors ) );
@ -2281,7 +2241,8 @@ DDTeamCollection* testTeamCollection(int teamSize, IRepPolicyRef policy, int pro
PromiseStream<std::pair<UID, Optional<StorageServerInterface>>>(),
Future<Void>(Void()),
Reference<AsyncVar<bool>>( new AsyncVar<bool>(true) ),
true
true,
Reference<AsyncVar<bool>>( new AsyncVar<bool>(false) )
);
for(int id = 1; id <= processCount; id++) {

View File

@ -27,7 +27,7 @@ struct RelocateShard {
KeyRange keys;
int priority;
RelocateShard() {}
RelocateShard() : priority(0) {}
RelocateShard( KeyRange const& keys, int priority ) : keys(keys), priority(priority) {}
};
@ -202,7 +202,8 @@ Future<Void> dataDistribution(
Version const& recoveryCommitVersion,
std::vector<Optional<Key>> const& primaryDcId,
std::vector<Optional<Key>> const& remoteDcIds,
double* const& lastLimited);
double* const& lastLimited,
Future<Void> const& remoteRecovered);
Future<Void> dataDistributionTracker(
Reference<InitialDataDistribution> const& initData,
@ -218,6 +219,7 @@ Future<Void> dataDistributionQueue(
Database const& cx,
PromiseStream<RelocateShard> const& input,
PromiseStream<GetMetricsRequest> const& getShardMetrics,
Reference<AsyncVar<bool>> const& processingUnhealthy,
vector<TeamCollectionInterface> const& teamCollection,
Reference<ShardsAffectedByTeamFailure> const& shardsAffectedByTeamFailure,
MoveKeysLock const& lock,

View File

@ -343,7 +343,6 @@ struct DDQueueData {
int activeRelocations;
int queuedRelocations;
int bytesWritten;
std::map<int, int> priority_relocations;
int teamSize;
int durableStorageQuorumPerTeam;
@ -370,6 +369,28 @@ struct DDQueueData {
double lastInterval;
int suppressIntervals;
Reference<AsyncVar<bool>> rawProcessingUnhealthy; //many operations will remove relocations before adding a new one, so delay a small time before settling on a new number.
std::map<int, int> priority_relocations;
int unhealthyRelocations;
void startRelocation(int priority) {
if(priority >= PRIORITY_TEAM_UNHEALTHY) {
unhealthyRelocations++;
rawProcessingUnhealthy->set(true);
}
priority_relocations[priority]++;
}
void finishRelocation(int priority) {
if(priority >= PRIORITY_TEAM_UNHEALTHY) {
unhealthyRelocations--;
ASSERT(unhealthyRelocations >= 0);
if(unhealthyRelocations == 0) {
rawProcessingUnhealthy->set(false);
}
}
priority_relocations[priority]--;
}
DDQueueData( MasterInterface mi, MoveKeysLock lock, Database cx, std::vector<TeamCollectionInterface> teamCollections,
Reference<ShardsAffectedByTeamFailure> sABTF, PromiseStream<Promise<int64_t>> getAverageShardBytes,
int teamSize, int durableStorageQuorumPerTeam, PromiseStream<RelocateShard> input,
@ -378,7 +399,8 @@ struct DDQueueData {
shardsAffectedByTeamFailure( sABTF ), getAverageShardBytes( getAverageShardBytes ), mi( mi ), lock( lock ),
cx( cx ), teamSize( teamSize ), durableStorageQuorumPerTeam( durableStorageQuorumPerTeam ), input( input ),
getShardMetrics( getShardMetrics ), startMoveKeysParallelismLock( SERVER_KNOBS->DD_MOVE_KEYS_PARALLELISM ),
finishMoveKeysParallelismLock( SERVER_KNOBS->DD_MOVE_KEYS_PARALLELISM ), lastLimited(lastLimited), suppressIntervals(0), lastInterval(0) {}
finishMoveKeysParallelismLock( SERVER_KNOBS->DD_MOVE_KEYS_PARALLELISM ), lastLimited(lastLimited),
suppressIntervals(0), lastInterval(0), unhealthyRelocations(0), rawProcessingUnhealthy( new AsyncVar<bool>(false) ) {}
void validate() {
if( EXPENSIVE_VALIDATION ) {
@ -596,7 +618,7 @@ struct DDQueueData {
/*TraceEvent(rrs.interval.end(), mi.id()).detail("Result","Cancelled")
.detail("WasFetching", foundActiveFetching).detail("Contained", rd.keys.contains( rrs.keys ));*/
queuedRelocations--;
priority_relocations[ rrs.priority ]--;
finishRelocation(rrs.priority);
}
}
@ -623,7 +645,7 @@ struct DDQueueData {
.detail("KeyBegin", printable(rrs.keys.begin)).detail("KeyEnd", printable(rrs.keys.end))
.detail("Priority", rrs.priority).detail("WantsNewServers", rrs.wantsNewServers);*/
queuedRelocations++;
priority_relocations[rrs.priority]++;
startRelocation(rrs.priority);
fetchingSourcesQueue.insert( rrs );
getSourceActors.insert( rrs.keys, getSourceServersForRange( cx, mi, rrs, fetchSourceServersComplete ) );
@ -643,7 +665,7 @@ struct DDQueueData {
.detail("KeyBegin", printable(newData.keys.begin)).detail("KeyEnd", printable(newData.keys.end))
.detail("Priority", newData.priority).detail("WantsNewServers", newData.wantsNewServers);*/
queuedRelocations++;
priority_relocations[newData.priority]++;
startRelocation(newData.priority);
foundActiveRelocation = true;
}
@ -776,7 +798,7 @@ struct DDQueueData {
//TraceEvent(rd.interval.end(), mi.id()).detail("Result","Success");
queuedRelocations--;
priority_relocations[rd.priority]--;
finishRelocation(rd.priority);
// now we are launching: remove this entry from the queue of all the src servers
for( int i = 0; i < rd.src.size(); i++ ) {
@ -804,7 +826,7 @@ struct DDQueueData {
launch( rrs, busymap );
activeRelocations++;
priority_relocations[ rrs.priority ]++;
startRelocation(rrs.priority);
inFlightActors.insert( rrs.keys, dataDistributionRelocator( this, rrs ) );
}
@ -1123,6 +1145,7 @@ ACTOR Future<Void> dataDistributionQueue(
Database cx,
PromiseStream<RelocateShard> input,
PromiseStream<GetMetricsRequest> getShardMetrics,
Reference<AsyncVar<bool>> processingUnhealthy,
std::vector<TeamCollectionInterface> teamCollections,
Reference<ShardsAffectedByTeamFailure> shardsAffectedByTeamFailure,
MoveKeysLock lock,
@ -1148,6 +1171,7 @@ ACTOR Future<Void> dataDistributionQueue(
balancingFutures.push_back(BgDDMountainChopper(&self, i));
balancingFutures.push_back(BgDDValleyFiller(&self, i));
}
balancingFutures.push_back(delayedAsyncVar(self.rawProcessingUnhealthy, processingUnhealthy, 0));
try {
loop {
@ -1189,7 +1213,7 @@ ACTOR Future<Void> dataDistributionQueue(
}
when ( RelocateData done = waitNext( self.relocationComplete.getFuture() ) ) {
self.activeRelocations--;
self.priority_relocations[ done.priority ]--;
self.finishRelocation(done.priority);
self.fetchKeysComplete.erase( done );
//self.logRelocation( done, "ShardRelocatorDone" );
actors.add( tag( delay(0, TaskDataDistributionLaunch), done.keys, rangesComplete ) );

View File

@ -23,6 +23,7 @@
#pragma once
#include "fdbclient/FDBTypes.h"
#include "Knobs.h"
class IClosable {
public:
@ -77,7 +78,7 @@ protected:
extern IKeyValueStore* keyValueStoreSQLite( std::string const& filename, UID logID, KeyValueStoreType storeType, bool checkChecksums=false, bool checkIntegrity=false );
extern IKeyValueStore* keyValueStoreMemory( std::string const& basename, UID logID, int64_t memoryLimit );
extern IKeyValueStore* keyValueStoreLogSystem( class IDiskQueue* queue, UID logID, int64_t memoryLimit, bool disableSnapshot );
extern IKeyValueStore* keyValueStoreLogSystem( class IDiskQueue* queue, UID logID, int64_t memoryLimit, bool disableSnapshot, bool replaceContent );
inline IKeyValueStore* openKVStore( KeyValueStoreType storeType, std::string const& filename, UID logID, int64_t memoryLimit, bool checkChecksums=false, bool checkIntegrity=false ) {
switch( storeType ) {

View File

@ -56,7 +56,7 @@ extern bool noUnseed;
class KeyValueStoreMemory : public IKeyValueStore, NonCopyable {
public:
KeyValueStoreMemory( IDiskQueue* log, UID id, int64_t memoryLimit, bool disableSnapshot );
KeyValueStoreMemory( IDiskQueue* log, UID id, int64_t memoryLimit, bool disableSnapshot, bool replaceContent );
// IClosable
virtual Future<Void> getError() { return log->getError(); }
@ -154,6 +154,12 @@ public:
if(!recovering.isReady())
return waitAndCommit(this, sequential);
if(!disableSnapshot && replaceContent && !firstCommitWithSnapshot) {
transactionSize += SERVER_KNOBS->REPLACE_CONTENTS_BYTES;
committedWriteBytes += SERVER_KNOBS->REPLACE_CONTENTS_BYTES;
semiCommit();
}
if(transactionIsLarge) {
fullSnapshot(data);
resetSnapshot = true;
@ -186,6 +192,7 @@ public:
committedDataSize = data.sumTo(data.end());
transactionSize = 0;
transactionIsLarge = false;
firstCommitWithSnapshot = false;
addActor.send( commitAndUpdateVersions( this, c, previousSnapshotEnd ) );
return c;
@ -353,6 +360,9 @@ private:
bool resetSnapshot; //Set to true after a fullSnapshot is performed. This causes the regular snapshot mechanism to restart
bool disableSnapshot;
bool replaceContent;
bool firstCommitWithSnapshot;
int snapshotCount;
int64_t memoryLimit; //The upper limit on the memory used by the store (excluding, possibly, some clear operations)
std::vector<std::pair<KeyValueMapPair, uint64_t>> dataSets;
@ -579,6 +589,7 @@ private:
//Snapshots an entire data set
void fullSnapshot( IndexedSet< KeyValueMapPair, uint64_t> &snapshotData ) {
previousSnapshotEnd = log_op(OpSnapshotAbort, StringRef(), StringRef());
replaceContent = false;
//Clear everything since we are about to write the whole database
log_op(OpClearToEnd, allKeys.begin, StringRef());
@ -646,6 +657,10 @@ private:
ASSERT(thisSnapshotEnd >= self->currentSnapshotEnd);
self->previousSnapshotEnd = self->currentSnapshotEnd;
self->currentSnapshotEnd = thisSnapshotEnd;
if(++self->snapshotCount == 2) {
self->replaceContent = false;
}
nextKey = Key();
nextKeyAfter = false;
snapItems = 0;
@ -689,10 +704,9 @@ private:
}
};
KeyValueStoreMemory::KeyValueStoreMemory( IDiskQueue* log, UID id, int64_t memoryLimit, bool disableSnapshot )
: log(log), id(id), previousSnapshotEnd(-1), currentSnapshotEnd(-1),
resetSnapshot(false), memoryLimit(memoryLimit), committedWriteBytes(0),
committedDataSize(0), transactionSize(0), transactionIsLarge(false), disableSnapshot(disableSnapshot)
KeyValueStoreMemory::KeyValueStoreMemory( IDiskQueue* log, UID id, int64_t memoryLimit, bool disableSnapshot, bool replaceContent )
: log(log), id(id), previousSnapshotEnd(-1), currentSnapshotEnd(-1), resetSnapshot(false), memoryLimit(memoryLimit), committedWriteBytes(0),
committedDataSize(0), transactionSize(0), transactionIsLarge(false), disableSnapshot(disableSnapshot), replaceContent(replaceContent), snapshotCount(0), firstCommitWithSnapshot(true)
{
recovering = recover( this );
snapshotting = snapshot( this );
@ -702,9 +716,9 @@ KeyValueStoreMemory::KeyValueStoreMemory( IDiskQueue* log, UID id, int64_t memor
IKeyValueStore* keyValueStoreMemory( std::string const& basename, UID logID, int64_t memoryLimit ) {
TraceEvent("KVSMemOpening", logID).detail("Basename", basename).detail("MemoryLimit", memoryLimit);
IDiskQueue *log = openDiskQueue( basename, logID );
return new KeyValueStoreMemory( log, logID, memoryLimit, false );
return new KeyValueStoreMemory( log, logID, memoryLimit, false, false );
}
IKeyValueStore* keyValueStoreLogSystem( class IDiskQueue* queue, UID logID, int64_t memoryLimit, bool disableSnapshot ) {
return new KeyValueStoreMemory( queue, logID, memoryLimit, disableSnapshot );
IKeyValueStore* keyValueStoreLogSystem( class IDiskQueue* queue, UID logID, int64_t memoryLimit, bool disableSnapshot, bool replaceContent ) {
return new KeyValueStoreMemory( queue, logID, memoryLimit, disableSnapshot, replaceContent );
}

View File

@ -201,6 +201,9 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) {
init( SPRING_CLEANING_MIN_VACUUM_PAGES, 1 ); if( randomize && BUGGIFY ) SPRING_CLEANING_MIN_VACUUM_PAGES = g_random->randomInt(0, 100);
init( SPRING_CLEANING_MAX_VACUUM_PAGES, 1e9 ); if( randomize && BUGGIFY ) SPRING_CLEANING_MAX_VACUUM_PAGES = g_random->coinflip() ? 0 : g_random->randomInt(1, 1e4);
// KeyValueStoreMemory
init( REPLACE_CONTENTS_BYTES, 1e5 ); if( randomize && BUGGIFY ) REPLACE_CONTENTS_BYTES = 1e3;
// Leader election
bool longLeaderElection = randomize && BUGGIFY;
init( CANDIDATE_MIN_DELAY, 0.05 );
@ -298,8 +301,9 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) {
init( METRIC_UPDATE_RATE, .1 ); if( slowRateKeeper ) METRIC_UPDATE_RATE = 0.5;
bool smallStorageTarget = randomize && BUGGIFY;
init( TARGET_BYTES_PER_STORAGE_SERVER, 1000e6 ); if( smallStorageTarget ) TARGET_BYTES_PER_STORAGE_SERVER = 1000e3;
init( SPRING_BYTES_STORAGE_SERVER, 100e6 ); if( smallStorageTarget ) SPRING_BYTES_STORAGE_SERVER = 100e3;
init( TARGET_BYTES_PER_STORAGE_SERVER, 1000e6 ); if( smallStorageTarget ) TARGET_BYTES_PER_STORAGE_SERVER = 3000e3;
init( SPRING_BYTES_STORAGE_SERVER, 100e6 ); if( smallStorageTarget ) SPRING_BYTES_STORAGE_SERVER = 300e3;
init( STORAGE_HARD_LIMIT_BYTES, 1500e6 ); if( smallStorageTarget ) STORAGE_HARD_LIMIT_BYTES = 4500e3;
bool smallTlogTarget = randomize && BUGGIFY;
init( TARGET_BYTES_PER_TLOG, 2000e6 ); if( smallTlogTarget ) TARGET_BYTES_PER_TLOG = 2000e3;
@ -330,7 +334,6 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) {
init( FETCH_BLOCK_BYTES, 2e6 );
init( FETCH_KEYS_PARALLELISM_BYTES, 5e6 ); if( randomize && BUGGIFY ) FETCH_KEYS_PARALLELISM_BYTES = 4e6;
init( BUGGIFY_BLOCK_BYTES, 10000 );
init( STORAGE_HARD_LIMIT_BYTES, 1500e6 ); if( randomize && BUGGIFY ) STORAGE_HARD_LIMIT_BYTES = 1500e3;
init( STORAGE_COMMIT_BYTES, 10000000 ); if( randomize && BUGGIFY ) STORAGE_COMMIT_BYTES = 2000000;
init( STORAGE_COMMIT_INTERVAL, 0.5 ); if( randomize && BUGGIFY ) STORAGE_COMMIT_INTERVAL = 2.0;
init( UPDATE_SHARD_VERSION_INTERVAL, 0.25 ); if( randomize && BUGGIFY ) UPDATE_SHARD_VERSION_INTERVAL = 1.0;

View File

@ -149,6 +149,9 @@ public:
int SPRING_CLEANING_MIN_VACUUM_PAGES;
int SPRING_CLEANING_MAX_VACUUM_PAGES;
// KeyValueStoreMemory
int64_t REPLACE_CONTENTS_BYTES;
// Leader election
double CANDIDATE_MIN_DELAY;
double CANDIDATE_MAX_DELAY;

View File

@ -37,15 +37,17 @@ struct LogRouterData {
struct TagData : NonCopyable, public ReferenceCounted<TagData> {
std::deque<std::pair<Version, LengthPrefixedStringRef>> version_messages;
Version popped;
Version knownCommittedVersion;
Tag tag;
TagData( Tag tag, Version popped ) : tag(tag), popped(popped) {}
TagData( Tag tag, Version popped, Version knownCommittedVersion ) : tag(tag), popped(popped), knownCommittedVersion(knownCommittedVersion) {}
TagData(TagData&& r) noexcept(true) : version_messages(std::move(r.version_messages)), tag(r.tag), popped(r.popped) {}
TagData(TagData&& r) noexcept(true) : version_messages(std::move(r.version_messages)), tag(r.tag), popped(r.popped), knownCommittedVersion(r.knownCommittedVersion) {}
void operator= (TagData&& r) noexcept(true) {
version_messages = std::move(r.version_messages);
tag = r.tag;
popped = r.popped;
knownCommittedVersion = r.knownCommittedVersion;
}
// Erase messages not needed to update *from* versions >= before (thus, messages with toversion <= before)
@ -75,10 +77,12 @@ struct LogRouterData {
UID dbgid;
Reference<AsyncVar<Reference<ILogSystem>>> logSystem;
NotifiedVersion version;
Version minPopped;
NotifiedVersion minPopped;
Version startVersion;
Deque<std::pair<Version, Standalone<VectorRef<uint8_t>>>> messageBlocks;
Tag routerTag;
int logSet;
bool allowPops;
LogSet logSet;
std::vector<Reference<TagData>> tag_data; //we only store data for the remote tag locality
@ -91,13 +95,28 @@ struct LogRouterData {
}
//only callable after getTagData returns a null reference
Reference<TagData> createTagData(Tag tag, Version popped) {
Reference<TagData> newTagData = Reference<TagData>( new TagData(tag, popped) );
Reference<TagData> createTagData(Tag tag, Version popped, Version knownCommittedVersion) {
Reference<TagData> newTagData = Reference<TagData>( new TagData(tag, popped, knownCommittedVersion) );
tag_data[tag.id] = newTagData;
return newTagData;
}
LogRouterData(UID dbgid, Tag routerTag, int logSet) : dbgid(dbgid), routerTag(routerTag), logSet(logSet), logSystem(new AsyncVar<Reference<ILogSystem>>()) {}
LogRouterData(UID dbgid, InitializeLogRouterRequest req) : dbgid(dbgid), routerTag(req.routerTag), logSystem(new AsyncVar<Reference<ILogSystem>>()), version(req.startVersion-1), minPopped(req.startVersion-1), startVersion(req.startVersion), allowPops(false) {
//setup just enough of a logSet to be able to call getPushLocations
logSet.logServers.resize(req.tLogLocalities.size());
logSet.tLogPolicy = req.tLogPolicy;
logSet.hasBestPolicy = req.hasBestPolicy;
logSet.locality = req.locality;
logSet.updateLocalitySet(req.tLogLocalities);
for(int i = 0; i < req.tLogLocalities.size(); i++) {
Tag tag(tagLocalityRemoteLog, i);
auto tagData = getTagData(tag);
if(!tagData) {
tagData = createTagData(tag, 0, 0);
}
}
}
};
void commitMessages( LogRouterData* self, Version version, const std::vector<TagsAndMessage>& taggedMessages ) {
@ -134,7 +153,7 @@ void commitMessages( LogRouterData* self, Version version, const std::vector<Tag
for(auto& tag : msg.tags) {
auto tagData = self->getTagData(tag);
if(!tagData) {
tagData = self->createTagData(tag, 0);
tagData = self->createTagData(tag, 0, 0);
}
if (version >= tagData->popped) {
@ -150,10 +169,10 @@ void commitMessages( LogRouterData* self, Version version, const std::vector<Tag
self->messageBlocks.push_back( std::make_pair(version, block) );
}
ACTOR Future<Void> pullAsyncData( LogRouterData *self, Tag tag ) {
ACTOR Future<Void> pullAsyncData( LogRouterData *self ) {
state Future<Void> dbInfoChange = Void();
state Reference<ILogSystem::IPeekCursor> r;
state Version tagAt = self->version.get()+1;
state Version tagAt = self->version.get() + 1;
state Version tagPopped = 0;
state Version lastVer = 0;
state std::vector<int> tags;
@ -167,7 +186,7 @@ ACTOR Future<Void> pullAsyncData( LogRouterData *self, Tag tag ) {
when( Void _ = wait( dbInfoChange ) ) { //FIXME: does this actually happen?
if(r) tagPopped = std::max(tagPopped, r->popped());
if( self->logSystem->get() )
r = self->logSystem->get()->peekSingle( tagAt, tag );
r = self->logSystem->get()->peekLogRouter( self->dbgid, tagAt, self->routerTag );
else
r = Reference<ILogSystem::IPeekCursor>();
dbInfoChange = self->logSystem->onChange();
@ -175,13 +194,14 @@ ACTOR Future<Void> pullAsyncData( LogRouterData *self, Tag tag ) {
}
}
Version ver = 0;
std::vector<TagsAndMessage> messages;
state Version ver = 0;
state std::vector<TagsAndMessage> messages;
while (true) {
bool foundMessage = r->hasMessage();
state bool foundMessage = r->hasMessage();
if (!foundMessage || r->version().version != ver) {
ASSERT(r->version().version > lastVer);
if (ver) {
Void _ = wait(self->minPopped.whenAtLeast(std::min(self->version.get(), ver - SERVER_KNOBS->MAX_READ_TRANSACTION_LIFE_VERSIONS)));
commitMessages(self, ver, messages);
self->version.set( ver );
//TraceEvent("LogRouterVersion").detail("ver",ver);
@ -193,6 +213,7 @@ ACTOR Future<Void> pullAsyncData( LogRouterData *self, Tag tag ) {
if (!foundMessage) {
ver--; //ver is the next possible version we will get data for
if(ver > self->version.get()) {
Void _ = wait(self->minPopped.whenAtLeast(std::min(self->version.get(), ver - SERVER_KNOBS->MAX_READ_TRANSACTION_LIFE_VERSIONS)));
self->version.set( ver );
}
break;
@ -202,7 +223,7 @@ ACTOR Future<Void> pullAsyncData( LogRouterData *self, Tag tag ) {
TagsAndMessage tagAndMsg;
tagAndMsg.message = r->getMessageWithTags();
tags.clear();
self->logSystem->get()->addRemoteTags(self->logSet, r->getTags(), tags);
self->logSet.getPushLocations(r->getTags(), tags, 0);
for(auto t : tags) {
tagAndMsg.tags.push_back(Tag(tagLocalityRemoteLog, t));
}
@ -211,7 +232,7 @@ ACTOR Future<Void> pullAsyncData( LogRouterData *self, Tag tag ) {
r->nextMessage();
}
tagAt = r->version().version;
tagAt = std::max( r->version().version, self->version.get() + 1 );
}
}
@ -236,7 +257,7 @@ void peekMessagesFromMemory( LogRouterData* self, TLogPeekRequest const& req, Bi
for(; it != deque.end(); ++it) {
if(it->first != currentVersion) {
if (messages.getLength() >= SERVER_KNOBS->DESIRED_TOTAL_BYTES) {
endVersion = it->first;
endVersion = currentVersion + 1;
//TraceEvent("tLogPeekMessagesReached2", self->dbgid);
break;
}
@ -273,13 +294,11 @@ ACTOR Future<Void> logRouterPeekMessages( LogRouterData* self, TLogPeekRequest r
Version poppedVer = poppedVersion(self, req.tag);
if(poppedVer > req.begin) {
//TraceEvent("LogRouterPeek3", self->dbgid);
TLogPeekReply rep;
rep.maxKnownVersion = self->version.get();
rep.popped = poppedVer;
rep.end = poppedVer;
req.reply.send( rep );
if(poppedVer > req.begin || req.begin < self->startVersion) {
//This should only happen if a packet is sent multiple times and the reply is not needed.
// Since we are using popped differently, do not send a reply.
TraceEvent(SevWarnAlways, "LogRouterPeekPopped", self->dbgid).detail("begin", req.begin).detail("popped", poppedVer).detail("start", self->startVersion);
req.reply.send( Never() );
return Void();
}
@ -289,6 +308,7 @@ ACTOR Future<Void> logRouterPeekMessages( LogRouterData* self, TLogPeekRequest r
TLogPeekReply reply;
reply.maxKnownVersion = self->version.get();
reply.messages = messages.toStringRef();
reply.popped = self->minPopped.get() >= self->startVersion ? self->minPopped.get() : 0;
reply.end = endVersion;
req.reply.send( reply );
@ -299,16 +319,19 @@ ACTOR Future<Void> logRouterPeekMessages( LogRouterData* self, TLogPeekRequest r
ACTOR Future<Void> logRouterPop( LogRouterData* self, TLogPopRequest req ) {
auto tagData = self->getTagData(req.tag);
if (!tagData) {
tagData = self->createTagData(req.tag, req.to);
tagData = self->createTagData(req.tag, req.to, req.knownCommittedVersion);
} else if (req.to > tagData->popped) {
tagData->popped = req.to;
tagData->knownCommittedVersion = req.knownCommittedVersion;
Void _ = wait(tagData->eraseMessagesBefore( req.to, self, TaskTLogPop ));
}
state Version minPopped = std::numeric_limits<Version>::max();
state Version minKnownCommittedVersion = std::numeric_limits<Version>::max();
for( auto it : self->tag_data ) {
if(it) {
minPopped = std::min( it->popped, minPopped );
minKnownCommittedVersion = std::min( it->knownCommittedVersion, minKnownCommittedVersion );
}
}
@ -317,35 +340,31 @@ ACTOR Future<Void> logRouterPop( LogRouterData* self, TLogPopRequest req ) {
Void _ = wait(yield(TaskUpdateStorage));
}
if(self->logSystem->get()) {
self->logSystem->get()->pop(minPopped, self->routerTag);
if(self->logSystem->get() && self->allowPops) {
self->logSystem->get()->pop(minKnownCommittedVersion - SERVER_KNOBS->MAX_READ_TRANSACTION_LIFE_VERSIONS, self->routerTag);
}
req.reply.send(Void());
self->minPopped.set(std::max(minPopped, self->minPopped.get()));
return Void();
}
ACTOR Future<Void> logRouterCore(
TLogInterface interf,
Tag tag,
int logSet,
InitializeLogRouterRequest req,
Reference<AsyncVar<ServerDBInfo>> db)
{
state LogRouterData logRouterData(interf.id(), tag, logSet);
state LogRouterData logRouterData(interf.id(), req);
state PromiseStream<Future<Void>> addActor;
state Future<Void> error = actorCollection( addActor.getFuture() );
state Future<Void> dbInfoChange = Void();
addActor.send( pullAsyncData(&logRouterData, tag) );
addActor.send( pullAsyncData(&logRouterData) );
loop choose {
when( Void _ = wait( dbInfoChange ) ) {
dbInfoChange = db->onChange();
if( db->get().recoveryState >= RecoveryState::FULLY_RECOVERED && logSet < db->get().logSystemConfig.tLogs.size() &&
std::count( db->get().logSystemConfig.tLogs[logSet].logRouters.begin(), db->get().logSystemConfig.tLogs[logSet].logRouters.end(), interf.id() ) ) {
logRouterData.logSystem->set(ILogSystem::fromServerDBInfo( logRouterData.dbgid, db->get() ));
} else {
logRouterData.logSystem->set(Reference<ILogSystem>());
}
logRouterData.allowPops = db->get().recoveryState == 7;
logRouterData.logSystem->set(ILogSystem::fromServerDBInfo( logRouterData.dbgid, db->get() ));
}
when( TLogPeekRequest req = waitNext( interf.peekMessages.getFuture() ) ) {
addActor.send( logRouterPeekMessages( &logRouterData, req ) );
@ -357,11 +376,28 @@ ACTOR Future<Void> logRouterCore(
}
}
ACTOR Future<Void> checkRemoved(Reference<AsyncVar<ServerDBInfo>> db, uint64_t recoveryCount, TLogInterface myInterface, int logSet) {
ACTOR Future<Void> checkRemoved(Reference<AsyncVar<ServerDBInfo>> db, uint64_t recoveryCount, TLogInterface myInterface) {
loop{
if ( ( (db->get().recoveryCount > recoveryCount && db->get().recoveryState != 0) || (db->get().recoveryCount == recoveryCount && db->get().recoveryState == 7) ) &&
( logSet >= db->get().logSystemConfig.expectedLogSets || ( logSet < db->get().logSystemConfig.tLogs.size() &&
!std::count(db->get().logSystemConfig.tLogs[logSet].logRouters.begin(), db->get().logSystemConfig.tLogs[logSet].logRouters.end(), myInterface.id()) ) )) {
bool isDisplaced = ( (db->get().recoveryCount > recoveryCount && db->get().recoveryState != 0) || (db->get().recoveryCount == recoveryCount && db->get().recoveryState == 7) );
if(isDisplaced) {
for(auto& log : db->get().logSystemConfig.tLogs) {
if( std::count( log.logRouters.begin(), log.logRouters.end(), myInterface.id() ) ) {
isDisplaced = false;
break;
}
}
}
if(isDisplaced) {
for(auto& old : db->get().logSystemConfig.oldTLogs) {
for(auto& log : old.tLogs) {
if( std::count( log.logRouters.begin(), log.logRouters.end(), myInterface.id() ) ) {
isDisplaced = false;
break;
}
}
}
}
if (isDisplaced) {
throw worker_removed();
}
Void _ = wait(db->onChange());
@ -374,10 +410,11 @@ ACTOR Future<Void> logRouter(
Reference<AsyncVar<ServerDBInfo>> db)
{
try {
state Future<Void> core = logRouterCore(interf, req.routerTag, req.logSet, db);
TraceEvent("LogRouterStart", interf.id()).detail("start", req.startVersion).detail("tag", req.routerTag.toString()).detail("localities", req.tLogLocalities.size()).detail("hasBestPolicy", req.hasBestPolicy).detail("locality", req.locality);
state Future<Void> core = logRouterCore(interf, req, db);
loop choose{
when(Void _ = wait(core)) { return Void(); }
when(Void _ = wait(checkRemoved(db, req.recoveryCount, interf, req.logSet))) {}
when(Void _ = wait(checkRemoved(db, req.recoveryCount, interf))) {}
}
}
catch (Error& e) {

View File

@ -24,7 +24,7 @@
#include "TLogInterface.h"
#include "WorkerInterface.h"
#include "DatabaseConfiguration.h"
#include "fdbclient/DatabaseConfiguration.h"
#include "flow/IndexedSet.h"
#include "fdbrpc/ReplicationPolicy.h"
#include "fdbrpc/Locality.h"
@ -46,8 +46,32 @@ public:
bool isLocal;
int32_t hasBestPolicy;
int8_t locality;
Version startVersion;
std::vector<Future<TLogLockResult>> replies;
LogSet() : tLogWriteAntiQuorum(0), tLogReplicationFactor(0), isLocal(true), hasBestPolicy(HasBestPolicyId), locality(-99) {}
LogSet() : tLogWriteAntiQuorum(0), tLogReplicationFactor(0), isLocal(true), hasBestPolicy(HasBestPolicyId), locality(tagLocalityInvalid), startVersion(invalidVersion) {}
std::string logRouterString() {
std::string result;
for(int i = 0; i < logRouters.size(); i++) {
if(i>0) {
result += ", ";
}
result += logRouters[i]->get().id().toString();
}
return result;
}
std::string logServerString() {
std::string result;
for(int i = 0; i < logServers.size(); i++) {
if(i>0) {
result += ", ";
}
result += logServers[i]->get().id().toString();
}
return result;
}
int bestLocationFor( Tag tag ) {
if(hasBestPolicy == HasBestPolicyNone) {
@ -81,7 +105,7 @@ public:
}
}
void updateLocalitySet( vector<WorkerInterface> const& workers ) {
void updateLocalitySet( vector<LocalityData> const& localities ) {
LocalityMap<int>* logServerMap;
logServerSet = LocalitySetRef(new LocalityMap<int>());
@ -89,12 +113,12 @@ public:
logEntryMap.clear();
logIndexArray.clear();
logIndexArray.reserve(workers.size());
logIndexArray.reserve(localities.size());
for( int i = 0; i < workers.size(); i++ ) {
for( int i = 0; i < localities.size(); i++ ) {
ASSERT(logEntryMap.find(i) == logEntryMap.end());
logIndexArray.push_back(i);
logEntryMap[logIndexArray.back()] = logServerMap->add(workers[i].locality, &logIndexArray.back());
logEntryMap[logIndexArray.back()] = logServerMap->add(localities[i], &logIndexArray.back());
}
}
@ -241,39 +265,23 @@ struct ILogSystem {
Future<Void> interfaceChanged;
ServerPeekCursor( Reference<AsyncVar<OptionalInterface<TLogInterface>>> const& interf, Tag tag, Version begin, Version end, bool returnIfBlocked, bool parallelGetMore );
ServerPeekCursor( TLogPeekReply const& results, LogMessageVersion const& messageVersion, LogMessageVersion const& end, int32_t messageLength, int32_t rawLength, bool hasMsg, Version poppedVersion, Tag tag );
virtual Reference<IPeekCursor> cloneNoMore();
virtual void setProtocolVersion( uint64_t version );
virtual Arena& arena();
virtual ArenaReader* reader();
virtual bool hasMessage();
virtual void nextMessage();
virtual StringRef getMessage();
virtual StringRef getMessageWithTags();
virtual const std::vector<Tag>& getTags();
virtual void advanceTo(LogMessageVersion n);
virtual Future<Void> getMore(int taskID = TaskTLogPeekReply);
virtual Future<Void> onFailed();
virtual bool isActive();
virtual bool isExhausted();
virtual LogMessageVersion version();
virtual Version popped();
virtual void addref() {
@ -288,6 +296,7 @@ struct ILogSystem {
};
struct MergedPeekCursor : IPeekCursor, ReferenceCounted<MergedPeekCursor> {
LocalityGroup localityGroup;
vector< Reference<IPeekCursor> > serverCursors;
std::vector< std::pair<LogMessageVersion, int> > sortedVersions;
Tag tag;
@ -298,47 +307,34 @@ struct ILogSystem {
UID randomID;
int tLogReplicationFactor;
IRepPolicyRef tLogPolicy;
std::vector< LocalityData > tLogLocalities;
Arena messageArena;
//FIXME: collectTags is needed to support upgrades from 5.X to 6.0. Remove this code when we no longer support that upgrade.
bool collectTags;
std::vector<Tag> tags;
MergedPeekCursor( std::vector<Reference<AsyncVar<OptionalInterface<TLogInterface>>>> const& logServers, int bestServer, int readQuorum, Tag tag, Version begin, Version end, bool parallelGetMore );
MergedPeekCursor( vector< Reference<IPeekCursor> > const& serverCursors, LogMessageVersion const& messageVersion, int bestServer, int readQuorum, Optional<LogMessageVersion> nextVersion );
// if server_cursors[c]->hasMessage(), then nextSequence <= server_cursors[c]->sequence() and there are no messages known to that server with sequences in [nextSequence,server_cursors[c]->sequence())
MergedPeekCursor( vector< Reference<ILogSystem::IPeekCursor> > const& serverCursors, Version begin, bool collectTags );
MergedPeekCursor( std::vector<Reference<AsyncVar<OptionalInterface<TLogInterface>>>> const& logServers, int bestServer, int readQuorum, Tag tag, Version begin, Version end, bool parallelGetMore, std::vector<LocalityData> const& tLogLocalities, IRepPolicyRef const tLogPolicy, int tLogReplicationFactor );
MergedPeekCursor( vector< Reference<IPeekCursor> > const& serverCursors, LogMessageVersion const& messageVersion, int bestServer, int readQuorum, Optional<LogMessageVersion> nextVersion, std::vector<LocalityData> const& tLogLocalities, IRepPolicyRef const tLogPolicy, int tLogReplicationFactor );
virtual Reference<IPeekCursor> cloneNoMore();
virtual void setProtocolVersion( uint64_t version );
virtual Arena& arena();
virtual ArenaReader* reader();
void calcHasMessage();
void updateMessage();
void updateMessage(bool usePolicy);
virtual bool hasMessage();
virtual void nextMessage();
virtual StringRef getMessage();
virtual StringRef getMessageWithTags();
virtual const std::vector<Tag>& getTags();
virtual void advanceTo(LogMessageVersion n);
virtual Future<Void> getMore(int taskID = TaskTLogPeekReply);
virtual Future<Void> onFailed();
virtual bool isActive();
virtual bool isExhausted();
virtual LogMessageVersion version();
virtual Version popped();
virtual void addref() {
@ -364,41 +360,25 @@ struct ILogSystem {
UID randomID;
SetPeekCursor( std::vector<Reference<LogSet>> const& logSets, int bestSet, int bestServer, Tag tag, Version begin, Version end, bool parallelGetMore );
SetPeekCursor( std::vector<Reference<LogSet>> const& logSets, std::vector< std::vector< Reference<IPeekCursor> > > const& serverCursors, LogMessageVersion const& messageVersion, int bestSet, int bestServer, Optional<LogMessageVersion> nextVersion, bool useBestSet );
virtual Reference<IPeekCursor> cloneNoMore();
virtual void setProtocolVersion( uint64_t version );
virtual Arena& arena();
virtual ArenaReader* reader();
void calcHasMessage();
void updateMessage(int logIdx, bool usePolicy);
virtual bool hasMessage();
virtual void nextMessage();
virtual StringRef getMessage();
virtual StringRef getMessageWithTags();
virtual const std::vector<Tag>& getTags();
virtual void advanceTo(LogMessageVersion n);
virtual Future<Void> getMore(int taskID = TaskTLogPeekReply);
virtual Future<Void> onFailed();
virtual bool isActive();
virtual bool isExhausted();
virtual LogMessageVersion version();
virtual Version popped();
virtual void addref() {
@ -418,35 +398,20 @@ struct ILogSystem {
MultiCursor( std::vector<Reference<IPeekCursor>> cursors, std::vector<LogMessageVersion> epochEnds );
virtual Reference<IPeekCursor> cloneNoMore();
virtual void setProtocolVersion( uint64_t version );
virtual Arena& arena();
virtual ArenaReader* reader();
virtual bool hasMessage();
virtual void nextMessage();
virtual StringRef getMessage();
virtual StringRef getMessageWithTags();
virtual const std::vector<Tag>& getTags();
virtual void advanceTo(LogMessageVersion n);
virtual Future<Void> getMore(int taskID = TaskTLogPeekReply);
virtual Future<Void> onFailed();
virtual bool isActive();
virtual bool isExhausted();
virtual LogMessageVersion version();
virtual Version popped();
virtual void addref() {
@ -483,16 +448,22 @@ struct ILogSystem {
// Returns when the preceding changes are durable. (Later we will need multiple return signals for diffferent durability levels)
// If the current epoch has ended, push will not return, and the pushed messages will not be visible in any subsequent epoch (but may become visible in this epoch)
//Future<PeekResults> peek( int64_t begin_epoch, int64_t begin_seq, int tag );
virtual Reference<IPeekCursor> peek( Version begin, Tag tag, bool parallelGetMore = false ) = 0;
virtual Reference<IPeekCursor> peek( UID dbgid, Version begin, Tag tag, bool parallelGetMore = false ) = 0;
// Returns (via cursor interface) a stream of messages with the given tag and message versions >= (begin, 0), ordered by message version
// If pop was previously or concurrently called with upTo > begin, the cursor may not return all such messages. In that case cursor->popped() will
// be greater than begin to reflect that.
virtual Reference<IPeekCursor> peekSingle( Version begin, Tag tag, vector<pair<Version,Tag>> history = vector<pair<Version,Tag>>() ) = 0;
virtual Reference<IPeekCursor> peek( UID dbgid, Version begin, std::vector<Tag> tags, bool parallelGetMore = false ) = 0;
// Same contract as peek(), but for a set of tags
virtual Reference<IPeekCursor> peekSingle( UID dbgid, Version begin, Tag tag, vector<pair<Version,Tag>> history = vector<pair<Version,Tag>>() ) = 0;
// Same contract as peek(), but blocks until the preferred log server(s) for the given tag are available (and is correspondingly less expensive)
virtual void pop( Version upTo, Tag tag ) = 0;
virtual Reference<IPeekCursor> peekLogRouter( UID dbgid, Version begin, Tag tag ) = 0;
// Same contract as peek(), but can only peek from the logs elected in the same generation.
// If the preferred log server is down, a different log from the same generation will merge results locally before sending them to the log router.
virtual void pop( Version upTo, Tag tag, Version knownCommittedVersion = 0, int8_t popLocality = tagLocalityInvalid ) = 0;
// Permits, but does not require, the log subsystem to strip `tag` from any or all messages with message versions < (upTo,0)
// The popping of any given message may be arbitrarily delayed.
@ -519,7 +490,7 @@ struct ILogSystem {
// Call only on an ILogSystem obtained from recoverAndEndEpoch()
// Returns the first unreadable version number of the recovered epoch (i.e. message version numbers < (get_end(), 0) will be readable)
virtual Future<Reference<ILogSystem>> newEpoch( struct RecruitFromConfigurationReply const& recr, Future<struct RecruitRemoteFromConfigurationReply> const& fRemoteWorkers, DatabaseConfiguration const& config, LogEpoch recoveryCount, int8_t primaryLocality, int8_t remoteLocality ) = 0;
virtual Future<Reference<ILogSystem>> newEpoch( struct RecruitFromConfigurationReply const& recr, Future<struct RecruitRemoteFromConfigurationReply> const& fRemoteWorkers, DatabaseConfiguration const& config, LogEpoch recoveryCount, int8_t primaryLocality, int8_t remoteLocality, std::vector<Tag> const& allTags ) = 0;
// Call only on an ILogSystem obtained from recoverAndEndEpoch()
// Returns an ILogSystem representing a new epoch immediately following this one. The new epoch is only provisional until the caller updates the coordinated DBCoreState
@ -535,8 +506,6 @@ struct ILogSystem {
virtual bool hasRemoteLogs() = 0;
virtual void addRemoteTags( int logSet, std::vector<Tag> const& originalTags, std::vector<int>& tags ) = 0;
virtual Tag getRandomRouterTag() = 0;
virtual void stopRejoins() = 0;

View File

@ -24,7 +24,7 @@
#include "TLogInterface.h"
#include "fdbrpc/ReplicationPolicy.h"
#include "DatabaseConfiguration.h"
#include "fdbclient/DatabaseConfiguration.h"
template <class Interface>
struct OptionalInterface {
@ -63,18 +63,20 @@ struct TLogSet {
int32_t tLogWriteAntiQuorum, tLogReplicationFactor;
std::vector< LocalityData > tLogLocalities; // Stores the localities of the log servers
IRepPolicyRef tLogPolicy;
int8_t locality;
bool isLocal;
int32_t hasBestPolicy;
int8_t locality;
Version startVersion;
TLogSet() : tLogWriteAntiQuorum(0), tLogReplicationFactor(0), isLocal(true), hasBestPolicy(HasBestPolicyId), locality(-99) {}
TLogSet() : tLogWriteAntiQuorum(0), tLogReplicationFactor(0), isLocal(true), hasBestPolicy(HasBestPolicyId), locality(tagLocalityInvalid), startVersion(invalidVersion) {}
std::string toString() const {
return format("anti: %d replication: %d local: %d best: %d routers: %d tLogs: %s locality: %d", tLogWriteAntiQuorum, tLogReplicationFactor, isLocal, hasBestPolicy, logRouters.size(), describe(tLogs).c_str(), locality);
}
bool operator == ( const TLogSet& rhs ) const {
if (tLogWriteAntiQuorum != rhs.tLogWriteAntiQuorum || tLogReplicationFactor != rhs.tLogReplicationFactor || isLocal != rhs.isLocal || hasBestPolicy != rhs.hasBestPolicy || tLogs.size() != rhs.tLogs.size() || locality != rhs.locality) {
if (tLogWriteAntiQuorum != rhs.tLogWriteAntiQuorum || tLogReplicationFactor != rhs.tLogReplicationFactor || isLocal != rhs.isLocal || hasBestPolicy != rhs.hasBestPolicy ||
startVersion != rhs.startVersion || tLogs.size() != rhs.tLogs.size() || locality != rhs.locality || logRouters.size() != rhs.logRouters.size()) {
return false;
}
if ((tLogPolicy && !rhs.tLogPolicy) || (!tLogPolicy && rhs.tLogPolicy) || (tLogPolicy && (tLogPolicy->info() != rhs.tLogPolicy->info()))) {
@ -85,11 +87,16 @@ struct TLogSet {
return false;
}
}
for(int j = 0; j < logRouters.size(); j++ ) {
if (logRouters[j].id() != rhs.logRouters[j].id() || logRouters[j].present() != rhs.logRouters[j].present() || ( logRouters[j].present() && logRouters[j].interf().commit.getEndpoint().token != rhs.logRouters[j].interf().commit.getEndpoint().token ) ) {
return false;
}
}
return true;
}
bool isEqualIds(TLogSet const& r) const {
if (tLogWriteAntiQuorum != r.tLogWriteAntiQuorum || tLogReplicationFactor != r.tLogReplicationFactor || isLocal != r.isLocal || hasBestPolicy != r.hasBestPolicy || tLogs.size() != r.tLogs.size() || locality != r.locality) {
if (tLogWriteAntiQuorum != r.tLogWriteAntiQuorum || tLogReplicationFactor != r.tLogReplicationFactor || isLocal != r.isLocal || hasBestPolicy != r.hasBestPolicy || startVersion != r.startVersion || tLogs.size() != r.tLogs.size() || locality != r.locality) {
return false;
}
if ((tLogPolicy && !r.tLogPolicy) || (!tLogPolicy && r.tLogPolicy) || (tLogPolicy && (tLogPolicy->info() != r.tLogPolicy->info()))) {
@ -105,22 +112,23 @@ struct TLogSet {
template <class Ar>
void serialize( Ar& ar ) {
ar & tLogs & logRouters & tLogWriteAntiQuorum & tLogReplicationFactor & tLogPolicy & tLogLocalities & isLocal & hasBestPolicy & locality;
ar & tLogs & logRouters & tLogWriteAntiQuorum & tLogReplicationFactor & tLogPolicy & tLogLocalities & isLocal & hasBestPolicy & locality & startVersion;
}
};
struct OldTLogConf {
std::vector<TLogSet> tLogs;
Version epochEnd;
int32_t logRouterTags;
OldTLogConf() : epochEnd(0) {}
OldTLogConf() : epochEnd(0), logRouterTags(0) {}
std::string toString() const {
return format("end: %d %s", epochEnd, describe(tLogs).c_str());
return format("end: %d tags: %d %s", epochEnd, logRouterTags, describe(tLogs).c_str());
}
bool operator == ( const OldTLogConf& rhs ) const {
return tLogs == rhs.tLogs && epochEnd == rhs.epochEnd;
return tLogs == rhs.tLogs && epochEnd == rhs.epochEnd && logRouterTags == rhs.logRouterTags;
}
bool isEqualIds(OldTLogConf const& r) const {
@ -137,21 +145,23 @@ struct OldTLogConf {
template <class Ar>
void serialize( Ar& ar ) {
ar & tLogs & epochEnd;
ar & tLogs & epochEnd & logRouterTags;
}
};
struct LogSystemConfig {
int logSystemType;
int32_t logSystemType;
std::vector<TLogSet> tLogs;
int32_t logRouterTags;
std::vector<OldTLogConf> oldTLogs;
int expectedLogSets;
int minRouters;
int32_t expectedLogSets;
UID recruitmentID;
bool stopped;
LogSystemConfig() : logSystemType(0), minRouters(0), expectedLogSets(0) {}
LogSystemConfig() : logSystemType(0), logRouterTags(0), expectedLogSets(0), stopped(false) {}
std::string toString() const {
return format("type: %d oldGenerations: %d %s", logSystemType, oldTLogs.size(), describe(tLogs).c_str());
return format("type: %d oldGenerations: %d tags: %d %s", logSystemType, oldTLogs.size(), logRouterTags, describe(tLogs).c_str());
}
std::vector<TLogInterface> allPresentLogs() const {
@ -193,7 +203,7 @@ struct LogSystemConfig {
bool operator == ( const LogSystemConfig& rhs ) const { return isEqual(rhs); }
bool isEqual(LogSystemConfig const& r) const {
return logSystemType == r.logSystemType && tLogs == r.tLogs && oldTLogs == r.oldTLogs && minRouters == r.minRouters && expectedLogSets == r.expectedLogSets;
return logSystemType == r.logSystemType && tLogs == r.tLogs && oldTLogs == r.oldTLogs && expectedLogSets == r.expectedLogSets && logRouterTags == r.logRouterTags && recruitmentID == r.recruitmentID && stopped == r.stopped;
}
bool isEqualIds(LogSystemConfig const& r) const {
@ -224,7 +234,7 @@ struct LogSystemConfig {
template <class Ar>
void serialize( Ar& ar ) {
ar & logSystemType & tLogs & oldTLogs & minRouters & expectedLogSets;
ar & logSystemType & tLogs & logRouterTags & oldTLogs & expectedLogSets & recruitmentID & stopped;
}
};

View File

@ -42,7 +42,7 @@ public:
LogSystemDiskQueueAdapter( Reference<ILogSystem> logSystem, Tag tag, bool recover=true ) : logSystem(logSystem), tag(tag), enableRecovery(recover), recoveryLoc(1), recoveryQueueLoc(1), poppedUpTo(0), nextCommit(1), recoveryQueueDataSize(0) {
if (enableRecovery)
cursor = logSystem->peek( 0, tag, true );
cursor = logSystem->peek( UID(), 0, tag, true );
}
struct CommitMessage {

View File

@ -256,8 +256,17 @@ LogMessageVersion ILogSystem::ServerPeekCursor::version() { return messageVersio
Version ILogSystem::ServerPeekCursor::popped() { return poppedVersion; }
ILogSystem::MergedPeekCursor::MergedPeekCursor( std::vector<Reference<AsyncVar<OptionalInterface<TLogInterface>>>> const& logServers, int bestServer, int readQuorum, Tag tag, Version begin, Version end, bool parallelGetMore )
: bestServer(bestServer), readQuorum(readQuorum), tag(tag), currentCursor(0), hasNextMessage(false), messageVersion(begin), randomID(g_random->randomUniqueID()) {
ILogSystem::MergedPeekCursor::MergedPeekCursor( vector< Reference<ILogSystem::IPeekCursor> > const& serverCursors, Version begin, bool collectTags )
: serverCursors(serverCursors), bestServer(-1), readQuorum(serverCursors.size()), tag(invalidTag), currentCursor(0), hasNextMessage(false),
messageVersion(begin), randomID(g_random->randomUniqueID()), tLogReplicationFactor(0), collectTags(collectTags) {
sortedVersions.resize(serverCursors.size());
}
ILogSystem::MergedPeekCursor::MergedPeekCursor( std::vector<Reference<AsyncVar<OptionalInterface<TLogInterface>>>> const& logServers, int bestServer, int readQuorum, Tag tag, Version begin, Version end,
bool parallelGetMore, std::vector< LocalityData > const& tLogLocalities, IRepPolicyRef const tLogPolicy, int tLogReplicationFactor )
: bestServer(bestServer), readQuorum(readQuorum), tag(tag), currentCursor(0), hasNextMessage(false), messageVersion(begin), randomID(g_random->randomUniqueID()), tLogLocalities(tLogLocalities),
tLogPolicy(tLogPolicy), tLogReplicationFactor(tLogReplicationFactor), collectTags(false) {
for( int i = 0; i < logServers.size(); i++ ) {
Reference<ILogSystem::ServerPeekCursor> cursor( new ILogSystem::ServerPeekCursor( logServers[i], tag, begin, end, bestServer >= 0, parallelGetMore ) );
//TraceEvent("MPC_starting", randomID).detail("cursor", cursor->randomID).detail("end", end);
@ -266,8 +275,11 @@ ILogSystem::MergedPeekCursor::MergedPeekCursor( std::vector<Reference<AsyncVar<O
sortedVersions.resize(serverCursors.size());
}
ILogSystem::MergedPeekCursor::MergedPeekCursor( vector< Reference<ILogSystem::IPeekCursor> > const& serverCursors, LogMessageVersion const& messageVersion, int bestServer, int readQuorum, Optional<LogMessageVersion> nextVersion )
: serverCursors(serverCursors), bestServer(bestServer), readQuorum(readQuorum), currentCursor(0), hasNextMessage(false), messageVersion(messageVersion), nextVersion(nextVersion), randomID(g_random->randomUniqueID()) {
ILogSystem::MergedPeekCursor::MergedPeekCursor( vector< Reference<ILogSystem::IPeekCursor> > const& serverCursors, LogMessageVersion const& messageVersion, int bestServer, int readQuorum, Optional<LogMessageVersion> nextVersion,
std::vector< LocalityData > const& tLogLocalities, IRepPolicyRef const tLogPolicy, int tLogReplicationFactor )
: serverCursors(serverCursors), bestServer(bestServer), readQuorum(readQuorum), currentCursor(0), hasNextMessage(false), messageVersion(messageVersion), nextVersion(nextVersion),
randomID(g_random->randomUniqueID()), tLogLocalities(tLogLocalities), tLogPolicy(tLogPolicy), tLogReplicationFactor(tLogReplicationFactor), collectTags(false) {
sortedVersions.resize(serverCursors.size());
calcHasMessage();
}
@ -276,7 +288,7 @@ Reference<ILogSystem::IPeekCursor> ILogSystem::MergedPeekCursor::cloneNoMore() {
for( auto it : serverCursors ) {
cursors.push_back(it->cloneNoMore());
}
return Reference<ILogSystem::MergedPeekCursor>( new ILogSystem::MergedPeekCursor( cursors, messageVersion, bestServer, readQuorum, nextVersion ) );
return Reference<ILogSystem::MergedPeekCursor>( new ILogSystem::MergedPeekCursor( cursors, messageVersion, bestServer, readQuorum, nextVersion, tLogLocalities, tLogPolicy, tLogReplicationFactor ) );
}
void ILogSystem::MergedPeekCursor::setProtocolVersion( uint64_t version ) {
@ -310,10 +322,14 @@ void ILogSystem::MergedPeekCursor::calcHasMessage() {
}
hasNextMessage = false;
updateMessage();
updateMessage(false);
if(!hasNextMessage && tLogPolicy) {
updateMessage(true);
}
}
void ILogSystem::MergedPeekCursor::updateMessage() {
void ILogSystem::MergedPeekCursor::updateMessage(bool usePolicy) {
loop {
bool advancedPast = false;
sortedVersions.clear();
@ -323,14 +339,30 @@ void ILogSystem::MergedPeekCursor::updateMessage() {
sortedVersions.push_back(std::pair<LogMessageVersion, int>(serverCursor->version(), i));
}
std::nth_element(sortedVersions.begin(), sortedVersions.end()-readQuorum, sortedVersions.end());
messageVersion = sortedVersions[sortedVersions.size()-readQuorum].first;
if(usePolicy) {
ASSERT(tLogPolicy);
localityGroup.clear();
std::sort(sortedVersions.begin(), sortedVersions.end());
for(auto sortedVersion : sortedVersions) {
auto& locality = tLogLocalities[sortedVersion.second];
localityGroup.add(locality);
if( localityGroup.size() >= tLogReplicationFactor && localityGroup.validate(tLogPolicy) ) {
messageVersion = sortedVersion.first;
break;
}
}
} else {
std::nth_element(sortedVersions.begin(), sortedVersions.end()-readQuorum, sortedVersions.end());
messageVersion = sortedVersions[sortedVersions.size()-readQuorum].first;
}
for(int i = 0; i < serverCursors.size(); i++) {
auto& c = serverCursors[i];
auto start = c->version();
c->advanceTo(messageVersion);
if( start < messageVersion && messageVersion < c->version() ) {
if( start <= messageVersion && messageVersion < c->version() ) {
advancedPast = true;
TEST(true); //Merge peek cursor advanced past desired sequence
}
@ -340,13 +372,19 @@ void ILogSystem::MergedPeekCursor::updateMessage() {
break;
}
tags.clear();
for(int i = 0; i < serverCursors.size(); i++) {
auto& c = serverCursors[i];
ASSERT_WE_THINK( !c->hasMessage() || c->version() >= messageVersion ); // Seems like the loop above makes this unconditionally true
if (c->version() == messageVersion && c->hasMessage()) {
hasNextMessage = true;
currentCursor = i;
break;
if(!collectTags) {
break;
}
auto& addTags = c->getTags();
ASSERT(addTags.size() == 1);
tags.push_back(addTags[0]);
}
}
}
@ -365,10 +403,24 @@ void ILogSystem::MergedPeekCursor::nextMessage() {
StringRef ILogSystem::MergedPeekCursor::getMessage() { return serverCursors[currentCursor]->getMessage(); }
StringRef ILogSystem::MergedPeekCursor::getMessageWithTags() { return serverCursors[currentCursor]->getMessageWithTags(); }
StringRef ILogSystem::MergedPeekCursor::getMessageWithTags() {
if(collectTags) {
StringRef msg = serverCursors[currentCursor]->getMessage();
BinaryWriter messageWriter(Unversioned());
messageWriter << uint32_t(msg.size() + sizeof(uint32_t) + sizeof(uint16_t) + tags.size()*sizeof(Tag)) << serverCursors[currentCursor]->version().sub << uint16_t(tags.size());
for(auto& t : tags) {
messageWriter << t;
}
messageWriter.serializeBytes(msg);
return StringRef(messageArena, messageWriter.toStringRef());
}
return serverCursors[currentCursor]->getMessageWithTags();
}
const std::vector<Tag>& ILogSystem::MergedPeekCursor::getTags() {
if(collectTags) {
return tags;
}
return serverCursors[currentCursor]->getTags();
}
@ -393,8 +445,10 @@ ACTOR Future<Void> mergedPeekGetMore(ILogSystem::MergedPeekCursor* self, LogMess
}
self->calcHasMessage();
//TraceEvent("MPC_getMoreB", self->randomID).detail("hasMessage", self->hasMessage()).detail("start", startVersion.toString()).detail("seq", self->version().toString());
if (self->hasMessage() || self->version() > startVersion)
if (self->hasMessage() || self->version() > startVersion) {
self->messageArena = Arena();
return Void();
}
}
}
@ -426,8 +480,7 @@ bool ILogSystem::MergedPeekCursor::isActive() {
}
bool ILogSystem::MergedPeekCursor::isExhausted() {
ASSERT(false);
return false;
return serverCursors[currentCursor]->isExhausted();
}
LogMessageVersion ILogSystem::MergedPeekCursor::version() { return messageVersion; }
@ -453,9 +506,26 @@ ILogSystem::SetPeekCursor::SetPeekCursor( std::vector<Reference<LogSet>> const&
sortedVersions.resize(maxServers);
}
ILogSystem::SetPeekCursor::SetPeekCursor( std::vector<Reference<LogSet>> const& logSets, std::vector< std::vector< Reference<IPeekCursor> > > const& serverCursors, LogMessageVersion const& messageVersion, int bestSet, int bestServer,
Optional<LogMessageVersion> nextVersion, bool useBestSet ) : logSets(logSets), serverCursors(serverCursors), messageVersion(messageVersion), bestSet(bestSet), bestServer(bestServer), nextVersion(nextVersion), currentSet(bestSet), currentCursor(0),
hasNextMessage(false), useBestSet(useBestSet), randomID(g_random->randomUniqueID()) {
int maxServers = 0;
for( int i = 0; i < logSets.size(); i++ ) {
maxServers = std::max<int>(maxServers, serverCursors[i].size());
}
sortedVersions.resize(maxServers);
calcHasMessage();
}
Reference<ILogSystem::IPeekCursor> ILogSystem::SetPeekCursor::cloneNoMore() {
ASSERT(false); //not implemented
throw internal_error();
vector< vector< Reference<ILogSystem::IPeekCursor> > > cursors;
cursors.resize(logSets.size());
for( int i = 0; i < logSets.size(); i++ ) {
for( int j = 0; j < logSets[i]->logServers.size(); j++) {
cursors[i].push_back( serverCursors[i][j]->cloneNoMore() );
}
}
return Reference<ILogSystem::SetPeekCursor>( new ILogSystem::SetPeekCursor( logSets, cursors, messageVersion, bestSet, bestServer, nextVersion, useBestSet ) );
}
void ILogSystem::SetPeekCursor::setProtocolVersion( uint64_t version ) {
@ -563,7 +633,7 @@ void ILogSystem::SetPeekCursor::updateMessage(int logIdx, bool usePolicy) {
for (auto& c : cursors) {
auto start = c->version();
c->advanceTo(messageVersion);
if( start < messageVersion && messageVersion < c->version() ) {
if( start <= messageVersion && messageVersion < c->version() ) {
advancedPast = true;
TEST(true); //Merge peek cursor advanced past desired sequence
}
@ -701,8 +771,7 @@ bool ILogSystem::SetPeekCursor::isActive() {
}
bool ILogSystem::SetPeekCursor::isExhausted() {
ASSERT(false);
return false;
return serverCursors[currentSet][currentCursor]->isExhausted();
}
LogMessageVersion ILogSystem::SetPeekCursor::version() { return messageVersion; }
@ -782,7 +851,7 @@ bool ILogSystem::MultiCursor::isActive() {
}
bool ILogSystem::MultiCursor::isExhausted() {
return cursors.back()->isActive();
return cursors.back()->isExhausted();
}
LogMessageVersion ILogSystem::MultiCursor::version() {

View File

@ -199,6 +199,7 @@ struct ProxyCommitData {
EventMetricHandle<SingleKeyMutation> singleKeyMutationEvent;
std::map<UID, Reference<StorageInfo>> storageCache;
std::map<Tag, Version> tag_popped;
//The tag related to a storage server rarely change, so we keep a vector of tags for each key range to be slightly more CPU efficient.
//When a tag related to a storage server does change, we empty out all of these vectors to signify they must be repopulated.
@ -446,7 +447,7 @@ ACTOR Future<Void> commitBatch(
for (int resolver = 0; resolver < resolution.size(); resolver++)
committed = committed && resolution[resolver].stateMutations[versionIndex][transactionIndex].committed;
if (committed)
applyMetadataMutations( self->dbgid, arena, resolution[0].stateMutations[versionIndex][transactionIndex].mutations, self->txnStateStore, NULL, &forceRecovery, self->logSystem, 0, &self->vecBackupKeys, &self->keyInfo, self->firstProxy ? &self->uid_applyMutationsData : NULL, self->commit, self->cx, &self->committedVersion, &self->storageCache );
applyMetadataMutations( self->dbgid, arena, resolution[0].stateMutations[versionIndex][transactionIndex].mutations, self->txnStateStore, NULL, &forceRecovery, self->logSystem, 0, &self->vecBackupKeys, &self->keyInfo, self->firstProxy ? &self->uid_applyMutationsData : NULL, self->commit, self->cx, &self->committedVersion, &self->storageCache, &self->tag_popped);
if( resolution[0].stateMutations[versionIndex][transactionIndex].mutations.size() && firstStateMutations ) {
ASSERT(committed);
@ -508,7 +509,7 @@ ACTOR Future<Void> commitBatch(
{
if (committed[t] == ConflictBatch::TransactionCommitted && (!locked || trs[t].isLockAware())) {
commitCount++;
applyMetadataMutations(self->dbgid, arena, trs[t].transaction.mutations, self->txnStateStore, &toCommit, &forceRecovery, self->logSystem, commitVersion+1, &self->vecBackupKeys, &self->keyInfo, self->firstProxy ? &self->uid_applyMutationsData : NULL, self->commit, self->cx, &self->committedVersion, &self->storageCache);
applyMetadataMutations(self->dbgid, arena, trs[t].transaction.mutations, self->txnStateStore, &toCommit, &forceRecovery, self->logSystem, commitVersion+1, &self->vecBackupKeys, &self->keyInfo, self->firstProxy ? &self->uid_applyMutationsData : NULL, self->commit, self->cx, &self->committedVersion, &self->storageCache, &self->tag_popped);
}
if(firstStateMutations) {
ASSERT(committed[t] == ConflictBatch::TransactionCommitted);
@ -762,7 +763,7 @@ ACTOR Future<Void> commitBatch(
}
}
LogSystemDiskQueueAdapter::CommitMessage msg = wait(storeCommits.back().first); // Should just be doing yields
state LogSystemDiskQueueAdapter::CommitMessage msg = wait(storeCommits.back().first); // Should just be doing yields
if (debugID.present())
g_traceBatch.addEvent("CommitDebug", debugID.get().first(), "MasterProxyServer.commitBatch.AfterStoreCommits");
@ -777,14 +778,11 @@ ACTOR Future<Void> commitBatch(
firstMessage = false;
}
self->logSystem->pop(msg.popTo, txsTag);
if ( prevVersion && commitVersion - prevVersion < SERVER_KNOBS->MAX_VERSIONS_IN_FLIGHT/2 )
debug_advanceMaxCommittedVersion( UID(), commitVersion ); //< Is this valid?
//TraceEvent("ProxyPush", self->dbgid).detail("PrevVersion", prevVersion).detail("Version", commitVersion)
// .detail("TransactionsSubmitted", trs.size()).detail("TransactionsCommitted", commitCount)
// .detail("txsBytes", msg.message.size()).detail("TxsPopTo", msg.popTo);
// .detail("TransactionsSubmitted", trs.size()).detail("TransactionsCommitted", commitCount).detail("TxsPopTo", msg.popTo);
if ( prevVersion && commitVersion - prevVersion < SERVER_KNOBS->MAX_VERSIONS_IN_FLIGHT/2 )
debug_advanceMaxCommittedVersion(UID(), commitVersion);
@ -801,6 +799,8 @@ ACTOR Future<Void> commitBatch(
Void _ = wait(loggingComplete);
Void _ = wait(yield());
self->logSystem->pop(msg.popTo, txsTag);
/////// Phase 5: Replies (CPU bound; no particular order required, though ordered execution would be best for latency)
if ( prevVersion && commitVersion - prevVersion < SERVER_KNOBS->MAX_VERSIONS_IN_FLIGHT/2 )
debug_advanceMinCommittedVersion(UID(), commitVersion);
@ -1109,6 +1109,12 @@ ACTOR static Future<Void> readRequestServer(
usedTags.push_back(t.id);
}
}
for( auto& kv : commitData->txnStateStore->readRange(serverTagHistoryKeys).get() ) {
Tag t = decodeServerTagValue( kv.value );
if(t.locality == locality) {
usedTags.push_back(t.id);
}
}
std::sort(usedTags.begin(), usedTags.end());
int usedIdx = 0;
@ -1172,7 +1178,7 @@ ACTOR Future<Void> masterProxyServerCore(
//TraceEvent("ProxyInit2", proxy.id()).detail("LSEpoch", db->get().logSystemConfig.epoch).detail("Need", epoch);
Void _ = wait(db->onChange());
}
state Future<Void> dbInfoChange = db->onChange();
//TraceEvent("ProxyInit3", proxy.id());
commitData.resolvers = db->get().resolvers;
@ -1184,7 +1190,7 @@ ACTOR Future<Void> masterProxyServerCore(
commitData.logSystem = ILogSystem::fromServerDBInfo(proxy.id(), db->get());
commitData.logAdapter = new LogSystemDiskQueueAdapter(commitData.logSystem, txsTag, false);
commitData.txnStateStore = keyValueStoreLogSystem(commitData.logAdapter, proxy.id(), 2e9, true);
commitData.txnStateStore = keyValueStoreLogSystem(commitData.logAdapter, proxy.id(), 2e9, true, true);
onError = onError || commitData.logSystem->onError();
addActor.send(transactionStarter(proxy, master, db, addActor, &commitData));
@ -1199,6 +1205,15 @@ ACTOR Future<Void> masterProxyServerCore(
SERVER_KNOBS->COMMIT_TRANSACTION_BATCH_BYTES_SCALE_BASE * pow(db->get().client.proxies.size(), SERVER_KNOBS->COMMIT_TRANSACTION_BATCH_BYTES_SCALE_POWER)));
commitBatcher = batcher(batchedCommits, proxy.commit.getFuture(), SERVER_KNOBS->COMMIT_TRANSACTION_BATCH_INTERVAL_FROM_IDLE, &commitBatchInterval, SERVER_KNOBS->MAX_COMMIT_BATCH_INTERVAL, SERVER_KNOBS->COMMIT_TRANSACTION_BATCH_COUNT_MAX, commitBatchByteLimit, CLIENT_KNOBS->TRANSACTION_SIZE_LIMIT, commitData.commitBatchStartNotifications, TaskProxyCommitBatcher, &commitData.stats.txnCommitIn);
loop choose{
when( Void _ = wait( dbInfoChange ) ) {
dbInfoChange = db->onChange();
if(db->get().master.id() == master.id() && db->get().recoveryState >= RecoveryState::RECOVERY_TRANSACTION) {
commitData.logSystem = ILogSystem::fromServerDBInfo(proxy.id(), db->get());
for(auto it : commitData.tag_popped) {
commitData.logSystem->pop(it.second, it.first);
}
}
}
when(Void _ = wait(onError)) {}
when(vector<CommitTransactionRequest> trs = waitNext(batchedCommits.getFuture())) {
//TraceEvent("MasterProxyCTR", proxy.id()).detail("CommitTransactions", trs.size()).detail("TransactionRate", transactionRate).detail("TransactionQueue", transactionQueue.size()).detail("ReleasedTransactionCount", transactionCount);
@ -1294,7 +1309,7 @@ ACTOR Future<Void> masterProxyServerCore(
Arena arena;
bool confChanges;
applyMetadataMutations(commitData.dbgid, arena, mutations, commitData.txnStateStore, NULL, &confChanges, Reference<ILogSystem>(), 0, &commitData.vecBackupKeys, &commitData.keyInfo, commitData.firstProxy ? &commitData.uid_applyMutationsData : NULL, commitData.commit, commitData.cx, &commitData.committedVersion, &commitData.storageCache, true);
applyMetadataMutations(commitData.dbgid, arena, mutations, commitData.txnStateStore, NULL, &confChanges, Reference<ILogSystem>(), 0, &commitData.vecBackupKeys, &commitData.keyInfo, commitData.firstProxy ? &commitData.uid_applyMutationsData : NULL, commitData.commit, commitData.cx, &commitData.committedVersion, &commitData.storageCache, &commitData.tag_popped, true );
}
auto lockedKey = commitData.txnStateStore->readValue(databaseLockedKey).get();

View File

@ -659,14 +659,15 @@ ACTOR Future<std::pair<Version, Tag>> addStorageServer( Database cx, StorageServ
state Future<Optional<Value>> fExclIP = tr.get(
StringRef(encodeExcludedServersKey( AddressExclusion( server.address().ip ))) );
state Future<Standalone<RangeResultRef>> fTags = tr.getRange( serverTagKeys, CLIENT_KNOBS->TOO_MANY, true);
state Future<Standalone<RangeResultRef>> fHistoryTags = tr.getRange( serverTagHistoryKeys, CLIENT_KNOBS->TOO_MANY, true);
Void _ = wait( success(fTagLocalities) && success(fv) && success(fExclProc) && success(fExclIP) && success(fTags) );
Void _ = wait( success(fTagLocalities) && success(fv) && success(fExclProc) && success(fExclIP) && success(fTags) && success(fHistoryTags) );
// If we have been added to the excluded state servers list, we have to fail
if (fExclProc.get().present() || fExclIP.get().present())
throw recruitment_failed();
if(fTagLocalities.get().more || fTags.get().more)
if(fTagLocalities.get().more || fTags.get().more || fHistoryTags.get().more)
ASSERT(false);
int8_t maxTagLocality = 0;
@ -697,6 +698,12 @@ ACTOR Future<std::pair<Version, Tag>> addStorageServer( Database cx, StorageServ
usedTags.push_back(t.id);
}
}
for(auto& it : fHistoryTags.get()) {
Tag t = decodeServerTagValue( it.value );
if(t.locality == locality) {
usedTags.push_back(t.id);
}
}
std::sort(usedTags.begin(), usedTags.end());
int usedIdx = 0;
@ -717,7 +724,6 @@ ACTOR Future<std::pair<Version, Tag>> addStorageServer( Database cx, StorageServ
KeyRange conflictRange = singleKeyRange(serverTagConflictKeyFor(tag));
tr.addReadConflictRange( conflictRange );
tr.addWriteConflictRange( conflictRange );
tr.atomicOp( serverMaxTagKeyFor(locality), serverTagMaxValue(tag), MutationRef::Max );
Void _ = wait( tr.commit() );
return std::make_pair(tr.getCommittedVersion(), tag);
@ -769,10 +775,11 @@ ACTOR Future<Void> removeStorageServer( Database cx, UID serverID, MoveKeysLock
} else {
state Future<Optional<Value>> fListKey = tr.get( serverListKeyFor(serverID) );
state Future<Standalone<RangeResultRef>> fTags = tr.getRange( serverTagKeys, CLIENT_KNOBS->TOO_MANY);
state Future<Standalone<RangeResultRef>> fTags = tr.getRange( serverTagKeys, CLIENT_KNOBS->TOO_MANY );
state Future<Standalone<RangeResultRef>> fHistoryTags = tr.getRange( serverTagHistoryKeys, CLIENT_KNOBS->TOO_MANY );
state Future<Standalone<RangeResultRef>> fTagLocalities = tr.getRange( tagLocalityListKeys, CLIENT_KNOBS->TOO_MANY );
Void _ = wait( success(fListKey) && success(fTags) && success(fTagLocalities) );
Void _ = wait( success(fListKey) && success(fTags) && success(fHistoryTags) && success(fTagLocalities) );
if (!fListKey.get().present()) {
if (retry) {
@ -793,6 +800,10 @@ ACTOR Future<Void> removeStorageServer( Database cx, UID serverID, MoveKeysLock
allLocalities.insert(t.locality);
}
}
for(auto& it : fHistoryTags.get()) {
Tag t = decodeServerTagValue( it.value );
allLocalities.insert(t.locality);
}
if(locality >= 0 && !allLocalities.count(locality) ) {
for(auto& it : fTagLocalities.get()) {
@ -874,9 +885,6 @@ void seedShardServers(
tr.set(arena, serverTagKeyFor(servers[s].id()), serverTagValue(server_tag[servers[s].id()]));
tr.set(arena, serverListKeyFor(servers[s].id()), serverListValue(servers[s]));
}
for(auto it : dcId_locality) {
tr.set(arena, serverMaxTagKeyFor(it.second.locality), serverTagMaxValue(Tag(it.second.locality, it.second.id-1)));
}
std::vector<UID> serverIds;
for(int i=0;i<servers.size();i++)

View File

@ -475,10 +475,8 @@ namespace oldTLog {
TLogLockResult result;
result.end = stopVersion;
result.knownCommittedVersion = logData->knownCommittedVersion;
for( auto & tag : logData->tag_data )
result.tags.push_back( convertOldTag(tag.key) );
TraceEvent("TLogStop2", self->dbgid).detail("logId", logData->logId).detail("Ver", stopVersion).detail("isStopped", logData->stopped).detail("queueCommitted", logData->queueCommittedVersion.get()).detail("tags", describe(result.tags));
TraceEvent("TLogStop2", self->dbgid).detail("logId", logData->logId).detail("Ver", stopVersion).detail("isStopped", logData->stopped).detail("queueCommitted", logData->queueCommittedVersion.get());
reply.send( result );
@ -858,9 +856,9 @@ namespace oldTLog {
int32_t messageLength;
uint32_t subVersion;
rd >> messageLength >> subVersion;
messageLength += sizeof(uint16_t);
messages << messageLength << subVersion << uint16_t(0);
messageLength -= (sizeof(subVersion) + sizeof(uint16_t));
messageLength += sizeof(uint16_t) + sizeof(Tag);
messages << messageLength << subVersion << uint16_t(1) << req.tag;
messageLength -= (sizeof(subVersion) + sizeof(uint16_t) + sizeof(Tag));
messages.serializeBytes(rd.readBytes(messageLength), messageLength);
}
}
@ -934,9 +932,9 @@ namespace oldTLog {
int32_t messageLength;
uint32_t subVersion;
rd >> messageLength >> subVersion;
messageLength += sizeof(uint16_t);
messages << messageLength << subVersion << uint16_t(0);
messageLength -= (sizeof(subVersion) + sizeof(uint16_t));
messageLength += sizeof(uint16_t) + sizeof(Tag);
messages << messageLength << subVersion << uint16_t(1) << req.tag;
messageLength -= (sizeof(subVersion) + sizeof(uint16_t) + sizeof(Tag));
messages.serializeBytes(rd.readBytes(messageLength), messageLength);
}
}

View File

@ -283,8 +283,27 @@ ACTOR Future<bool> getStorageServersRecruiting( Database cx, Reference<AsyncVar<
}
}
ACTOR Future<Void> reconfigureAfter(Database cx, double time) {
Void _ = wait( delay(time) );
if(g_network->isSimulated()) {
TraceEvent(SevWarnAlways, "DisablingFearlessConfiguration");
g_simulator.hasRemoteReplication = false;
ConfigurationResult::Type _ = wait( changeConfig( cx, "remote_none" ) );
if (g_network->isSimulated() && g_simulator.extraDB) {
Reference<ClusterConnectionFile> extraFile(new ClusterConnectionFile(*g_simulator.extraDB));
Reference<Cluster> cluster = Cluster::createCluster(extraFile, -1);
Database extraDB = cluster->createDatabase(LiteralStringRef("DB")).get();
ConfigurationResult::Type _ = wait(changeConfig(extraDB, "remote_none"));
}
}
return Void();
}
ACTOR Future<Void> waitForQuietDatabase( Database cx, Reference<AsyncVar<ServerDBInfo>> dbInfo, std::string phase, int64_t dataInFlightGate = 2e6,
int64_t maxTLogQueueGate = 5e6, int64_t maxStorageServerQueueGate = 5e6, int64_t maxDataDistributionQueueSize = 0 ) {
state Future<Void> reconfig = reconfigureAfter(cx, 100 + (g_random->random01()*100));
TraceEvent(("QuietDatabase" + phase + "Begin").c_str());

View File

@ -24,7 +24,7 @@
#include "MasterInterface.h"
#include "TLogInterface.h"
#include "DatabaseConfiguration.h"
#include "fdbclient/DatabaseConfiguration.h"
Future<Void> rateKeeper(
Reference<AsyncVar<struct ServerDBInfo>> const& dbInfo,

View File

@ -704,7 +704,7 @@ StringRef StringRefOf(const char* s) {
void SimulationConfig::generateNormalConfig(int minimumReplication) {
set_config("new");
bool generateFearless = false; //FIXME g_random->random01() < 0.5;
bool generateFearless = g_random->random01() < 0.5;
datacenters = generateFearless ? 4 : g_random->randomInt( 1, 4 );
if (g_random->random01() < 0.25) db.desiredTLogCount = g_random->randomInt(1,7);
if (g_random->random01() < 0.25) db.masterProxyCount = g_random->randomInt(1,7);
@ -761,11 +761,11 @@ void SimulationConfig::generateNormalConfig(int minimumReplication) {
if(generateFearless || (datacenters == 2 && g_random->random01() < 0.5)) {
StatusObject primaryObj;
primaryObj["id"] = "0";
primaryObj["priority"] = 1;
primaryObj["priority"] = 0;
StatusObject remoteObj;
remoteObj["id"] = "1";
remoteObj["priority"] = 0;
remoteObj["priority"] = 1;
bool needsRemote = generateFearless;
if(generateFearless) {
@ -854,7 +854,6 @@ void SimulationConfig::generateNormalConfig(int minimumReplication) {
}
if (g_random->random01() < 0.25) db.remoteDesiredTLogCount = g_random->randomInt(1,7);
if (g_random->random01() < 0.25) db.desiredLogRouterCount = g_random->randomInt(1,7);
}
StatusArray regionArr;
@ -866,7 +865,10 @@ void SimulationConfig::generateNormalConfig(int minimumReplication) {
set_config("regions=" + json_spirit::write_string(json_spirit::mValue(regionArr), json_spirit::Output_options::none));
}
if(generateFearless) {
if(generateFearless && minimumReplication > 1) {
//low latency tests in fearless configurations need 4 machines per datacenter (3 for triple replication, 1 that is down during failures).
machine_count = 16;
} else if(generateFearless) {
machine_count = 12;
} else if(db.tLogPolicy && db.tLogPolicy->info() == "data_hall^2 x zoneid^2 x 1") {
machine_count = 9;
@ -1020,7 +1022,7 @@ void setupSimulatedSystem( vector<Future<Void>> *systemActors, std::string baseF
if(assignClasses) {
if(assignedMachines < 4)
processClass = ProcessClass((ProcessClass::ClassType) g_random->randomInt(0, 2), ProcessClass::CommandLineSource); //Unset or Storage
else if(assignedMachines == 4 && !g_simulator.hasRemoteReplication && !g_simulator.hasSatelliteReplication)
else if(assignedMachines == 4 && !simconfig.db.regions.size())
processClass = ProcessClass((ProcessClass::ClassType) (g_random->randomInt(0, 2) * ProcessClass::ResolutionClass), ProcessClass::CommandLineSource); //Unset or Resolution
else
processClass = ProcessClass((ProcessClass::ClassType) g_random->randomInt(0, 3), ProcessClass::CommandLineSource); //Unset, Storage, or Transaction

View File

@ -82,11 +82,10 @@ struct TLogRecoveryFinishedRequest {
struct TLogLockResult {
Version end;
Version knownCommittedVersion;
std::vector<Tag> tags;
template <class Ar>
void serialize( Ar& ar ) {
ar & end & knownCommittedVersion & tags;
ar & end & knownCommittedVersion;
}
};
@ -167,15 +166,16 @@ struct TLogPeekRequest {
struct TLogPopRequest {
Arena arena;
Version to;
Version knownCommittedVersion;
Tag tag;
ReplyPromise<Void> reply;
TLogPopRequest( Version to, Tag tag ) : to(to), tag(tag) {}
TLogPopRequest( Version to, Version knownCommittedVersion, Tag tag ) : to(to), knownCommittedVersion(knownCommittedVersion), tag(tag) {}
TLogPopRequest() {}
template <class Ar>
void serialize(Ar& ar) {
ar & arena & to & tag & reply;
ar & arena & to & knownCommittedVersion & tag & reply;
}
};

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -74,27 +74,37 @@ struct InitializeTLogRequest {
Version knownCommittedVersion;
LogEpoch epoch;
std::vector<Tag> recoverTags;
std::vector<Tag> allTags;
KeyValueStoreType storeType;
Optional<Tag> remoteTag;
Tag remoteTag;
int8_t locality;
bool isPrimary;
Version startVersion;
int logRouterTags;
ReplyPromise< struct TLogInterface > reply;
InitializeTLogRequest() {}
template <class Ar>
void serialize( Ar& ar ) {
ar & recruitmentID & recoverFrom & recoverAt & knownCommittedVersion & epoch & recoverTags & storeType & remoteTag & reply;
ar & recruitmentID & recoverFrom & recoverAt & knownCommittedVersion & epoch & recoverTags & allTags & storeType & remoteTag & locality & isPrimary & startVersion & logRouterTags & reply;
}
};
struct InitializeLogRouterRequest {
uint64_t recoveryCount;
int logSet;
Tag routerTag;
Version startVersion;
std::vector<LocalityData> tLogLocalities;
IRepPolicyRef tLogPolicy;
int32_t hasBestPolicy;
int8_t locality;
ReplyPromise<struct TLogInterface> reply;
template <class Ar>
void serialize(Ar& ar) {
ar & recoveryCount & routerTag & logSet & reply;
ar & recoveryCount & routerTag & startVersion & tLogLocalities & tLogPolicy & hasBestPolicy & locality & reply;
}
};
@ -136,12 +146,22 @@ struct InitializeResolverRequest {
}
};
struct InitializeStorageReply {
StorageServerInterface interf;
Version addedVersion;
template <class Ar>
void serialize(Ar& ar) {
ar & interf & addedVersion;
}
};
struct InitializeStorageRequest {
Tag seedTag; //< If this server will be passed to seedShardServers, this will be a tag, otherwise it is invalidTag
UID reqId;
UID interfaceId;
KeyValueStoreType storeType;
ReplyPromise< struct StorageServerInterface > reply;
ReplyPromise< InitializeStorageReply > reply;
template <class Ar>
void serialize( Ar& ar ) {
@ -270,15 +290,15 @@ class Database openDBOnServer( Reference<AsyncVar<ServerDBInfo>> const& db, int
Future<Void> extractClusterInterface( Reference<AsyncVar<Optional<struct ClusterControllerFullInterface>>> const& a, Reference<AsyncVar<Optional<struct ClusterInterface>>> const& b );
Future<Void> fdbd( Reference<ClusterConnectionFile> const&, LocalityData const& localities, ProcessClass const& processClass, std::string const& dataFolder, std::string const& coordFolder, int64_t const& memoryLimit, std::string const& metricsConnFile, std::string const& metricsPrefix );
Future<Void> workerServer( Reference<ClusterConnectionFile> const&, Reference<AsyncVar<Optional<struct ClusterControllerFullInterface>>> const& ccInterface, LocalityData const& localities, Reference<AsyncVar<ClusterControllerPriorityInfo>> const& asyncPriorityInfo, ProcessClass const& initialClass, std::string const& filename, int64_t const& memoryLimit, Future<Void> const& forceFailure, std::string const& metricsConnFile, std::string const& metricsPrefix );
Future<Void> clusterController( Reference<ClusterConnectionFile> const&, Reference<AsyncVar<Optional<ClusterControllerFullInterface>>> const& currentCC, Reference<AsyncVar<ClusterControllerPriorityInfo>> const& asyncPriorityInfo );
Future<Void> workerServer( Reference<ClusterConnectionFile> const&, Reference<AsyncVar<Optional<struct ClusterControllerFullInterface>>> const& ccInterface, LocalityData const& localities, Reference<AsyncVar<ClusterControllerPriorityInfo>> const& asyncPriorityInfo, ProcessClass const& initialClass, std::string const& filename, int64_t const& memoryLimit, Future<Void> const& forceFailure, std::string const& metricsConnFile, std::string const& metricsPrefix, Promise<Void> const& recoveredDiskFiles );
Future<Void> clusterController( Reference<ClusterConnectionFile> const&, Reference<AsyncVar<Optional<ClusterControllerFullInterface>>> const& currentCC, Reference<AsyncVar<ClusterControllerPriorityInfo>> const& asyncPriorityInfo, Future<Void> const& recoveredDiskFiles );
// These servers are started by workerServer
Future<Void> storageServer(
class IKeyValueStore* const& persistentData,
StorageServerInterface const& ssi,
Tag const& seedTag,
ReplyPromise<StorageServerInterface> const& recruitReply,
ReplyPromise<InitializeStorageReply> const& recruitReply,
Reference<AsyncVar<ServerDBInfo>> const& db,
std::string const& folder );
Future<Void> storageServer(

View File

@ -36,7 +36,6 @@
<ActorCompiler Include="Coordination.actor.cpp" />
<ActorCompiler Include="CoordinatedState.actor.cpp" />
<ActorCompiler Include="CoroFlow.actor.cpp" />
<ClCompile Include="DatabaseConfiguration.cpp" />
<ActorCompiler Include="MasterProxyServer.actor.cpp" />
<ActorCompiler Include="KeyValueStoreSQLite.actor.cpp" />
<ActorCompiler Include="LeaderElection.actor.cpp" />
@ -150,7 +149,6 @@
<ClInclude Include="CoordinatedState.h" />
<ClInclude Include="CoordinationInterface.h" />
<ClInclude Include="CoroFlow.h" />
<ClInclude Include="DatabaseConfiguration.h" />
<ClInclude Include="DataDistribution.h" />
<ClInclude Include="DBCoreState.h" />
<ClInclude Include="IDiskQueue.h" />

View File

@ -314,7 +314,6 @@
<ClInclude Include="ClusterRecruitmentInterface.h" />
<ClInclude Include="MasterInterface.h" />
<ClInclude Include="TLogInterface.h" />
<ClInclude Include="DatabaseConfiguration.h" />
<ClInclude Include="sqlite\sqlite3.h">
<Filter>sqlite</Filter>
</ClInclude>

View File

@ -185,6 +185,7 @@ struct MasterData : NonCopyable, ReferenceCounted<MasterData> {
IKeyValueStore* txnStateStore;
int64_t memoryLimit;
std::map<Optional<Value>,int8_t> dcId_locality;
std::vector<Tag> allTags;
int8_t getNextLocality() {
int8_t maxLocality = -1;
@ -207,7 +208,7 @@ struct MasterData : NonCopyable, ReferenceCounted<MasterData> {
ClusterControllerFullInterface clusterController; // If the cluster controller changes, this master will die, so this is immutable.
ReusableCoordinatedState cstate;
AsyncVar<bool> cstateUpdated;
Promise<Void> cstateUpdated;
Reference<AsyncVar<ServerDBInfo>> dbInfo;
int64_t registrationCount; // Number of different MasterRegistrationRequests sent to clusterController
@ -244,7 +245,6 @@ struct MasterData : NonCopyable, ReferenceCounted<MasterData> {
lastVersionTime(0),
txnStateStore(0),
memoryLimit(2e9),
cstateUpdated(false),
addActor(addActor),
hasConfiguration(false)
{
@ -292,7 +292,7 @@ ACTOR Future<Void> newTLogServers( Reference<MasterData> self, RecruitFromConfig
if(self->configuration.remoteTLogReplicationFactor > 0) {
state Optional<Key> remoteDcId = self->remoteDcIds.size() ? self->remoteDcIds[0] : Optional<Key>();
if( !self->dcId_locality.count(recr.dcId) ) {
TraceEvent(SevWarnAlways, "UnknownPrimaryDCID", self->dbgid).detail("found", self->dcId_locality.count(recr.dcId)).detail("primaryId", printable(recr.dcId));
TraceEvent(SevWarn, "UnknownPrimaryDCID", self->dbgid).detail("primaryId", printable(recr.dcId));
int8_t loc = self->getNextLocality();
Standalone<CommitTransactionRef> tr;
tr.set(tr.arena(), tagLocalityListKeyFor(recr.dcId), tagLocalityListValue(loc));
@ -301,7 +301,7 @@ ACTOR Future<Void> newTLogServers( Reference<MasterData> self, RecruitFromConfig
}
if( !self->dcId_locality.count(remoteDcId) ) {
TraceEvent(SevWarnAlways, "UnknownRemoteDCID", self->dbgid).detail("remoteFound", self->dcId_locality.count(remoteDcId)).detail("remoteId", printable(remoteDcId));
TraceEvent(SevWarn, "UnknownRemoteDCID", self->dbgid).detail("remoteId", printable(remoteDcId));
int8_t loc = self->getNextLocality();
Standalone<CommitTransactionRef> tr;
tr.set(tr.arena(), tagLocalityListKeyFor(remoteDcId), tagLocalityListValue(loc));
@ -309,12 +309,12 @@ ACTOR Future<Void> newTLogServers( Reference<MasterData> self, RecruitFromConfig
self->dcId_locality[remoteDcId] = loc;
}
Future<RecruitRemoteFromConfigurationReply> fRemoteWorkers = brokenPromiseToNever( self->clusterController.recruitRemoteFromConfiguration.getReply( RecruitRemoteFromConfigurationRequest( self->configuration, remoteDcId, recr.logRouterCount ) ) );
Future<RecruitRemoteFromConfigurationReply> fRemoteWorkers = brokenPromiseToNever( self->clusterController.recruitRemoteFromConfiguration.getReply( RecruitRemoteFromConfigurationRequest( self->configuration, remoteDcId, recr.tLogs.size() ) ) );
Reference<ILogSystem> newLogSystem = wait( oldLogSystem->newEpoch( recr, fRemoteWorkers, self->configuration, self->cstate.myDBState.recoveryCount + 1, self->dcId_locality[recr.dcId], self->dcId_locality[remoteDcId] ) );
Reference<ILogSystem> newLogSystem = wait( oldLogSystem->newEpoch( recr, fRemoteWorkers, self->configuration, self->cstate.myDBState.recoveryCount + 1, self->dcId_locality[recr.dcId], self->dcId_locality[remoteDcId], self->allTags ) );
self->logSystem = newLogSystem;
} else {
Reference<ILogSystem> newLogSystem = wait( oldLogSystem->newEpoch( recr, Never(), self->configuration, self->cstate.myDBState.recoveryCount + 1, tagLocalitySpecial, tagLocalitySpecial ) );
Reference<ILogSystem> newLogSystem = wait( oldLogSystem->newEpoch( recr, Never(), self->configuration, self->cstate.myDBState.recoveryCount + 1, tagLocalitySpecial, tagLocalitySpecial, self->allTags ) );
self->logSystem = newLogSystem;
}
return Void();
@ -338,7 +338,7 @@ ACTOR Future<Void> newSeedServers( Reference<MasterData> self, RecruitFromConfig
isr.reqId = g_random->randomUniqueID();
isr.interfaceId = g_random->randomUniqueID();
ErrorOr<StorageServerInterface> newServer = wait( recruits.storageServers[idx].storage.tryGetReply( isr ) );
ErrorOr<InitializeStorageReply> newServer = wait( recruits.storageServers[idx].storage.tryGetReply( isr ) );
if( newServer.isError() ) {
if( !newServer.isError( error_code_recruitment_failed ) && !newServer.isError( error_code_request_maybe_delivered ) )
@ -357,7 +357,7 @@ ACTOR Future<Void> newSeedServers( Reference<MasterData> self, RecruitFromConfig
tag.id++;
idx++;
servers->push_back( newServer.get() );
servers->push_back( newServer.get().interf );
}
}
@ -458,7 +458,7 @@ ACTOR Future<Void> updateRegistration( Reference<MasterData> self, Reference<ILo
TraceEvent("MasterUpdateRegistration", self->dbgid).detail("RecoveryCount", self->cstate.myDBState.recoveryCount).detail("logs", describe(logSystem->getLogSystemConfig().tLogs));
if (!self->cstateUpdated.get()) {
if (!self->cstateUpdated.isSet()) {
Void _ = wait(sendMasterRegistration(self.getPtr(), logSystem->getLogSystemConfig(), self->provisionalProxies, self->resolvers, self->cstate.myDBState.recoveryCount, self->cstate.prevDBState.getPriorCommittedLogServers() ));
} else {
updateLogsKey = updateLogsValue(self, cx);
@ -545,10 +545,16 @@ ACTOR Future<Void> recruitEverything( Reference<MasterData> self, vector<Storage
.detail("DesiredResolvers", self->configuration.getDesiredResolvers())
.detail("storeType", self->configuration.storageServerStoreType)
.trackLatest("MasterRecoveryState");
//FIXME: we only need log routers for the same locality as the master
int maxLogRouters = self->cstate.prevDBState.logRouterTags;
for(auto& old : self->cstate.prevDBState.oldTLogData) {
maxLogRouters = std::max(maxLogRouters, old.logRouterTags);
}
state RecruitFromConfigurationReply recruits = wait(
brokenPromiseToNever( self->clusterController.recruitFromConfiguration.getReply(
RecruitFromConfigurationRequest( self->configuration, self->lastEpochEnd==0 ) ) ) );
RecruitFromConfigurationRequest( self->configuration, self->lastEpochEnd==0, maxLogRouters ) ) ) );
self->primaryDcId.clear();
self->remoteDcIds.clear();
@ -585,7 +591,7 @@ ACTOR Future<Void> readTransactionSystemState( Reference<MasterData> self, Refer
// Recover transaction state store
if(self->txnStateStore) self->txnStateStore->close();
self->txnStateLogAdapter = openDiskQueueAdapter( oldLogSystem, txsTag );
self->txnStateStore = keyValueStoreLogSystem( self->txnStateLogAdapter, self->dbgid, self->memoryLimit, false );
self->txnStateStore = keyValueStoreLogSystem( self->txnStateLogAdapter, self->dbgid, self->memoryLimit, false, false );
// Versionstamped operations (particularly those applied from DR) define a minimum commit version
// that we may recover to, as they embed the version in user-readable data and require that no
@ -622,6 +628,20 @@ ACTOR Future<Void> readTransactionSystemState( Reference<MasterData> self, Refer
self->dcId_locality[decodeTagLocalityListKey(kv.key)] = decodeTagLocalityListValue(kv.value);
}
Standalone<VectorRef<KeyValueRef>> rawTags = wait( self->txnStateStore->readRange( serverTagKeys ) );
self->allTags.clear();
self->allTags.push_back(txsTag);
for(auto& kv : rawTags) {
self->allTags.push_back(decodeServerTagValue( kv.value ));
}
Standalone<VectorRef<KeyValueRef>> rawHistoryTags = wait( self->txnStateStore->readRange( serverTagHistoryKeys ) );
for(auto& kv : rawHistoryTags) {
self->allTags.push_back(decodeServerTagValue( kv.value ));
}
uniquify(self->allTags);
//auto kvs = self->txnStateStore->readRange( systemKeys );
//for( auto & kv : kvs.get() )
// TraceEvent("MasterRecoveredTXS", self->dbgid).detail("K", printable(kv.key)).detail("V", printable(kv.value));
@ -967,6 +987,7 @@ static std::set<int> const& normalMasterErrors() {
s.insert( error_code_master_max_versions_in_flight );
s.insert( error_code_worker_removed );
s.insert( error_code_new_coordinators_timed_out );
s.insert( error_code_broken_promise );
}
return s;
}
@ -1006,7 +1027,7 @@ ACTOR Future<Void> rejoinRequestHandler( Reference<MasterData> self ) {
}
}
ACTOR Future<Void> trackTlogRecovery( Reference<MasterData> self, Reference<AsyncVar<Reference<ILogSystem>>> oldLogSystems ) {
ACTOR Future<Void> trackTlogRecovery( Reference<MasterData> self, Reference<AsyncVar<Reference<ILogSystem>>> oldLogSystems, Promise<Void> remoteRecovered ) {
state Future<Void> rejoinRequests = Never();
state DBRecoveryCount recoverCount = self->cstate.myDBState.recoveryCount + 1;
loop {
@ -1016,24 +1037,29 @@ ACTOR Future<Void> trackTlogRecovery( Reference<MasterData> self, Reference<Asyn
state Future<Void> changed = self->logSystem->onCoreStateChanged();
ASSERT( newState.tLogs[0].tLogWriteAntiQuorum == self->configuration.tLogWriteAntiQuorum && newState.tLogs[0].tLogReplicationFactor == self->configuration.tLogReplicationFactor );
state bool finalUpdate = !newState.oldTLogData.size() && newState.tLogs.size() == self->configuration.expectedLogSets(self->primaryDcId.size() ? self->primaryDcId[0] : Optional<Key>());
state bool allLogs = newState.tLogs.size() == self->configuration.expectedLogSets(self->primaryDcId.size() ? self->primaryDcId[0] : Optional<Key>());
state bool finalUpdate = !newState.oldTLogData.size() && allLogs;
Void _ = wait( self->cstate.write(newState, finalUpdate) );
self->logSystem->coreStateWritten(newState);
if(self->cstateUpdated.canBeSet()) {
self->cstateUpdated.send(Void());
}
if( finalUpdate ) {
self->recoveryState = RecoveryState::REMOTE_RECOVERED;
TraceEvent("MasterRecoveryState", self->dbgid)
.detail("StatusCode", RecoveryStatus::remote_recovered)
.detail("Status", RecoveryStatus::names[RecoveryStatus::remote_recovered])
.trackLatest(format("%s/MasterRecoveryState", printable(self->dbName).c_str() ).c_str());
self->logSystem->coreStateWritten(newState);
}
self->cstateUpdated.set(true);
self->registrationTrigger.trigger();
if(allLogs && remoteRecovered.canBeSet()) {
remoteRecovered.send(Void());
}
if( finalUpdate ) {
TraceEvent("MasterFullyRecovered", self->dbgid);
oldLogSystems->get()->stopRejoins();
rejoinRequests = rejoinRequestHandler(self);
return Void();
@ -1051,9 +1077,7 @@ ACTOR Future<Void> configurationMonitor( Reference<MasterData> self ) {
loop {
try {
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
state Future<Standalone<RangeResultRef>> fresults = tr.getRange( configKeys, CLIENT_KNOBS->TOO_MANY );
Void _ = wait( success(fresults) );
Standalone<RangeResultRef> results = fresults.get();
Standalone<RangeResultRef> results = wait( tr.getRange( configKeys, CLIENT_KNOBS->TOO_MANY ) );
ASSERT( !results.more && results.size() < CLIENT_KNOBS->TOO_MANY );
DatabaseConfiguration conf;
@ -1169,14 +1193,6 @@ ACTOR Future<Void> masterCore( Reference<MasterData> self ) {
tr.set(recoveryCommitRequest.arena, coordinatorsKey, self->coordinators.ccf->getConnectionString().toString());
tr.set(recoveryCommitRequest.arena, logsKey, self->logSystem->getLogsValue());
//FIXME: upgrade code for 4.4, remove for 4.5
tr.clear(recoveryCommitRequest.arena, KeyRangeRef(LiteralStringRef("\xff/status/"), LiteralStringRef("\xff/status0")));
tr.clear(recoveryCommitRequest.arena, KeyRangeRef(LiteralStringRef("\xff/backupstatus/"), LiteralStringRef("\xff/backupstatus0")));
tr.clear(recoveryCommitRequest.arena, KeyRangeRef(LiteralStringRef("\xff/backup-agent/"), LiteralStringRef("\xff/backup-agent0")));
tr.clear(recoveryCommitRequest.arena, KeyRangeRef(LiteralStringRef("\xff/db-backup-agent/"), LiteralStringRef("\xff/db-backup-agent0")));
tr.clear(recoveryCommitRequest.arena, KeyRangeRef(LiteralStringRef("\xff/cplog/"), LiteralStringRef("\xff/cplog0")));
tr.clear(recoveryCommitRequest.arena, KeyRangeRef(LiteralStringRef("\xff/bklog/"), LiteralStringRef("\xff/bklog0")));
applyMetadataMutations(self->dbgid, recoveryCommitRequest.arena, tr.mutations.slice(mmApplied, tr.mutations.size()), self->txnStateStore, NULL, NULL);
mmApplied = tr.mutations.size();
@ -1184,11 +1200,10 @@ ACTOR Future<Void> masterCore( Reference<MasterData> self ) {
TraceEvent("MasterRecoveryCommit", self->dbgid);
state Future<ErrorOr<CommitID>> recoveryCommit = self->proxies[0].commit.tryGetReply(recoveryCommitRequest);
state Future<Void> tlogFailure = self->logSystem->onError();
state Future<Void> resolverFailure = waitResolverFailure( self->resolvers );
state Future<Void> proxyFailure = waitProxyFailure( self->proxies );
state Future<Void> providingVersions = provideVersions(self);
self->addActor.send( self->logSystem->onError() );
self->addActor.send( waitResolverFailure( self->resolvers ) );
self->addActor.send( waitProxyFailure( self->proxies ) );
self->addActor.send( provideVersions(self) );
self->addActor.send( reportErrors(updateRegistration(self, self->logSystem), "updateRegistration", self->dbgid) );
self->registrationTrigger.trigger();
@ -1197,7 +1212,7 @@ ACTOR Future<Void> masterCore( Reference<MasterData> self ) {
// Wait for the recovery transaction to complete.
// SOMEDAY: For faster recovery, do this and setDBState asynchronously and don't wait for them
// unless we want to change TLogs
Void _ = wait((success(recoveryCommit) && sendInitialCommitToResolvers(self)) || tlogFailure || resolverFailure || proxyFailure );
Void _ = wait((success(recoveryCommit) && sendInitialCommitToResolvers(self)) );
if(recoveryCommit.isReady() && recoveryCommit.get().isError()) {
TEST(true); // Master recovery failed because of the initial commit failed
throw master_recovery_failed();
@ -1223,11 +1238,10 @@ ACTOR Future<Void> masterCore( Reference<MasterData> self ) {
// we made to the new Tlogs (self->recoveryTransactionVersion), and only our own semi-commits can come between our
// first commit and the next new TLogs
self->addActor.send( trackTlogRecovery(self, oldLogSystems) );
state Promise<Void> remoteRecovered;
self->addActor.send( trackTlogRecovery(self, oldLogSystems, remoteRecovered) );
debug_advanceMaxCommittedVersion(UID(), self->recoveryTransactionVersion);
while(!self->cstateUpdated.get()) {
Void _ = wait(self->cstateUpdated.onChange());
}
Void _ = wait(self->cstateUpdated.getFuture());
debug_advanceMinCommittedVersion(UID(), self->recoveryTransactionVersion);
if( debugResult )
@ -1255,7 +1269,7 @@ ACTOR Future<Void> masterCore( Reference<MasterData> self ) {
{
PromiseStream< std::pair<UID, Optional<StorageServerInterface>> > ddStorageServerChanges;
state double lastLimited = 0;
self->addActor.send( reportErrorsExcept( dataDistribution( self->dbInfo, self->myInterface, self->configuration, ddStorageServerChanges, self->logSystem, self->recoveryTransactionVersion, self->primaryDcId, self->remoteDcIds, &lastLimited ), "DataDistribution", self->dbgid, &normalMasterErrors() ) );
self->addActor.send( reportErrorsExcept( dataDistribution( self->dbInfo, self->myInterface, self->configuration, ddStorageServerChanges, self->logSystem, self->recoveryTransactionVersion, self->primaryDcId, self->remoteDcIds, &lastLimited, remoteRecovered.getFuture() ), "DataDistribution", self->dbgid, &normalMasterErrors() ) );
self->addActor.send( reportErrors( rateKeeper( self->dbInfo, ddStorageServerChanges, self->myInterface.getRateInfo.getFuture(), self->dbName, self->configuration, &lastLimited ), "Ratekeeper", self->dbgid) );
}
@ -1263,15 +1277,10 @@ ACTOR Future<Void> masterCore( Reference<MasterData> self ) {
self->addActor.send( resolutionBalancing(self) );
self->addActor.send( changeCoordinators(self) );
state Future<Void> configMonitor = configurationMonitor( self );
self->addActor.send( configurationMonitor( self ) );
loop choose {
when( Void _ = wait( tlogFailure ) ) { throw internal_error(); }
when( Void _ = wait( proxyFailure ) ) { throw internal_error(); }
when( Void _ = wait( resolverFailure ) ) { throw internal_error(); }
when( Void _ = wait( providingVersions ) ) { throw internal_error(); }
when( Void _ = wait( configMonitor ) ) { throw internal_error(); }
}
Void _ = wait( Future<Void>(Never()) );
throw internal_error();
}
ACTOR Future<Void> masterServer( MasterInterface mi, Reference<AsyncVar<ServerDBInfo>> db, ServerCoordinators coordinators, LifetimeToken lifetime )
@ -1300,16 +1309,25 @@ ACTOR Future<Void> masterServer( MasterInterface mi, Reference<AsyncVar<ServerDB
when (Void _ = wait(collection) ) { ASSERT(false); throw internal_error(); }
}
} catch (Error& e) {
TEST(e.code() == error_code_master_tlog_failed); // Master: terminated because of a tLog failure
TEST(e.code() == error_code_master_proxy_failed); // Master: terminated because of a proxy failure
TEST(e.code() == error_code_master_resolver_failed); // Master: terminated because of a resolver failure
state Error err = e;
if(e.code() != error_code_actor_cancelled) {
Void _ = wait(delay(0.0));
}
if (normalMasterErrors().count(e.code()))
while(!self->addActor.isEmpty()) {
self->addActor.getFuture().pop();
}
TEST(err.code() == error_code_master_tlog_failed); // Master: terminated because of a tLog failure
TEST(err.code() == error_code_master_proxy_failed); // Master: terminated because of a proxy failure
TEST(err.code() == error_code_master_resolver_failed); // Master: terminated because of a resolver failure
if (normalMasterErrors().count(err.code()))
{
TraceEvent("MasterTerminated", mi.id()).error(e);
TraceEvent("MasterTerminated", mi.id()).error(err);
return Void();
}
throw;
throw err;
}
return Void();
}

View File

@ -264,6 +264,7 @@ public:
Tag tag;
vector<pair<Version,Tag>> history;
vector<pair<Version,Tag>> allHistory;
Version poppedAllAfter;
std::map<Version, Arena> freeable; // for each version, an Arena that must be held until that version is < oldestVersion
Arena lastArena;
@ -281,6 +282,11 @@ public:
void popVersion(Version v, bool popAllTags = false) {
if(logSystem) {
if(v > poppedAllAfter) {
popAllTags = true;
poppedAllAfter = std::numeric_limits<Version>::max();
}
vector<pair<Version,Tag>>* hist = &history;
vector<pair<Version,Tag>> allHistoryCopy;
if(popAllTags) {
@ -446,7 +452,8 @@ public:
shuttingDown(false), debug_inApplyUpdate(false), debug_lastValidateTime(0), watchBytes(0),
logProtocol(0), counters(this), tag(invalidTag), maxQueryQueue(0), thisServerID(ssi.id()),
readQueueSizeMetric(LiteralStringRef("StorageServer.ReadQueueSize")),
behind(false), byteSampleClears(false, LiteralStringRef("\xff\xff\xff")), noRecentUpdates(false), lastUpdate(now())
behind(false), byteSampleClears(false, LiteralStringRef("\xff\xff\xff")), noRecentUpdates(false),
lastUpdate(now()), poppedAllAfter(std::numeric_limits<Version>::max())
{
version.initMetric(LiteralStringRef("StorageServer.Version"), counters.cc.id);
oldestVersion.initMetric(LiteralStringRef("StorageServer.OldestVersion"), counters.cc.id);
@ -457,7 +464,7 @@ public:
newestDirtyVersion.insert(allKeys, invalidVersion);
addShard( ShardInfo::newNotAssigned( allKeys ) );
cx = openDBOnServer(db, TaskDefaultEndpoint, false, true);
cx = openDBOnServer(db, TaskDefaultEndpoint, true, true);
}
//~StorageServer() { fclose(log); }
@ -2335,7 +2342,13 @@ ACTOR Future<Void> update( StorageServer* data, bool* pReceivedUpdate )
state Reference<ILogSystem::IPeekCursor> cursor = data->logCursor;
//TraceEvent("SSUpdatePeeking", data->thisServerID).detail("MyVer", data->version.get()).detail("Epoch", data->updateEpoch).detail("Seq", data->updateSequence);
Void _ = wait( cursor->getMore() );
loop {
Void _ = wait( cursor->getMore() );
if(!cursor->isExhausted()) {
break;
}
}
if(cursor->popped() > 0)
throw worker_removed();
@ -2354,7 +2367,6 @@ ACTOR Future<Void> update( StorageServer* data, bool* pReceivedUpdate )
start = now();
state UpdateEagerReadInfo eager;
state FetchInjectionInfo fii;
state Version minNewOldestVersion = 0;
state Reference<ILogSystem::IPeekCursor> cloneCursor2;
loop{
@ -2420,7 +2432,7 @@ ACTOR Future<Void> update( StorageServer* data, bool* pReceivedUpdate )
data->updateEagerReads = &eager;
data->debug_inApplyUpdate = true;
StorageUpdater updater(data->lastVersionWithData, std::max( std::max(data->desiredOldestVersion.get(), data->oldestVersion.get()), minNewOldestVersion ), data->restoredVersion);
StorageUpdater updater(data->lastVersionWithData, std::max(data->desiredOldestVersion.get(), data->oldestVersion.get()), data->restoredVersion);
if (EXPENSIVE_VALIDATION) data->data().atLatest().validate();
validate(data);
@ -2471,8 +2483,11 @@ ACTOR Future<Void> update( StorageServer* data, bool* pReceivedUpdate )
}
}
if(ver != invalidVersion) data->lastVersionWithData = ver;
ver = cloneCursor2->version().version - 1;
if(ver != invalidVersion) {
data->lastVersionWithData = ver;
} else {
ver = cloneCursor2->version().version - 1;
}
if(injectedChanges) data->lastVersionWithData = ver;
data->updateEagerReads = NULL;
@ -3125,7 +3140,10 @@ ACTOR Future<Void> storageServerCore( StorageServer* self, StorageServerInterfac
if( self->db->get().recoveryState >= RecoveryState::FULLY_RECOVERED ) {
self->logSystem = ILogSystem::fromServerDBInfo( self->thisServerID, self->db->get() );
if (self->logSystem) {
self->logCursor = self->logSystem->peekSingle( self->version.get() + 1, self->tag, self->history );
if(self->logSystem->getLogSystemConfig().oldTLogs.size()) {
self->poppedAllAfter = self->logSystem->getLogSystemConfig().oldTLogs[0].epochEnd;
}
self->logCursor = self->logSystem->peekSingle( self->thisServerID, self->version.get() + 1, self->tag, self->history );
self->popVersion( self->durableVersion.get() + 1, true );
}
// If update() is waiting for results from the tlog, it might never get them, so needs to be cancelled. But if it is waiting later,
@ -3212,7 +3230,7 @@ bool storageServerTerminated(StorageServer& self, IKeyValueStore* persistentData
return false;
}
ACTOR Future<Void> storageServer( IKeyValueStore* persistentData, StorageServerInterface ssi, Tag seedTag, ReplyPromise<StorageServerInterface> recruitReply,
ACTOR Future<Void> storageServer( IKeyValueStore* persistentData, StorageServerInterface ssi, Tag seedTag, ReplyPromise<InitializeStorageReply> recruitReply,
Reference<AsyncVar<ServerDBInfo>> db, std::string folder )
{
state StorageServer self(persistentData, db, ssi);
@ -3221,19 +3239,22 @@ ACTOR Future<Void> storageServer( IKeyValueStore* persistentData, StorageServerI
self.folder = folder;
try {
self.storage.makeNewStorageServerDurable();
Void _ = wait( self.storage.commit() );
if (seedTag == invalidTag) {
std::pair<Version, Tag> verAndTag = wait( addStorageServer(self.cx, ssi) ); // Might throw recruitment_failed in case of simultaneous master failure
self.tag = verAndTag.second;
self.setInitialVersion( verAndTag.first-1 ); // FIXME: Can this be 0 now? Should we get a corresponding updatePos?
self.setInitialVersion( verAndTag.first-1 );
} else {
self.tag = seedTag;
}
self.storage.makeNewStorageServerDurable();
Void _ = wait( self.storage.commit() );
TraceEvent("StorageServerInit", ssi.id()).detail("Version", self.version.get()).detail("SeedTag", seedTag.toString());
recruitReply.send(ssi);
InitializeStorageReply rep;
rep.interf = ssi;
rep.addedVersion = self.version.get();
recruitReply.send(rep);
self.byteSampleRecovery = Void();
Void _ = wait( storageServerCore(&self, ssi) );
@ -3282,26 +3303,30 @@ ACTOR Future<Void> replaceInterface( StorageServer* self, StorageServerInterface
tr.setOption(FDBTransactionOptions::FIRST_IN_BATCH);
tr.set( serverTagKeyFor(ssi.id()), serverTagValue(rep.newTag.get()) );
tr.atomicOp( serverTagHistoryKeyFor(ssi.id()), serverTagValue(rep.tag), MutationRef::SetVersionstampedKey );
tr.atomicOp( serverMaxTagKeyFor(rep.newTag.get().locality), serverTagMaxValue(rep.newTag.get()), MutationRef::Max );
}
if(rep.history.size() && rep.history.back().first <= self->version.get()) {
if(rep.history.size() && rep.history.back().first < self->version.get()) {
tr.clear(serverTagHistoryRangeBefore(ssi.id(), self->version.get()));
}
choose {
when ( Void _ = wait( tr.commit() ) ) {
self->history = rep.history;
while(self->history.size() && self->history.back().first < self->version.get() ) {
self->history.pop_back();
}
if(rep.newTag.present()) {
self->tag = rep.newTag.get();
self->history.push_back(std::make_pair(tr.getCommittedVersion(), rep.tag));
self->history.insert(self->history.begin(), std::make_pair(tr.getCommittedVersion(), rep.tag));
} else {
self->tag = rep.tag;
}
self->allHistory = self->history;
TraceEvent("SSTag", self->thisServerID).detail("myTag", self->tag.toString());
for(auto it : self->history) {
TraceEvent("SSHistory", self->thisServerID).detail("ver", it.first).detail("tag", it.second.toString()).detail("myTag", self->tag.toString());
TraceEvent("SSHistory", self->thisServerID).detail("ver", it.first).detail("tag", it.second.toString());
}
if(self->history.size() && BUGGIFY) {

View File

@ -997,24 +997,6 @@ vector<TestSpec> readTests( ifstream& ifs ) {
return result;
}
ACTOR Future<Void> reconfigureAfter(Database cx, double time) {
Void _ = wait( delay(time) );
if(g_network->isSimulated()) {
TraceEvent(SevWarnAlways, "DisablingFearlessConfiguration");
g_simulator.hasRemoteReplication = false;
ConfigurationResult::Type _ = wait( changeConfig( cx, "remote_none" ) );
if (g_network->isSimulated() && g_simulator.extraDB) {
Reference<ClusterConnectionFile> extraFile(new ClusterConnectionFile(*g_simulator.extraDB));
Reference<Cluster> cluster = Cluster::createCluster(extraFile, -1);
Database extraDB = cluster->createDatabase(LiteralStringRef("DB")).get();
ConfigurationResult::Type _ = wait(changeConfig(extraDB, "remote_none"));
}
}
return Void();
}
ACTOR Future<Void> runTests( Reference<AsyncVar<Optional<struct ClusterControllerFullInterface>>> cc, Reference<AsyncVar<Optional<struct ClusterInterface>>> ci, vector< TesterInterface > testers, vector<TestSpec> tests, StringRef startingConfiguration, LocalityData locality ) {
state Standalone<StringRef> database = LiteralStringRef("DB");
state Database cx;
@ -1081,10 +1063,6 @@ ACTOR Future<Void> runTests( Reference<AsyncVar<Optional<struct ClusterControlle
}
}
if (useDB) {
state Future<Void> reconfig = reconfigureAfter(cx, 300 + (g_random->random01()*300));
}
TraceEvent("TestsExpectedToPass").detail("Count", tests.size());
state int idx = 0;
for(; idx < tests.size(); idx++ ) {
@ -1115,7 +1093,7 @@ ACTOR Future<Void> runTests( Reference<AsyncVar<Optional<struct ClusterControlle
Reference<AsyncVar<Optional<struct ClusterInterface>>> ci, vector<TestSpec> tests, test_location_t at,
int minTestersExpected, StringRef startingConfiguration, LocalityData locality ) {
state int flags = (at == TEST_ON_SERVERS ? 0 : GetWorkersRequest::TESTER_CLASS_ONLY) | GetWorkersRequest::NON_EXCLUDED_PROCESSES_ONLY;
state Future<Void> testerTimeout = delay(60.0); // wait 60 sec for testers to show up
state Future<Void> testerTimeout = delay(600.0); // wait 600 sec for testers to show up
state vector<std::pair<WorkerInterface, ProcessClass>> workers;
loop {

View File

@ -479,7 +479,7 @@ ACTOR Future<Void> monitorServerDBInfo( Reference<AsyncVar<Optional<ClusterContr
}
ACTOR Future<Void> workerServer( Reference<ClusterConnectionFile> connFile, Reference<AsyncVar<Optional<ClusterControllerFullInterface>>> ccInterface, LocalityData localities,
Reference<AsyncVar<ClusterControllerPriorityInfo>> asyncPriorityInfo, ProcessClass initialClass, std::string folder, int64_t memoryLimit, std::string metricsConnFile, std::string metricsPrefix ) {
Reference<AsyncVar<ClusterControllerPriorityInfo>> asyncPriorityInfo, ProcessClass initialClass, std::string folder, int64_t memoryLimit, std::string metricsConnFile, std::string metricsPrefix, Promise<Void> recoveredDiskFiles ) {
state PromiseStream< ErrorInfo > errors;
state Future<Void> handleErrors = workerHandleErrors( errors.getFuture() ); // Needs to be stopped last
state ActorCollection errorForwarders(false);
@ -487,7 +487,7 @@ ACTOR Future<Void> workerServer( Reference<ClusterConnectionFile> connFile, Refe
state double loggingDelay = SERVER_KNOBS->WORKER_LOGGING_INTERVAL;
state ActorCollection filesClosed(true);
state Promise<Void> stopping;
state WorkerCache<StorageServerInterface> storageCache;
state WorkerCache<InitializeStorageReply> storageCache;
state Reference<AsyncVar<ServerDBInfo>> dbInfo( new AsyncVar<ServerDBInfo>(ServerDBInfo(LiteralStringRef("DB"))) );
state Future<Void> metricsLogger;
state UID processIDUid;
@ -643,6 +643,8 @@ ACTOR Future<Void> workerServer( Reference<ClusterConnectionFile> connFile, Refe
startRole( interf.id(), interf.id(), "Worker", details );
Void _ = wait(waitForAll(recoveries));
recoveredDiskFiles.send(Void());
errorForwarders.add( registrationClient( ccInterface, interf, asyncPriorityInfo, initialClass ) );
TraceEvent("RecoveriesComplete", interf.id());
@ -759,7 +761,7 @@ ACTOR Future<Void> workerServer( Reference<ClusterConnectionFile> connFile, Refe
IKeyValueStore* data = openKVStore( req.storeType, filename, recruited.id(), memoryLimit );
Future<Void> kvClosed = data->onClosed();
filesClosed.add( kvClosed );
ReplyPromise<StorageServerInterface> storageReady = req.reply;
ReplyPromise<InitializeStorageReply> storageReady = req.reply;
storageCache.set( req.reqId, storageReady.getFuture() );
Future<Void> s = storageServer( data, recruited, req.seedTag, storageReady, dbInfo, folder );
s = handleIOErrors(s, data, recruited.id(), kvClosed);
@ -979,12 +981,13 @@ ACTOR Future<Void> fdbd(
Reference<AsyncVar<ClusterControllerPriorityInfo>> asyncPriorityInfo(new AsyncVar<ClusterControllerPriorityInfo>(
ClusterControllerPriorityInfo(ProcessClass(processClass.classType(), ProcessClass::CommandLineSource).machineClassFitness(ProcessClass::ClusterController), false, ClusterControllerPriorityInfo::FitnessUnknown)));
vector<Future<Void>> v;
state Promise<Void> recoveredDiskFiles;
if ( coordFolder.size() )
v.push_back( fileNotFoundToNever( coordinationServer( coordFolder ) ) ); //SOMEDAY: remove the fileNotFound wrapper and make DiskQueue construction safe from errors setting up their files
v.push_back( reportErrors( processClass == ProcessClass::TesterClass ? monitorLeader( connFile, cc ) : clusterController( connFile, cc , asyncPriorityInfo ), "clusterController") );
v.push_back( reportErrors( processClass == ProcessClass::TesterClass ? monitorLeader( connFile, cc ) : clusterController( connFile, cc , asyncPriorityInfo, recoveredDiskFiles.getFuture()), "clusterController") );
v.push_back( reportErrors(extractClusterInterface( cc, ci ), "extractClusterInterface") );
v.push_back( reportErrors(failureMonitorClient( ci, true ), "failureMonitorClient") );
v.push_back( reportErrorsExcept(workerServer(connFile, cc, localities, asyncPriorityInfo, processClass, dataFolder, memoryLimit, metricsConnFile, metricsPrefix), "workerServer", UID(), &normalWorkerErrors()) );
v.push_back( reportErrorsExcept(workerServer(connFile, cc, localities, asyncPriorityInfo, processClass, dataFolder, memoryLimit, metricsConnFile, metricsPrefix, recoveredDiskFiles), "workerServer", UID(), &normalWorkerErrors()) );
state Future<Void> firstConnect = reportErrors( printOnFirstConnected(ci), "ClusterFirstConnectedError" );
Void _ = wait( quorum(v,1) );

View File

@ -344,7 +344,6 @@ struct BackupToDBUpgradeWorkload : TestWorkload {
ACTOR static Future<Void> _start(Database cx, BackupToDBUpgradeWorkload* self) {
state DatabaseBackupAgent backupAgent(cx);
state DatabaseBackupAgent restoreAgent(self->extraDB);
state Future<Void> disabler = disableConnectionFailuresAfter(300, "BackupToDBUpgradeStart");
state Standalone<VectorRef<KeyRangeRef>> prevBackupRanges;
state UID logUid;
state Version commitVersion;

View File

@ -61,9 +61,12 @@ struct ChangeConfigWorkload : TestWorkload {
state Database extraDB = cluster->createDatabase(LiteralStringRef("DB")).get();
Void _ = wait(delay(5*g_random->random01()));
if (self->configMode.size())
if (self->configMode.size()) {
ConfigurationResult::Type _ = wait(changeConfig(extraDB, self->configMode));
if (self->networkAddresses.size()) {
TraceEvent("WaitForReplicasExtra");
Void _ = wait( waitForFullReplication( extraDB ) );
TraceEvent("WaitForReplicasExtraEnd");
} if (self->networkAddresses.size()) {
if (self->networkAddresses == "auto")
CoordinatorsResult::Type _ = wait(changeQuorum(extraDB, autoQuorumChange()));
else
@ -83,8 +86,12 @@ struct ChangeConfigWorkload : TestWorkload {
Void _ = wait( self->extraDatabaseConfigure(self) );
}
if( self->configMode.size() )
if( self->configMode.size() ) {
ConfigurationResult::Type _ = wait( changeConfig( cx, self->configMode ) );
TraceEvent("WaitForReplicas");
Void _ = wait( waitForFullReplication( cx ) );
TraceEvent("WaitForReplicasEnd");
}
if( self->networkAddresses.size() ) {
if (self->networkAddresses == "auto")
CoordinatorsResult::Type _ = wait( changeQuorum( cx, autoQuorumChange() ) );

View File

@ -725,6 +725,20 @@ void forwardVector( Future<V> values, std::vector<Promise<T>> out ) {
out[i].send( in[i] );
}
ACTOR template <class T>
Future<Void> delayedAsyncVar( Reference<AsyncVar<T>> in, Reference<AsyncVar<T>> out, double time ) {
try {
loop {
Void _ = wait( delay( time ) );
out->set( in->get() );
Void _ = wait( in->onChange() );
}
} catch (Error& e) {
out->set( in->get() );
throw;
}
}
Future<bool> allTrue( const std::vector<Future<bool>>& all );
Future<Void> anyTrue( std::vector<Reference<AsyncVar<bool>>> const& input, Reference<AsyncVar<bool>> const& output );
Future<Void> cancelOnly( std::vector<Future<Void>> const& futures );
@ -849,6 +863,7 @@ Future<Void> quorum(std::vector<Future<T>> const& results, int n) {
ACTOR template <class T>
Future<Void> smartQuorum( std::vector<Future<T>> results, int required, double extraSeconds, int taskID = TaskDefaultDelay ) {
if (results.empty()) return Void();
Void _ = wait(quorum(results, required));
choose {
when (Void _ = wait(quorum(results, (int)results.size()))) {return Void();}