Merge pull request #135 from etschannen/release-5.1

Release 5.1
This commit is contained in:
Evan Tschannen 2018-04-10 16:27:29 -07:00 committed by GitHub
commit fcd0f96f07
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 43 additions and 53 deletions

View File

@ -59,7 +59,7 @@ The ``commit`` command commits the current transaction. Any sets or clears execu
configure
---------
The ``configure`` command changes the database configuration. Its syntax is ``configure [new] [single|double|triple|three_data_hall|multi_dc] [ssd|memory] [proxies=<N>] [resolvers=<N>] [logs=<N>]``.
The ``configure`` command changes the database configuration. Its syntax is ``configure [new] [single|double|triple|three_data_hall|three_datacenter] [ssd|memory] [proxies=<N>] [resolvers=<N>] [logs=<N>]``.
The ``new`` option, if present, initializes a new database with the given configuration rather than changing the configuration of an existing one. When ``new`` is used, both a redundancy mode and a storage engine must be specified.
@ -72,7 +72,7 @@ Redundancy modes define storage requirements, required cluster size, and resilie
* ``double``
* ``triple``
* ``three_data_hall``
* ``multi_dc``
* ``three_datacenter``
For descriptions of redundacy modes, see :ref:`configuration-choosing-redundancy-mode`.

View File

@ -28,7 +28,7 @@ The number of machines in each team is based on the replication mode; the total
Independence assumptions
========================
As a further refinement, FoundationDB can be made aware that certain machines might tend to fail together by specifying the locality of each process. For example, every machine in a rack might share a network and power connection. If either failed, then the entire rack of machines would fail. We use this knowledge when choosing teams, taking care not to place any two machines in a team that would have a tendency to fail together. Pieces of data can then be intelligently distributed across racks or even datacenters, so that characteristic multimachine failures (for example, based on rack configuration) do not cause service interruption or data loss. Our ``three_data_hall`` and ``multi_dc`` configurations use this technique to continuously operate through a failure of a data hall or datacenter respectively.
As a further refinement, FoundationDB can be made aware that certain machines might tend to fail together by specifying the locality of each process. For example, every machine in a rack might share a network and power connection. If either failed, then the entire rack of machines would fail. We use this knowledge when choosing teams, taking care not to place any two machines in a team that would have a tendency to fail together. Pieces of data can then be intelligently distributed across racks or even datacenters, so that characteristic multimachine failures (for example, based on rack configuration) do not cause service interruption or data loss. Our ``three_data_hall`` and ``three_datacenter`` configurations use this technique to continuously operate through a failure of a data hall or datacenter respectively.
Other types of failure
======================

View File

@ -438,9 +438,9 @@ void initHelp() {
"clear a range of keys from the database",
"All keys between BEGINKEY (inclusive) and ENDKEY (exclusive) are cleared from the database. This command will succeed even if the specified range is empty, but may fail because of conflicts." ESCAPINGK);
helpMap["configure"] = CommandHelp(
"configure [new] <single|double|triple|three_data_hall|three_datacenter|multi_dc|ssd|memory|proxies=<PROXIES>|logs=<LOGS>|resolvers=<RESOLVERS>>*",
"configure [new] <single|double|triple|three_data_hall|three_datacenter|ssd|memory|proxies=<PROXIES>|logs=<LOGS>|resolvers=<RESOLVERS>>*",
"change database configuration",
"The `new' option, if present, initializes a new database with the given configuration rather than changing the configuration of an existing one. When used, both a redundancy mode and a storage engine must be specified.\n\nRedundancy mode:\n single - one copy of the data. Not fault tolerant.\n double - two copies of data (survive one failure).\n triple - three copies of data (survive two failures).\n three_data_hall - See the Admin Guide.\n three_datacenter - See the Admin Guide.\n multi_dc - See the Admin Guide.\n\nStorage engine:\n ssd - B-Tree storage engine optimized for solid state disks.\n memory - Durable in-memory storage engine for small datasets.\n\nproxies=<PROXIES>: Sets the desired number of proxies in the cluster. Must be at least 1, or set to -1 which restores the number of proxies to the default value.\n\nlogs=<LOGS>: Sets the desired number of log servers in the cluster. Must be at least 1, or set to -1 which restores the number of logs to the default value.\n\nresolvers=<RESOLVERS>: Sets the desired number of resolvers in the cluster. Must be at least 1, or set to -1 which restores the number of resolvers to the default value.\n\nSee the FoundationDB Administration Guide for more information.");
"The `new' option, if present, initializes a new database with the given configuration rather than changing the configuration of an existing one. When used, both a redundancy mode and a storage engine must be specified.\n\nRedundancy mode:\n single - one copy of the data. Not fault tolerant.\n double - two copies of data (survive one failure).\n triple - three copies of data (survive two failures).\n three_data_hall - See the Admin Guide.\n three_datacenter - See the Admin Guide.\n\nStorage engine:\n ssd - B-Tree storage engine optimized for solid state disks.\n memory - Durable in-memory storage engine for small datasets.\n\nproxies=<PROXIES>: Sets the desired number of proxies in the cluster. Must be at least 1, or set to -1 which restores the number of proxies to the default value.\n\nlogs=<LOGS>: Sets the desired number of log servers in the cluster. Must be at least 1, or set to -1 which restores the number of logs to the default value.\n\nresolvers=<RESOLVERS>: Sets the desired number of resolvers in the cluster. Must be at least 1, or set to -1 which restores the number of resolvers to the default value.\n\nSee the FoundationDB Administration Guide for more information.");
helpMap["coordinators"] = CommandHelp(
"coordinators auto|<ADDRESS>+ [description=new_cluster_description]",
"change cluster coordinators or description",
@ -1984,7 +1984,7 @@ void onoff_generator(const char* text, const char *line, std::vector<std::string
}
void configure_generator(const char* text, const char *line, std::vector<std::string>& lc) {
const char* opts[] = {"new", "single", "double", "triple", "three_data_hall", "three_datacenter", "multi_dc", "ssd", "ssd-1", "ssd-2", "memory", "proxies=", "logs=", "resolvers=", NULL};
const char* opts[] = {"new", "single", "double", "triple", "three_data_hall", "three_datacenter", "ssd", "ssd-1", "ssd-2", "memory", "proxies=", "logs=", "resolvers=", NULL};
array_generator(text, line, opts, lc);
}

View File

@ -105,25 +105,7 @@ std::map<std::string, std::string> configForToken( std::string const& mode ) {
redundancy="3";
log_replicas="3";
storagePolicy = tLogPolicy = IRepPolicyRef(new PolicyAcross(3, "zoneid", IRepPolicyRef(new PolicyOne())));
} else if(mode == "two_datacenter") {
redundancy="3";
log_replicas="3";
storagePolicy = tLogPolicy = IRepPolicyRef(new PolicyAcross(3, "zoneid", IRepPolicyRef(new PolicyOne())));
} else if(mode == "three_datacenter") {
redundancy="3";
log_replicas="3";
storagePolicy = tLogPolicy = IRepPolicyRef(new PolicyAnd({
IRepPolicyRef(new PolicyAcross(3, "dcid", IRepPolicyRef(new PolicyOne()))),
IRepPolicyRef(new PolicyAcross(3, "zoneid", IRepPolicyRef(new PolicyOne())))
}));
} else if(mode == "three_data_hall") {
redundancy="3";
log_replicas="4";
storagePolicy = IRepPolicyRef(new PolicyAcross(3, "data_hall", IRepPolicyRef(new PolicyOne())));
tLogPolicy = IRepPolicyRef(new PolicyAcross(2, "data_hall",
IRepPolicyRef(new PolicyAcross(2, "zoneid", IRepPolicyRef(new PolicyOne())))
));
} else if(mode == "multi_dc") {
} else if(mode == "three_datacenter" || mode == "multi_dc") {
redundancy="6";
log_replicas="4";
storagePolicy = IRepPolicyRef(new PolicyAcross(3, "dcid",
@ -132,6 +114,13 @@ std::map<std::string, std::string> configForToken( std::string const& mode ) {
tLogPolicy = IRepPolicyRef(new PolicyAcross(2, "dcid",
IRepPolicyRef(new PolicyAcross(2, "zoneid", IRepPolicyRef(new PolicyOne())))
));
} else if(mode == "three_data_hall") {
redundancy="3";
log_replicas="4";
storagePolicy = IRepPolicyRef(new PolicyAcross(3, "data_hall", IRepPolicyRef(new PolicyOne())));
tLogPolicy = IRepPolicyRef(new PolicyAcross(2, "data_hall",
IRepPolicyRef(new PolicyAcross(2, "zoneid", IRepPolicyRef(new PolicyOne())))
));
} else
redundancySpecified = false;
if (redundancySpecified) {
@ -303,12 +292,9 @@ ConfigureAutoResult parseConfig( StatusObject const& status ) {
} else if( result.old_replication == "triple" || result.old_replication == "fast_recovery_triple" ) {
storage_replication = 3;
log_replication = 3;
} else if( result.old_replication == "two_datacenter" ) {
storage_replication = 3;
log_replication = 3;
} else if( result.old_replication == "three_datacenter" ) {
storage_replication = 3;
log_replication = 3;
storage_replication = 6;
log_replication = 4;
} else
return ConfigureAutoResult();

View File

@ -687,7 +687,7 @@ ACTOR Future<Void> dataDistributionRelocator( DDQueueData *self, RelocateData rd
}
TEST(true); //did not find a healthy destination team on the first attempt
stuckCount++;
TraceEvent(stuckCount > 50 ? SevWarnAlways : SevWarn, "BestTeamStuck", masterId).detail("Count", stuckCount);
TraceEvent(stuckCount > 50 ? SevWarnAlways : SevWarn, "BestTeamStuck", masterId).detail("Count", stuckCount).suppressFor(1.0);
if(stuckCount > 50 && g_network->isSimulated()) { //FIXME: known bug in simulation we are supressing
int unseed = noUnseed ? 0 : g_random->randomInt(0, 100001);
TraceEvent("ElapsedTime").detail("SimTime", now()).detail("RealTime", 0)

View File

@ -85,14 +85,12 @@ std::map<std::string, std::string> DatabaseConfiguration::toMap() const {
result["redundancy_mode"] = "single";
else if( tLogReplicationFactor == 2 && durableStorageQuorum == 2 )
result["redundancy_mode"] = "double";
else if( tLogReplicationFactor == 3 && durableStorageQuorum == 3 && tlogInfo == "((dcid^3 x 1) & (zoneid^3 x 1))" && storageInfo == "((dcid^3 x 1) & (zoneid^3 x 1))" )
else if( tLogReplicationFactor == 4 && durableStorageQuorum == 6 && tlogInfo == "dcid^2 x zoneid^2 x 1" && storageInfo == "dcid^3 x zoneid^2 x 1" )
result["redundancy_mode"] = "three_datacenter";
else if( tLogReplicationFactor == 3 && durableStorageQuorum == 3 )
result["redundancy_mode"] = "triple";
else if( tLogReplicationFactor == 4 && durableStorageQuorum == 3 && tlogInfo == "data_hall^2 x zoneid^2 x 1" && storageInfo == "data_hall^3 x 1" )
result["redundancy_mode"] = "three_data_hall";
else if( tLogReplicationFactor == 4 && durableStorageQuorum == 6 && tlogInfo == "dcid^2 x zoneid^2 x 1" && storageInfo == "dcid^3 x zoneid^2 x 1" )
result["redundancy_mode"] = "multi_dc";
else
result["redundancy_mode"] = "custom";
} else

View File

@ -318,11 +318,14 @@ ACTOR Future<Void> startMoveKeys( Database occ, KeyRange keys, vector<UID> serve
if (err.code() == error_code_move_to_removed_server)
throw;
Void _ = wait( tr.onError(e) );
TraceEvent(retries == 50 ? SevWarnAlways : SevWarn, "startMoveKeysRetrying", relocationIntervalId)
.detail("Keys", printable(keys))
.detail("BeginKey", printable(begin))
.detail("NumTries", retries)
.error(err);
if(retries%10 == 0) {
TraceEvent(retries == 50 ? SevWarnAlways : SevWarn, "startMoveKeysRetrying", relocationIntervalId)
.detail("Keys", printable(keys))
.detail("BeginKey", printable(begin))
.detail("NumTries", retries)
.error(err);
}
}
}
@ -602,12 +605,15 @@ ACTOR Future<Void> finishMoveKeys( Database occ, KeyRange keys, vector<UID> dest
if (error.code() == error_code_actor_cancelled) throw;
state Error err = error;
Void _ = wait( tr.onError(error) );
TraceEvent(retries++ == 15 ? SevWarnAlways : SevWarn, "RelocateShard_finishMoveKeysRetrying", relocationIntervalId)
.error(err)
.detail("KeyBegin", printable(keys.begin))
.detail("KeyEnd", printable(keys.end))
.detail("IterationBegin", printable(begin))
.detail("IterationEnd", printable(endKey));
retries++;
if(retries%10 == 0) {
TraceEvent(retries == 20 ? SevWarnAlways : SevWarn, "RelocateShard_finishMoveKeysRetrying", relocationIntervalId)
.error(err)
.detail("KeyBegin", printable(keys.begin))
.detail("KeyEnd", printable(keys.end))
.detail("IterationBegin", printable(begin))
.detail("IterationEnd", printable(endKey));
}
}
}
}

View File

@ -735,14 +735,10 @@ void SimulationConfig::generateNormalConfig(int minimumReplication) {
break;
}
case 3: {
if( datacenters == 1 ) {
if( datacenters <= 2 ) {
TEST( true ); // Simulated cluster running in triple redundancy mode
set_config("triple");
}
else if( datacenters == 2 ) {
TEST( true ); // Simulated cluster running in 2 datacenter mode
set_config("two_datacenter");
}
else if( datacenters == 3 ) {
TEST( true ); // Simulated cluster running in 3 data-hall mode
set_config("three_data_hall");

View File

@ -573,8 +573,12 @@ struct ConsistencyCheckWorkload : TestWorkload
state int increment = (self->distributed && !self->firstClient) ? effectiveClientCount * self->shardSampleFactor : 1;
state Reference<IRateControl> rateLimiter = Reference<IRateControl>( new SpeedLimit(self->rateLimit, CLIENT_KNOBS->CONSISTENCY_CHECK_RATE_WINDOW) );
int64_t _dbSize = wait( self->getDatabaseSize( cx ) );
state double dbSize = _dbSize;
state double dbSize = 100e12;
if(g_network->isSimulated()) {
//This call will get all shard ranges in the database, which is too expensive on real clusters.
int64_t _dbSize = wait( self->getDatabaseSize( cx ) );
dbSize = _dbSize;
}
state vector<KeyRangeRef> ranges;

View File

@ -240,7 +240,7 @@ public:
try {
if (error) {
// Log the error...
TraceEvent(SevWarn, errContext, errID).detail("Message", error.value());
TraceEvent(SevWarn, errContext, errID).detail("Message", error.value()).suppressFor(1.0);
p.sendError( connection_failed() );
} else
p.send( Void() );