Merge pull request #585 from etschannen/feature-remote-logs
A variety of cleanup and test strengthening commits
This commit is contained in:
commit
ad37b1693d
|
@ -346,6 +346,7 @@
|
|||
},
|
||||
"cluster_controller_timestamp":1415650089,
|
||||
"protocol_version":"fdb00a400050001",
|
||||
"connection_string":"a:a@127.0.0.1:4000",
|
||||
"full_replication":true,
|
||||
"configuration":{
|
||||
"log_anti_quorum":0,
|
||||
|
|
|
@ -17,6 +17,7 @@ Performance
|
|||
* Transaction logs do not copy mutations from previous generations of transaction logs. `(PR #339) <https://github.com/apple/foundationdb/pull/339>`_
|
||||
* Load balancing temporarily avoids communicating with storage servers that have fallen behind.
|
||||
* Avoid assigning storage servers responsibility for keys they do not have.
|
||||
* Clients optimistically assume the first leader reply from a coordinator is correct. `(PR #425) <https://github.com/apple/foundationdb/pull/425>`_
|
||||
|
||||
Fixes
|
||||
-----
|
||||
|
@ -24,6 +25,8 @@ Fixes
|
|||
* Not all endpoint failures were reported to the failure monitor.
|
||||
* Watches registered on a lagging storage server would take a long time to trigger.
|
||||
* The cluster controller would not start a new generation until it recovered its files from disk.
|
||||
* Disk errors cause the server process to exit, preventing the process from being reused unless it can read its files from disk. `(PR #568) <https://github.com/apple/foundationdb/pull/568>`_
|
||||
|
||||
|
||||
Status
|
||||
------
|
||||
|
@ -40,7 +43,7 @@ Other Changes
|
|||
-------------
|
||||
|
||||
* Does not support upgrades from any version older than 5.0.
|
||||
* Renamed the trace log attribute ``logGroup`` to ``LogGroup``.
|
||||
* Normalized the capitalization of trace event names and attributes. `(PR #455) <https://github.com/apple/foundationdb/pull/455>`_
|
||||
|
||||
Earlier release notes
|
||||
---------------------
|
||||
|
|
|
@ -1114,8 +1114,8 @@ public:
|
|||
notEnoughLeft = !primaryProcessesLeft.validate(tLogPolicy) || !primaryProcessesLeft.validate(storagePolicy) || !remoteProcessesLeft.validate(tLogPolicy) || !remoteProcessesLeft.validate(storagePolicy);
|
||||
}
|
||||
} else {
|
||||
bool primarySatelliteTLogsDead = satelliteTLogWriteAntiQuorum ? !validateAllCombinations(badCombo, primarySatelliteProcessesDead, satelliteTLogPolicy, primarySatelliteLocalitiesLeft, satelliteTLogWriteAntiQuorum, false) : primarySatelliteProcessesDead.validate(satelliteTLogPolicy);
|
||||
bool remoteSatelliteTLogsDead = satelliteTLogWriteAntiQuorum ? !validateAllCombinations(badCombo, remoteSatelliteProcessesDead, satelliteTLogPolicy, remoteSatelliteLocalitiesLeft, satelliteTLogWriteAntiQuorum, false) : remoteSatelliteProcessesDead.validate(satelliteTLogPolicy);
|
||||
bool primarySatelliteTLogsDead = satelliteTLogWriteAntiQuorumFallback ? !validateAllCombinations(badCombo, primarySatelliteProcessesDead, satelliteTLogPolicyFallback, primarySatelliteLocalitiesLeft, satelliteTLogWriteAntiQuorumFallback, false) : primarySatelliteProcessesDead.validate(satelliteTLogPolicyFallback);
|
||||
bool remoteSatelliteTLogsDead = satelliteTLogWriteAntiQuorumFallback ? !validateAllCombinations(badCombo, remoteSatelliteProcessesDead, satelliteTLogPolicyFallback, remoteSatelliteLocalitiesLeft, satelliteTLogWriteAntiQuorumFallback, false) : remoteSatelliteProcessesDead.validate(satelliteTLogPolicyFallback);
|
||||
|
||||
if(usableRegions > 1) {
|
||||
notEnoughLeft = !primaryProcessesLeft.validate(tLogPolicy) || !primaryProcessesLeft.validate(remoteTLogPolicy) || !primaryProcessesLeft.validate(storagePolicy) || !primarySatelliteProcessesLeft.validate(satelliteTLogPolicy) || !remoteProcessesLeft.validate(tLogPolicy) || !remoteProcessesLeft.validate(remoteTLogPolicy) || !remoteProcessesLeft.validate(storagePolicy) || !remoteSatelliteProcessesLeft.validate(satelliteTLogPolicy);
|
||||
|
|
|
@ -286,7 +286,9 @@ public:
|
|||
Optional<Standalone<StringRef>> remoteDcId;
|
||||
bool hasSatelliteReplication;
|
||||
IRepPolicyRef satelliteTLogPolicy;
|
||||
IRepPolicyRef satelliteTLogPolicyFallback;
|
||||
int32_t satelliteTLogWriteAntiQuorum;
|
||||
int32_t satelliteTLogWriteAntiQuorumFallback;
|
||||
std::vector<Optional<Standalone<StringRef>>> primarySatelliteDcIds;
|
||||
std::vector<Optional<Standalone<StringRef>>> remoteSatelliteDcIds;
|
||||
|
||||
|
|
|
@ -58,7 +58,7 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) {
|
|||
init( VERSIONS_PER_SECOND, 1e6 );
|
||||
init( MAX_VERSIONS_IN_FLIGHT, 100 * VERSIONS_PER_SECOND );
|
||||
init( MAX_VERSIONS_IN_FLIGHT_FORCED, 6e5 * VERSIONS_PER_SECOND ); //one week of versions
|
||||
init( MAX_READ_TRANSACTION_LIFE_VERSIONS, 5 * VERSIONS_PER_SECOND ); if (randomize && BUGGIFY) MAX_READ_TRANSACTION_LIFE_VERSIONS=std::max<int>(1, 0.1 * VERSIONS_PER_SECOND); else if( randomize && BUGGIFY ) MAX_READ_TRANSACTION_LIFE_VERSIONS = 10 * VERSIONS_PER_SECOND;
|
||||
init( MAX_READ_TRANSACTION_LIFE_VERSIONS, 5 * VERSIONS_PER_SECOND ); if (randomize && BUGGIFY) MAX_READ_TRANSACTION_LIFE_VERSIONS = VERSIONS_PER_SECOND; else if (randomize && BUGGIFY) MAX_READ_TRANSACTION_LIFE_VERSIONS = std::max<int>(1, 0.1 * VERSIONS_PER_SECOND); else if( randomize && BUGGIFY ) MAX_READ_TRANSACTION_LIFE_VERSIONS = 10 * VERSIONS_PER_SECOND;
|
||||
init( MAX_WRITE_TRANSACTION_LIFE_VERSIONS, 5 * VERSIONS_PER_SECOND ); if (randomize && BUGGIFY) MAX_WRITE_TRANSACTION_LIFE_VERSIONS=std::max<int>(1, 1 * VERSIONS_PER_SECOND);
|
||||
init( MAX_COMMIT_BATCH_INTERVAL, 0.5 ); if( randomize && BUGGIFY ) MAX_COMMIT_BATCH_INTERVAL = 2.0; // Each master proxy generates a CommitTransactionBatchRequest at least this often, so that versions always advance smoothly
|
||||
MAX_COMMIT_BATCH_INTERVAL = std::min(MAX_COMMIT_BATCH_INTERVAL, MAX_READ_TRANSACTION_LIFE_VERSIONS/double(2*VERSIONS_PER_SECOND)); // Ensure that the proxy commits 2 times every MAX_READ_TRANSACTION_LIFE_VERSIONS, otherwise the master will not give out versions fast enough
|
||||
|
|
|
@ -706,7 +706,7 @@ StringRef StringRefOf(const char* s) {
|
|||
void SimulationConfig::generateNormalConfig(int minimumReplication) {
|
||||
set_config("new");
|
||||
bool generateFearless = g_random->random01() < 0.5;
|
||||
datacenters = generateFearless ? 4 : g_random->randomInt( 1, 4 );
|
||||
datacenters = generateFearless ? ( minimumReplication > 0 || g_random->random01() < 0.5 ? 4 : 6 ) : g_random->randomInt( 1, 4 );
|
||||
if (g_random->random01() < 0.25) db.desiredTLogCount = g_random->randomInt(1,7);
|
||||
if (g_random->random01() < 0.25) db.masterProxyCount = g_random->randomInt(1,7);
|
||||
if (g_random->random01() < 0.25) db.resolverCount = g_random->randomInt(1,7);
|
||||
|
@ -716,7 +716,7 @@ void SimulationConfig::generateNormalConfig(int minimumReplication) {
|
|||
set_config("memory");
|
||||
}
|
||||
|
||||
int replication_type = std::max(minimumReplication, std::min(g_random->randomInt(0,6), 3));
|
||||
int replication_type = std::max(minimumReplication, datacenters > 4 ? g_random->randomInt(1,3) : std::min(g_random->randomInt(0,6), 3));
|
||||
switch (replication_type) {
|
||||
case 0: {
|
||||
TEST( true ); // Simulated cluster using custom redundancy mode
|
||||
|
@ -787,6 +787,44 @@ void SimulationConfig::generateNormalConfig(int minimumReplication) {
|
|||
remoteSatelliteObj["satellite"] = 1;
|
||||
remoteDcArr.push_back(remoteSatelliteObj);
|
||||
|
||||
if(datacenters > 4) {
|
||||
StatusObject primarySatelliteObjB;
|
||||
primarySatelliteObjB["id"] = "4";
|
||||
primarySatelliteObjB["priority"] = 1;
|
||||
primarySatelliteObjB["satellite"] = 1;
|
||||
primaryDcArr.push_back(primarySatelliteObjB);
|
||||
|
||||
StatusObject remoteSatelliteObjB;
|
||||
remoteSatelliteObjB["id"] = "5";
|
||||
remoteSatelliteObjB["priority"] = 1;
|
||||
remoteSatelliteObjB["satellite"] = 1;
|
||||
remoteDcArr.push_back(remoteSatelliteObjB);
|
||||
}
|
||||
|
||||
if(datacenters > 4) {
|
||||
//FIXME: we cannot use one satellite replication with more than one satellite per region because canKillProcesses does not respect usable_dcs
|
||||
int satellite_replication_type = g_random->randomInt(0,3);
|
||||
switch (satellite_replication_type) {
|
||||
case 0: {
|
||||
TEST( true ); // Simulated cluster using no satellite redundancy mode
|
||||
break;
|
||||
}
|
||||
case 1: {
|
||||
TEST( true ); // Simulated cluster using two satellite fast redundancy mode
|
||||
primaryObj["satellite_redundancy_mode"] = "two_satellite_fast";
|
||||
remoteObj["satellite_redundancy_mode"] = "two_satellite_fast";
|
||||
break;
|
||||
}
|
||||
case 2: {
|
||||
TEST( true ); // Simulated cluster using two satellite safe redundancy mode
|
||||
primaryObj["satellite_redundancy_mode"] = "two_satellite_safe";
|
||||
remoteObj["satellite_redundancy_mode"] = "two_satellite_safe";
|
||||
break;
|
||||
}
|
||||
default:
|
||||
ASSERT(false); // Programmer forgot to adjust cases.
|
||||
}
|
||||
} else {
|
||||
int satellite_replication_type = g_random->randomInt(0,5);
|
||||
switch (satellite_replication_type) {
|
||||
case 0: {
|
||||
|
@ -819,6 +857,7 @@ void SimulationConfig::generateNormalConfig(int minimumReplication) {
|
|||
default:
|
||||
ASSERT(false); // Programmer forgot to adjust cases.
|
||||
}
|
||||
}
|
||||
|
||||
if (g_random->random01() < 0.25) {
|
||||
int logs = g_random->randomInt(1,7);
|
||||
|
@ -826,7 +865,8 @@ void SimulationConfig::generateNormalConfig(int minimumReplication) {
|
|||
remoteObj["satellite_logs"] = logs;
|
||||
}
|
||||
|
||||
if (g_random->random01() < 0.5) {
|
||||
//We cannot run with a remote DC when MAX_READ_TRANSACTION_LIFE_VERSIONS is too small, because the log routers will not be able to keep up.
|
||||
if (g_random->random01() < 0.25 || SERVER_KNOBS->MAX_READ_TRANSACTION_LIFE_VERSIONS < SERVER_KNOBS->VERSIONS_PER_SECOND) {
|
||||
TEST( true ); // Simulated cluster using one region
|
||||
needsRemote = false;
|
||||
} else {
|
||||
|
@ -834,7 +874,7 @@ void SimulationConfig::generateNormalConfig(int minimumReplication) {
|
|||
db.usableRegions = 2;
|
||||
}
|
||||
|
||||
int remote_replication_type = g_random->randomInt(0,5);
|
||||
int remote_replication_type = g_random->randomInt(0, datacenters > 4 ? 4 : 5);
|
||||
switch (remote_replication_type) {
|
||||
case 0: {
|
||||
//FIXME: implement
|
||||
|
@ -940,12 +980,14 @@ void setupSimulatedSystem( vector<Future<Void>> *systemActors, std::string baseF
|
|||
g_simulator.primaryDcId = simconfig.db.regions[0].dcId;
|
||||
g_simulator.hasSatelliteReplication = simconfig.db.regions[0].satelliteTLogReplicationFactor > 0;
|
||||
if(simconfig.db.regions[0].satelliteTLogUsableDcsFallback > 0) {
|
||||
g_simulator.satelliteTLogPolicy = simconfig.db.regions[0].satelliteTLogPolicyFallback;
|
||||
g_simulator.satelliteTLogWriteAntiQuorum = simconfig.db.regions[0].satelliteTLogWriteAntiQuorumFallback;
|
||||
g_simulator.satelliteTLogPolicyFallback = simconfig.db.regions[0].satelliteTLogPolicyFallback;
|
||||
g_simulator.satelliteTLogWriteAntiQuorumFallback = simconfig.db.regions[0].satelliteTLogWriteAntiQuorumFallback;
|
||||
} else {
|
||||
g_simulator.satelliteTLogPolicyFallback = simconfig.db.regions[0].satelliteTLogPolicy;
|
||||
g_simulator.satelliteTLogWriteAntiQuorumFallback = simconfig.db.regions[0].satelliteTLogWriteAntiQuorum;
|
||||
}
|
||||
g_simulator.satelliteTLogPolicy = simconfig.db.regions[0].satelliteTLogPolicy;
|
||||
g_simulator.satelliteTLogWriteAntiQuorum = simconfig.db.regions[0].satelliteTLogWriteAntiQuorum;
|
||||
}
|
||||
|
||||
for(auto s : simconfig.db.regions[0].satellites) {
|
||||
g_simulator.primarySatelliteDcIds.push_back(s.dcId);
|
||||
|
@ -964,6 +1006,10 @@ void setupSimulatedSystem( vector<Future<Void>> *systemActors, std::string baseF
|
|||
}
|
||||
}
|
||||
|
||||
if(g_simulator.usableRegions < 2 || !g_simulator.hasSatelliteReplication) {
|
||||
g_simulator.allowLogSetKills = false;
|
||||
}
|
||||
|
||||
ASSERT(g_simulator.storagePolicy && g_simulator.tLogPolicy);
|
||||
ASSERT(!g_simulator.hasSatelliteReplication || g_simulator.satelliteTLogPolicy);
|
||||
TraceEvent("SimulatorConfig").detail("ConfigString", printable(StringRef(startingConfigString)));
|
||||
|
|
|
@ -1741,6 +1741,7 @@ ACTOR Future<StatusReply> clusterGetStatus(
|
|||
state StatusObject data_overlay;
|
||||
|
||||
statusObj["protocol_version"] = format("%llx", currentProtocolVersion);
|
||||
statusObj["connection_string"] = coordinators.ccf->getConnectionString().toString();
|
||||
|
||||
state Optional<DatabaseConfiguration> configuration;
|
||||
state Optional<bool> fullReplication;
|
||||
|
|
|
@ -379,8 +379,8 @@ struct LogData : NonCopyable, public ReferenceCounted<LogData> {
|
|||
|
||||
//only callable after getTagData returns a null reference
|
||||
Reference<TagData> createTagData(Tag tag, Version popped, bool nothingPersistent, bool poppedRecently, bool unpoppedRecovered) {
|
||||
if(tag.locality != tagLocalityLogRouter && allTags.size() && !allTags.count(tag) && popped < recoveredAt) {
|
||||
popped = recoveredAt;
|
||||
if(tag.locality != tagLocalityLogRouter && allTags.size() && !allTags.count(tag) && popped <= recoveredAt) {
|
||||
popped = recoveredAt + 1;
|
||||
}
|
||||
Reference<TagData> newTagData = Reference<TagData>( new TagData(tag, popped, nothingPersistent, poppedRecently, unpoppedRecovered) );
|
||||
int idx = tag.locality >= 0 ? 2*tag.locality : 1-(2*tag.locality);
|
||||
|
|
|
@ -500,7 +500,6 @@ ACTOR Future<Standalone<CommitTransactionRef>> provisionalMaster( Reference<Mast
|
|||
when ( CommitTransactionRequest req = waitNext( parent->provisionalProxies[0].commit.getFuture() ) ) {
|
||||
req.reply.send(Never()); // don't reply (clients always get commit_unknown_result)
|
||||
auto t = &req.transaction;
|
||||
TraceEvent("PM_CTC", parent->dbgid).detail("Snapshot", t->read_snapshot).detail("Now", parent->lastEpochEnd);
|
||||
if (t->read_snapshot == parent->lastEpochEnd && //< So no transactions can fall between the read snapshot and the recovery transaction this (might) be merged with
|
||||
// vvv and also the changes we will make in the recovery transaction (most notably to lastEpochEndKey) BEFORE we merge initialConfChanges won't conflict
|
||||
!std::any_of(t->read_conflict_ranges.begin(), t->read_conflict_ranges.end(), [](KeyRangeRef const& r){return r.contains(lastEpochEndKey);}))
|
||||
|
@ -641,7 +640,9 @@ ACTOR Future<Void> readTransactionSystemState( Reference<MasterData> self, Refer
|
|||
|
||||
Standalone<VectorRef<KeyValueRef>> rawTags = wait( self->txnStateStore->readRange( serverTagKeys ) );
|
||||
self->allTags.clear();
|
||||
if(self->lastEpochEnd > 0) {
|
||||
self->allTags.push_back(txsTag);
|
||||
}
|
||||
for(auto& kv : rawTags) {
|
||||
self->allTags.push_back(decodeServerTagValue( kv.value ));
|
||||
}
|
||||
|
|
|
@ -147,6 +147,9 @@ struct ConfigureDatabaseWorkload : TestWorkload {
|
|||
//TraceEvent("ConfigureTestConfigureBegin").detail("NewConfig", newConfig);
|
||||
int redundancy = g_random->randomInt( 0, sizeof(redundancies)/sizeof(redundancies[0]));
|
||||
std::string config = redundancies[redundancy];
|
||||
if(config == "triple" && g_simulator.physicalDatacenters > 4) {
|
||||
config = "double";
|
||||
}
|
||||
if(config == "triple" && g_simulator.physicalDatacenters == 3) {
|
||||
config = "three_data_hall";
|
||||
}
|
||||
|
|
|
@ -139,11 +139,11 @@ struct ConsistencyCheckWorkload : TestWorkload
|
|||
|
||||
}
|
||||
|
||||
void testFailure(std::string message)
|
||||
void testFailure(std::string message, bool isError = false)
|
||||
{
|
||||
success = false;
|
||||
|
||||
TraceEvent failEvent(failureIsError ? SevError : SevWarn, "TestFailure");
|
||||
TraceEvent failEvent((failureIsError || isError) ? SevError : SevWarn, "TestFailure");
|
||||
if(performQuiescentChecks)
|
||||
failEvent.detail("Workload", "QuiescentCheck");
|
||||
else
|
||||
|
@ -430,7 +430,7 @@ struct ConsistencyCheckWorkload : TestWorkload
|
|||
{
|
||||
TraceEvent("ConsistencyCheck_InconsistentKeyServers").detail("StorageServer1", shards[i].second[firstValidStorageServer].id())
|
||||
.detail("StorageServer2", shards[i].second[j].id());
|
||||
self->testFailure("Key servers inconsistent");
|
||||
self->testFailure("Key servers inconsistent", true);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
@ -834,7 +834,7 @@ struct ConsistencyCheckWorkload : TestWorkload
|
|||
.detail("ValueMismatchKey", printable(valueMismatchKey))
|
||||
.detail("MatchingKVPairs", matchingKVPairs);
|
||||
|
||||
self->testFailure("Data inconsistent");
|
||||
self->testFailure("Data inconsistent", true);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue