diff --git a/documentation/StatusSchema.json b/documentation/StatusSchema.json index 5b25417ee9..47ae2229e9 100644 --- a/documentation/StatusSchema.json +++ b/documentation/StatusSchema.json @@ -284,6 +284,9 @@ "initializing_transaction_servers", "recovery_transaction", "writing_coordinated_state", + "accepting_commits", + "all_logs_recruited", + "storage_recovered", "fully_recovered" ] }, diff --git a/fdbcli/fdbcli.actor.cpp b/fdbcli/fdbcli.actor.cpp index 0d53c609ec..2dc8e322ad 100644 --- a/fdbcli/fdbcli.actor.cpp +++ b/fdbcli/fdbcli.actor.cpp @@ -735,7 +735,7 @@ void printStatus(StatusObjectReader statusObj, StatusClient::StatusLevel level, std::string description; if (recoveryState.get("name", name) && recoveryState.get("description", description) && - name != "fully_recovered" && name != "remote_recovered") + name != "accepting_commits" && name != "all_logs_recruited" && name != "storage_recovered" && name != "fully_recovered") { fatalRecoveryState = true; diff --git a/fdbclient/StatusClient.actor.cpp b/fdbclient/StatusClient.actor.cpp index 5ce2fa3ed2..594b10b00b 100644 --- a/fdbclient/StatusClient.actor.cpp +++ b/fdbclient/StatusClient.actor.cpp @@ -377,8 +377,9 @@ StatusObject getClientDatabaseStatus(StatusObjectReader client, StatusObjectRead try { // Lots of the JSON reads in this code could throw, and that's OK, isAvailable and isHealthy will be // at the states we want them to be in (currently) + std::string recoveryStateName = cluster.at("recovery_state.name").get_str(); isAvailable = client.at("coordinators.quorum_reachable").get_bool() - && ( cluster.at("recovery_state.name") == "fully_recovered" || "remote_recovered" ) + && ( recoveryStateName == "accepting_commits" || recoveryStateName == "all_logs_recruited" || recoveryStateName == "storage_recovered" || recoveryStateName == "fully_recovered" ) && cluster.at("database_available").get_bool(); if (isAvailable) diff --git a/fdbrpc/sim2.actor.cpp b/fdbrpc/sim2.actor.cpp index be4e2741a6..21543f417f 100644 --- a/fdbrpc/sim2.actor.cpp +++ b/fdbrpc/sim2.actor.cpp @@ -1117,7 +1117,7 @@ public: bool remoteSatelliteTLogsDead = satelliteTLogWriteAntiQuorum ? !validateAllCombinations(badCombo, remoteSatelliteProcessesDead, satelliteTLogPolicy, remoteSatelliteLocalitiesLeft, satelliteTLogWriteAntiQuorum, false) : remoteSatelliteProcessesDead.validate(satelliteTLogPolicy); notEnoughLeft = !primaryProcessesLeft.validate(tLogPolicy) || !primaryProcessesLeft.validate(storagePolicy) || !primarySatelliteProcessesLeft.validate(satelliteTLogPolicy) || !remoteProcessesLeft.validate(tLogPolicy) || !remoteProcessesLeft.validate(storagePolicy) || !remoteSatelliteProcessesLeft.validate(satelliteTLogPolicy); - if(usableRegions > 1) { + if(usableRegions > 1 && allowLogSetKills) { tooManyDead = ( primaryTLogsDead && primarySatelliteTLogsDead ) || ( remoteTLogsDead && remoteSatelliteTLogsDead ) || ( primaryTLogsDead && remoteTLogsDead ) || ( primaryProcessesDead.validate(storagePolicy) && remoteProcessesDead.validate(storagePolicy) ); } else { tooManyDead = primaryTLogsDead || remoteTLogsDead || primaryProcessesDead.validate(storagePolicy) || remoteProcessesDead.validate(storagePolicy); @@ -1266,7 +1266,7 @@ public: processesDead.push_back(processInfo); excluded++; } - else if (!processInfo->isCleared()) { + else if (processInfo->isCleared()) { processesDead.push_back(processInfo); cleared++; } diff --git a/fdbrpc/simulator.h b/fdbrpc/simulator.h index 64f0a788bb..bed0ea6a42 100644 --- a/fdbrpc/simulator.h +++ b/fdbrpc/simulator.h @@ -34,7 +34,7 @@ enum ClogMode { ClogDefault, ClogAll, ClogSend, ClogReceive }; class ISimulator : public INetwork { public: - ISimulator() : desiredCoordinators(1), physicalDatacenters(1), processesPerMachine(0), isStopped(false), lastConnectionFailure(0), connectionFailuresDisableDuration(0), speedUpSimulation(false), allSwapsDisabled(false), backupAgents(WaitForType), drAgents(WaitForType), extraDB(NULL) {} + ISimulator() : desiredCoordinators(1), physicalDatacenters(1), processesPerMachine(0), isStopped(false), lastConnectionFailure(0), connectionFailuresDisableDuration(0), speedUpSimulation(false), allSwapsDisabled(false), backupAgents(WaitForType), drAgents(WaitForType), extraDB(NULL), allowLogSetKills(true) {} // Order matters! enum KillType { KillInstantly, InjectFaults, RebootAndDelete, RebootProcessAndDelete, Reboot, RebootProcess, None }; @@ -77,8 +77,8 @@ public: bool isReliable() const { return !failed && fault_injection_p1 == 0 && fault_injection_p2 == 0; } bool isAvailable() const { return !isExcluded() && isReliable(); } - bool isExcluded() const { return !excluded; } - bool isCleared() const { return !cleared; } + bool isExcluded() const { return excluded; } + bool isCleared() const { return cleared; } // Returns true if the class represents an acceptable worker bool isAvailableClass() const { @@ -282,6 +282,7 @@ public: Optional> primaryDcId; IRepPolicyRef remoteTLogPolicy; int32_t usableRegions; + bool allowLogSetKills; Optional> remoteDcId; bool hasSatelliteReplication; IRepPolicyRef satelliteTLogPolicy; diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp index 53c8ff2362..3e6364aa94 100644 --- a/fdbserver/ClusterController.actor.cpp +++ b/fdbserver/ClusterController.actor.cpp @@ -791,7 +791,7 @@ public: bool betterMasterExists() { ServerDBInfo dbi = db.serverInfo->get(); - if(dbi.recoveryState < RecoveryState::FULLY_RECOVERED) { + if(dbi.recoveryState < RecoveryState::ACCEPTING_COMMITS) { return false; } @@ -923,14 +923,14 @@ public: return false; RoleFitness oldRemoteTLogFit(remote_tlogs, ProcessClass::TLog); - RoleFitness newRemoteTLogFit((db.config.usableRegions > 1 && dbi.recoveryState == RecoveryState::REMOTE_RECOVERED) ? getWorkersForTlogs(db.config, db.config.getRemoteTLogReplicationFactor(), db.config.getDesiredRemoteLogs(), db.config.getRemoteTLogPolicy(), id_used, true, remoteDC) : remote_tlogs, ProcessClass::TLog); + RoleFitness newRemoteTLogFit((db.config.usableRegions > 1 && dbi.recoveryState == RecoveryState::FULLY_RECOVERED) ? getWorkersForTlogs(db.config, db.config.getRemoteTLogReplicationFactor(), db.config.getDesiredRemoteLogs(), db.config.getRemoteTLogPolicy(), id_used, true, remoteDC) : remote_tlogs, ProcessClass::TLog); if(oldRemoteTLogFit < newRemoteTLogFit) return false; int oldRouterCount = oldTLogFit.count * std::max(1, db.config.desiredLogRouterCount / std::max(1,oldTLogFit.count)); int newRouterCount = newTLogFit.count * std::max(1, db.config.desiredLogRouterCount / std::max(1,newTLogFit.count)); RoleFitness oldLogRoutersFit(log_routers, ProcessClass::LogRouter); - RoleFitness newLogRoutersFit((db.config.usableRegions > 1 && dbi.recoveryState == RecoveryState::REMOTE_RECOVERED) ? getWorkersForRoleInDatacenter( *remoteDC.begin(), ProcessClass::LogRouter, newRouterCount, db.config, id_used, Optional(), true ) : log_routers, ProcessClass::LogRouter); + RoleFitness newLogRoutersFit((db.config.usableRegions > 1 && dbi.recoveryState == RecoveryState::FULLY_RECOVERED) ? getWorkersForRoleInDatacenter( *remoteDC.begin(), ProcessClass::LogRouter, newRouterCount, db.config, id_used, Optional(), true ) : log_routers, ProcessClass::LogRouter); if(oldLogRoutersFit.count < oldRouterCount) { oldLogRoutersFit.worstFit = ProcessClass::NeverAssign; @@ -1555,7 +1555,7 @@ void clusterRegisterMaster( ClusterControllerData* self, RegisterMasterRequest c if ( req.configuration.present() ) { db->config = req.configuration.get(); - if ( req.recoveryState >= RecoveryState::FULLY_RECOVERED ) { + if ( req.recoveryState >= RecoveryState::ACCEPTING_COMMITS ) { self->gotFullyRecoveredConfig = true; db->fullyRecoveredConfig = req.configuration.get(); for ( auto& it : self->id_worker ) { @@ -2049,7 +2049,7 @@ ACTOR Future updateDatacenterVersionDifference( ClusterControllerData *sel state double lastLogTime = 0; loop { self->versionDifferenceUpdated = false; - if(self->db.serverInfo->get().recoveryState >= RecoveryState::FULLY_RECOVERED && self->db.config.usableRegions == 1) { + if(self->db.serverInfo->get().recoveryState >= RecoveryState::ACCEPTING_COMMITS && self->db.config.usableRegions == 1) { self->versionDifferenceUpdated = true; self->datacenterVersionDifference = 0; Void _ = wait(self->db.serverInfo->onChange()); @@ -2058,7 +2058,7 @@ ACTOR Future updateDatacenterVersionDifference( ClusterControllerData *sel state Optional primaryLog; state Optional remoteLog; - if(self->db.serverInfo->get().recoveryState == RecoveryState::REMOTE_RECOVERED) { + if(self->db.serverInfo->get().recoveryState == RecoveryState::FULLY_RECOVERED) { for(auto& logSet : self->db.serverInfo->get().logSystemConfig.tLogs) { if(logSet.isLocal && logSet.locality != tagLocalitySatellite) { for(auto& tLog : logSet.tLogs) { @@ -2086,8 +2086,8 @@ ACTOR Future updateDatacenterVersionDifference( ClusterControllerData *sel state Future onChange = self->db.serverInfo->onChange(); loop { - state Future primaryMetrics = primaryLog.get().getQueuingMetrics.getReply( TLogQueuingMetricsRequest() ); - state Future remoteMetrics = remoteLog.get().getQueuingMetrics.getReply( TLogQueuingMetricsRequest() ); + state Future primaryMetrics = brokenPromiseToNever( primaryLog.get().getQueuingMetrics.getReply( TLogQueuingMetricsRequest() ) ); + state Future remoteMetrics = brokenPromiseToNever( remoteLog.get().getQueuingMetrics.getReply( TLogQueuingMetricsRequest() ) ); Void _ = wait( ( success(primaryMetrics) && success(remoteMetrics) ) || onChange ); if(onChange.isReady()) { diff --git a/fdbserver/LogRouter.actor.cpp b/fdbserver/LogRouter.actor.cpp index a0f76fda5d..e45adc57b4 100644 --- a/fdbserver/LogRouter.actor.cpp +++ b/fdbserver/LogRouter.actor.cpp @@ -366,7 +366,7 @@ ACTOR Future logRouterCore( loop choose { when( Void _ = wait( dbInfoChange ) ) { dbInfoChange = db->onChange(); - logRouterData.allowPops = db->get().recoveryState == RecoveryState::REMOTE_RECOVERED; + logRouterData.allowPops = db->get().recoveryState == RecoveryState::FULLY_RECOVERED; logRouterData.logSystem->set(ILogSystem::fromServerDBInfo( logRouterData.dbgid, db->get(), true )); } when( TLogPeekRequest req = waitNext( interf.peekMessages.getFuture() ) ) { @@ -381,7 +381,7 @@ ACTOR Future logRouterCore( ACTOR Future checkRemoved(Reference> db, uint64_t recoveryCount, TLogInterface myInterface) { loop{ - bool isDisplaced = ( (db->get().recoveryCount > recoveryCount && db->get().recoveryState != RecoveryState::UNINITIALIZED) || (db->get().recoveryCount == recoveryCount && db->get().recoveryState == RecoveryState::REMOTE_RECOVERED) ); + bool isDisplaced = ( (db->get().recoveryCount > recoveryCount && db->get().recoveryState != RecoveryState::UNINITIALIZED) || (db->get().recoveryCount == recoveryCount && db->get().recoveryState == RecoveryState::FULLY_RECOVERED) ); if(isDisplaced) { for(auto& log : db->get().logSystemConfig.tLogs) { if( std::count( log.logRouters.begin(), log.logRouters.end(), myInterface.id() ) ) { diff --git a/fdbserver/LogSystem.h b/fdbserver/LogSystem.h index 55ae18c3c6..d26d7c8983 100644 --- a/fdbserver/LogSystem.h +++ b/fdbserver/LogSystem.h @@ -161,7 +161,18 @@ public: minUsed = std::min(minUsed, i); maxUsed = std::max(maxUsed, i); } - TraceEvent(((maxUsed - minUsed > 1) || (maxUsedBest - minUsedBest > 1)) ? (g_network->isSimulated() ? SevError : SevWarnAlways) : SevInfo, "CheckSatelliteTagLocations").detail("MinUsed", minUsed).detail("MaxUsed", maxUsed).detail("MinUsedBest", minUsedBest).detail("MaxUsedBest", maxUsedBest); + + bool foundDuplicate = false; + std::set> zones; + for(auto& loc : tLogLocalities) { + if(zones.count(loc.zoneId())) { + foundDuplicate = true; + break; + } + zones.insert(loc.zoneId()); + } + + TraceEvent(((maxUsed - minUsed > 1) || (maxUsedBest - minUsedBest > 1)) ? (g_network->isSimulated() && !foundDuplicate ? SevError : SevWarnAlways) : SevInfo, "CheckSatelliteTagLocations").detail("MinUsed", minUsed).detail("MaxUsed", maxUsed).detail("MinUsedBest", minUsedBest).detail("MaxUsedBest", maxUsedBest).detail("DuplicateZones", foundDuplicate); } int bestLocationFor( Tag tag ) { diff --git a/fdbserver/MasterProxyServer.actor.cpp b/fdbserver/MasterProxyServer.actor.cpp index 79d009da91..e604bfe7fb 100644 --- a/fdbserver/MasterProxyServer.actor.cpp +++ b/fdbserver/MasterProxyServer.actor.cpp @@ -974,7 +974,7 @@ ACTOR static Future transactionStarter( otherProxies.push_back(mp); } - ASSERT(db->get().recoveryState >= RecoveryState::FULLY_RECOVERED); // else potentially we could return uncommitted read versions (since self->committedVersion is only a committed version if this recovery succeeds) + ASSERT(db->get().recoveryState >= RecoveryState::ACCEPTING_COMMITS); // else potentially we could return uncommitted read versions (since self->committedVersion is only a committed version if this recovery succeeds) TraceEvent("ProxyReadyForTxnStarts", proxy.id()); @@ -1247,7 +1247,7 @@ ACTOR Future masterProxyServerCore( const vector &trs = batchedRequests.first; int batchBytes = batchedRequests.second; //TraceEvent("MasterProxyCTR", proxy.id()).detail("CommitTransactions", trs.size()).detail("TransactionRate", transactionRate).detail("TransactionQueue", transactionQueue.size()).detail("ReleasedTransactionCount", transactionCount); - if (trs.size() || (db->get().recoveryState >= RecoveryState::FULLY_RECOVERED && now() - lastCommit >= SERVER_KNOBS->MAX_COMMIT_BATCH_INTERVAL)) { + if (trs.size() || (db->get().recoveryState >= RecoveryState::ACCEPTING_COMMITS && now() - lastCommit >= SERVER_KNOBS->MAX_COMMIT_BATCH_INTERVAL)) { lastCommit = now(); if (trs.size() || lastCommitComplete.isReady()) { diff --git a/fdbserver/QuietDatabase.actor.cpp b/fdbserver/QuietDatabase.actor.cpp index 401c885d50..683d6c7269 100644 --- a/fdbserver/QuietDatabase.actor.cpp +++ b/fdbserver/QuietDatabase.actor.cpp @@ -286,11 +286,11 @@ ACTOR Future getStorageServersRecruiting( Database cx, Reference reconfigureAfter(Database cx, double time, Reference> dbInfo) { Void _ = wait( delay(time) ); - if(g_network->isSimulated()) { + if(g_network->isSimulated() && g_simulator.allowLogSetKills) { TraceEvent(SevWarnAlways, "DisablingFearlessConfiguration"); g_simulator.usableRegions = 1; ConfigurationResult::Type _ = wait( changeConfig( cx, "repopulate_anti_quorum=1" ) ); - while( dbInfo->get().recoveryState < RecoveryState::REMOTE_RECOVERED ) { + while( dbInfo->get().recoveryState < RecoveryState::STORAGE_RECOVERED ) { Void _ = wait( dbInfo->onChange() ); } ConfigurationResult::Type _ = wait( changeConfig( cx, "usable_regions=1" ) ); diff --git a/fdbserver/RecoveryState.h b/fdbserver/RecoveryState.h index 914c5e2559..9d710d2b1b 100644 --- a/fdbserver/RecoveryState.h +++ b/fdbserver/RecoveryState.h @@ -27,7 +27,7 @@ // RecoveryState and RecoveryStatus should probably be merged. The former is passed through ServerDBInfo and used for "real" decisions in the system; the latter // is slightly more detailed and is used by the status infrastructure. But I'm scared to make changes to the former so close to 1.0 release, so I'm making the latter. -enum class RecoveryState { UNINITIALIZED = 0, READING_CSTATE = 1, LOCKING_CSTATE = 2, RECRUITING = 3, RECOVERY_TRANSACTION = 4, WRITING_CSTATE = 5, FULLY_RECOVERED = 6, REMOTE_RECOVERED = 7 }; +enum class RecoveryState { UNINITIALIZED = 0, READING_CSTATE = 1, LOCKING_CSTATE = 2, RECRUITING = 3, RECOVERY_TRANSACTION = 4, WRITING_CSTATE = 5, ACCEPTING_COMMITS = 6, ALL_LOGS_RECRUITED = 7, STORAGE_RECOVERED = 8, FULLY_RECOVERED = 9 }; BINARY_SERIALIZABLE( RecoveryState ); namespace RecoveryStatus { @@ -43,8 +43,10 @@ namespace RecoveryStatus { initializing_transaction_servers, recovery_transaction, writing_coordinated_state, + accepting_commits, + all_logs_recruited, + storage_recovered, fully_recovered, - remote_recovered, END }; diff --git a/fdbserver/Status.actor.cpp b/fdbserver/Status.actor.cpp index 680cc4e095..3ec215be06 100644 --- a/fdbserver/Status.actor.cpp +++ b/fdbserver/Status.actor.cpp @@ -37,7 +37,7 @@ const char* RecoveryStatus::names[] = { "reading_coordinated_state", "locking_coordinated_state", "locking_old_transaction_servers", "reading_transaction_system_state", "configuration_missing", "configuration_never_created", "configuration_invalid", "recruiting_transaction_servers", "initializing_transaction_servers", "recovery_transaction", - "writing_coordinated_state", "fully_recovered", "remote_recovered" + "writing_coordinated_state", "accepting_commits", "all_logs_recruited", "storage_recovered", "fully_recovered" }; static_assert( sizeof(RecoveryStatus::names) == sizeof(RecoveryStatus::names[0])*RecoveryStatus::END, "RecoveryStatus::names[] size" ); const char* RecoveryStatus::descriptions[] = { @@ -63,10 +63,14 @@ const char* RecoveryStatus::descriptions[] = { "Performing recovery transaction.", // writing_coordinated_state "Writing coordinated state. Verify that a majority of coordination server processes are active.", + // accepting_commits + "Accepting commits.", + // all_logs_recruited + "Accepting commits. All logs recruited.", + // storage_recovered + "Accepting commits. All storage servers are reading from the new logs.", // fully_recovered - "Recovery complete.", - // remote_recovered - "Remote recovery complete." + "Recovery complete." }; static_assert( sizeof(RecoveryStatus::descriptions) == sizeof(RecoveryStatus::descriptions[0])*RecoveryStatus::END, "RecoveryStatus::descriptions[] size" ); @@ -1396,7 +1400,7 @@ ACTOR static Future workloadStatusFetcher(Reference> db, std::unordered_map const& address_workers) { StatusArray oldTlogsArray; - if(db->get().recoveryState >= RecoveryState::FULLY_RECOVERED) { + if(db->get().recoveryState >= RecoveryState::ACCEPTING_COMMITS) { for(auto it : db->get().logSystemConfig.oldTLogs) { StatusObject statusObj; StatusArray logsObj; @@ -1780,7 +1784,7 @@ ACTOR Future clusterGetStatus( state std::vector workerStatuses = wait(getAll(futures2)); int oldLogFaultTolerance = 100; - if(db->get().recoveryState >= RecoveryState::FULLY_RECOVERED && db->get().logSystemConfig.oldTLogs.size() > 0) { + if(db->get().recoveryState >= RecoveryState::ACCEPTING_COMMITS && db->get().logSystemConfig.oldTLogs.size() > 0) { statusObj["old_logs"] = oldTlogFetcher(&oldLogFaultTolerance, db, address_workers); } diff --git a/fdbserver/TLogServer.actor.cpp b/fdbserver/TLogServer.actor.cpp index fda928622e..e6fc79709e 100644 --- a/fdbserver/TLogServer.actor.cpp +++ b/fdbserver/TLogServer.actor.cpp @@ -1272,7 +1272,7 @@ ACTOR Future rejoinMasters( TLogData* self, TLogInterface tli, DBRecoveryC if(isPrimary) { isDisplaced = isDisplaced && inf.recoveryCount >= recoveryCount && inf.recoveryState != RecoveryState::UNINITIALIZED; } else { - isDisplaced = isDisplaced && ( ( inf.recoveryCount > recoveryCount && inf.recoveryState != RecoveryState::UNINITIALIZED ) || ( inf.recoveryCount == recoveryCount && inf.recoveryState == RecoveryState::REMOTE_RECOVERED ) ); + isDisplaced = isDisplaced && ( ( inf.recoveryCount > recoveryCount && inf.recoveryState != RecoveryState::UNINITIALIZED ) || ( inf.recoveryCount == recoveryCount && inf.recoveryState == RecoveryState::FULLY_RECOVERED ) ); } if(isDisplaced) { for(auto& log : inf.logSystemConfig.tLogs) { @@ -1383,7 +1383,7 @@ ACTOR Future serveTLogInterface( TLogData* self, TLogInterface tli, Refere when( Void _ = wait( dbInfoChange ) ) { dbInfoChange = self->dbInfo->onChange(); bool found = false; - if(self->dbInfo->get().recoveryState >= RecoveryState::FULLY_RECOVERED) { + if(self->dbInfo->get().recoveryState >= RecoveryState::ACCEPTING_COMMITS) { for(auto& logs : self->dbInfo->get().logSystemConfig.tLogs) { if( std::count( logs.tLogs.begin(), logs.tLogs.end(), logData->logId ) ) { found = true; @@ -1871,7 +1871,7 @@ ACTOR Future updateLogSystem(TLogData* self, Reference logData, L logSystem->set(ILogSystem::fromLogSystemConfig( logData->logId, self->dbInfo->get().myLocality, self->dbInfo->get().logSystemConfig, false, true )); found = true; } - else if( self->dbInfo->get().recoveryState >= RecoveryState::FULLY_RECOVERED ) { + else if( self->dbInfo->get().recoveryState >= RecoveryState::ACCEPTING_COMMITS ) { logSystem->set(ILogSystem::fromLogSystemConfig( logData->logId, self->dbInfo->get().myLocality, self->dbInfo->get().logSystemConfig, true )); found = true; } diff --git a/fdbserver/masterserver.actor.cpp b/fdbserver/masterserver.actor.cpp index f997c4f3e2..894d6e0429 100644 --- a/fdbserver/masterserver.actor.cpp +++ b/fdbserver/masterserver.actor.cpp @@ -1062,10 +1062,22 @@ ACTOR Future trackTlogRecovery( Reference self, ReferencerecoveryState = RecoveryState::REMOTE_RECOVERED; + self->recoveryState = RecoveryState::FULLY_RECOVERED; TraceEvent("MasterRecoveryState", self->dbgid) - .detail("StatusCode", RecoveryStatus::remote_recovered) - .detail("Status", RecoveryStatus::names[RecoveryStatus::remote_recovered]) + .detail("StatusCode", RecoveryStatus::fully_recovered) + .detail("Status", RecoveryStatus::names[RecoveryStatus::fully_recovered]) + .trackLatest(format("%s/MasterRecoveryState", printable(self->dbName).c_str() ).c_str()); + } else if( !newState.oldTLogData.size() && self->recoveryState < RecoveryState::STORAGE_RECOVERED ) { + self->recoveryState = RecoveryState::STORAGE_RECOVERED; + TraceEvent("MasterRecoveryState", self->dbgid) + .detail("StatusCode", RecoveryStatus::storage_recovered) + .detail("Status", RecoveryStatus::names[RecoveryStatus::storage_recovered]) + .trackLatest(format("%s/MasterRecoveryState", printable(self->dbName).c_str() ).c_str()); + } else if( allLogs && self->recoveryState < RecoveryState::ALL_LOGS_RECRUITED ) { + self->recoveryState = RecoveryState::ALL_LOGS_RECRUITED; + TraceEvent("MasterRecoveryState", self->dbgid) + .detail("StatusCode", RecoveryStatus::all_logs_recruited) + .detail("Status", RecoveryStatus::names[RecoveryStatus::all_logs_recruited]) .trackLatest(format("%s/MasterRecoveryState", printable(self->dbName).c_str() ).c_str()); } @@ -1093,12 +1105,16 @@ ACTOR Future configurationMonitor( Reference self ) { loop { try { tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); - Standalone results = wait( tr.getRange( configKeys, CLIENT_KNOBS->TOO_MANY ) ); + Standalone results = wait( tr.getRange( configKeys, CLIENT_KNOBS->TOO_MANY ) ); ASSERT( !results.more && results.size() < CLIENT_KNOBS->TOO_MANY ); DatabaseConfiguration conf; conf.fromKeyValues((VectorRef) results); if(conf != self->configuration) { + if(self->recoveryState != RecoveryState::ALL_LOGS_RECRUITED && self->recoveryState != RecoveryState::FULLY_RECOVERED) { + throw master_recovery_failed(); + } + self->configuration = conf; self->registrationTrigger.trigger(); } @@ -1268,7 +1284,7 @@ ACTOR Future masterCore( Reference self ) { TraceEvent(recoveryInterval.end(), self->dbgid).detail("RecoveryTransactionVersion", self->recoveryTransactionVersion); - self->recoveryState = RecoveryState::FULLY_RECOVERED; + self->recoveryState = RecoveryState::ACCEPTING_COMMITS; double recoveryDuration = now() - recoverStartTime; TraceEvent((recoveryDuration > 4 && !g_network->isSimulated()) ? SevWarnAlways : SevInfo, "MasterRecoveryDuration", self->dbgid) @@ -1276,8 +1292,8 @@ ACTOR Future masterCore( Reference self ) { .trackLatest("MasterRecoveryDuration"); TraceEvent("MasterRecoveryState", self->dbgid) - .detail("StatusCode", RecoveryStatus::fully_recovered) - .detail("Status", RecoveryStatus::names[RecoveryStatus::fully_recovered]) + .detail("StatusCode", RecoveryStatus::accepting_commits) + .detail("Status", RecoveryStatus::names[RecoveryStatus::accepting_commits]) .detail("StoreType", self->configuration.storageServerStoreType) .detail("RecoveryDuration", recoveryDuration) .trackLatest("MasterRecoveryState"); diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp index fc5056d593..3d4200e5b2 100644 --- a/fdbserver/storageserver.actor.cpp +++ b/fdbserver/storageserver.actor.cpp @@ -3188,7 +3188,7 @@ ACTOR Future storageServerCore( StorageServer* self, StorageServerInterfac when( Void _ = wait( dbInfoChange ) ) { TEST( self->logSystem ); // shardServer dbInfo changed dbInfoChange = self->db->onChange(); - if( self->db->get().recoveryState >= RecoveryState::FULLY_RECOVERED ) { + if( self->db->get().recoveryState >= RecoveryState::ACCEPTING_COMMITS ) { self->logSystem = ILogSystem::fromServerDBInfo( self->thisServerID, self->db->get() ); if (self->logSystem) { if(self->db->get().logSystemConfig.recoveredAt.present()) { diff --git a/fdbserver/workloads/RemoveServersSafely.actor.cpp b/fdbserver/workloads/RemoveServersSafely.actor.cpp index b4a0128f34..52349ddd80 100644 --- a/fdbserver/workloads/RemoveServersSafely.actor.cpp +++ b/fdbserver/workloads/RemoveServersSafely.actor.cpp @@ -53,6 +53,9 @@ struct RemoveServersSafelyWorkload : TestWorkload { kill1Timeout = getOption( options, LiteralStringRef("kill1Timeout"), 60.0 ); kill2Timeout = getOption( options, LiteralStringRef("kill2Timeout"), 6000.0 ); killProcesses = g_random->random01() < 0.5; + if(g_network->isSimulated()) { + g_simulator.allowLogSetKills = false; + } } virtual std::string description() { return "RemoveServersSafelyWorkload"; } @@ -231,7 +234,7 @@ struct RemoveServersSafelyWorkload : TestWorkload { for (auto processInfo : getServers()) { auto processNet = AddressExclusion(processInfo->address.ip, processInfo->address.port); // Mark all of the unavailable as dead - if (!processInfo->isAvailable()) + if (!processInfo->isAvailable() || processInfo->isCleared()) processesDead.push_back(processInfo); // Save all processes not specified within set else if (killAddrs.find(processNet) == killAddrs.end()) diff --git a/tests/fast/SidebandWithStatus.txt b/tests/fast/SidebandWithStatus.txt index 0656e0e777..3e96a0eeb0 100644 --- a/tests/fast/SidebandWithStatus.txt +++ b/tests/fast/SidebandWithStatus.txt @@ -5,7 +5,7 @@ testTitle=CloggedCausalConsistencyTest testName=Status testDuration=30.0 - schema={"cluster":{"layers":{"_valid":true,"_error":"some error description"},"datacenter_version_difference":0,"processes":{"$map":{"fault_domain":"0ccb4e0fdbdb5583010f6b77d9d10ece","class_source":{"$enum":["command_line","configure_auto","set_class"]},"class_type":{"$enum":["unset","storage","transaction","resolution","proxy","master","test"]},"roles":[{"query_queue_max":0,"data_lag":{"seconds":5.0,"versions":12341234},"input_bytes":{"hz":0.0,"counter":0,"roughness":0.0},"kvstore_used_bytes":12341234,"stored_bytes":12341234,"kvstore_free_bytes":12341234,"durable_bytes":{"hz":0.0,"counter":0,"roughness":0.0},"id":"eb84471d68c12d1d26f692a50000003f","data_version":12341234,"role":{"$enum":["master","proxy","log","storage","resolver","cluster_controller"]},"queue_disk_available_bytes":12341234,"kvstore_available_bytes":12341234,"queue_disk_total_bytes":12341234,"queue_disk_used_bytes":12341234,"queue_disk_free_bytes":12341234,"kvstore_total_bytes":12341234,"finished_queries":{"hz":0.0,"counter":0,"roughness":0.0}}],"locality":{"$map":"value"},"messages":[{"description":"abc","type":"x","name":{"$enum":["file_open_error","incorrect_cluster_file_contents","process_error","io_error","io_timeout","platform_error","storage_server_lagging","(other FDB error messages)"]},"raw_log_message":"","time":12345.12312}],"address":"1.2.3.4:1234","command_line":"-r simulation","disk":{"free_bytes":3451233456234,"reads":{"hz":0.0,"counter":0,"sectors":0},"busy":0.0,"writes":{"hz":0.0,"counter":0,"sectors":0},"total_bytes":123412341234},"version":"3.0.0","excluded":false,"memory":{"available_bytes":0,"unused_allocated_memory":0,"limit_bytes":0,"used_bytes":0},"machine_id":"0ccb4e0feddb5583010f6b77d9d10ece","uptime_seconds":1234.2345,"cpu":{"usage_cores":0.0},"network":{"megabits_sent":{"hz":0.0},"megabits_received":{"hz":0.0},"connections_closed":{"hz":0.0},"connection_errors":{"hz":0.0},"current_connections":0,"connections_established":{"hz":0.0}}}},"clients":{"count":1,"supported_versions":[{"count":1,"protocol_version":"fdb00a400050001","client_version":"3.0.0","source_version":"9430e1127b4991cbc5ab2b17f41cfffa5de07e9d","connected_clients":[{"log_group":"default","address":"127.0.0.1:9898"}]}]},"qos":{"limiting_version_lag_storage_server":0,"released_transactions_per_second":0,"transactions_per_second_limit":0,"limiting_queue_bytes_storage_server":0,"performance_limited_by":{"reason_server_id":"7f8d623d0cb9966e","description":"The database is not being saturated by the workload.","reason_id":0,"name":{"$enum":["workload","storage_server_write_queue_size","storage_server_write_bandwidth_mvcc","storage_server_readable_behind","log_server_mvcc_write_bandwidth","log_server_write_queue","storage_server_min_free_space","storage_server_min_free_space_ratio","log_server_min_free_space","log_server_min_free_space_ratio"]}},"worst_version_lag_storage_server":0,"worst_queue_bytes_log_server":460,"worst_queue_bytes_storage_server":0},"incompatible_connections":[],"full_replication":true,"database_locked":false,"generation":2,"data":{"least_operating_space_bytes_log_server":0,"average_partition_size_bytes":0,"state":{"healthy":true,"description":"","name":{"$enum":["initializing","missing_data","healing","healthy_repartitioning","healthy_removing_server","healthy_rebalancing","healthy"]},"min_replicas_remaining":0},"least_operating_space_ratio_storage_server":0.1,"max_machine_failures_without_losing_availability":0,"total_disk_used_bytes":0,"total_kv_size_bytes":0,"max_machine_failures_without_losing_data":0,"moving_data":{"in_queue_bytes":0,"total_written_bytes":0,"in_flight_bytes":0},"least_operating_space_bytes_storage_server":0,"partitions_count":2},"fault_tolerance":{"max_machine_failures_without_losing_availability":0,"max_machine_failures_without_losing_data":0},"messages":[{"reasons":[{"description":"Blah."}],"unreachable_processes":[{"address":"1.2.3.4:1234"}],"name":{"$enum":["unreachable_master_worker","unreadable_configuration","full_replication_timeout","client_issues","unreachable_processes","immediate_priority_transaction_start_probe_timeout","batch_priority_transaction_start_probe_timeout","transaction_start_probe_timeout","read_probe_timeout","commit_probe_timeout","storage_servers_error","status_incomplete","layer_status_incomplete","database_availability_timeout"]},"issues":[{"name":{"$enum":["incorrect_cluster_file_contents"]},"description":"Cluster file contents do not match current cluster connection string. Verify cluster file is writable and has not been overwritten externally."}],"description":"abc"}],"database_available":true,"recovery_state":{"required_proxies":1,"name":{"$enum":["reading_coordinated_state","locking_coordinated_state","locking_old_transaction_servers","reading_transaction_system_state","configuration_missing","configuration_never_created","configuration_invalid","recruiting_transaction_servers","initializing_transaction_servers","recovery_transaction","writing_coordinated_state","fully_recovered"]},"missing_logs":"7f8d623d0cb9966e","required_resolvers":1,"required_logs":3,"description":"Recovery complete."},"workload":{"operations":{"writes":{"hz":0.0,"counter":0,"roughness":0.0},"reads":{"hz":0.0,"counter":0,"roughness":0.0}},"keys":{"read":{"hz":0.0,"counter":0,"roughness":0.0}},"bytes":{"read":{"hz":0.0,"counter":0,"roughness":0.0},"written":{"hz":0.0,"counter":0,"roughness":0.0}},"transactions":{"started":{"hz":0.0,"counter":0,"roughness":0.0},"conflicted":{"hz":0.0,"counter":0,"roughness":0.0},"committed":{"hz":0.0,"counter":0,"roughness":0.0}}},"cluster_controller_timestamp":1415650089,"protocol_version":"fdb00a400050001","configuration":{"logs":2,"log_replicas":2,"storage_engine":{"$enum":["ssd","ssd-1","ssd-2","memory","custom"]},"excluded_servers":[{"address":"10.0.4.1"}],"remote_logs":5,"log_anti_quorum":0,"storage_replicas":1,"coordinators_count":1,"regions":[{"satellite_redundancy_mode":"one_satellite_single","satellite_anti_quorum":0,"satellite_usable_dcs":1,"datacenters":[{"priority":1,"satellite":1,"id":"mr"}],"satellite_log_policy":"(zoneid^3x1)","satellite_log_replicas":1,"satellite_logs":2}],"usable_regions":1,"redundancy_mode":"single","auto_logs":3,"proxies":5,"resolvers":1,"log_replication_policy":"(zoneid^3x1)","remote_redundancy_mode":"remote_single","repopulate_anti_quorum":1,"remote_log_replicas":3,"log_routers":10,"storage_replication_policy":"(zoneid^3x1)","auto_proxies":3,"auto_resolvers":1},"latency_probe":{"immediate_priority_transaction_start_seconds":0.0,"transaction_start_seconds":0.0,"batch_priority_transaction_start_seconds":0.0,"read_seconds":7,"commit_seconds":0.02},"machines":{"$map":{"network":{"megabits_sent":{"hz":0.0},"megabits_received":{"hz":0.0},"tcp_segments_retransmitted":{"hz":0.0}},"locality":{"$map":"value"},"memory":{"free_bytes":0,"committed_bytes":0,"total_bytes":0},"contributing_workers":4,"datacenter_id":"6344abf1813eb05b","excluded":false,"address":"1.2.3.4","machine_id":"6344abf1813eb05b","cpu":{"logical_core_utilization":0.4}}},"old_logs":[{"satellite_log_fault_tolerance":2,"logs":[{"healthy":true,"id":"7f8d623d0cb9966e","address":"1.2.3.4:1234"}],"satellite_log_write_anti_quorum":0,"remote_log_fault_tolerance":2,"log_fault_tolerance":2,"log_write_anti_quorum":0,"satellite_log_replication_factor":3,"remote_log_replication_factor":3,"log_replication_factor":3}]},"client":{"coordinators":{"coordinators":[{"reachable":true,"address":"127.0.0.1:4701"}],"quorum_reachable":true},"cluster_file":{"path":"/etc/foundationdb/fdb.cluster","up_to_date":true},"messages":[{"name":{"$enum":["inconsistent_cluster_file","unreachable_cluster_controller","no_cluster_controller","status_incomplete_client","status_incomplete_coordinators","status_incomplete_error","status_incomplete_timeout","status_incomplete_cluster","quorum_not_reachable"]},"description":"The cluster file is not up to date."}],"timestamp":1415650089,"database_status":{"available":true,"healthy":true}}} + schema={"cluster":{"layers":{"_valid":true,"_error":"some error description"},"datacenter_version_difference":0,"processes":{"$map":{"fault_domain":"0ccb4e0fdbdb5583010f6b77d9d10ece","class_source":{"$enum":["command_line","configure_auto","set_class"]},"class_type":{"$enum":["unset","storage","transaction","resolution","proxy","master","test"]},"roles":[{"query_queue_max":0,"data_lag":{"seconds":5.0,"versions":12341234},"input_bytes":{"hz":0.0,"counter":0,"roughness":0.0},"kvstore_used_bytes":12341234,"stored_bytes":12341234,"kvstore_free_bytes":12341234,"durable_bytes":{"hz":0.0,"counter":0,"roughness":0.0},"id":"eb84471d68c12d1d26f692a50000003f","data_version":12341234,"role":{"$enum":["master","proxy","log","storage","resolver","cluster_controller"]},"queue_disk_available_bytes":12341234,"kvstore_available_bytes":12341234,"queue_disk_total_bytes":12341234,"queue_disk_used_bytes":12341234,"queue_disk_free_bytes":12341234,"kvstore_total_bytes":12341234,"finished_queries":{"hz":0.0,"counter":0,"roughness":0.0}}],"locality":{"$map":"value"},"messages":[{"description":"abc","type":"x","name":{"$enum":["file_open_error","incorrect_cluster_file_contents","process_error","io_error","io_timeout","platform_error","storage_server_lagging","(other FDB error messages)"]},"raw_log_message":"","time":12345.12312}],"address":"1.2.3.4:1234","command_line":"-r simulation","disk":{"free_bytes":3451233456234,"reads":{"hz":0.0,"counter":0,"sectors":0},"busy":0.0,"writes":{"hz":0.0,"counter":0,"sectors":0},"total_bytes":123412341234},"version":"3.0.0","excluded":false,"memory":{"available_bytes":0,"unused_allocated_memory":0,"limit_bytes":0,"used_bytes":0},"machine_id":"0ccb4e0feddb5583010f6b77d9d10ece","uptime_seconds":1234.2345,"cpu":{"usage_cores":0.0},"network":{"megabits_sent":{"hz":0.0},"megabits_received":{"hz":0.0},"connections_closed":{"hz":0.0},"connection_errors":{"hz":0.0},"current_connections":0,"connections_established":{"hz":0.0}}}},"clients":{"count":1,"supported_versions":[{"count":1,"protocol_version":"fdb00a400050001","client_version":"3.0.0","source_version":"9430e1127b4991cbc5ab2b17f41cfffa5de07e9d","connected_clients":[{"log_group":"default","address":"127.0.0.1:9898"}]}]},"qos":{"limiting_version_lag_storage_server":0,"released_transactions_per_second":0,"transactions_per_second_limit":0,"limiting_queue_bytes_storage_server":0,"performance_limited_by":{"reason_server_id":"7f8d623d0cb9966e","description":"The database is not being saturated by the workload.","reason_id":0,"name":{"$enum":["workload","storage_server_write_queue_size","storage_server_write_bandwidth_mvcc","storage_server_readable_behind","log_server_mvcc_write_bandwidth","log_server_write_queue","storage_server_min_free_space","storage_server_min_free_space_ratio","log_server_min_free_space","log_server_min_free_space_ratio"]}},"worst_version_lag_storage_server":0,"worst_queue_bytes_log_server":460,"worst_queue_bytes_storage_server":0},"incompatible_connections":[],"full_replication":true,"database_locked":false,"generation":2,"data":{"least_operating_space_bytes_log_server":0,"average_partition_size_bytes":0,"state":{"healthy":true,"description":"","name":{"$enum":["initializing","missing_data","healing","healthy_repartitioning","healthy_removing_server","healthy_rebalancing","healthy"]},"min_replicas_remaining":0},"least_operating_space_ratio_storage_server":0.1,"max_machine_failures_without_losing_availability":0,"total_disk_used_bytes":0,"total_kv_size_bytes":0,"max_machine_failures_without_losing_data":0,"moving_data":{"in_queue_bytes":0,"total_written_bytes":0,"in_flight_bytes":0},"least_operating_space_bytes_storage_server":0,"partitions_count":2},"fault_tolerance":{"max_machine_failures_without_losing_availability":0,"max_machine_failures_without_losing_data":0},"messages":[{"reasons":[{"description":"Blah."}],"unreachable_processes":[{"address":"1.2.3.4:1234"}],"name":{"$enum":["unreachable_master_worker","unreadable_configuration","full_replication_timeout","client_issues","unreachable_processes","immediate_priority_transaction_start_probe_timeout","batch_priority_transaction_start_probe_timeout","transaction_start_probe_timeout","read_probe_timeout","commit_probe_timeout","storage_servers_error","status_incomplete","layer_status_incomplete","database_availability_timeout"]},"issues":[{"name":{"$enum":["incorrect_cluster_file_contents"]},"description":"Cluster file contents do not match current cluster connection string. Verify cluster file is writable and has not been overwritten externally."}],"description":"abc"}],"database_available":true,"recovery_state":{"required_proxies":1,"name":{"$enum":["reading_coordinated_state","locking_coordinated_state","locking_old_transaction_servers","reading_transaction_system_state","configuration_missing","configuration_never_created","configuration_invalid","recruiting_transaction_servers","initializing_transaction_servers","recovery_transaction","writing_coordinated_state","accepting_commits","all_logs_recruited","storage_recovered","fully_recovered"]},"missing_logs":"7f8d623d0cb9966e","required_resolvers":1,"required_logs":3,"description":"Recovery complete."},"workload":{"operations":{"writes":{"hz":0.0,"counter":0,"roughness":0.0},"reads":{"hz":0.0,"counter":0,"roughness":0.0}},"keys":{"read":{"hz":0.0,"counter":0,"roughness":0.0}},"bytes":{"read":{"hz":0.0,"counter":0,"roughness":0.0},"written":{"hz":0.0,"counter":0,"roughness":0.0}},"transactions":{"started":{"hz":0.0,"counter":0,"roughness":0.0},"conflicted":{"hz":0.0,"counter":0,"roughness":0.0},"committed":{"hz":0.0,"counter":0,"roughness":0.0}}},"cluster_controller_timestamp":1415650089,"protocol_version":"fdb00a400050001","configuration":{"logs":2,"log_replicas":2,"storage_engine":{"$enum":["ssd","ssd-1","ssd-2","memory","custom"]},"excluded_servers":[{"address":"10.0.4.1"}],"remote_logs":5,"log_anti_quorum":0,"storage_replicas":1,"coordinators_count":1,"regions":[{"satellite_redundancy_mode":"one_satellite_single","satellite_anti_quorum":0,"satellite_usable_dcs":1,"datacenters":[{"priority":1,"satellite":1,"id":"mr"}],"satellite_log_policy":"(zoneid^3x1)","satellite_log_replicas":1,"satellite_logs":2}],"usable_regions":1,"redundancy_mode":"single","auto_logs":3,"proxies":5,"resolvers":1,"log_replication_policy":"(zoneid^3x1)","remote_redundancy_mode":"remote_single","repopulate_anti_quorum":1,"remote_log_replicas":3,"log_routers":10,"storage_replication_policy":"(zoneid^3x1)","auto_proxies":3,"auto_resolvers":1},"latency_probe":{"immediate_priority_transaction_start_seconds":0.0,"transaction_start_seconds":0.0,"batch_priority_transaction_start_seconds":0.0,"read_seconds":7,"commit_seconds":0.02},"machines":{"$map":{"network":{"megabits_sent":{"hz":0.0},"megabits_received":{"hz":0.0},"tcp_segments_retransmitted":{"hz":0.0}},"locality":{"$map":"value"},"memory":{"free_bytes":0,"committed_bytes":0,"total_bytes":0},"contributing_workers":4,"datacenter_id":"6344abf1813eb05b","excluded":false,"address":"1.2.3.4","machine_id":"6344abf1813eb05b","cpu":{"logical_core_utilization":0.4}}},"old_logs":[{"satellite_log_fault_tolerance":2,"logs":[{"healthy":true,"id":"7f8d623d0cb9966e","address":"1.2.3.4:1234"}],"satellite_log_write_anti_quorum":0,"remote_log_fault_tolerance":2,"log_fault_tolerance":2,"log_write_anti_quorum":0,"satellite_log_replication_factor":3,"remote_log_replication_factor":3,"log_replication_factor":3}]},"client":{"coordinators":{"coordinators":[{"reachable":true,"address":"127.0.0.1:4701"}],"quorum_reachable":true},"cluster_file":{"path":"/etc/foundationdb/fdb.cluster","up_to_date":true},"messages":[{"name":{"$enum":["inconsistent_cluster_file","unreachable_cluster_controller","no_cluster_controller","status_incomplete_client","status_incomplete_coordinators","status_incomplete_error","status_incomplete_timeout","status_incomplete_cluster","quorum_not_reachable"]},"description":"The cluster file is not up to date."}],"timestamp":1415650089,"database_status":{"available":true,"healthy":true}}} testName=RandomClogging testDuration=30.0 diff --git a/tests/rare/LargeApiCorrectnessStatus.txt b/tests/rare/LargeApiCorrectnessStatus.txt index 32972c0c4c..ca9a8e5df4 100644 --- a/tests/rare/LargeApiCorrectnessStatus.txt +++ b/tests/rare/LargeApiCorrectnessStatus.txt @@ -24,5 +24,5 @@ testTitle=ApiCorrectnessTest testName=Status testDuration=30.0 - schema={"cluster":{"layers":{"_valid":true,"_error":"some error description"},"datacenter_version_difference":0,"processes":{"$map":{"fault_domain":"0ccb4e0fdbdb5583010f6b77d9d10ece","class_source":{"$enum":["command_line","configure_auto","set_class"]},"class_type":{"$enum":["unset","storage","transaction","resolution","proxy","master","test"]},"roles":[{"query_queue_max":0,"data_lag":{"seconds":5.0,"versions":12341234},"input_bytes":{"hz":0.0,"counter":0,"roughness":0.0},"kvstore_used_bytes":12341234,"stored_bytes":12341234,"kvstore_free_bytes":12341234,"durable_bytes":{"hz":0.0,"counter":0,"roughness":0.0},"id":"eb84471d68c12d1d26f692a50000003f","data_version":12341234,"role":{"$enum":["master","proxy","log","storage","resolver","cluster_controller"]},"queue_disk_available_bytes":12341234,"kvstore_available_bytes":12341234,"queue_disk_total_bytes":12341234,"queue_disk_used_bytes":12341234,"queue_disk_free_bytes":12341234,"kvstore_total_bytes":12341234,"finished_queries":{"hz":0.0,"counter":0,"roughness":0.0}}],"locality":{"$map":"value"},"messages":[{"description":"abc","type":"x","name":{"$enum":["file_open_error","incorrect_cluster_file_contents","process_error","io_error","io_timeout","platform_error","storage_server_lagging","(other FDB error messages)"]},"raw_log_message":"","time":12345.12312}],"address":"1.2.3.4:1234","command_line":"-r simulation","disk":{"free_bytes":3451233456234,"reads":{"hz":0.0,"counter":0,"sectors":0},"busy":0.0,"writes":{"hz":0.0,"counter":0,"sectors":0},"total_bytes":123412341234},"version":"3.0.0","excluded":false,"memory":{"available_bytes":0,"unused_allocated_memory":0,"limit_bytes":0,"used_bytes":0},"machine_id":"0ccb4e0feddb5583010f6b77d9d10ece","uptime_seconds":1234.2345,"cpu":{"usage_cores":0.0},"network":{"megabits_sent":{"hz":0.0},"megabits_received":{"hz":0.0},"connections_closed":{"hz":0.0},"connection_errors":{"hz":0.0},"current_connections":0,"connections_established":{"hz":0.0}}}},"clients":{"count":1,"supported_versions":[{"count":1,"protocol_version":"fdb00a400050001","client_version":"3.0.0","source_version":"9430e1127b4991cbc5ab2b17f41cfffa5de07e9d","connected_clients":[{"log_group":"default","address":"127.0.0.1:9898"}]}]},"qos":{"limiting_version_lag_storage_server":0,"released_transactions_per_second":0,"transactions_per_second_limit":0,"limiting_queue_bytes_storage_server":0,"performance_limited_by":{"reason_server_id":"7f8d623d0cb9966e","description":"The database is not being saturated by the workload.","reason_id":0,"name":{"$enum":["workload","storage_server_write_queue_size","storage_server_write_bandwidth_mvcc","storage_server_readable_behind","log_server_mvcc_write_bandwidth","log_server_write_queue","storage_server_min_free_space","storage_server_min_free_space_ratio","log_server_min_free_space","log_server_min_free_space_ratio"]}},"worst_version_lag_storage_server":0,"worst_queue_bytes_log_server":460,"worst_queue_bytes_storage_server":0},"incompatible_connections":[],"full_replication":true,"database_locked":false,"generation":2,"data":{"least_operating_space_bytes_log_server":0,"average_partition_size_bytes":0,"state":{"healthy":true,"description":"","name":{"$enum":["initializing","missing_data","healing","healthy_repartitioning","healthy_removing_server","healthy_rebalancing","healthy"]},"min_replicas_remaining":0},"least_operating_space_ratio_storage_server":0.1,"max_machine_failures_without_losing_availability":0,"total_disk_used_bytes":0,"total_kv_size_bytes":0,"max_machine_failures_without_losing_data":0,"moving_data":{"in_queue_bytes":0,"total_written_bytes":0,"in_flight_bytes":0},"least_operating_space_bytes_storage_server":0,"partitions_count":2},"fault_tolerance":{"max_machine_failures_without_losing_availability":0,"max_machine_failures_without_losing_data":0},"messages":[{"reasons":[{"description":"Blah."}],"unreachable_processes":[{"address":"1.2.3.4:1234"}],"name":{"$enum":["unreachable_master_worker","unreadable_configuration","full_replication_timeout","client_issues","unreachable_processes","immediate_priority_transaction_start_probe_timeout","batch_priority_transaction_start_probe_timeout","transaction_start_probe_timeout","read_probe_timeout","commit_probe_timeout","storage_servers_error","status_incomplete","layer_status_incomplete","database_availability_timeout"]},"issues":[{"name":{"$enum":["incorrect_cluster_file_contents"]},"description":"Cluster file contents do not match current cluster connection string. Verify cluster file is writable and has not been overwritten externally."}],"description":"abc"}],"database_available":true,"recovery_state":{"required_proxies":1,"name":{"$enum":["reading_coordinated_state","locking_coordinated_state","locking_old_transaction_servers","reading_transaction_system_state","configuration_missing","configuration_never_created","configuration_invalid","recruiting_transaction_servers","initializing_transaction_servers","recovery_transaction","writing_coordinated_state","fully_recovered"]},"missing_logs":"7f8d623d0cb9966e","required_resolvers":1,"required_logs":3,"description":"Recovery complete."},"workload":{"operations":{"writes":{"hz":0.0,"counter":0,"roughness":0.0},"reads":{"hz":0.0,"counter":0,"roughness":0.0}},"keys":{"read":{"hz":0.0,"counter":0,"roughness":0.0}},"bytes":{"read":{"hz":0.0,"counter":0,"roughness":0.0},"written":{"hz":0.0,"counter":0,"roughness":0.0}},"transactions":{"started":{"hz":0.0,"counter":0,"roughness":0.0},"conflicted":{"hz":0.0,"counter":0,"roughness":0.0},"committed":{"hz":0.0,"counter":0,"roughness":0.0}}},"cluster_controller_timestamp":1415650089,"protocol_version":"fdb00a400050001","configuration":{"logs":2,"log_replicas":2,"storage_engine":{"$enum":["ssd","ssd-1","ssd-2","memory","custom"]},"excluded_servers":[{"address":"10.0.4.1"}],"remote_logs":5,"log_anti_quorum":0,"storage_replicas":1,"coordinators_count":1,"regions":[{"satellite_redundancy_mode":"one_satellite_single","satellite_anti_quorum":0,"satellite_usable_dcs":1,"datacenters":[{"priority":1,"satellite":1,"id":"mr"}],"satellite_log_policy":"(zoneid^3x1)","satellite_log_replicas":1,"satellite_logs":2}],"usable_regions":1,"redundancy_mode":"single","auto_logs":3,"proxies":5,"resolvers":1,"log_replication_policy":"(zoneid^3x1)","remote_redundancy_mode":"remote_single","repopulate_anti_quorum":1,"remote_log_replicas":3,"log_routers":10,"storage_replication_policy":"(zoneid^3x1)","auto_proxies":3,"auto_resolvers":1},"latency_probe":{"immediate_priority_transaction_start_seconds":0.0,"transaction_start_seconds":0.0,"batch_priority_transaction_start_seconds":0.0,"read_seconds":7,"commit_seconds":0.02},"machines":{"$map":{"network":{"megabits_sent":{"hz":0.0},"megabits_received":{"hz":0.0},"tcp_segments_retransmitted":{"hz":0.0}},"locality":{"$map":"value"},"memory":{"free_bytes":0,"committed_bytes":0,"total_bytes":0},"contributing_workers":4,"datacenter_id":"6344abf1813eb05b","excluded":false,"address":"1.2.3.4","machine_id":"6344abf1813eb05b","cpu":{"logical_core_utilization":0.4}}},"old_logs":[{"satellite_log_fault_tolerance":2,"logs":[{"healthy":true,"id":"7f8d623d0cb9966e","address":"1.2.3.4:1234"}],"satellite_log_write_anti_quorum":0,"remote_log_fault_tolerance":2,"log_fault_tolerance":2,"log_write_anti_quorum":0,"satellite_log_replication_factor":3,"remote_log_replication_factor":3,"log_replication_factor":3}]},"client":{"coordinators":{"coordinators":[{"reachable":true,"address":"127.0.0.1:4701"}],"quorum_reachable":true},"cluster_file":{"path":"/etc/foundationdb/fdb.cluster","up_to_date":true},"messages":[{"name":{"$enum":["inconsistent_cluster_file","unreachable_cluster_controller","no_cluster_controller","status_incomplete_client","status_incomplete_coordinators","status_incomplete_error","status_incomplete_timeout","status_incomplete_cluster","quorum_not_reachable"]},"description":"The cluster file is not up to date."}],"timestamp":1415650089,"database_status":{"available":true,"healthy":true}}} + schema={"cluster":{"layers":{"_valid":true,"_error":"some error description"},"datacenter_version_difference":0,"processes":{"$map":{"fault_domain":"0ccb4e0fdbdb5583010f6b77d9d10ece","class_source":{"$enum":["command_line","configure_auto","set_class"]},"class_type":{"$enum":["unset","storage","transaction","resolution","proxy","master","test"]},"roles":[{"query_queue_max":0,"data_lag":{"seconds":5.0,"versions":12341234},"input_bytes":{"hz":0.0,"counter":0,"roughness":0.0},"kvstore_used_bytes":12341234,"stored_bytes":12341234,"kvstore_free_bytes":12341234,"durable_bytes":{"hz":0.0,"counter":0,"roughness":0.0},"id":"eb84471d68c12d1d26f692a50000003f","data_version":12341234,"role":{"$enum":["master","proxy","log","storage","resolver","cluster_controller"]},"queue_disk_available_bytes":12341234,"kvstore_available_bytes":12341234,"queue_disk_total_bytes":12341234,"queue_disk_used_bytes":12341234,"queue_disk_free_bytes":12341234,"kvstore_total_bytes":12341234,"finished_queries":{"hz":0.0,"counter":0,"roughness":0.0}}],"locality":{"$map":"value"},"messages":[{"description":"abc","type":"x","name":{"$enum":["file_open_error","incorrect_cluster_file_contents","process_error","io_error","io_timeout","platform_error","storage_server_lagging","(other FDB error messages)"]},"raw_log_message":"","time":12345.12312}],"address":"1.2.3.4:1234","command_line":"-r simulation","disk":{"free_bytes":3451233456234,"reads":{"hz":0.0,"counter":0,"sectors":0},"busy":0.0,"writes":{"hz":0.0,"counter":0,"sectors":0},"total_bytes":123412341234},"version":"3.0.0","excluded":false,"memory":{"available_bytes":0,"unused_allocated_memory":0,"limit_bytes":0,"used_bytes":0},"machine_id":"0ccb4e0feddb5583010f6b77d9d10ece","uptime_seconds":1234.2345,"cpu":{"usage_cores":0.0},"network":{"megabits_sent":{"hz":0.0},"megabits_received":{"hz":0.0},"connections_closed":{"hz":0.0},"connection_errors":{"hz":0.0},"current_connections":0,"connections_established":{"hz":0.0}}}},"clients":{"count":1,"supported_versions":[{"count":1,"protocol_version":"fdb00a400050001","client_version":"3.0.0","source_version":"9430e1127b4991cbc5ab2b17f41cfffa5de07e9d","connected_clients":[{"log_group":"default","address":"127.0.0.1:9898"}]}]},"qos":{"limiting_version_lag_storage_server":0,"released_transactions_per_second":0,"transactions_per_second_limit":0,"limiting_queue_bytes_storage_server":0,"performance_limited_by":{"reason_server_id":"7f8d623d0cb9966e","description":"The database is not being saturated by the workload.","reason_id":0,"name":{"$enum":["workload","storage_server_write_queue_size","storage_server_write_bandwidth_mvcc","storage_server_readable_behind","log_server_mvcc_write_bandwidth","log_server_write_queue","storage_server_min_free_space","storage_server_min_free_space_ratio","log_server_min_free_space","log_server_min_free_space_ratio"]}},"worst_version_lag_storage_server":0,"worst_queue_bytes_log_server":460,"worst_queue_bytes_storage_server":0},"incompatible_connections":[],"full_replication":true,"database_locked":false,"generation":2,"data":{"least_operating_space_bytes_log_server":0,"average_partition_size_bytes":0,"state":{"healthy":true,"description":"","name":{"$enum":["initializing","missing_data","healing","healthy_repartitioning","healthy_removing_server","healthy_rebalancing","healthy"]},"min_replicas_remaining":0},"least_operating_space_ratio_storage_server":0.1,"max_machine_failures_without_losing_availability":0,"total_disk_used_bytes":0,"total_kv_size_bytes":0,"max_machine_failures_without_losing_data":0,"moving_data":{"in_queue_bytes":0,"total_written_bytes":0,"in_flight_bytes":0},"least_operating_space_bytes_storage_server":0,"partitions_count":2},"fault_tolerance":{"max_machine_failures_without_losing_availability":0,"max_machine_failures_without_losing_data":0},"messages":[{"reasons":[{"description":"Blah."}],"unreachable_processes":[{"address":"1.2.3.4:1234"}],"name":{"$enum":["unreachable_master_worker","unreadable_configuration","full_replication_timeout","client_issues","unreachable_processes","immediate_priority_transaction_start_probe_timeout","batch_priority_transaction_start_probe_timeout","transaction_start_probe_timeout","read_probe_timeout","commit_probe_timeout","storage_servers_error","status_incomplete","layer_status_incomplete","database_availability_timeout"]},"issues":[{"name":{"$enum":["incorrect_cluster_file_contents"]},"description":"Cluster file contents do not match current cluster connection string. Verify cluster file is writable and has not been overwritten externally."}],"description":"abc"}],"database_available":true,"recovery_state":{"required_proxies":1,"name":{"$enum":["reading_coordinated_state","locking_coordinated_state","locking_old_transaction_servers","reading_transaction_system_state","configuration_missing","configuration_never_created","configuration_invalid","recruiting_transaction_servers","initializing_transaction_servers","recovery_transaction","writing_coordinated_state","accepting_commits","all_logs_recruited","storage_recovered","fully_recovered"]},"missing_logs":"7f8d623d0cb9966e","required_resolvers":1,"required_logs":3,"description":"Recovery complete."},"workload":{"operations":{"writes":{"hz":0.0,"counter":0,"roughness":0.0},"reads":{"hz":0.0,"counter":0,"roughness":0.0}},"keys":{"read":{"hz":0.0,"counter":0,"roughness":0.0}},"bytes":{"read":{"hz":0.0,"counter":0,"roughness":0.0},"written":{"hz":0.0,"counter":0,"roughness":0.0}},"transactions":{"started":{"hz":0.0,"counter":0,"roughness":0.0},"conflicted":{"hz":0.0,"counter":0,"roughness":0.0},"committed":{"hz":0.0,"counter":0,"roughness":0.0}}},"cluster_controller_timestamp":1415650089,"protocol_version":"fdb00a400050001","configuration":{"logs":2,"log_replicas":2,"storage_engine":{"$enum":["ssd","ssd-1","ssd-2","memory","custom"]},"excluded_servers":[{"address":"10.0.4.1"}],"remote_logs":5,"log_anti_quorum":0,"storage_replicas":1,"coordinators_count":1,"regions":[{"satellite_redundancy_mode":"one_satellite_single","satellite_anti_quorum":0,"satellite_usable_dcs":1,"datacenters":[{"priority":1,"satellite":1,"id":"mr"}],"satellite_log_policy":"(zoneid^3x1)","satellite_log_replicas":1,"satellite_logs":2}],"usable_regions":1,"redundancy_mode":"single","auto_logs":3,"proxies":5,"resolvers":1,"log_replication_policy":"(zoneid^3x1)","remote_redundancy_mode":"remote_single","repopulate_anti_quorum":1,"remote_log_replicas":3,"log_routers":10,"storage_replication_policy":"(zoneid^3x1)","auto_proxies":3,"auto_resolvers":1},"latency_probe":{"immediate_priority_transaction_start_seconds":0.0,"transaction_start_seconds":0.0,"batch_priority_transaction_start_seconds":0.0,"read_seconds":7,"commit_seconds":0.02},"machines":{"$map":{"network":{"megabits_sent":{"hz":0.0},"megabits_received":{"hz":0.0},"tcp_segments_retransmitted":{"hz":0.0}},"locality":{"$map":"value"},"memory":{"free_bytes":0,"committed_bytes":0,"total_bytes":0},"contributing_workers":4,"datacenter_id":"6344abf1813eb05b","excluded":false,"address":"1.2.3.4","machine_id":"6344abf1813eb05b","cpu":{"logical_core_utilization":0.4}}},"old_logs":[{"satellite_log_fault_tolerance":2,"logs":[{"healthy":true,"id":"7f8d623d0cb9966e","address":"1.2.3.4:1234"}],"satellite_log_write_anti_quorum":0,"remote_log_fault_tolerance":2,"log_fault_tolerance":2,"log_write_anti_quorum":0,"satellite_log_replication_factor":3,"remote_log_replication_factor":3,"log_replication_factor":3}]},"client":{"coordinators":{"coordinators":[{"reachable":true,"address":"127.0.0.1:4701"}],"quorum_reachable":true},"cluster_file":{"path":"/etc/foundationdb/fdb.cluster","up_to_date":true},"messages":[{"name":{"$enum":["inconsistent_cluster_file","unreachable_cluster_controller","no_cluster_controller","status_incomplete_client","status_incomplete_coordinators","status_incomplete_error","status_incomplete_timeout","status_incomplete_cluster","quorum_not_reachable"]},"description":"The cluster file is not up to date."}],"timestamp":1415650089,"database_status":{"available":true,"healthy":true}}} diff --git a/tests/slow/DDBalanceAndRemoveStatus.txt b/tests/slow/DDBalanceAndRemoveStatus.txt index d098c41b69..0a47f52ef1 100644 --- a/tests/slow/DDBalanceAndRemoveStatus.txt +++ b/tests/slow/DDBalanceAndRemoveStatus.txt @@ -43,5 +43,5 @@ testTitle=DDBalance_test testName=Status testDuration=30.0 - schema={"cluster":{"layers":{"_valid":true,"_error":"some error description"},"datacenter_version_difference":0,"processes":{"$map":{"fault_domain":"0ccb4e0fdbdb5583010f6b77d9d10ece","class_source":{"$enum":["command_line","configure_auto","set_class"]},"class_type":{"$enum":["unset","storage","transaction","resolution","proxy","master","test"]},"roles":[{"query_queue_max":0,"data_lag":{"seconds":5.0,"versions":12341234},"input_bytes":{"hz":0.0,"counter":0,"roughness":0.0},"kvstore_used_bytes":12341234,"stored_bytes":12341234,"kvstore_free_bytes":12341234,"durable_bytes":{"hz":0.0,"counter":0,"roughness":0.0},"id":"eb84471d68c12d1d26f692a50000003f","data_version":12341234,"role":{"$enum":["master","proxy","log","storage","resolver","cluster_controller"]},"queue_disk_available_bytes":12341234,"kvstore_available_bytes":12341234,"queue_disk_total_bytes":12341234,"queue_disk_used_bytes":12341234,"queue_disk_free_bytes":12341234,"kvstore_total_bytes":12341234,"finished_queries":{"hz":0.0,"counter":0,"roughness":0.0}}],"locality":{"$map":"value"},"messages":[{"description":"abc","type":"x","name":{"$enum":["file_open_error","incorrect_cluster_file_contents","process_error","io_error","io_timeout","platform_error","storage_server_lagging","(other FDB error messages)"]},"raw_log_message":"","time":12345.12312}],"address":"1.2.3.4:1234","command_line":"-r simulation","disk":{"free_bytes":3451233456234,"reads":{"hz":0.0,"counter":0,"sectors":0},"busy":0.0,"writes":{"hz":0.0,"counter":0,"sectors":0},"total_bytes":123412341234},"version":"3.0.0","excluded":false,"memory":{"available_bytes":0,"unused_allocated_memory":0,"limit_bytes":0,"used_bytes":0},"machine_id":"0ccb4e0feddb5583010f6b77d9d10ece","uptime_seconds":1234.2345,"cpu":{"usage_cores":0.0},"network":{"megabits_sent":{"hz":0.0},"megabits_received":{"hz":0.0},"connections_closed":{"hz":0.0},"connection_errors":{"hz":0.0},"current_connections":0,"connections_established":{"hz":0.0}}}},"clients":{"count":1,"supported_versions":[{"count":1,"protocol_version":"fdb00a400050001","client_version":"3.0.0","source_version":"9430e1127b4991cbc5ab2b17f41cfffa5de07e9d","connected_clients":[{"log_group":"default","address":"127.0.0.1:9898"}]}]},"qos":{"limiting_version_lag_storage_server":0,"released_transactions_per_second":0,"transactions_per_second_limit":0,"limiting_queue_bytes_storage_server":0,"performance_limited_by":{"reason_server_id":"7f8d623d0cb9966e","description":"The database is not being saturated by the workload.","reason_id":0,"name":{"$enum":["workload","storage_server_write_queue_size","storage_server_write_bandwidth_mvcc","storage_server_readable_behind","log_server_mvcc_write_bandwidth","log_server_write_queue","storage_server_min_free_space","storage_server_min_free_space_ratio","log_server_min_free_space","log_server_min_free_space_ratio"]}},"worst_version_lag_storage_server":0,"worst_queue_bytes_log_server":460,"worst_queue_bytes_storage_server":0},"incompatible_connections":[],"full_replication":true,"database_locked":false,"generation":2,"data":{"least_operating_space_bytes_log_server":0,"average_partition_size_bytes":0,"state":{"healthy":true,"description":"","name":{"$enum":["initializing","missing_data","healing","healthy_repartitioning","healthy_removing_server","healthy_rebalancing","healthy"]},"min_replicas_remaining":0},"least_operating_space_ratio_storage_server":0.1,"max_machine_failures_without_losing_availability":0,"total_disk_used_bytes":0,"total_kv_size_bytes":0,"max_machine_failures_without_losing_data":0,"moving_data":{"in_queue_bytes":0,"total_written_bytes":0,"in_flight_bytes":0},"least_operating_space_bytes_storage_server":0,"partitions_count":2},"fault_tolerance":{"max_machine_failures_without_losing_availability":0,"max_machine_failures_without_losing_data":0},"messages":[{"reasons":[{"description":"Blah."}],"unreachable_processes":[{"address":"1.2.3.4:1234"}],"name":{"$enum":["unreachable_master_worker","unreadable_configuration","full_replication_timeout","client_issues","unreachable_processes","immediate_priority_transaction_start_probe_timeout","batch_priority_transaction_start_probe_timeout","transaction_start_probe_timeout","read_probe_timeout","commit_probe_timeout","storage_servers_error","status_incomplete","layer_status_incomplete","database_availability_timeout"]},"issues":[{"name":{"$enum":["incorrect_cluster_file_contents"]},"description":"Cluster file contents do not match current cluster connection string. Verify cluster file is writable and has not been overwritten externally."}],"description":"abc"}],"database_available":true,"recovery_state":{"required_proxies":1,"name":{"$enum":["reading_coordinated_state","locking_coordinated_state","locking_old_transaction_servers","reading_transaction_system_state","configuration_missing","configuration_never_created","configuration_invalid","recruiting_transaction_servers","initializing_transaction_servers","recovery_transaction","writing_coordinated_state","fully_recovered"]},"missing_logs":"7f8d623d0cb9966e","required_resolvers":1,"required_logs":3,"description":"Recovery complete."},"workload":{"operations":{"writes":{"hz":0.0,"counter":0,"roughness":0.0},"reads":{"hz":0.0,"counter":0,"roughness":0.0}},"keys":{"read":{"hz":0.0,"counter":0,"roughness":0.0}},"bytes":{"read":{"hz":0.0,"counter":0,"roughness":0.0},"written":{"hz":0.0,"counter":0,"roughness":0.0}},"transactions":{"started":{"hz":0.0,"counter":0,"roughness":0.0},"conflicted":{"hz":0.0,"counter":0,"roughness":0.0},"committed":{"hz":0.0,"counter":0,"roughness":0.0}}},"cluster_controller_timestamp":1415650089,"protocol_version":"fdb00a400050001","configuration":{"logs":2,"log_replicas":2,"storage_engine":{"$enum":["ssd","ssd-1","ssd-2","memory","custom"]},"excluded_servers":[{"address":"10.0.4.1"}],"remote_logs":5,"log_anti_quorum":0,"storage_replicas":1,"coordinators_count":1,"regions":[{"satellite_redundancy_mode":"one_satellite_single","satellite_anti_quorum":0,"satellite_usable_dcs":1,"datacenters":[{"priority":1,"satellite":1,"id":"mr"}],"satellite_log_policy":"(zoneid^3x1)","satellite_log_replicas":1,"satellite_logs":2}],"usable_regions":1,"redundancy_mode":"single","auto_logs":3,"proxies":5,"resolvers":1,"log_replication_policy":"(zoneid^3x1)","remote_redundancy_mode":"remote_single","repopulate_anti_quorum":1,"remote_log_replicas":3,"log_routers":10,"storage_replication_policy":"(zoneid^3x1)","auto_proxies":3,"auto_resolvers":1},"latency_probe":{"immediate_priority_transaction_start_seconds":0.0,"transaction_start_seconds":0.0,"batch_priority_transaction_start_seconds":0.0,"read_seconds":7,"commit_seconds":0.02},"machines":{"$map":{"network":{"megabits_sent":{"hz":0.0},"megabits_received":{"hz":0.0},"tcp_segments_retransmitted":{"hz":0.0}},"locality":{"$map":"value"},"memory":{"free_bytes":0,"committed_bytes":0,"total_bytes":0},"contributing_workers":4,"datacenter_id":"6344abf1813eb05b","excluded":false,"address":"1.2.3.4","machine_id":"6344abf1813eb05b","cpu":{"logical_core_utilization":0.4}}},"old_logs":[{"satellite_log_fault_tolerance":2,"logs":[{"healthy":true,"id":"7f8d623d0cb9966e","address":"1.2.3.4:1234"}],"satellite_log_write_anti_quorum":0,"remote_log_fault_tolerance":2,"log_fault_tolerance":2,"log_write_anti_quorum":0,"satellite_log_replication_factor":3,"remote_log_replication_factor":3,"log_replication_factor":3}]},"client":{"coordinators":{"coordinators":[{"reachable":true,"address":"127.0.0.1:4701"}],"quorum_reachable":true},"cluster_file":{"path":"/etc/foundationdb/fdb.cluster","up_to_date":true},"messages":[{"name":{"$enum":["inconsistent_cluster_file","unreachable_cluster_controller","no_cluster_controller","status_incomplete_client","status_incomplete_coordinators","status_incomplete_error","status_incomplete_timeout","status_incomplete_cluster","quorum_not_reachable"]},"description":"The cluster file is not up to date."}],"timestamp":1415650089,"database_status":{"available":true,"healthy":true}}} + schema={"cluster":{"layers":{"_valid":true,"_error":"some error description"},"datacenter_version_difference":0,"processes":{"$map":{"fault_domain":"0ccb4e0fdbdb5583010f6b77d9d10ece","class_source":{"$enum":["command_line","configure_auto","set_class"]},"class_type":{"$enum":["unset","storage","transaction","resolution","proxy","master","test"]},"roles":[{"query_queue_max":0,"data_lag":{"seconds":5.0,"versions":12341234},"input_bytes":{"hz":0.0,"counter":0,"roughness":0.0},"kvstore_used_bytes":12341234,"stored_bytes":12341234,"kvstore_free_bytes":12341234,"durable_bytes":{"hz":0.0,"counter":0,"roughness":0.0},"id":"eb84471d68c12d1d26f692a50000003f","data_version":12341234,"role":{"$enum":["master","proxy","log","storage","resolver","cluster_controller"]},"queue_disk_available_bytes":12341234,"kvstore_available_bytes":12341234,"queue_disk_total_bytes":12341234,"queue_disk_used_bytes":12341234,"queue_disk_free_bytes":12341234,"kvstore_total_bytes":12341234,"finished_queries":{"hz":0.0,"counter":0,"roughness":0.0}}],"locality":{"$map":"value"},"messages":[{"description":"abc","type":"x","name":{"$enum":["file_open_error","incorrect_cluster_file_contents","process_error","io_error","io_timeout","platform_error","storage_server_lagging","(other FDB error messages)"]},"raw_log_message":"","time":12345.12312}],"address":"1.2.3.4:1234","command_line":"-r simulation","disk":{"free_bytes":3451233456234,"reads":{"hz":0.0,"counter":0,"sectors":0},"busy":0.0,"writes":{"hz":0.0,"counter":0,"sectors":0},"total_bytes":123412341234},"version":"3.0.0","excluded":false,"memory":{"available_bytes":0,"unused_allocated_memory":0,"limit_bytes":0,"used_bytes":0},"machine_id":"0ccb4e0feddb5583010f6b77d9d10ece","uptime_seconds":1234.2345,"cpu":{"usage_cores":0.0},"network":{"megabits_sent":{"hz":0.0},"megabits_received":{"hz":0.0},"connections_closed":{"hz":0.0},"connection_errors":{"hz":0.0},"current_connections":0,"connections_established":{"hz":0.0}}}},"clients":{"count":1,"supported_versions":[{"count":1,"protocol_version":"fdb00a400050001","client_version":"3.0.0","source_version":"9430e1127b4991cbc5ab2b17f41cfffa5de07e9d","connected_clients":[{"log_group":"default","address":"127.0.0.1:9898"}]}]},"qos":{"limiting_version_lag_storage_server":0,"released_transactions_per_second":0,"transactions_per_second_limit":0,"limiting_queue_bytes_storage_server":0,"performance_limited_by":{"reason_server_id":"7f8d623d0cb9966e","description":"The database is not being saturated by the workload.","reason_id":0,"name":{"$enum":["workload","storage_server_write_queue_size","storage_server_write_bandwidth_mvcc","storage_server_readable_behind","log_server_mvcc_write_bandwidth","log_server_write_queue","storage_server_min_free_space","storage_server_min_free_space_ratio","log_server_min_free_space","log_server_min_free_space_ratio"]}},"worst_version_lag_storage_server":0,"worst_queue_bytes_log_server":460,"worst_queue_bytes_storage_server":0},"incompatible_connections":[],"full_replication":true,"database_locked":false,"generation":2,"data":{"least_operating_space_bytes_log_server":0,"average_partition_size_bytes":0,"state":{"healthy":true,"description":"","name":{"$enum":["initializing","missing_data","healing","healthy_repartitioning","healthy_removing_server","healthy_rebalancing","healthy"]},"min_replicas_remaining":0},"least_operating_space_ratio_storage_server":0.1,"max_machine_failures_without_losing_availability":0,"total_disk_used_bytes":0,"total_kv_size_bytes":0,"max_machine_failures_without_losing_data":0,"moving_data":{"in_queue_bytes":0,"total_written_bytes":0,"in_flight_bytes":0},"least_operating_space_bytes_storage_server":0,"partitions_count":2},"fault_tolerance":{"max_machine_failures_without_losing_availability":0,"max_machine_failures_without_losing_data":0},"messages":[{"reasons":[{"description":"Blah."}],"unreachable_processes":[{"address":"1.2.3.4:1234"}],"name":{"$enum":["unreachable_master_worker","unreadable_configuration","full_replication_timeout","client_issues","unreachable_processes","immediate_priority_transaction_start_probe_timeout","batch_priority_transaction_start_probe_timeout","transaction_start_probe_timeout","read_probe_timeout","commit_probe_timeout","storage_servers_error","status_incomplete","layer_status_incomplete","database_availability_timeout"]},"issues":[{"name":{"$enum":["incorrect_cluster_file_contents"]},"description":"Cluster file contents do not match current cluster connection string. Verify cluster file is writable and has not been overwritten externally."}],"description":"abc"}],"database_available":true,"recovery_state":{"required_proxies":1,"name":{"$enum":["reading_coordinated_state","locking_coordinated_state","locking_old_transaction_servers","reading_transaction_system_state","configuration_missing","configuration_never_created","configuration_invalid","recruiting_transaction_servers","initializing_transaction_servers","recovery_transaction","writing_coordinated_state","accepting_commits","all_logs_recruited","storage_recovered","fully_recovered"]},"missing_logs":"7f8d623d0cb9966e","required_resolvers":1,"required_logs":3,"description":"Recovery complete."},"workload":{"operations":{"writes":{"hz":0.0,"counter":0,"roughness":0.0},"reads":{"hz":0.0,"counter":0,"roughness":0.0}},"keys":{"read":{"hz":0.0,"counter":0,"roughness":0.0}},"bytes":{"read":{"hz":0.0,"counter":0,"roughness":0.0},"written":{"hz":0.0,"counter":0,"roughness":0.0}},"transactions":{"started":{"hz":0.0,"counter":0,"roughness":0.0},"conflicted":{"hz":0.0,"counter":0,"roughness":0.0},"committed":{"hz":0.0,"counter":0,"roughness":0.0}}},"cluster_controller_timestamp":1415650089,"protocol_version":"fdb00a400050001","configuration":{"logs":2,"log_replicas":2,"storage_engine":{"$enum":["ssd","ssd-1","ssd-2","memory","custom"]},"excluded_servers":[{"address":"10.0.4.1"}],"remote_logs":5,"log_anti_quorum":0,"storage_replicas":1,"coordinators_count":1,"regions":[{"satellite_redundancy_mode":"one_satellite_single","satellite_anti_quorum":0,"satellite_usable_dcs":1,"datacenters":[{"priority":1,"satellite":1,"id":"mr"}],"satellite_log_policy":"(zoneid^3x1)","satellite_log_replicas":1,"satellite_logs":2}],"usable_regions":1,"redundancy_mode":"single","auto_logs":3,"proxies":5,"resolvers":1,"log_replication_policy":"(zoneid^3x1)","remote_redundancy_mode":"remote_single","repopulate_anti_quorum":1,"remote_log_replicas":3,"log_routers":10,"storage_replication_policy":"(zoneid^3x1)","auto_proxies":3,"auto_resolvers":1},"latency_probe":{"immediate_priority_transaction_start_seconds":0.0,"transaction_start_seconds":0.0,"batch_priority_transaction_start_seconds":0.0,"read_seconds":7,"commit_seconds":0.02},"machines":{"$map":{"network":{"megabits_sent":{"hz":0.0},"megabits_received":{"hz":0.0},"tcp_segments_retransmitted":{"hz":0.0}},"locality":{"$map":"value"},"memory":{"free_bytes":0,"committed_bytes":0,"total_bytes":0},"contributing_workers":4,"datacenter_id":"6344abf1813eb05b","excluded":false,"address":"1.2.3.4","machine_id":"6344abf1813eb05b","cpu":{"logical_core_utilization":0.4}}},"old_logs":[{"satellite_log_fault_tolerance":2,"logs":[{"healthy":true,"id":"7f8d623d0cb9966e","address":"1.2.3.4:1234"}],"satellite_log_write_anti_quorum":0,"remote_log_fault_tolerance":2,"log_fault_tolerance":2,"log_write_anti_quorum":0,"satellite_log_replication_factor":3,"remote_log_replication_factor":3,"log_replication_factor":3}]},"client":{"coordinators":{"coordinators":[{"reachable":true,"address":"127.0.0.1:4701"}],"quorum_reachable":true},"cluster_file":{"path":"/etc/foundationdb/fdb.cluster","up_to_date":true},"messages":[{"name":{"$enum":["inconsistent_cluster_file","unreachable_cluster_controller","no_cluster_controller","status_incomplete_client","status_incomplete_coordinators","status_incomplete_error","status_incomplete_timeout","status_incomplete_cluster","quorum_not_reachable"]},"description":"The cluster file is not up to date."}],"timestamp":1415650089,"database_status":{"available":true,"healthy":true}}}