improved error msgs for snapshot cmd
This commit is contained in:
parent
e0824f4915
commit
5d87443323
|
@ -172,7 +172,6 @@ public:
|
|||
Counter transactionsMaybeCommitted;
|
||||
Counter transactionsResourceConstrained;
|
||||
Counter transactionsProcessBehind;
|
||||
Counter transactionWaitsForFullRecovery;
|
||||
|
||||
ContinuousSample<double> latencies, readLatencies, commitLatencies, GRVLatencies, mutationsPerCommit, bytesPerCommit;
|
||||
|
||||
|
|
|
@ -519,7 +519,7 @@ DatabaseContext::DatabaseContext(
|
|||
transactionCommittedMutations("CommittedMutations", cc), transactionCommittedMutationBytes("CommittedMutationBytes", cc), transactionsCommitStarted("CommitStarted", cc),
|
||||
transactionsCommitCompleted("CommitCompleted", cc), transactionsTooOld("TooOld", cc), transactionsFutureVersions("FutureVersions", cc),
|
||||
transactionsNotCommitted("NotCommitted", cc), transactionsMaybeCommitted("MaybeCommitted", cc), transactionsResourceConstrained("ResourceConstrained", cc),
|
||||
transactionsProcessBehind("ProcessBehind", cc), transactionWaitsForFullRecovery("WaitsForFullRecovery", cc), outstandingWatches(0),
|
||||
transactionsProcessBehind("ProcessBehind", cc), outstandingWatches(0),
|
||||
latencies(1000), readLatencies(1000), commitLatencies(1000), GRVLatencies(1000), mutationsPerCommit(1000), bytesPerCommit(1000), mvCacheInsertLocation(0),
|
||||
healthMetricsLastUpdated(0), detailedHealthMetricsLastUpdated(0), internal(internal)
|
||||
{
|
||||
|
@ -548,7 +548,7 @@ DatabaseContext::DatabaseContext( const Error &err ) : deferredError(err), cc("T
|
|||
transactionCommittedMutations("CommittedMutations", cc), transactionCommittedMutationBytes("CommittedMutationBytes", cc), transactionsCommitStarted("CommitStarted", cc),
|
||||
transactionsCommitCompleted("CommitCompleted", cc), transactionsTooOld("TooOld", cc), transactionsFutureVersions("FutureVersions", cc),
|
||||
transactionsNotCommitted("NotCommitted", cc), transactionsMaybeCommitted("MaybeCommitted", cc), transactionsResourceConstrained("ResourceConstrained", cc),
|
||||
transactionsProcessBehind("ProcessBehind", cc), transactionWaitsForFullRecovery("WaitsForFullRecovery", cc), latencies(1000), readLatencies(1000), commitLatencies(1000),
|
||||
transactionsProcessBehind("ProcessBehind", cc), latencies(1000), readLatencies(1000), commitLatencies(1000),
|
||||
GRVLatencies(1000), mutationsPerCommit(1000), bytesPerCommit(1000),
|
||||
internal(false) {}
|
||||
|
||||
|
@ -2705,10 +2705,7 @@ ACTOR static Future<Void> tryCommit( Database cx, Reference<TransactionLogInfo>
|
|||
if (e.code() != error_code_transaction_too_old
|
||||
&& e.code() != error_code_not_committed
|
||||
&& e.code() != error_code_database_locked
|
||||
&& e.code() != error_code_proxy_memory_limit_exceeded
|
||||
&& e.code() != error_code_transaction_not_permitted
|
||||
&& e.code() != error_code_cluster_not_fully_recovered
|
||||
&& e.code() != error_code_txn_exec_log_anti_quorum)
|
||||
&& e.code() != error_code_proxy_memory_limit_exceeded)
|
||||
TraceEvent(SevError, "TryCommitError").error(e);
|
||||
if (trLogInfo)
|
||||
trLogInfo->addLog(FdbClientLogEvents::EventCommitError(startTime, static_cast<int>(e.code()), req));
|
||||
|
@ -3115,8 +3112,7 @@ Future<Void> Transaction::onError( Error const& e ) {
|
|||
e.code() == error_code_commit_unknown_result ||
|
||||
e.code() == error_code_database_locked ||
|
||||
e.code() == error_code_proxy_memory_limit_exceeded ||
|
||||
e.code() == error_code_process_behind ||
|
||||
e.code() == error_code_cluster_not_fully_recovered)
|
||||
e.code() == error_code_process_behind)
|
||||
{
|
||||
if(e.code() == error_code_not_committed)
|
||||
++cx->transactionsNotCommitted;
|
||||
|
@ -3126,9 +3122,6 @@ Future<Void> Transaction::onError( Error const& e ) {
|
|||
++cx->transactionsResourceConstrained;
|
||||
if (e.code() == error_code_process_behind)
|
||||
++cx->transactionsProcessBehind;
|
||||
if (e.code() == error_code_cluster_not_fully_recovered) {
|
||||
++cx->transactionWaitsForFullRecovery;
|
||||
}
|
||||
|
||||
double backoff = getBackoff(e.code());
|
||||
reset();
|
||||
|
|
|
@ -4147,7 +4147,7 @@ ACTOR Future<Void> ddSnapCreateCore(DistributorSnapRequest snapReq, Reference<As
|
|||
std::vector<Future<Void>> disablePops;
|
||||
for (const auto & tlog : tlogs) {
|
||||
disablePops.push_back(
|
||||
transformErrors(throwErrorOr(tlog.disablePopRequest.tryGetReply(TLogDisablePopRequest(snapReq.snapUID))), operation_failed())
|
||||
transformErrors(throwErrorOr(tlog.disablePopRequest.tryGetReply(TLogDisablePopRequest(snapReq.snapUID))), snap_disable_tlog_pop_failed())
|
||||
);
|
||||
}
|
||||
wait(waitForAll(disablePops));
|
||||
|
@ -4156,14 +4156,14 @@ ACTOR Future<Void> ddSnapCreateCore(DistributorSnapRequest snapReq, Reference<As
|
|||
.detail("SnapPayload", snapReq.snapPayload)
|
||||
.detail("SnapUID", snapReq.snapUID);
|
||||
// snap local storage nodes
|
||||
std::vector<WorkerInterface> storageWorkers = wait(getStorageWorkers(cx, db, true /* localOnly */));
|
||||
std::vector<WorkerInterface> storageWorkers = wait(transformErrors(getStorageWorkers(cx, db, true /* localOnly */), snap_storage_failed()));
|
||||
TraceEvent("SnapDataDistributor_GotStorageWorkers")
|
||||
.detail("SnapPayload", snapReq.snapPayload)
|
||||
.detail("SnapUID", snapReq.snapUID);
|
||||
std::vector<Future<Void>> storageSnapReqs;
|
||||
for (const auto & worker : storageWorkers) {
|
||||
storageSnapReqs.push_back(
|
||||
transformErrors(throwErrorOr(worker.workerSnapReq.tryGetReply(WorkerSnapRequest(snapReq.snapPayload, snapReq.snapUID, LiteralStringRef("storage")))), operation_failed())
|
||||
transformErrors(throwErrorOr(worker.workerSnapReq.tryGetReply(WorkerSnapRequest(snapReq.snapPayload, snapReq.snapUID, LiteralStringRef("storage")))), snap_storage_failed())
|
||||
);
|
||||
}
|
||||
wait(waitForAll(storageSnapReqs));
|
||||
|
@ -4175,7 +4175,7 @@ ACTOR Future<Void> ddSnapCreateCore(DistributorSnapRequest snapReq, Reference<As
|
|||
std::vector<Future<Void>> tLogSnapReqs;
|
||||
for (const auto & tlog : tlogs) {
|
||||
tLogSnapReqs.push_back(
|
||||
transformErrors(throwErrorOr(tlog.snapRequest.tryGetReply(TLogSnapRequest(snapReq.snapPayload, snapReq.snapUID, LiteralStringRef("tlog")))), operation_failed())
|
||||
transformErrors(throwErrorOr(tlog.snapRequest.tryGetReply(TLogSnapRequest(snapReq.snapPayload, snapReq.snapUID, LiteralStringRef("tlog")))), snap_tlog_failed())
|
||||
);
|
||||
}
|
||||
wait(waitForAll(tLogSnapReqs));
|
||||
|
@ -4187,7 +4187,7 @@ ACTOR Future<Void> ddSnapCreateCore(DistributorSnapRequest snapReq, Reference<As
|
|||
std::vector<Future<Void>> enablePops;
|
||||
for (const auto & tlog : tlogs) {
|
||||
enablePops.push_back(
|
||||
transformErrors(throwErrorOr(tlog.enablePopRequest.tryGetReply(TLogEnablePopRequest(snapReq.snapUID))), operation_failed())
|
||||
transformErrors(throwErrorOr(tlog.enablePopRequest.tryGetReply(TLogEnablePopRequest(snapReq.snapUID))), snap_enable_tlog_pop_failed())
|
||||
);
|
||||
}
|
||||
wait(waitForAll(enablePops));
|
||||
|
@ -4203,18 +4203,36 @@ ACTOR Future<Void> ddSnapCreateCore(DistributorSnapRequest snapReq, Reference<As
|
|||
std::vector<Future<Void>> coordSnapReqs;
|
||||
for (const auto & worker : coordWorkers) {
|
||||
coordSnapReqs.push_back(
|
||||
transformErrors(throwErrorOr(worker.workerSnapReq.tryGetReply(WorkerSnapRequest(snapReq.snapPayload, snapReq.snapUID, LiteralStringRef("coord")))), operation_failed())
|
||||
transformErrors(throwErrorOr(worker.workerSnapReq.tryGetReply(WorkerSnapRequest(snapReq.snapPayload, snapReq.snapUID, LiteralStringRef("coord")))), snap_coord_failed())
|
||||
);
|
||||
}
|
||||
wait(waitForAll(coordSnapReqs));
|
||||
TraceEvent("SnapDataDistributor_AfterSnapCoords")
|
||||
.detail("SnapPayload", snapReq.snapPayload)
|
||||
.detail("SnapUID", snapReq.snapUID);
|
||||
} catch (Error& e) {
|
||||
} catch (Error& err) {
|
||||
state Error e = err;
|
||||
TraceEvent("SnapDataDistributor_SnapReqExit")
|
||||
.detail("SnapPayload", snapReq.snapPayload)
|
||||
.detail("SnapUID", snapReq.snapUID)
|
||||
.error(e, true /*includeCancelled */);
|
||||
if (e.code() == error_code_snap_storage_failed
|
||||
|| e.code() == error_code_snap_tlog_failed
|
||||
|| e.code() == error_code_operation_cancelled) {
|
||||
// enable tlog pop on local tlog nodes
|
||||
std::vector<TLogInterface> tlogs = db->get().logSystemConfig.allLocalLogs(false);
|
||||
try {
|
||||
std::vector<Future<Void>> enablePops;
|
||||
for (const auto & tlog : tlogs) {
|
||||
enablePops.push_back(
|
||||
transformErrors(throwErrorOr(tlog.enablePopRequest.tryGetReply(TLogEnablePopRequest(snapReq.snapUID))), snap_enable_tlog_pop_failed())
|
||||
);
|
||||
}
|
||||
wait(waitForAll(enablePops));
|
||||
} catch (Error& error) {
|
||||
TraceEvent(SevDebug, "IgnoreEnableTLogPopFailure");
|
||||
}
|
||||
}
|
||||
throw e;
|
||||
}
|
||||
return Void();
|
||||
|
@ -4235,7 +4253,7 @@ ACTOR Future<Void> ddSnapCreate(DistributorSnapRequest snapReq, Reference<AsyncV
|
|||
TraceEvent("SnapDDCreateDBInfoChanged")
|
||||
.detail("SnapPayload", snapReq.snapPayload)
|
||||
.detail("SnapUID", snapReq.snapUID);
|
||||
snapReq.reply.sendError(operation_failed());
|
||||
snapReq.reply.sendError(snap_with_recovery_unsupported());
|
||||
}
|
||||
when (wait(ddSnapCreateCore(snapReq, db))) {
|
||||
TraceEvent("SnapDDCreateSuccess")
|
||||
|
|
|
@ -1466,7 +1466,7 @@ ACTOR Future<Void> proxySnapCreate(ProxySnapRequest snapReq, ProxyCommitData* co
|
|||
TraceEvent("SnapMasterProxy_WhiteListCheckFailed")
|
||||
.detail("SnapPayload", snapReq.snapPayload)
|
||||
.detail("SnapUID", snapReq.snapUID);
|
||||
throw transaction_not_permitted();
|
||||
throw snap_path_not_whitelisted();
|
||||
}
|
||||
// db fully recovered check
|
||||
if (commitData->db->get().recoveryState != RecoveryState::FULLY_RECOVERED) {
|
||||
|
@ -1478,7 +1478,7 @@ ACTOR Future<Void> proxySnapCreate(ProxySnapRequest snapReq, ProxyCommitData* co
|
|||
TraceEvent("SnapMasterProxy_ClusterNotFullyRecovered")
|
||||
.detail("SnapPayload", snapReq.snapPayload)
|
||||
.detail("SnapUID", snapReq.snapUID);
|
||||
throw cluster_not_fully_recovered();
|
||||
throw snap_not_fully_recovered_unsupported();
|
||||
}
|
||||
|
||||
auto result =
|
||||
|
@ -1493,7 +1493,7 @@ ACTOR Future<Void> proxySnapCreate(ProxySnapRequest snapReq, ProxyCommitData* co
|
|||
TraceEvent("SnapMasterProxy_LogAnitQuorumNotSupported")
|
||||
.detail("SnapPayload", snapReq.snapPayload)
|
||||
.detail("SnapUID", snapReq.snapUID);
|
||||
throw txn_exec_log_anti_quorum();
|
||||
throw snap_log_anti_quorum_unsupported();
|
||||
}
|
||||
|
||||
// send a snap request to DD
|
||||
|
|
|
@ -211,7 +211,7 @@ public: // workload functions
|
|||
wait(status);
|
||||
break;
|
||||
} catch (Error& e) {
|
||||
if (e.code() == error_code_txn_exec_log_anti_quorum) {
|
||||
if (e.code() == error_code_snap_log_anti_quorum_unsupported) {
|
||||
snapFailed = true;
|
||||
break;
|
||||
}
|
||||
|
@ -298,12 +298,12 @@ public: // workload functions
|
|||
wait(status);
|
||||
break;
|
||||
} catch (Error& e) {
|
||||
if (e.code() == error_code_cluster_not_fully_recovered ||
|
||||
e.code() == error_code_txn_exec_log_anti_quorum) {
|
||||
if (e.code() == error_code_snap_not_fully_recovered_unsupported ||
|
||||
e.code() == error_code_snap_log_anti_quorum_unsupported) {
|
||||
snapFailed = true;
|
||||
break;
|
||||
}
|
||||
if (e.code() == error_code_transaction_not_permitted) {
|
||||
if (e.code() == error_code_snap_path_not_whitelisted) {
|
||||
testedFailure = true;
|
||||
break;
|
||||
}
|
||||
|
|
|
@ -65,9 +65,6 @@ ERROR( lookup_failed, 1041, "DNS lookup failed" )
|
|||
ERROR( proxy_memory_limit_exceeded, 1042, "Proxy commit memory limit exceeded" )
|
||||
ERROR( shutdown_in_progress, 1043, "Operation no longer supported due to shutdown" )
|
||||
ERROR( serialization_failed, 1044, "Failed to deserialize an object" )
|
||||
ERROR( transaction_not_permitted, 1045, "Operation not permitted")
|
||||
ERROR( cluster_not_fully_recovered, 1046, "Cluster not fully recovered")
|
||||
ERROR( txn_exec_log_anti_quorum, 1047, "Execute Transaction not supported when log anti quorum is configured")
|
||||
ERROR( connection_unreferenced, 1048, "No peer references for connection" )
|
||||
ERROR( connection_idle, 1049, "Connection closed after idle timeout" )
|
||||
ERROR( disk_adapter_reset, 1050, "The disk queue adpater reset" )
|
||||
|
@ -206,6 +203,17 @@ ERROR( key_not_found, 2400, "Expected key is missing")
|
|||
ERROR( json_malformed, 2401, "JSON string was malformed")
|
||||
ERROR( json_eof_expected, 2402, "JSON string did not terminate where expected")
|
||||
|
||||
// 2500 - disk snapshot based backup errors
|
||||
ERROR( snap_disable_tlog_pop_failed, 2500, "Snapshot error")
|
||||
ERROR( snap_storage_failed, 2501, "Failed to snapshot storage nodes")
|
||||
ERROR( snap_tlog_failed, 2502, "Failed to snapshot TLog nodes")
|
||||
ERROR( snap_coord_failed, 2503, "Failed to snapshot coordinator nodes")
|
||||
ERROR( snap_enable_tlog_pop_failed, 2504, "Snapshot error")
|
||||
ERROR( snap_path_not_whitelisted, 2505, "Snapshot create binary path not whitelisted")
|
||||
ERROR( snap_not_fully_recovered_unsupported, 2506, "Unsupported when the cluster is not fully recovered")
|
||||
ERROR( snap_log_anti_quorum_unsupported, 2507, "Unsupported when log anti quorum is configured")
|
||||
ERROR( snap_with_recovery_unsupported, 2508, "Cluster recovery during snapshot operation not supported")
|
||||
|
||||
// 4xxx Internal errors (those that should be generated only by bugs) are decimal 4xxx
|
||||
ERROR( unknown_error, 4000, "An unknown error occurred" ) // C++ exception not of type Error
|
||||
ERROR( internal_error, 4100, "An internal error occurred" )
|
||||
|
|
Loading…
Reference in New Issue