improved error msgs for snapshot cmd

This commit is contained in:
sramamoorthy 2019-08-23 11:56:06 -07:00
parent e0824f4915
commit 5d87443323
6 changed files with 48 additions and 30 deletions

View File

@ -172,7 +172,6 @@ public:
Counter transactionsMaybeCommitted;
Counter transactionsResourceConstrained;
Counter transactionsProcessBehind;
Counter transactionWaitsForFullRecovery;
ContinuousSample<double> latencies, readLatencies, commitLatencies, GRVLatencies, mutationsPerCommit, bytesPerCommit;

View File

@ -519,7 +519,7 @@ DatabaseContext::DatabaseContext(
transactionCommittedMutations("CommittedMutations", cc), transactionCommittedMutationBytes("CommittedMutationBytes", cc), transactionsCommitStarted("CommitStarted", cc),
transactionsCommitCompleted("CommitCompleted", cc), transactionsTooOld("TooOld", cc), transactionsFutureVersions("FutureVersions", cc),
transactionsNotCommitted("NotCommitted", cc), transactionsMaybeCommitted("MaybeCommitted", cc), transactionsResourceConstrained("ResourceConstrained", cc),
transactionsProcessBehind("ProcessBehind", cc), transactionWaitsForFullRecovery("WaitsForFullRecovery", cc), outstandingWatches(0),
transactionsProcessBehind("ProcessBehind", cc), outstandingWatches(0),
latencies(1000), readLatencies(1000), commitLatencies(1000), GRVLatencies(1000), mutationsPerCommit(1000), bytesPerCommit(1000), mvCacheInsertLocation(0),
healthMetricsLastUpdated(0), detailedHealthMetricsLastUpdated(0), internal(internal)
{
@ -548,7 +548,7 @@ DatabaseContext::DatabaseContext( const Error &err ) : deferredError(err), cc("T
transactionCommittedMutations("CommittedMutations", cc), transactionCommittedMutationBytes("CommittedMutationBytes", cc), transactionsCommitStarted("CommitStarted", cc),
transactionsCommitCompleted("CommitCompleted", cc), transactionsTooOld("TooOld", cc), transactionsFutureVersions("FutureVersions", cc),
transactionsNotCommitted("NotCommitted", cc), transactionsMaybeCommitted("MaybeCommitted", cc), transactionsResourceConstrained("ResourceConstrained", cc),
transactionsProcessBehind("ProcessBehind", cc), transactionWaitsForFullRecovery("WaitsForFullRecovery", cc), latencies(1000), readLatencies(1000), commitLatencies(1000),
transactionsProcessBehind("ProcessBehind", cc), latencies(1000), readLatencies(1000), commitLatencies(1000),
GRVLatencies(1000), mutationsPerCommit(1000), bytesPerCommit(1000),
internal(false) {}
@ -2705,10 +2705,7 @@ ACTOR static Future<Void> tryCommit( Database cx, Reference<TransactionLogInfo>
if (e.code() != error_code_transaction_too_old
&& e.code() != error_code_not_committed
&& e.code() != error_code_database_locked
&& e.code() != error_code_proxy_memory_limit_exceeded
&& e.code() != error_code_transaction_not_permitted
&& e.code() != error_code_cluster_not_fully_recovered
&& e.code() != error_code_txn_exec_log_anti_quorum)
&& e.code() != error_code_proxy_memory_limit_exceeded)
TraceEvent(SevError, "TryCommitError").error(e);
if (trLogInfo)
trLogInfo->addLog(FdbClientLogEvents::EventCommitError(startTime, static_cast<int>(e.code()), req));
@ -3115,8 +3112,7 @@ Future<Void> Transaction::onError( Error const& e ) {
e.code() == error_code_commit_unknown_result ||
e.code() == error_code_database_locked ||
e.code() == error_code_proxy_memory_limit_exceeded ||
e.code() == error_code_process_behind ||
e.code() == error_code_cluster_not_fully_recovered)
e.code() == error_code_process_behind)
{
if(e.code() == error_code_not_committed)
++cx->transactionsNotCommitted;
@ -3126,9 +3122,6 @@ Future<Void> Transaction::onError( Error const& e ) {
++cx->transactionsResourceConstrained;
if (e.code() == error_code_process_behind)
++cx->transactionsProcessBehind;
if (e.code() == error_code_cluster_not_fully_recovered) {
++cx->transactionWaitsForFullRecovery;
}
double backoff = getBackoff(e.code());
reset();

View File

@ -4147,7 +4147,7 @@ ACTOR Future<Void> ddSnapCreateCore(DistributorSnapRequest snapReq, Reference<As
std::vector<Future<Void>> disablePops;
for (const auto & tlog : tlogs) {
disablePops.push_back(
transformErrors(throwErrorOr(tlog.disablePopRequest.tryGetReply(TLogDisablePopRequest(snapReq.snapUID))), operation_failed())
transformErrors(throwErrorOr(tlog.disablePopRequest.tryGetReply(TLogDisablePopRequest(snapReq.snapUID))), snap_disable_tlog_pop_failed())
);
}
wait(waitForAll(disablePops));
@ -4156,14 +4156,14 @@ ACTOR Future<Void> ddSnapCreateCore(DistributorSnapRequest snapReq, Reference<As
.detail("SnapPayload", snapReq.snapPayload)
.detail("SnapUID", snapReq.snapUID);
// snap local storage nodes
std::vector<WorkerInterface> storageWorkers = wait(getStorageWorkers(cx, db, true /* localOnly */));
std::vector<WorkerInterface> storageWorkers = wait(transformErrors(getStorageWorkers(cx, db, true /* localOnly */), snap_storage_failed()));
TraceEvent("SnapDataDistributor_GotStorageWorkers")
.detail("SnapPayload", snapReq.snapPayload)
.detail("SnapUID", snapReq.snapUID);
std::vector<Future<Void>> storageSnapReqs;
for (const auto & worker : storageWorkers) {
storageSnapReqs.push_back(
transformErrors(throwErrorOr(worker.workerSnapReq.tryGetReply(WorkerSnapRequest(snapReq.snapPayload, snapReq.snapUID, LiteralStringRef("storage")))), operation_failed())
transformErrors(throwErrorOr(worker.workerSnapReq.tryGetReply(WorkerSnapRequest(snapReq.snapPayload, snapReq.snapUID, LiteralStringRef("storage")))), snap_storage_failed())
);
}
wait(waitForAll(storageSnapReqs));
@ -4175,7 +4175,7 @@ ACTOR Future<Void> ddSnapCreateCore(DistributorSnapRequest snapReq, Reference<As
std::vector<Future<Void>> tLogSnapReqs;
for (const auto & tlog : tlogs) {
tLogSnapReqs.push_back(
transformErrors(throwErrorOr(tlog.snapRequest.tryGetReply(TLogSnapRequest(snapReq.snapPayload, snapReq.snapUID, LiteralStringRef("tlog")))), operation_failed())
transformErrors(throwErrorOr(tlog.snapRequest.tryGetReply(TLogSnapRequest(snapReq.snapPayload, snapReq.snapUID, LiteralStringRef("tlog")))), snap_tlog_failed())
);
}
wait(waitForAll(tLogSnapReqs));
@ -4187,7 +4187,7 @@ ACTOR Future<Void> ddSnapCreateCore(DistributorSnapRequest snapReq, Reference<As
std::vector<Future<Void>> enablePops;
for (const auto & tlog : tlogs) {
enablePops.push_back(
transformErrors(throwErrorOr(tlog.enablePopRequest.tryGetReply(TLogEnablePopRequest(snapReq.snapUID))), operation_failed())
transformErrors(throwErrorOr(tlog.enablePopRequest.tryGetReply(TLogEnablePopRequest(snapReq.snapUID))), snap_enable_tlog_pop_failed())
);
}
wait(waitForAll(enablePops));
@ -4203,18 +4203,36 @@ ACTOR Future<Void> ddSnapCreateCore(DistributorSnapRequest snapReq, Reference<As
std::vector<Future<Void>> coordSnapReqs;
for (const auto & worker : coordWorkers) {
coordSnapReqs.push_back(
transformErrors(throwErrorOr(worker.workerSnapReq.tryGetReply(WorkerSnapRequest(snapReq.snapPayload, snapReq.snapUID, LiteralStringRef("coord")))), operation_failed())
transformErrors(throwErrorOr(worker.workerSnapReq.tryGetReply(WorkerSnapRequest(snapReq.snapPayload, snapReq.snapUID, LiteralStringRef("coord")))), snap_coord_failed())
);
}
wait(waitForAll(coordSnapReqs));
TraceEvent("SnapDataDistributor_AfterSnapCoords")
.detail("SnapPayload", snapReq.snapPayload)
.detail("SnapUID", snapReq.snapUID);
} catch (Error& e) {
} catch (Error& err) {
state Error e = err;
TraceEvent("SnapDataDistributor_SnapReqExit")
.detail("SnapPayload", snapReq.snapPayload)
.detail("SnapUID", snapReq.snapUID)
.error(e, true /*includeCancelled */);
if (e.code() == error_code_snap_storage_failed
|| e.code() == error_code_snap_tlog_failed
|| e.code() == error_code_operation_cancelled) {
// enable tlog pop on local tlog nodes
std::vector<TLogInterface> tlogs = db->get().logSystemConfig.allLocalLogs(false);
try {
std::vector<Future<Void>> enablePops;
for (const auto & tlog : tlogs) {
enablePops.push_back(
transformErrors(throwErrorOr(tlog.enablePopRequest.tryGetReply(TLogEnablePopRequest(snapReq.snapUID))), snap_enable_tlog_pop_failed())
);
}
wait(waitForAll(enablePops));
} catch (Error& error) {
TraceEvent(SevDebug, "IgnoreEnableTLogPopFailure");
}
}
throw e;
}
return Void();
@ -4235,7 +4253,7 @@ ACTOR Future<Void> ddSnapCreate(DistributorSnapRequest snapReq, Reference<AsyncV
TraceEvent("SnapDDCreateDBInfoChanged")
.detail("SnapPayload", snapReq.snapPayload)
.detail("SnapUID", snapReq.snapUID);
snapReq.reply.sendError(operation_failed());
snapReq.reply.sendError(snap_with_recovery_unsupported());
}
when (wait(ddSnapCreateCore(snapReq, db))) {
TraceEvent("SnapDDCreateSuccess")

View File

@ -1466,7 +1466,7 @@ ACTOR Future<Void> proxySnapCreate(ProxySnapRequest snapReq, ProxyCommitData* co
TraceEvent("SnapMasterProxy_WhiteListCheckFailed")
.detail("SnapPayload", snapReq.snapPayload)
.detail("SnapUID", snapReq.snapUID);
throw transaction_not_permitted();
throw snap_path_not_whitelisted();
}
// db fully recovered check
if (commitData->db->get().recoveryState != RecoveryState::FULLY_RECOVERED) {
@ -1478,7 +1478,7 @@ ACTOR Future<Void> proxySnapCreate(ProxySnapRequest snapReq, ProxyCommitData* co
TraceEvent("SnapMasterProxy_ClusterNotFullyRecovered")
.detail("SnapPayload", snapReq.snapPayload)
.detail("SnapUID", snapReq.snapUID);
throw cluster_not_fully_recovered();
throw snap_not_fully_recovered_unsupported();
}
auto result =
@ -1493,7 +1493,7 @@ ACTOR Future<Void> proxySnapCreate(ProxySnapRequest snapReq, ProxyCommitData* co
TraceEvent("SnapMasterProxy_LogAnitQuorumNotSupported")
.detail("SnapPayload", snapReq.snapPayload)
.detail("SnapUID", snapReq.snapUID);
throw txn_exec_log_anti_quorum();
throw snap_log_anti_quorum_unsupported();
}
// send a snap request to DD

View File

@ -211,7 +211,7 @@ public: // workload functions
wait(status);
break;
} catch (Error& e) {
if (e.code() == error_code_txn_exec_log_anti_quorum) {
if (e.code() == error_code_snap_log_anti_quorum_unsupported) {
snapFailed = true;
break;
}
@ -298,12 +298,12 @@ public: // workload functions
wait(status);
break;
} catch (Error& e) {
if (e.code() == error_code_cluster_not_fully_recovered ||
e.code() == error_code_txn_exec_log_anti_quorum) {
if (e.code() == error_code_snap_not_fully_recovered_unsupported ||
e.code() == error_code_snap_log_anti_quorum_unsupported) {
snapFailed = true;
break;
}
if (e.code() == error_code_transaction_not_permitted) {
if (e.code() == error_code_snap_path_not_whitelisted) {
testedFailure = true;
break;
}

View File

@ -65,9 +65,6 @@ ERROR( lookup_failed, 1041, "DNS lookup failed" )
ERROR( proxy_memory_limit_exceeded, 1042, "Proxy commit memory limit exceeded" )
ERROR( shutdown_in_progress, 1043, "Operation no longer supported due to shutdown" )
ERROR( serialization_failed, 1044, "Failed to deserialize an object" )
ERROR( transaction_not_permitted, 1045, "Operation not permitted")
ERROR( cluster_not_fully_recovered, 1046, "Cluster not fully recovered")
ERROR( txn_exec_log_anti_quorum, 1047, "Execute Transaction not supported when log anti quorum is configured")
ERROR( connection_unreferenced, 1048, "No peer references for connection" )
ERROR( connection_idle, 1049, "Connection closed after idle timeout" )
ERROR( disk_adapter_reset, 1050, "The disk queue adpater reset" )
@ -206,6 +203,17 @@ ERROR( key_not_found, 2400, "Expected key is missing")
ERROR( json_malformed, 2401, "JSON string was malformed")
ERROR( json_eof_expected, 2402, "JSON string did not terminate where expected")
// 2500 - disk snapshot based backup errors
ERROR( snap_disable_tlog_pop_failed, 2500, "Snapshot error")
ERROR( snap_storage_failed, 2501, "Failed to snapshot storage nodes")
ERROR( snap_tlog_failed, 2502, "Failed to snapshot TLog nodes")
ERROR( snap_coord_failed, 2503, "Failed to snapshot coordinator nodes")
ERROR( snap_enable_tlog_pop_failed, 2504, "Snapshot error")
ERROR( snap_path_not_whitelisted, 2505, "Snapshot create binary path not whitelisted")
ERROR( snap_not_fully_recovered_unsupported, 2506, "Unsupported when the cluster is not fully recovered")
ERROR( snap_log_anti_quorum_unsupported, 2507, "Unsupported when log anti quorum is configured")
ERROR( snap_with_recovery_unsupported, 2508, "Cluster recovery during snapshot operation not supported")
// 4xxx Internal errors (those that should be generated only by bugs) are decimal 4xxx
ERROR( unknown_error, 4000, "An unknown error occurred" ) // C++ exception not of type Error
ERROR( internal_error, 4100, "An internal error occurred" )