Merge pull request #6896 from sfc-gh-mpilman/features/validate-trace-events-in-simulation

Reduce excessive tracing and fail after 1M traces
2022-04-25 18:02:59 -07:00 · 2022-04-25 18:02:59 -07:00 · fe338ced8f
parent ec95fb7e74 cbe4a873d2
commit fe338ced8f
5 changed files with 36 additions and 21 deletions
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@ -265,11 +265,11 @@ void DatabaseContext::getLatestCommitVersions(const Reference<LocationInfo>& loc
 void updateCachedReadVersionShared(double t, Version v, DatabaseSharedState* p) {
 	MutexHolder mutex(p->mutexLock);
 	if (v >= p->grvCacheSpace.cachedReadVersion) {
-		TraceEvent(SevDebug, "CacheReadVersionUpdate")
-		    .detail("Version", v)
-		    .detail("CurTime", t)
-		    .detail("LastVersion", p->grvCacheSpace.cachedReadVersion)
-		    .detail("LastTime", p->grvCacheSpace.lastGrvTime);
+		//TraceEvent(SevDebug, "CacheReadVersionUpdate")
+		//    .detail("Version", v)
+		//    .detail("CurTime", t)
+		//    .detail("LastVersion", p->grvCacheSpace.cachedReadVersion)
+		//    .detail("LastTime", p->grvCacheSpace.lastGrvTime);
 		p->grvCacheSpace.cachedReadVersion = v;
 		if (t > p->grvCacheSpace.lastGrvTime) {
 			p->grvCacheSpace.lastGrvTime = t;
@ -282,11 +282,11 @@ void DatabaseContext::updateCachedReadVersion(double t, Version v) {
 		return updateCachedReadVersionShared(t, v, sharedStatePtr);
 	}
 	if (v >= cachedReadVersion) {
-		TraceEvent(SevDebug, "CachedReadVersionUpdate")
-		    .detail("Version", v)
-		    .detail("GrvStartTime", t)
-		    .detail("LastVersion", cachedReadVersion)
-		    .detail("LastTime", lastGrvTime);
+		//TraceEvent(SevDebug, "CachedReadVersionUpdate")
+		//    .detail("Version", v)
+		//    .detail("GrvStartTime", t)
+		//    .detail("LastVersion", cachedReadVersion)
+		//    .detail("LastTime", lastGrvTime);
 		cachedReadVersion = v;
 		// Since the time is based on the start of the request, it's possible that we
 		// get a newer version with an older time.
--- a/fdbserver/workloads/Cycle.actor.cpp
+++ b/fdbserver/workloads/Cycle.actor.cpp
@ -106,6 +106,7 @@ struct CycleWorkload : TestWorkload {
 				state Transaction tr(cx);
 				if (deterministicRandom()->random01() >= self->traceParentProbability) {
 					state Span span("CycleClient"_loc);
+					// TraceEvent("CycleTracingTransaction", span.context).log();
 					TraceEvent("CycleTracingTransaction", span.context).log();
 					tr.setOption(FDBTransactionOptions::SPAN_PARENT,
 					             BinaryWriter::toValue(span.context, Unversioned()));
--- a/flow/Knobs.cpp
+++ b/flow/Knobs.cpp
@ -217,6 +217,7 @@ void FlowKnobs::initialize(Randomize randomize, IsSimulated isSimulated) {
 	init( MAX_TRACE_EVENT_LENGTH,                             4000 ); // If the value of this is changed, the corresponding default in Trace.cpp should be changed as well
 	init( ALLOCATION_TRACING_ENABLED,                         true );
 	init( SIM_SPEEDUP_AFTER_SECONDS,                           450 );
+	init( MAX_TRACE_LINES,                               1'000'000 );
 	init( CODE_COV_TRACE_EVENT_SEVERITY,                        10 ); // Code coverage TraceEvent severity level

 	//TDMetrics
--- a/flow/Knobs.h
+++ b/flow/Knobs.h
@ -271,6 +271,7 @@ public:
 	double MAX_BUGGIFIED_DELAY;
 	int SIM_CONNECT_ERROR_MODE;
 	double SIM_SPEEDUP_AFTER_SECONDS;
+	int MAX_TRACE_LINES;

 	// Tracefiles
 	int ZERO_LENGTH_FILE_PAD;
--- a/flow/Trace.cpp
+++ b/flow/Trace.cpp
@ -53,6 +53,8 @@
 // 2. To avoid a historically documented but unknown crash that occurs when logging allocations
 //    during an open trace event
 thread_local int g_allocation_tracing_disabled = 1;
+unsigned tracedLines = 0;
+thread_local int failedLineOverflow = 0;

 ITraceLogIssuesReporter::~ITraceLogIssuesReporter() {}

@ -394,17 +396,22 @@ public:
 		eventBuffer.push_back(fields);
 		bufferLength += fields.sizeBytes();

-		// If we have queued up a large number of events in simulation, then throw an error. This makes it easier to
-		// diagnose cases where we get stuck in a loop logging trace events that eventually runs out of memory.
-		// Without this we would never see any trace events from that loop, and it would be more difficult to identify
-		// where the process is actually stuck.
-		if (g_network && g_network->isSimulated() && bufferLength > 1e8) {
-			fprintf(stderr, "Trace log buffer overflow\n");
-			fprintf(stderr, "Last event: %s\n", fields.toString().c_str());
-			// Setting this to 0 avoids a recurse from the assertion trace event and also prevents a situation where
-			// we roll the trace log only to log the single assertion event when using --crash.
-			bufferLength = 0;
-			ASSERT(false);
+		if (g_network && g_network->isSimulated()) {
+			// Throw an error if we have queued up a large number of events in simulation. This makes it easier to
+			// diagnose cases where we get stuck in a loop logging trace events that eventually runs out of memory.
+			// Without this we would never see any trace events from that loop, and it would be more difficult to
+			// identify where the process is actually stuck.
+			if (bufferLength > 1e8) {
+				fprintf(stderr, "Trace log buffer overflow\n");
+				fprintf(stderr, "Last event: %s\n", fields.toString().c_str());
+				// Setting this to 0 avoids a recurse from the assertion trace event and also prevents a situation where
+				// we roll the trace log only to log the single assertion event when using --crash.
+				bufferLength = 0;
+				ASSERT(false);
+			}
+			if (++tracedLines > FLOW_KNOBS->MAX_TRACE_LINES && failedLineOverflow == 0) {
+				failedLineOverflow = 1; // we only want to do this once
+			}
 		}

 		if (trackError) {
@ -1265,6 +1272,11 @@ void BaseTraceEvent::log() {

 BaseTraceEvent::~BaseTraceEvent() {
 	log();
+	if (failedLineOverflow == 1) {
+		failedLineOverflow = 2;
+		TraceEvent(SevError, "TracedTooManyLines").log();
+		crashAndDie();
+	}
 }

 thread_local bool BaseTraceEvent::networkThread = false;