Merge pull request #6896 from sfc-gh-mpilman/features/validate-trace-events-in-simulation

Reduce excessive tracing and fail after 1M traces
This commit is contained in:
Steve Atherton 2022-04-25 18:02:59 -07:00 committed by GitHub
commit fe338ced8f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 36 additions and 21 deletions

View File

@ -265,11 +265,11 @@ void DatabaseContext::getLatestCommitVersions(const Reference<LocationInfo>& loc
void updateCachedReadVersionShared(double t, Version v, DatabaseSharedState* p) {
MutexHolder mutex(p->mutexLock);
if (v >= p->grvCacheSpace.cachedReadVersion) {
TraceEvent(SevDebug, "CacheReadVersionUpdate")
.detail("Version", v)
.detail("CurTime", t)
.detail("LastVersion", p->grvCacheSpace.cachedReadVersion)
.detail("LastTime", p->grvCacheSpace.lastGrvTime);
//TraceEvent(SevDebug, "CacheReadVersionUpdate")
// .detail("Version", v)
// .detail("CurTime", t)
// .detail("LastVersion", p->grvCacheSpace.cachedReadVersion)
// .detail("LastTime", p->grvCacheSpace.lastGrvTime);
p->grvCacheSpace.cachedReadVersion = v;
if (t > p->grvCacheSpace.lastGrvTime) {
p->grvCacheSpace.lastGrvTime = t;
@ -282,11 +282,11 @@ void DatabaseContext::updateCachedReadVersion(double t, Version v) {
return updateCachedReadVersionShared(t, v, sharedStatePtr);
}
if (v >= cachedReadVersion) {
TraceEvent(SevDebug, "CachedReadVersionUpdate")
.detail("Version", v)
.detail("GrvStartTime", t)
.detail("LastVersion", cachedReadVersion)
.detail("LastTime", lastGrvTime);
//TraceEvent(SevDebug, "CachedReadVersionUpdate")
// .detail("Version", v)
// .detail("GrvStartTime", t)
// .detail("LastVersion", cachedReadVersion)
// .detail("LastTime", lastGrvTime);
cachedReadVersion = v;
// Since the time is based on the start of the request, it's possible that we
// get a newer version with an older time.

View File

@ -106,6 +106,7 @@ struct CycleWorkload : TestWorkload {
state Transaction tr(cx);
if (deterministicRandom()->random01() >= self->traceParentProbability) {
state Span span("CycleClient"_loc);
// TraceEvent("CycleTracingTransaction", span.context).log();
TraceEvent("CycleTracingTransaction", span.context).log();
tr.setOption(FDBTransactionOptions::SPAN_PARENT,
BinaryWriter::toValue(span.context, Unversioned()));

View File

@ -217,6 +217,7 @@ void FlowKnobs::initialize(Randomize randomize, IsSimulated isSimulated) {
init( MAX_TRACE_EVENT_LENGTH, 4000 ); // If the value of this is changed, the corresponding default in Trace.cpp should be changed as well
init( ALLOCATION_TRACING_ENABLED, true );
init( SIM_SPEEDUP_AFTER_SECONDS, 450 );
init( MAX_TRACE_LINES, 1'000'000 );
init( CODE_COV_TRACE_EVENT_SEVERITY, 10 ); // Code coverage TraceEvent severity level
//TDMetrics

View File

@ -271,6 +271,7 @@ public:
double MAX_BUGGIFIED_DELAY;
int SIM_CONNECT_ERROR_MODE;
double SIM_SPEEDUP_AFTER_SECONDS;
int MAX_TRACE_LINES;
// Tracefiles
int ZERO_LENGTH_FILE_PAD;

View File

@ -53,6 +53,8 @@
// 2. To avoid a historically documented but unknown crash that occurs when logging allocations
// during an open trace event
thread_local int g_allocation_tracing_disabled = 1;
unsigned tracedLines = 0;
thread_local int failedLineOverflow = 0;
ITraceLogIssuesReporter::~ITraceLogIssuesReporter() {}
@ -394,17 +396,22 @@ public:
eventBuffer.push_back(fields);
bufferLength += fields.sizeBytes();
// If we have queued up a large number of events in simulation, then throw an error. This makes it easier to
// diagnose cases where we get stuck in a loop logging trace events that eventually runs out of memory.
// Without this we would never see any trace events from that loop, and it would be more difficult to identify
// where the process is actually stuck.
if (g_network && g_network->isSimulated() && bufferLength > 1e8) {
fprintf(stderr, "Trace log buffer overflow\n");
fprintf(stderr, "Last event: %s\n", fields.toString().c_str());
// Setting this to 0 avoids a recurse from the assertion trace event and also prevents a situation where
// we roll the trace log only to log the single assertion event when using --crash.
bufferLength = 0;
ASSERT(false);
if (g_network && g_network->isSimulated()) {
// Throw an error if we have queued up a large number of events in simulation. This makes it easier to
// diagnose cases where we get stuck in a loop logging trace events that eventually runs out of memory.
// Without this we would never see any trace events from that loop, and it would be more difficult to
// identify where the process is actually stuck.
if (bufferLength > 1e8) {
fprintf(stderr, "Trace log buffer overflow\n");
fprintf(stderr, "Last event: %s\n", fields.toString().c_str());
// Setting this to 0 avoids a recurse from the assertion trace event and also prevents a situation where
// we roll the trace log only to log the single assertion event when using --crash.
bufferLength = 0;
ASSERT(false);
}
if (++tracedLines > FLOW_KNOBS->MAX_TRACE_LINES && failedLineOverflow == 0) {
failedLineOverflow = 1; // we only want to do this once
}
}
if (trackError) {
@ -1265,6 +1272,11 @@ void BaseTraceEvent::log() {
BaseTraceEvent::~BaseTraceEvent() {
log();
if (failedLineOverflow == 1) {
failedLineOverflow = 2;
TraceEvent(SevError, "TracedTooManyLines").log();
crashAndDie();
}
}
thread_local bool BaseTraceEvent::networkThread = false;