Fix ConsistencyCheck_DataInconsistent failure in simulation

In the KillRegion workload, a force recovery can intentionally kill a region
and recover from another region with data loss. It's possible that when the
ConsistencyScan (CS) role is checking a shard from two storage servers, the
one from the failed region returned first for a higher version, then the SS
was killed. Later, CS got data from the second SS from a different region,
where forced recovery rollback happened and a lower version value was returned.
In this case, even though the data is inconsistent, we shouldn't fail the test.

Reproduction:
  commit: 8e0c9eab8 on release-7.3 branch
  seed: -f ./tests/fast/KillRegionCycle.toml -s 2713020850 -b on
  build: gcc
This commit is contained in:
Jingyu Zhou 2023-05-17 22:41:14 -07:00
parent e25b9ff686
commit eff338f7bd
1 changed files with 27 additions and 1 deletions

View File

@ -23,6 +23,7 @@
#include "flow/IRandom.h"
#include "flow/IndexedSet.h"
#include "fdbrpc/FailureMonitor.h"
#include "fdbrpc/SimulatorProcessInfo.h"
#include "fdbrpc/Smoother.h"
#include "fdbrpc/simulator.h"
#include "fdbclient/DatabaseContext.h"
@ -811,7 +812,31 @@ ACTOR Future<Void> checkDataConsistency(Database cx,
g_simulator->tssMode != ISimulator::TSSMode::EnabledDropMutations) ||
(!storageServerInterfaces[j].isTss() &&
!storageServerInterfaces[firstValidServer].isTss())) {
testFailure("Data inconsistent", performQuiescentChecks, success, true);
// It's possible that the storage servers are inconsistent in KillRegion
// workload where a forced recovery is performed. The killed storage server
// in the killed region returned first with a higher version, and a later
// response from a different region returned with a lower version (because
// of the rollback of the forced recovery). In this case, we should not fail
// the test. So we double check both process are live.
if (!g_network->isSimulated() &&
!g_simulator->getProcessByAddress(storageServerInterfaces[j].address())
->failed &&
!g_simulator
->getProcessByAddress(
storageServerInterfaces[firstValidServer].address())
->failed) {
testFailure("Data inconsistent", performQuiescentChecks, success, true);
} else {
// If the storage servers are not live, we should retry.
TraceEvent("ConsistencyCheck_StorageServerUnavailable")
.detail("StorageServer0",
storageServerInterfaces[firstValidServer].id())
.detail("StorageServer1", storageServerInterfaces[j].id())
.detail("ShardBegin", printable(range.begin))
.detail("ShardEnd", printable(range.end));
*success = false;
return Void();
}
}
}
}
@ -1037,6 +1062,7 @@ ACTOR Future<Void> checkDataConsistency(Database cx,
failureIsError);
}
TraceEvent("ConsistencyCheck_CheckSplits").detail("Range", range).detail("CanSplit", canSplit);
// Check if the storage server returns split point for the shard. There are cases where
// the split point returned by storage server is discarded because it's an unfair split.
// See splitStorageMetrics() in NativeAPI.actor.cpp.