Fix ConsistencyCheck_DataInconsistent failure in simulation
In the KillRegion workload, a force recovery can intentionally kill a region
and recover from another region with data loss. It's possible that when the
ConsistencyScan (CS) role is checking a shard from two storage servers, the
one from the failed region returned first for a higher version, then the SS
was killed. Later, CS got data from the second SS from a different region,
where forced recovery rollback happened and a lower version value was returned.
In this case, even though the data is inconsistent, we shouldn't fail the test.
Reproduction:
commit: 8e0c9eab8
on release-7.3 branch
seed: -f ./tests/fast/KillRegionCycle.toml -s 2713020850 -b on
build: gcc
This commit is contained in:
parent
e25b9ff686
commit
eff338f7bd
|
@ -23,6 +23,7 @@
|
|||
#include "flow/IRandom.h"
|
||||
#include "flow/IndexedSet.h"
|
||||
#include "fdbrpc/FailureMonitor.h"
|
||||
#include "fdbrpc/SimulatorProcessInfo.h"
|
||||
#include "fdbrpc/Smoother.h"
|
||||
#include "fdbrpc/simulator.h"
|
||||
#include "fdbclient/DatabaseContext.h"
|
||||
|
@ -811,7 +812,31 @@ ACTOR Future<Void> checkDataConsistency(Database cx,
|
|||
g_simulator->tssMode != ISimulator::TSSMode::EnabledDropMutations) ||
|
||||
(!storageServerInterfaces[j].isTss() &&
|
||||
!storageServerInterfaces[firstValidServer].isTss())) {
|
||||
// It's possible that the storage servers are inconsistent in KillRegion
|
||||
// workload where a forced recovery is performed. The killed storage server
|
||||
// in the killed region returned first with a higher version, and a later
|
||||
// response from a different region returned with a lower version (because
|
||||
// of the rollback of the forced recovery). In this case, we should not fail
|
||||
// the test. So we double check both process are live.
|
||||
if (!g_network->isSimulated() &&
|
||||
!g_simulator->getProcessByAddress(storageServerInterfaces[j].address())
|
||||
->failed &&
|
||||
!g_simulator
|
||||
->getProcessByAddress(
|
||||
storageServerInterfaces[firstValidServer].address())
|
||||
->failed) {
|
||||
testFailure("Data inconsistent", performQuiescentChecks, success, true);
|
||||
} else {
|
||||
// If the storage servers are not live, we should retry.
|
||||
TraceEvent("ConsistencyCheck_StorageServerUnavailable")
|
||||
.detail("StorageServer0",
|
||||
storageServerInterfaces[firstValidServer].id())
|
||||
.detail("StorageServer1", storageServerInterfaces[j].id())
|
||||
.detail("ShardBegin", printable(range.begin))
|
||||
.detail("ShardEnd", printable(range.end));
|
||||
*success = false;
|
||||
return Void();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1037,6 +1062,7 @@ ACTOR Future<Void> checkDataConsistency(Database cx,
|
|||
failureIsError);
|
||||
}
|
||||
|
||||
TraceEvent("ConsistencyCheck_CheckSplits").detail("Range", range).detail("CanSplit", canSplit);
|
||||
// Check if the storage server returns split point for the shard. There are cases where
|
||||
// the split point returned by storage server is discarded because it's an unfair split.
|
||||
// See splitStorageMetrics() in NativeAPI.actor.cpp.
|
||||
|
|
Loading…
Reference in New Issue