Merge pull request #2846 from xumengpanda/mengxu/fr-add-attrition-to-test-PR

Performant restore [21/xx]: Enable assassination workload in restore test
This commit is contained in:
Jingyu Zhou 2020-03-23 13:52:01 -07:00 committed by GitHub
commit 9a50458a64
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 46 additions and 24 deletions

View File

@ -32,6 +32,7 @@
#include "fdbclient/MutationList.h"
#include "fdbclient/BackupContainer.h"
#include "fdbrpc/IAsyncFile.h"
#include "fdbrpc/simulator.h"
#include "flow/genericactors.actor.h"
#include "flow/Hash3.h"
#include "flow/ActorCollection.h"
@ -297,11 +298,30 @@ ACTOR Future<Void> _restoreWorker(Database cx, LocalityData locality) {
state Future<Void> myWork = Never();
state Reference<AsyncVar<RestoreWorkerInterface>> leader =
Reference<AsyncVar<RestoreWorkerInterface>>(new AsyncVar<RestoreWorkerInterface>());
state RestoreWorkerInterface myWorkerInterf;
myWorkerInterf.initEndpoints();
state Reference<RestoreWorkerData> self = Reference<RestoreWorkerData>(new RestoreWorkerData());
myWorkerInterf.initEndpoints();
self->workerID = myWorkerInterf.id();
// Protect restore worker from being killed in simulation;
// Future: Remove the protection once restore can tolerate failure
if (g_network->isSimulated()) {
auto addresses = g_simulator.getProcessByAddress(myWorkerInterf.address())->addresses;
g_simulator.protectedAddresses.insert(addresses.address);
if (addresses.secondaryAddress.present()) {
g_simulator.protectedAddresses.insert(addresses.secondaryAddress.get());
}
ISimulator::ProcessInfo* p = g_simulator.getProcessByAddress(myWorkerInterf.address());
TraceEvent("ProtectRestoreWorker")
.detail("Address", addresses.toString())
.detail("IsReliable", p->isReliable())
.detail("ReliableInfo", p->getReliableInfo())
.backtrace();
ASSERT(p->isReliable());
}
TraceEvent("FastRestoreWorkerKnobs", myWorkerInterf.id())
.detail("FailureTimeout", SERVER_KNOBS->FASTRESTORE_FAILURE_TIMEOUT)
.detail("HeartBeat", SERVER_KNOBS->FASTRESTORE_HEARTBEAT_INTERVAL)

View File

@ -33,18 +33,19 @@ testTitle=BackupAndParallelRestoreWithAtomicOp
; meanDelay=90.0
; testDuration=90.0
; Do NOT consider machine crash yet
; testName=Attrition
; machinesToKill=10
; machinesToLeave=3
; reboot=true
; testDuration=90.0
; Do NOT kill restore worker process yet
; Kill other process to ensure restore works when FDB cluster has faults
testName=Attrition
machinesToKill=10
machinesToLeave=3
reboot=true
testDuration=90.0
; testName=Attrition
; machinesToKill=10
; machinesToLeave=3
; reboot=true
; testDuration=90.0
testName=Attrition
machinesToKill=10
machinesToLeave=3
reboot=true
testDuration=90.0
; Disable buggify for parallel restore
;buggify=on

View File

@ -55,18 +55,19 @@ testTitle=BackupAndRestore
; meanDelay=90.0
; testDuration=90.0
; Do NOT consider machine crash yet
; testName=Attrition
; machinesToKill=10
; machinesToLeave=3
; reboot=true
; testDuration=90.0
; Do NOT kill restore worker process yet
; Kill other process to ensure restore works when FDB cluster has faults
testName=Attrition
machinesToKill=10
machinesToLeave=3
reboot=true
testDuration=90.0
; testName=Attrition
; machinesToKill=10
; machinesToLeave=3
; reboot=true
; testDuration=90.0
testName=Attrition
machinesToKill=10
machinesToLeave=3
reboot=true
testDuration=90.0
; Disable buggify for parallel restore
;buggify=off