Update ChangeConfig to test coordinators special keys

This commit is contained in:
Chaoguang Lin 2021-02-16 13:01:37 -08:00
parent 3def9731a4
commit 731ee8a121
5 changed files with 177 additions and 73 deletions

View File

@ -1009,7 +1009,8 @@ ACTOR Future<std::vector<NetworkAddress>> getCoordinators( Database cx ) {
}
}
ACTOR Future<Optional<CoordinatorsResult>> changeQuorumChecker(Transaction* tr, Reference<IQuorumChange> change, std::vector<NetworkAddress>* desiredCoordinators, int* retries, int* notEnoughMachineResults) {
ACTOR Future<Optional<CoordinatorsResult>> changeQuorumChecker(Transaction* tr, Reference<IQuorumChange> change,
std::vector<NetworkAddress>* desiredCoordinators) {
tr->setOption( FDBTransactionOptions::LOCK_AWARE );
tr->setOption( FDBTransactionOptions::USE_PROVISIONAL_PROXIES );
tr->setOption( FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE );
@ -1028,24 +1029,19 @@ ACTOR Future<Optional<CoordinatorsResult>> changeQuorumChecker(Transaction* tr,
*desiredCoordinators = _desiredCoordinators;
}
if(result == CoordinatorsResult::NOT_ENOUGH_MACHINES && *notEnoughMachineResults < 1) {
//we could get not_enough_machines if we happen to see the database while the cluster controller is updating the worker list, so make sure it happens twice before returning a failure
(*notEnoughMachineResults)++;
wait( delay(1.0) );
tr->reset();
return Optional<CoordinatorsResult>();
}
if (result != CoordinatorsResult::SUCCESS)
return result;
if (!desiredCoordinators->size())
return CoordinatorsResult::INVALID_NETWORK_ADDRESSES;
std::sort(desiredCoordinators->begin(), desiredCoordinators->end());
std::string newName = change->getDesiredClusterKeyName();
if (newName.empty()) newName = old.clusterKeyName().toString();
if ( old.coordinators() == *desiredCoordinators && old.clusterKeyName() == newName)
return *retries ? CoordinatorsResult::SUCCESS : CoordinatorsResult::SAME_NETWORK_ADDRESSES;
return CoordinatorsResult::SAME_NETWORK_ADDRESSES;
state ClusterConnectionString conn( *desiredCoordinators, StringRef( newName + ':' + deterministicRandom()->randomAlphaNumeric( 32 ) ) );
@ -1061,12 +1057,6 @@ ACTOR Future<Optional<CoordinatorsResult>> changeQuorumChecker(Transaction* tr,
}
}
TraceEvent("AttemptingQuorumChange")
.detail("FromCS", old.toString())
.detail("ToCS", conn.toString())
.detail("OldClusterDescription", old.clusterKeyName())
.detail("NewClusterDescription", conn.clusterKeyName());
vector<Future<Optional<LeaderInfo>>> leaderServers;
ClientCoordinators coord( Reference<ClusterConnectionFile>( new ClusterConnectionFile( conn ) ) );
for( int i = 0; i < coord.clientLeaderServers.size(); i++ )

View File

@ -144,7 +144,8 @@ struct IQuorumChange : ReferenceCounted<IQuorumChange> {
};
// Change to use the given set of coordination servers
ACTOR Future<Optional<CoordinatorsResult>> changeQuorumChecker(Transaction* tr, Reference<IQuorumChange> change, std::vector<NetworkAddress>* desiredCoordinators, int* retries, int* notEnoughMachineResults);
ACTOR Future<Optional<CoordinatorsResult>> changeQuorumChecker(Transaction* tr, Reference<IQuorumChange> change,
std::vector<NetworkAddress>* desiredCoordinators);
ACTOR Future<CoordinatorsResult> changeQuorum(Database cx, Reference<IQuorumChange> change);
Reference<IQuorumChange> autoQuorumChange(int desired = -1);
Reference<IQuorumChange> noQuorumChange();

View File

@ -1411,9 +1411,7 @@ ACTOR static Future<Optional<std::string>> coordinatorsCommitActor(ReadYourWrite
state std::vector<NetworkAddress> addressesVec;
state std::vector<std::string> process_address_strs;
state Optional<std::string> msg;
state int retry = 0;
state int index;
state int notEnoughMachineResults;
state bool parse_error = false;
// check update for cluster_description
@ -1485,14 +1483,14 @@ ACTOR static Future<Optional<std::string>> coordinatorsCommitActor(ReadYourWrite
.detail("Auto", auto_option.first)
.detail("Description", entry.first ? entry.second.get().toString() : "");
Optional<CoordinatorsResult> r =
wait(changeQuorumChecker(&ryw->getTransaction(), change, &addressesVec, &retry, &notEnoughMachineResults));
Optional<CoordinatorsResult> r = wait(changeQuorumChecker(&ryw->getTransaction(), change, &addressesVec));
TraceEvent(SevDebug, "SKSChangeCoordinatorsFinish")
.detail("Result", r.present() ? int(r.get()) : -1); // -1 means success
if (r.present()) {
auto res = r.get();
std::string error_msg;
bool retriable = false;
if (res == CoordinatorsResult::INVALID_NETWORK_ADDRESSES) {
error_msg = "The specified network addresses are invalid";
} else if (res == CoordinatorsResult::SAME_NETWORK_ADDRESSES) {
@ -1505,6 +1503,7 @@ ACTOR static Future<Optional<std::string>> coordinatorsCommitActor(ReadYourWrite
error_msg = "The database is in an unexpected state from which changing coordinators might be unsafe";
} else if (res == CoordinatorsResult::COORDINATOR_UNREACHABLE) {
error_msg = "One of the specified coordinators is unreachable";
retriable = true;
} else if (res == CoordinatorsResult::NOT_ENOUGH_MACHINES) {
error_msg = "Too few fdbserver machines to provide coordination at the current redundancy level";
} else if (res == CoordinatorsResult::SUCCESS) {
@ -1512,7 +1511,7 @@ ACTOR static Future<Optional<std::string>> coordinatorsCommitActor(ReadYourWrite
} else {
ASSERT(false);
}
msg = ManagementAPIError::toJsonString(false, "coordinators", error_msg);
msg = ManagementAPIError::toJsonString(retriable, "coordinators", error_msg);
}
return msg;
}

View File

@ -24,6 +24,7 @@
#include "fdbclient/ManagementAPI.actor.h"
#include "fdbserver/workloads/workloads.actor.h"
#include "fdbrpc/simulator.h"
#include "fdbclient/Schemas.h"
#include "flow/actorcompiler.h" // This must be the last #include.
struct ChangeConfigWorkload : TestWorkload {
@ -65,9 +66,9 @@ struct ChangeConfigWorkload : TestWorkload {
TraceEvent("WaitForReplicasExtraEnd");
} if (self->networkAddresses.size()) {
if (self->networkAddresses == "auto")
wait(success(changeQuorum(extraDB, autoQuorumChange())));
wait(CoordinatorsChangeActor(extraDB, self, true));
else
wait(success(changeQuorum(extraDB, specifiedQuorumChange(NetworkAddress::parseList(self->networkAddresses)))));
wait(CoordinatorsChangeActor(extraDB, self));
}
wait(delay(5*deterministicRandom()->random01()));
}
@ -91,9 +92,9 @@ struct ChangeConfigWorkload : TestWorkload {
}
if( self->networkAddresses.size() ) {
if (self->networkAddresses == "auto")
wait(success( changeQuorum( cx, autoQuorumChange() ) ));
wait(CoordinatorsChangeActor(cx, self, true));
else
wait(success( changeQuorum( cx, specifiedQuorumChange(NetworkAddress::parseList( self->networkAddresses )) ) ));
wait(CoordinatorsChangeActor(cx, self));
}
if(!extraConfigureBefore) {
@ -102,6 +103,95 @@ struct ChangeConfigWorkload : TestWorkload {
return Void();
}
ACTOR static Future<Void> CoordinatorsChangeActor(Database cx, ChangeConfigWorkload* self,
bool autoChange = false) {
state ReadYourWritesTransaction tr(cx);
state int notEnoughMachineResults = 0; // Retry for the second time if we first get this result
// state std::vector<NetworkAddress> desiredCoordinators; // the desired coordinators' network addresses
state std::string desiredCoordinatorsKey; // comma separated
if (autoChange) { // if auto, we first get the desired addresses, which is not changed in the following retries
loop {
try {
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
tr.setOption(FDBTransactionOptions::USE_PROVISIONAL_PROXIES);
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
Optional<Value> currentKey = wait(tr.get(coordinatorsKey));
if (!currentKey.present()) return Void(); // Someone deleted this key entirely?
ClusterConnectionString old(currentKey.get().toString());
if (cx->getConnectionFile() && old.clusterKeyName().toString() !=
cx->getConnectionFile()->getConnectionString().clusterKeyName())
return Void(); // Someone changed the "name" of the database??
state CoordinatorsResult result = CoordinatorsResult::SUCCESS;
if (!desiredCoordinatorsKey.size()) {
std::vector<NetworkAddress> _desiredCoordinators =
wait(autoQuorumChange()->getDesiredCoordinators(
&tr.getTransaction(), old.coordinators(),
Reference<ClusterConnectionFile>(new ClusterConnectionFile(old)), result));
for (const auto& address : _desiredCoordinators) {
desiredCoordinatorsKey += desiredCoordinatorsKey.size() ? "," : "";
desiredCoordinatorsKey += address.toString();
}
}
if (result == CoordinatorsResult::NOT_ENOUGH_MACHINES && notEnoughMachineResults < 1) {
// we could get not_enough_machines if we happen to see the database while the cluster
// controller is updating the worker list, so make sure it happens twice before returning a
// failure
notEnoughMachineResults++;
wait(delay(1.0));
tr.reset();
continue;
}
if (result != CoordinatorsResult::SUCCESS) return Void();
tr.reset();
break;
} catch (Error& e) {
wait(tr.onError(e));
}
}
} else {
desiredCoordinatorsKey = self->networkAddresses;
}
loop {
try {
tr.setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES);
tr.set(LiteralStringRef("processes")
.withPrefix(SpecialKeySpace::getManagementApiCommandPrefix("coordinators")),
Value(desiredCoordinatorsKey));
TraceEvent(SevDebug, "CoordinatorsChangeBeforeCommit")
.detail("Auto", autoChange)
.detail("NewCoordinatorsKey", describe(desiredCoordinatorsKey));
wait(tr.commit());
ASSERT(false);
} catch (Error& e) {
state Error err(e);
if (e.code() == error_code_special_keys_api_failure) {
Optional<Value> errorMsg =
wait(tr.get(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::ERRORMSG).begin));
ASSERT(errorMsg.present());
std::string errorStr;
auto valueObj = readJSONStrictly(errorMsg.get().toString()).get_obj();
auto schema = readJSONStrictly(JSONSchemas::managementApiErrorSchema.toString()).get_obj();
// special_key_space_management_api_error_msg schema validation
TraceEvent(SevDebug, "CoordinatorsChangeError")
.detail("Auto", autoChange)
.detail("ErrorMessage", valueObj["message"].get_str());
ASSERT(schemaMatch(schema, valueObj, errorStr, SevError, true));
ASSERT(valueObj["command"].get_str() == "coordinators");
break;
} else {
wait(tr.onError(err));
}
wait(delay(FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY));
}
}
return Void();
}
};
WorkloadFactory<ChangeConfigWorkload> ChangeConfigWorkloadFactory("ChangeConfig");

View File

@ -27,6 +27,8 @@
#include "fdbclient/SpecialKeySpace.actor.h"
#include "fdbserver/TesterInterface.actor.h"
#include "fdbserver/workloads/workloads.actor.h"
#include "flow/Knobs.h"
#include "flow/Trace.h"
#include "flow/actorcompiler.h"
struct SpecialKeySpaceCorrectnessWorkload : TestWorkload {
@ -440,11 +442,12 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload {
wait(tx->getRange(KeyRangeRef(LiteralStringRef("\xff\xff/worker_interfaces/"),
LiteralStringRef("\xff\xff/worker_interfaces0")),
CLIENT_KNOBS->TOO_MANY));
// We should have at least 1 process in the cluster
ASSERT(result.size());
state KeyValueRef entry = deterministicRandom()->randomChoice(result);
Optional<Value> singleRes = wait(tx->get(entry.key));
ASSERT(singleRes.present() && singleRes.get() == entry.value);
// Note: there's possibility we get zero workers
if (result.size()) {
state KeyValueRef entry = deterministicRandom()->randomChoice(result);
Optional<Value> singleRes = wait(tx->get(entry.key));
ASSERT(singleRes.present() && singleRes.get() == entry.value);
}
tx->reset();
} catch (Error& e) {
wait(tx->onError(e));
@ -763,7 +766,11 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload {
Key("process/class_source/" + address)
.withPrefix(
SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::CONFIGURATION).begin)));
ASSERT(class_source.present() && class_source.get() == LiteralStringRef("set_class"));
TraceEvent(SevDebug, "SetClassSourceDebug")
.detail("Present", class_source.present())
.detail("ClassSource", class_source.present() ? class_source.get().toString() : "__Nothing");
// Very rarely, we get an empty worker list, thus no class_source data
if (class_source.present()) ASSERT(class_source.get() == LiteralStringRef("set_class"));
tx->reset();
} else {
// If no worker process returned, skip the test
@ -968,11 +975,10 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload {
.withPrefix(SpecialKeySpace::getManagementApiCommandPrefix("coordinators")),
Value(new_cluster_description));
wait(tx->commit());
tx->reset();
break;
ASSERT(false);
} catch (Error& e) {
TraceEvent(SevDebug, "CoordinatorsManualChange").error(e);
// if we repeat doing the change, we will get this error:
// if we repeat doing the change, we will get the error:
// CoordinatorsResult::SAME_NETWORK_ADDRESSES
if (e.code() == error_code_special_keys_api_failure) {
Optional<Value> errorMsg =
@ -983,16 +989,21 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload {
auto schema = readJSONStrictly(JSONSchemas::managementApiErrorSchema.toString()).get_obj();
// special_key_space_management_api_error_msg schema validation
ASSERT(schemaMatch(schema, valueObj, errorStr, SevError, true));
ASSERT(valueObj["command"].get_str() == "coordinators" &&
!valueObj["retriable"].get_bool());
ASSERT(valueObj["message"].get_str() ==
"No change (existing configuration satisfies request)");
tx->reset();
break;
TraceEvent(SevDebug, "CoordinatorsManualChange")
.detail("ErrorMessage", valueObj["message"].get_str());
ASSERT(valueObj["command"].get_str() == "coordinators");
if (valueObj["retriable"].get_bool()) { // coordinators not reachable, retry
tx->reset();
} else {
ASSERT(valueObj["message"].get_str() ==
"No change (existing configuration satisfies request)");
tx->reset();
break;
}
} else {
wait(tx->onError(e));
wait(delay(1.0));
}
wait(delay(FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY));
}
}
// change successful, now check it is already changed
@ -1010,43 +1021,56 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload {
new_coordinator_process == address_str);
}
// verify the cluster decription
TraceEvent(SevDebug, "CoordinatorsManualChange")
.detail("NewClsuterDescription", cs.clusterKeyName());
ASSERT(new_cluster_description == cs.clusterKeyName().toString());
tx->reset();
} catch (Error& e) {
wait(tx->onError(e));
wait(delay(1.0));
wait(delay(FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY));
}
}
}
// test coordinators' "auto" option
loop {
try {
tx->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES);
tx->set(SpecialKeySpace::getManagementApiCommandOptionSpecialKey("coordinators", "auto"), ValueRef());
wait(tx->commit()); // if an "auto" change happened, the commit may or may not succeed
tx->reset();
} catch (Error& e) {
TraceEvent(SevDebug, "CoordinatorsAutoChange").error(e);
// if we repeat doing "auto" change, we will get this error: CoordinatorsResult::SAME_NETWORK_ADDRESSES
if (e.code() == error_code_special_keys_api_failure) {
Optional<Value> errorMsg =
wait(tx->get(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::ERRORMSG).begin));
ASSERT(errorMsg.present());
std::string errorStr;
auto valueObj = readJSONStrictly(errorMsg.get().toString()).get_obj();
auto schema = readJSONStrictly(JSONSchemas::managementApiErrorSchema.toString()).get_obj();
// special_key_space_management_api_error_msg schema validation
TraceEvent(SevDebug, "CoordinatorsAutoChange").detail("SKSErrorMessage", valueObj["message"].get_str());
ASSERT(schemaMatch(schema, valueObj, errorStr, SevError, true));
ASSERT(valueObj["command"].get_str() == "coordinators" && !valueObj["retriable"].get_bool());
// ASSERT(valueObj["message"].get_str() == "No change (existing configuration satisfies request)");
tx->reset();
break;
} else {
wait(tx->onError(e));
wait(delay(1.0));
// change back to original settings
loop {
try {
std::string new_processes_key;
tx->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES);
for (const auto& address : old_coordinators_processes) {
new_processes_key += new_processes_key.size() ? "," : "";
new_processes_key += address;
}
tx->set(LiteralStringRef("processes")
.withPrefix(SpecialKeySpace::getManagementApiCommandPrefix("coordinators")),
Value(new_processes_key));
wait(tx->commit());
ASSERT(false);
} catch (Error& e) {
TraceEvent(SevDebug, "CoordinatorsManualChangeRevert").error(e);
if (e.code() == error_code_special_keys_api_failure) {
Optional<Value> errorMsg =
wait(tx->get(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::ERRORMSG).begin));
ASSERT(errorMsg.present());
std::string errorStr;
auto valueObj = readJSONStrictly(errorMsg.get().toString()).get_obj();
auto schema = readJSONStrictly(JSONSchemas::managementApiErrorSchema.toString()).get_obj();
// special_key_space_management_api_error_msg schema validation
ASSERT(schemaMatch(schema, valueObj, errorStr, SevError, true));
TraceEvent(SevDebug, "CoordinatorsManualChangeRevert")
.detail("ErrorMessage", valueObj["message"].get_str());
ASSERT(valueObj["command"].get_str() == "coordinators");
if (valueObj["retriable"].get_bool()) {
tx->reset();
} else if (valueObj["message"].get_str() ==
"No change (existing configuration satisfies request)") {
tx->reset();
break;
} else {
TraceEvent(SevError, "CoordinatorsManualChangeRevert")
.detail("UnexpectedError", valueObj["message"].get_str());
throw special_keys_api_failure();
}
} else {
wait(tx->onError(e));
}
wait(delay(FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY));
}
}
}
}