foundationdb/fdbserver/CoordinatedState.actor.cpp

364 lines
14 KiB
C++

/*
* CoordinatedState.actor.cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "fdbclient/ClusterConnectionMemoryRecord.h"
#include "fdbserver/CoordinatedState.h"
#include "fdbserver/CoordinationInterface.h"
#include "fdbserver/Knobs.h"
#include "flow/ActorCollection.h"
#include "fdbserver/LeaderElection.h"
#include "flow/actorcompiler.h" // has to be last include
ACTOR Future<GenerationRegReadReply> waitAndSendRead(RequestStream<GenerationRegReadRequest> to,
GenerationRegReadRequest req) {
if (SERVER_KNOBS->BUGGIFY_ALL_COORDINATION || BUGGIFY)
wait(delay(SERVER_KNOBS->BUGGIFIED_EVENTUAL_CONSISTENCY * deterministicRandom()->random01()));
state GenerationRegReadReply reply = wait(retryBrokenPromise(to, req));
if (SERVER_KNOBS->BUGGIFY_ALL_COORDINATION || BUGGIFY)
wait(delay(SERVER_KNOBS->BUGGIFIED_EVENTUAL_CONSISTENCY * deterministicRandom()->random01()));
return reply;
}
ACTOR Future<UniqueGeneration> waitAndSendWrite(RequestStream<GenerationRegWriteRequest> to,
GenerationRegWriteRequest req) {
if (SERVER_KNOBS->BUGGIFY_ALL_COORDINATION || BUGGIFY)
wait(delay(SERVER_KNOBS->BUGGIFIED_EVENTUAL_CONSISTENCY * deterministicRandom()->random01()));
state UniqueGeneration reply = wait(retryBrokenPromise(to, req));
if (SERVER_KNOBS->BUGGIFY_ALL_COORDINATION || BUGGIFY)
wait(delay(SERVER_KNOBS->BUGGIFIED_EVENTUAL_CONSISTENCY * deterministicRandom()->random01()));
return reply;
}
ACTOR Future<GenerationRegReadReply> emptyToNever(Future<GenerationRegReadReply> f) {
state GenerationRegReadReply r = wait(f);
if (r.gen.generation == 0)
wait(Future<Void>(Never()));
return r;
}
ACTOR Future<GenerationRegReadReply> nonemptyToNever(Future<GenerationRegReadReply> f) {
state GenerationRegReadReply r = wait(f);
if (r.gen.generation != 0)
wait(Future<Void>(Never()));
return r;
}
struct CoordinatedStateImpl {
ServerCoordinators coordinators;
int stage;
UniqueGeneration gen;
uint64_t conflictGen;
bool doomed;
ActorCollection ac; // Errors are not reported
bool initial;
CoordinatedStateImpl(ServerCoordinators const& c)
: coordinators(c), stage(0), conflictGen(0), doomed(false), ac(false), initial(false) {}
uint64_t getConflict() { return conflictGen; }
bool isDoomed(GenerationRegReadReply const& rep) {
return rep.gen > gen // setExclusive is doomed, because there was a write at least started at a higher
// generation, which means a read completed at that higher generation
// || rep.rgen > gen // setExclusive isn't absolutely doomed, but it may/probably will fail
;
}
ACTOR static Future<Value> read(CoordinatedStateImpl* self) {
ASSERT(self->stage == 0);
{
self->stage = 1;
GenerationRegReadReply rep = wait(self->replicatedRead(
self, GenerationRegReadRequest(self->coordinators.clusterKey, UniqueGeneration())));
self->conflictGen = std::max(self->conflictGen, std::max(rep.gen.generation, rep.rgen.generation)) + 1;
self->gen = UniqueGeneration(self->conflictGen, deterministicRandom()->randomUniqueID());
}
{
self->stage = 2;
GenerationRegReadReply rep =
wait(self->replicatedRead(self, GenerationRegReadRequest(self->coordinators.clusterKey, self->gen)));
self->stage = 3;
self->conflictGen = std::max(self->conflictGen, std::max(rep.gen.generation, rep.rgen.generation));
if (self->isDoomed(rep))
self->doomed = true;
self->initial = rep.gen.generation == 0;
self->stage = 4;
return rep.value.present() ? rep.value.get() : Value();
}
}
ACTOR static Future<Void> onConflict(CoordinatedStateImpl* self) {
ASSERT(self->stage == 4);
if (self->doomed)
return Void();
loop {
wait(delay(SERVER_KNOBS->COORDINATED_STATE_ONCONFLICT_POLL_INTERVAL));
GenerationRegReadReply rep = wait(self->replicatedRead(
self, GenerationRegReadRequest(self->coordinators.clusterKey, UniqueGeneration())));
if (self->stage > 4)
break;
self->conflictGen = std::max(self->conflictGen, std::max(rep.gen.generation, rep.rgen.generation));
if (self->isDoomed(rep))
return Void();
}
wait(Future<Void>(Never()));
return Void();
}
ACTOR static Future<Void> setExclusive(CoordinatedStateImpl* self, Value v) {
ASSERT(self->stage == 4);
self->stage = 5;
UniqueGeneration wgen = wait(self->replicatedWrite(
self, GenerationRegWriteRequest(KeyValueRef(self->coordinators.clusterKey, v), self->gen)));
self->stage = 6;
TraceEvent("CoordinatedStateSet")
.detail("Gen", self->gen.generation)
.detail("Wgen", wgen.generation)
.detail("Genu", self->gen.uid)
.detail("Wgenu", wgen.uid)
.detail("Cgen", self->conflictGen);
if (wgen == self->gen)
return Void();
else {
self->conflictGen = std::max(self->conflictGen, wgen.generation);
throw coordinated_state_conflict();
}
}
ACTOR static Future<GenerationRegReadReply> replicatedRead(CoordinatedStateImpl* self,
GenerationRegReadRequest req) {
state std::vector<GenerationRegInterface>& replicas = self->coordinators.stateServers;
state std::vector<Future<GenerationRegReadReply>> rep_empty_reply;
state std::vector<Future<GenerationRegReadReply>> rep_reply;
for (int i = 0; i < replicas.size(); i++) {
Future<GenerationRegReadReply> reply =
waitAndSendRead(replicas[i].read, GenerationRegReadRequest(req.key, req.gen));
rep_empty_reply.push_back(nonemptyToNever(reply));
rep_reply.push_back(emptyToNever(reply));
self->ac.add(success(reply));
}
state Future<Void> majorityEmpty =
quorum(rep_empty_reply,
(replicas.size() + 1) / 2); // enough empty to ensure we cannot achieve a majority non-empty
wait(quorum(rep_reply, replicas.size() / 2 + 1) || majorityEmpty);
if (majorityEmpty.isReady()) {
int best = -1;
for (int i = 0; i < rep_empty_reply.size(); i++)
if (rep_empty_reply[i].isReady() && !rep_empty_reply[i].isError()) {
if (best < 0 || rep_empty_reply[i].get().rgen > rep_empty_reply[best].get().rgen)
best = i;
}
ASSERT(best >= 0);
auto result = rep_empty_reply[best].get();
return result;
} else {
int best = -1;
for (int i = 0; i < rep_reply.size(); i++)
if (rep_reply[i].isReady() && !rep_reply[i].isError()) {
if (best < 0 || rep_reply[i].get().gen > rep_reply[best].get().gen ||
(rep_reply[i].get().gen == rep_reply[best].get().gen &&
rep_reply[i].get().rgen > rep_reply[best].get().rgen))
best = i;
}
ASSERT(best >= 0);
auto result = rep_reply[best].get();
return result;
}
}
ACTOR static Future<UniqueGeneration> replicatedWrite(CoordinatedStateImpl* self, GenerationRegWriteRequest req) {
state std::vector<GenerationRegInterface>& replicas = self->coordinators.stateServers;
state std::vector<Future<UniqueGeneration>> wrep_reply;
for (int i = 0; i < replicas.size(); i++) {
Future<UniqueGeneration> reply =
waitAndSendWrite(replicas[i].write, GenerationRegWriteRequest(req.kv, req.gen));
wrep_reply.push_back(reply);
self->ac.add(success(reply));
}
wait(quorum(wrep_reply, self->initial ? replicas.size() : replicas.size() / 2 + 1));
UniqueGeneration maxGen;
for (int i = 0; i < wrep_reply.size(); i++)
if (wrep_reply[i].isReady())
maxGen = std::max(maxGen, wrep_reply[i].get());
return maxGen;
}
};
CoordinatedState::CoordinatedState(ServerCoordinators const& coord)
: impl(std::make_unique<CoordinatedStateImpl>(coord)) {}
CoordinatedState::~CoordinatedState() = default;
Future<Value> CoordinatedState::read() {
return CoordinatedStateImpl::read(impl.get());
}
Future<Void> CoordinatedState::onConflict() {
return CoordinatedStateImpl::onConflict(impl.get());
}
Future<Void> CoordinatedState::setExclusive(Value v) {
return CoordinatedStateImpl::setExclusive(impl.get(), v);
}
uint64_t CoordinatedState::getConflict() {
return impl->getConflict();
}
struct MovableValue {
enum MoveState { MaybeTo = 1, Active = 2, MovingFrom = 3 };
Value value;
int32_t mode;
Optional<Value> other; // a cluster connection string
MovableValue() : mode(Active) {}
MovableValue(Value const& v, int mode, Optional<Value> other = Optional<Value>())
: value(v), mode(mode), other(other) {}
// To change this serialization, ProtocolVersion::MovableCoordinatedStateV2 must be updated, and downgrades need to
// be considered
template <class Ar>
void serialize(Ar& ar) {
ASSERT(ar.protocolVersion().hasMovableCoordinatedState());
serializer(ar, value, mode, other);
}
};
struct MovableCoordinatedStateImpl {
ServerCoordinators coordinators;
CoordinatedState cs;
Optional<Value> lastValue, // The value passed to setExclusive()
lastCSValue; // The value passed to cs.setExclusive()
MovableCoordinatedStateImpl(ServerCoordinators const& c) : coordinators(c), cs(c) {}
ACTOR static Future<Value> read(MovableCoordinatedStateImpl* self) {
state MovableValue moveState;
Value rawValue = wait(self->cs.read());
if (rawValue.size()) {
BinaryReader r(rawValue, IncludeVersion());
if (!r.protocolVersion().hasMovableCoordinatedState()) {
// Old coordinated state, not a MovableValue
moveState.value = rawValue;
} else
r >> moveState;
}
// SOMEDAY: If moveState.mode == MovingFrom, read (without locking) old state and assert that it corresponds
// with our state and is ReallyTo(coordinators)
if (moveState.mode == MovableValue::MaybeTo) {
TEST(true); // Maybe moveto state
ASSERT(moveState.other.present());
wait(self->moveTo(
self, &self->cs, ClusterConnectionString(moveState.other.get().toString()), moveState.value));
}
return moveState.value;
}
Future<Void> onConflict() { return cs.onConflict(); }
Future<Void> setExclusive(Value v) {
lastValue = v;
lastCSValue = BinaryWriter::toValue(MovableValue(v, MovableValue::Active),
IncludeVersion(ProtocolVersion::withMovableCoordinatedStateV2()));
return cs.setExclusive(lastCSValue.get());
}
ACTOR static Future<Void> move(MovableCoordinatedStateImpl* self, ClusterConnectionString nc) {
// Call only after setExclusive returns. Attempts to move the coordinated state
// permanently to the new ServerCoordinators, which must be uninitialized. Returns when the process has
// reached the point where a leader elected by the new coordinators should be doing the rest of the work
// (and therefore the caller should die).
state CoordinatedState cs(self->coordinators);
state CoordinatedState nccs(ServerCoordinators(makeReference<ClusterConnectionMemoryRecord>(nc)));
state Future<Void> creationTimeout = delay(30);
ASSERT(self->lastValue.present() && self->lastCSValue.present());
TraceEvent("StartMove").detail("ConnectionString", nc.toString());
choose {
when(wait(creationTimeout)) { throw new_coordinators_timed_out(); }
when(Value ncInitialValue = wait(nccs.read())) {
ASSERT(!ncInitialValue.size()); // The new coordinators must be uninitialized!
}
}
TraceEvent("FinishedRead").detail("ConnectionString", nc.toString());
choose {
when(wait(creationTimeout)) { throw new_coordinators_timed_out(); }
when(wait(nccs.setExclusive(
BinaryWriter::toValue(MovableValue(self->lastValue.get(),
MovableValue::MovingFrom,
self->coordinators.ccr->getConnectionString().toString()),
IncludeVersion(ProtocolVersion::withMovableCoordinatedStateV2()))))) {}
}
if (BUGGIFY)
wait(delay(5));
Value oldQuorumState = wait(cs.read());
if (oldQuorumState != self->lastCSValue.get()) {
TEST(true); // Quorum change aborted by concurrent write to old coordination state
TraceEvent("QuorumChangeAbortedByConcurrency").log();
throw coordinated_state_conflict();
}
wait(self->moveTo(self, &cs, nc, self->lastValue.get()));
throw coordinators_changed();
}
ACTOR static Future<Void> moveTo(MovableCoordinatedStateImpl* self,
CoordinatedState* coordinatedState,
ClusterConnectionString nc,
Value value) {
wait(coordinatedState->setExclusive(
BinaryWriter::toValue(MovableValue(value, MovableValue::MaybeTo, nc.toString()),
IncludeVersion(ProtocolVersion::withMovableCoordinatedStateV2()))));
if (BUGGIFY)
wait(delay(5));
// SOMEDAY: If we are worried about someone magically getting the new cluster ID and interfering, do a second
// cs.setExclusive( encode( ReallyTo, ... ) )
TraceEvent("ChangingQuorum").detail("ConnectionString", nc.toString());
wait(changeLeaderCoordinators(self->coordinators, StringRef(nc.toString())));
TraceEvent("ChangedQuorum").detail("ConnectionString", nc.toString());
throw coordinators_changed();
}
};
MovableCoordinatedState& MovableCoordinatedState::operator=(MovableCoordinatedState&&) = default;
MovableCoordinatedState::MovableCoordinatedState(class ServerCoordinators const& coord)
: impl(std::make_unique<MovableCoordinatedStateImpl>(coord)) {}
MovableCoordinatedState::~MovableCoordinatedState() = default;
Future<Value> MovableCoordinatedState::read() {
return MovableCoordinatedStateImpl::read(impl.get());
}
Future<Void> MovableCoordinatedState::onConflict() {
return impl->onConflict();
}
Future<Void> MovableCoordinatedState::setExclusive(Value v) {
return impl->setExclusive(v);
}
Future<Void> MovableCoordinatedState::move(ClusterConnectionString const& nc) {
return MovableCoordinatedStateImpl::move(impl.get(), nc);
}