Merge pull request #2281 from fzhjon/include-failed-ss
Allow re-inclusion of servers marked as failed
This commit is contained in:
commit
a3ef3199c6
|
@ -213,6 +213,10 @@ To temporarily or permanently remove one or more machines from a FoundationDB cl
|
|||
|
||||
If you interrupt the exclude command with Ctrl-C after seeing the "waiting for state to be removed" message, the exclusion work will continue in the background. Repeating the command will continue waiting for the exclusion to complete. To reverse the effect of the ``exclude`` command, use the ``include`` command.
|
||||
|
||||
Excluding a server with the ``failed`` flag will shut it down immediately; it will assume that it has already become unrecoverable or unreachable, and will not attempt to move the data on the machine away. This may break the guarantee required to maintain the configured redundancy mode, which will be checked internally, and the command may be denied if the guarantee is violated. This safety check can be ignored by using the command ``exclude FORCE failed``.
|
||||
|
||||
In case you want to include a new machine with the same address as a server previously marked as failed, you can allow it to join by using the ``include failed`` command.
|
||||
|
||||
4) On each removed machine, stop the FoundationDB server and prevent it from starting at the next boot. Follow the :ref:`instructions for your platform <administration-running-foundationdb>`. For example, on Ubuntu::
|
||||
|
||||
user@host3$ sudo service foundationdb stop
|
||||
|
@ -222,7 +226,7 @@ To temporarily or permanently remove one or more machines from a FoundationDB cl
|
|||
|
||||
6) You can optionally :ref:`uninstall <administration-removing>` the FoundationDB server package entirely and/or delete database files on removed servers.
|
||||
|
||||
7) If you ever want to add a removed machine back to the cluster, you will have to take it off the excluded servers list to which it was added in step 3. This can be done using the ``include`` command of ``fdbcli``. Typing ``exclude`` with no parameters will tell you the current list of excluded machines.
|
||||
7) If you ever want to add a removed machine back to the cluster, you will have to take it off the excluded servers list to which it was added in step 3. This can be done using the ``include`` command of ``fdbcli``. If attempting to re-include a failed server, this can be done using the ``include failed`` command of ``fdbcli``. Typing ``exclude`` with no parameters will tell you the current list of excluded and failed machines.
|
||||
|
||||
Moving a cluster
|
||||
================
|
||||
|
|
|
@ -128,10 +128,12 @@ For more information on setting the cluster description, see :ref:`configuration
|
|||
exclude
|
||||
-------
|
||||
|
||||
The ``exclude`` command excludes servers from the database. Its syntax is ``exclude <ADDRESS...>``. If no addresses are specified, the command provides the set of excluded servers.
|
||||
The ``exclude`` command excludes servers from the database or marks them as failed. Its syntax is ``exclude [failed] <ADDRESS...>``. If no addresses are specified, the command provides the set of excluded and failed servers.
|
||||
|
||||
For each IP address or IP:port pair in ``<ADDRESS...>``, the command adds the address to the set of excluded servers. It then waits until all database state has been safely moved off the specified servers.
|
||||
|
||||
If the ``failed`` keyword is specified, the address is marked as failed and added to the set of failed servers. It will not wait for the database state to move off the specified servers.
|
||||
|
||||
For more information on excluding servers, see :ref:`removing-machines-from-a-cluster`.
|
||||
|
||||
exit
|
||||
|
@ -213,9 +215,13 @@ The following options are available for use with the ``option`` command:
|
|||
include
|
||||
-------
|
||||
|
||||
The ``include`` command permits previously excluded servers to rejoin the database. Its syntax is ``include all|<ADDRESS...>``.
|
||||
The ``include`` command permits previously excluded or failed servers to rejoin the database. Its syntax is ``include [failed] all|<ADDRESS...>``.
|
||||
|
||||
If ``all`` is specified, the excluded servers list is cleared.
|
||||
The ``failed`` keyword is required if the servers were previously marked as failed rather than excluded.
|
||||
|
||||
If ``all`` is specified, the excluded servers list is cleared. This will not clear the failed servers list.
|
||||
|
||||
If ``failed all`` or ``all failed`` is specified, the failed servers list is cleared. This will not clear the excluded servers list.
|
||||
|
||||
For each IP address or IP:port pair in ``<ADDRESS...>``, the command removes any matching exclusions from the excluded servers list. (A specified IP will match all ``IP:*`` exclusion entries).
|
||||
|
||||
|
|
|
@ -1998,10 +1998,14 @@ ACTOR Future<bool> coordinators( Database db, std::vector<StringRef> tokens, boo
|
|||
|
||||
ACTOR Future<bool> include( Database db, std::vector<StringRef> tokens ) {
|
||||
std::vector<AddressExclusion> addresses;
|
||||
if (tokens.size() == 2 && tokens[1] == LiteralStringRef("all"))
|
||||
addresses.push_back( AddressExclusion() );
|
||||
else {
|
||||
for(auto t = tokens.begin()+1; t != tokens.end(); ++t) {
|
||||
bool failed = false;
|
||||
bool all = false;
|
||||
for (auto t = tokens.begin() + 1; t != tokens.end(); ++t) {
|
||||
if (*t == LiteralStringRef("all")) {
|
||||
all = true;
|
||||
} else if (*t == LiteralStringRef("failed")) {
|
||||
failed = true;
|
||||
} else {
|
||||
auto a = AddressExclusion::parse( *t );
|
||||
if (!a.isValid()) {
|
||||
printf("ERROR: '%s' is not a valid network endpoint address\n", t->toString().c_str());
|
||||
|
@ -2012,8 +2016,13 @@ ACTOR Future<bool> include( Database db, std::vector<StringRef> tokens ) {
|
|||
addresses.push_back( a );
|
||||
}
|
||||
}
|
||||
|
||||
wait( makeInterruptable(includeServers(db, addresses)) );
|
||||
if (all) {
|
||||
std::vector<AddressExclusion> includeAll;
|
||||
includeAll.push_back(AddressExclusion());
|
||||
wait(makeInterruptable(includeServers(db, includeAll, failed)));
|
||||
} else {
|
||||
wait(makeInterruptable(includeServers(db, addresses, failed)));
|
||||
}
|
||||
return false;
|
||||
};
|
||||
|
||||
|
|
|
@ -1229,12 +1229,9 @@ ACTOR Future<Void> excludeServers(Database cx, vector<AddressExclusion> servers,
|
|||
}
|
||||
}
|
||||
|
||||
ACTOR Future<Void> includeServers( Database cx, vector<AddressExclusion> servers ) {
|
||||
state bool includeAll = false;
|
||||
ACTOR Future<Void> includeServers(Database cx, vector<AddressExclusion> servers, bool failed) {
|
||||
state Transaction tr(cx);
|
||||
state Key versionKey = BinaryWriter::toValue(deterministicRandom()->randomUniqueID(),Unversioned());
|
||||
state std::string excludeVersionKey = deterministicRandom()->randomUniqueID().toString();
|
||||
|
||||
state std::string versionKey = deterministicRandom()->randomUniqueID().toString();
|
||||
loop {
|
||||
try {
|
||||
tr.setOption( FDBTransactionOptions::ACCESS_SYSTEM_KEYS );
|
||||
|
@ -1244,13 +1241,21 @@ ACTOR Future<Void> includeServers( Database cx, vector<AddressExclusion> servers
|
|||
|
||||
// includeServers might be used in an emergency transaction, so make sure it is retry-self-conflicting and CAUSAL_WRITE_RISKY
|
||||
tr.setOption( FDBTransactionOptions::CAUSAL_WRITE_RISKY );
|
||||
tr.addReadConflictRange( singleKeyRange(excludedServersVersionKey) );
|
||||
tr.set( excludedServersVersionKey, excludeVersionKey );
|
||||
if (failed) {
|
||||
tr.addReadConflictRange(singleKeyRange(failedServersVersionKey));
|
||||
tr.set(failedServersVersionKey, versionKey);
|
||||
} else {
|
||||
tr.addReadConflictRange(singleKeyRange(excludedServersVersionKey));
|
||||
tr.set(excludedServersVersionKey, versionKey);
|
||||
}
|
||||
|
||||
for(auto& s : servers ) {
|
||||
if (!s.isValid()) {
|
||||
tr.clear( excludedServersKeys );
|
||||
includeAll = true;
|
||||
if (failed) {
|
||||
tr.clear(failedServersKeys);
|
||||
} else {
|
||||
tr.clear(excludedServersKeys);
|
||||
}
|
||||
} else if (s.isWholeMachine()) {
|
||||
// Eliminate both any ip-level exclusion (1.2.3.4) and any
|
||||
// port-level exclusions (1.2.3.4:5)
|
||||
|
@ -1260,15 +1265,19 @@ ACTOR Future<Void> includeServers( Database cx, vector<AddressExclusion> servers
|
|||
//
|
||||
// This is why we now make two clears: first only of the ip
|
||||
// address, the second will delete all ports.
|
||||
auto addr = encodeExcludedServersKey(s);
|
||||
auto addr = failed ? encodeFailedServersKey(s) : encodeExcludedServersKey(s);
|
||||
tr.clear(singleKeyRange(addr));
|
||||
tr.clear(KeyRangeRef(addr + ':', addr + char(':' + 1)));
|
||||
} else {
|
||||
tr.clear( encodeExcludedServersKey(s) );
|
||||
if (failed) {
|
||||
tr.clear(encodeFailedServersKey(s));
|
||||
} else {
|
||||
tr.clear(encodeExcludedServersKey(s));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TraceEvent("IncludeServersCommit").detail("Servers", describe(servers));
|
||||
TraceEvent("IncludeServersCommit").detail("Servers", describe(servers)).detail("Failed", failed);
|
||||
|
||||
wait( tr.commit() );
|
||||
return Void();
|
||||
|
|
|
@ -146,7 +146,7 @@ ACTOR Future<Void> excludeServers( Database cx, vector<AddressExclusion> serve
|
|||
|
||||
// Remove the given servers from the exclusion list. A NetworkAddress with a port of 0 means all servers on the given IP. A NetworkAddress() means
|
||||
// all servers (don't exclude anything)
|
||||
ACTOR Future<Void> includeServers( Database cx, vector<AddressExclusion> servers );
|
||||
ACTOR Future<Void> includeServers(Database cx, vector<AddressExclusion> servers, bool failed = false);
|
||||
|
||||
// Set the process class of processes with the given address. A NetworkAddress with a port of 0 means all servers on the given IP.
|
||||
ACTOR Future<Void> setClass( Database cx, AddressExclusion server, ProcessClass processClass );
|
||||
|
|
Loading…
Reference in New Issue