update to latest version

This commit is contained in:
chaoguang 2019-06-03 16:49:19 -07:00
commit 66811b7bd2
51 changed files with 2443 additions and 142 deletions

View File

@ -23,6 +23,6 @@
FDBLibTLS_BUILD_SOURCES +=
FDBLibTLS_CFLAGS := -fPIC -I/usr/local/include -I$(BOOSTDIR) -I. -DUSE_UCONTEXT
FDBLibTLS_CFLAGS := -fPIC -I/usr/local/include -isystem$(BOOSTDIR) -I. -DUSE_UCONTEXT
lib/libFDBLibTLS.a: bin/coverage.FDBLibTLS.xml

View File

@ -41,6 +41,7 @@ package directory
import (
"errors"
"github.com/apple/foundationdb/bindings/go/src/fdb"
"github.com/apple/foundationdb/bindings/go/src/fdb/subspace"
)
@ -54,6 +55,18 @@ const (
_MICROVERSION int32 = 0
)
var (
// ErrDirAlreadyExists is returned when trying to create a directory while it already exists.
ErrDirAlreadyExists = errors.New("the directory already exists")
// ErrDirNotExists is returned when opening or listing a directory that does not exist.
ErrDirNotExists = errors.New("the directory does not exist")
// ErrParentDirDoesNotExist is returned when opening a directory and one or more
// parent directories in the path do not exist.
ErrParentDirDoesNotExist = errors.New("the parent directory does not exist")
)
// Directory represents a subspace of keys in a FoundationDB database,
// identified by a hierarchical path.
type Directory interface {
@ -69,8 +82,9 @@ type Directory interface {
CreateOrOpen(t fdb.Transactor, path []string, layer []byte) (DirectorySubspace, error)
// Open opens the directory specified by path (relative to this Directory),
// and returns the directory and its contents as a DirectorySubspace (or an
// error if the directory does not exist).
// and returns the directory and its contents as a DirectorySubspace (or ErrDirNotExists
// error if the directory does not exist, or ErrParentDirDoesNotExist if one of the parent
// directories in the path does not exist).
//
// If the byte slice layer is specified, it is compared against the layer
// specified when the directory was created, and an error is returned if
@ -79,7 +93,7 @@ type Directory interface {
// Create creates a directory specified by path (relative to this
// Directory), and returns the directory and its contents as a
// DirectorySubspace (or an error if the directory already exists).
// DirectorySubspace (or ErrDirAlreadyExists if the directory already exists).
//
// If the byte slice layer is specified, it is recorded as the layer and
// will be checked when opening the directory in the future.

View File

@ -99,7 +99,7 @@ func (dl directoryLayer) createOrOpen(rtr fdb.ReadTransaction, tr *fdb.Transacti
}
if !allowOpen {
return nil, errors.New("the directory already exists")
return nil, ErrDirAlreadyExists
}
if layer != nil {
@ -112,7 +112,7 @@ func (dl directoryLayer) createOrOpen(rtr fdb.ReadTransaction, tr *fdb.Transacti
}
if !allowCreate {
return nil, errors.New("the directory does not exist")
return nil, ErrDirNotExists
}
if e := dl.checkVersion(rtr, tr); e != nil {
@ -161,7 +161,7 @@ func (dl directoryLayer) createOrOpen(rtr fdb.ReadTransaction, tr *fdb.Transacti
}
if parentNode == nil {
return nil, errors.New("the parent directory does not exist")
return nil, ErrParentDirDoesNotExist
}
node := dl.nodeWithPrefix(prefix)
@ -254,7 +254,7 @@ func (dl directoryLayer) List(rt fdb.ReadTransactor, path []string) ([]string, e
node := dl.find(rtr, path).prefetchMetadata(rtr)
if !node.exists() {
return nil, errors.New("the directory does not exist")
return nil, ErrDirNotExists
}
if node.isInPartition(nil, true) {

View File

@ -0,0 +1,91 @@
## FDB Backup Data Format
### Introduction
This document describes the data format of the files generated by FoundationDB (FDB) backup procedure.
The target readers who may benefit from reading this document are:
* who make changes on the current backup or restore procedure;
* who writes tools to digest the backup data for analytical purpose;
* who wants to understand the internals of how backup and restore works.
The description of the backup data format is based on FDB 5.2 to FDB 6.1. The backup data format may (although unlikely) change after FDB 6.1.
### Files generated by backup
The backup procedure generates two types of files: range files and log files.
* A range file describes key-value pairs in a range at the version when the backup process takes a snapshot of the range. Different range files have data for different ranges at different versions.
* A log file describes the mutations taken from a version v<sub>1</sub> to v<sub>2</sub> during the backup procedure.
With the key-value pairs in range file and the mutations in log file, the restore procedure can restore the database into a consistent state at a user-provided version v<sub>k</sub> if the backup data is claimed by the restore as restorable at v<sub>k</sub>. (The details of determining if a set of backup data is restorable at a version is out of scope of this document and can be found at [backup.md](https://github.com/xumengpanda/foundationdb/blob/cd873831ecd18653c5bf459d6f72d14a99b619c4/design/backup.md).
### Filename conventions
The backup files will be saved in a directory (i.e., url) specified by users. Under the directory, the range files are in the `snapshots` folder. The log files are in the `logs` folder.
The convention of the range filename is ` snapshots/snapshot,beginVersion,beginVersion,blockSize`, where `beginVersion` is the version when the key-values in the range file are recorded, and blockSize is the size of data blocks in the range file.
The convention of the log filename is `logs/,versionPrefix/log,beginVersion,endVersion,randomUID, blockSize`, where the versionPrefix is a 2-level path (`x/y`) where beginVersion should go such that `x/y/*` contains (10^smallestBucket) possible versions; the randomUID is a random UID, the `beginVersion` and `endVersion` are the version range (left inclusive, right exclusive) when the mutations are recorded; and the `blockSize` is the data block size in the log file.
We will use an example to explain what each field in the range and log filename means.
Suppose under the backup directory, we have a range file `snapshots/snapshot,78994177,78994177,97` and a log file `logs/0000/0000/log,78655645,98655645,149a0bdfedecafa2f648219d5eba816e,1048576`.
The range files filename tells us that all key-value pairs decoded from the file are the KV value in DB at the version `78994177`. The data block size is `97` bytes.
The log files filename tells us that the mutations in the log file were the mutations in the DB during the version range `[78655645,98655645)`, and the data block size is `1048576` bytes.
### Data format in a range file
A range file can have one to many data blocks. Each data block has a set of key-value pairs.
A data block is encoded as follows: `Header startKey k1v1 k2v2 Padding`.
Example:
The client code writes keys in this sequence:
a c d e f g h i j z
The backup procedure records the key-value pairs in the database into range file.
H = header P = padding a...z = keys v = value | = block boundary
Encoded file: H a cv dv ev P | H e ev fv gv hv P | H h hv iv jv z
Decoded in blocks yields:
Block 1: range [a, e) with kv pairs cv, dv
Block 2: range [e, h) with kv pairs ev, fv, gv
Block 3: range [h, z) with kv pairs hv, iv, jv
NOTE: All blocks except for the final block will have one last value which will not be used. This isn't actually a waste since if the next KV pair wouldn't fit within the block after the value then the space after the final key to the next 1MB boundary would just be padding anyway.
The code related to how a range file is written is in the `struct RangeFileWriter` in `namespace fileBackup`.
The code that decodes a range block is in `ACTOR Future<Standalone<VectorRef<KeyValueRef>>> decodeRangeFileBlock(Reference<IAsyncFile> file, int64_t offset, int len)`.
### Data format in a log file
A log file can have one to many data blocks.
Each block is encoded as `Header, [Param1, Param2]... padding`.
The first 32bits in `Param1` and `Param2` specifies the length of the `Param1` and `Param2`.
`Param1` specifies the version when the mutations happened;
`Param2` encodes the group of mutations happened at the version.
Note that if the group of mutations is bigger than the block size, the mutation group will be split across multiple data blocks.
For example, we may get `[Param1, Param2_part0]`, `[Param1, Param2_part1]`. By concatenating the `Param2_part0` and `Param2_part1`, we can get the group of all mutations happened in the version specified in `Param1`.
The encoding format for `Param1` is as follows:
`hashValue|commitVersion|part`,
where `hashValue` is the hash of the commitVersion, `commitVersion` is the version when the mutations in `Param2`(s) are taken, and `part` is the part number in case we need to concatenate the `Param2` to get the group of all mutations.
`hashValue` takes 8bits, `commitVersion` takes 64bits, and `part` takes 32bits.
Note that in case of concatenating the partial group of mutations in `Param2` to get the full group of all mutations, the part number should be continuous.
The encoding format for the group of mutations, which is Param2 or the concatenated Param2 in case of partial group of mutations in a block, is as follows:
`length_of_the_mutation_group | encoded_mutation_1 | … | encoded_mutation_k`.
The `encoded_mutation_i` is encoded as follows
`type|kLen|vLen|Key|Value`
where type is the mutation type, such as Set or Clear, `kLen` and `vLen` respectively are the length of the key and value in the mutation. `Key` and `Value` are the serialized value of the Key and Value in the mutation.
The code related to how a log file is written is in the `struct LogFileWriter` in `namespace fileBackup`.
The code that decodes a mutation block is in `ACTOR Future<Standalone<VectorRef<KeyValueRef>>> decodeLogFileBlock(Reference<IAsyncFile> file, int64_t offset, int len)`.
### Endianness
When the restore decodes a serialized integer from the backup file, it needs to convert the serialized value from big endian to little endian.
The reason is as follows: When the backup procedure transfers the data to remote blob store, the backup data is encoded in big endian. However, FoundationDB currently only run on little endian machines. The endianness affects the interpretation of an integer, so we must perform the endianness convertion.

View File

@ -530,7 +530,7 @@ The second feature is the ability to add one or more synchronous replicas of the
An example configuration would be four total datacenters, two on the east coast, two on the west coast, with a preference for fast write latencies from the west coast. One datacenter on each coast would be sized to store a full copy of the data. The second datacenter on each coast would only have a few FoundationDB processes.
While everything is healthy, writes need to be made durable in both west coast datacenters before a commit can succeed. The geographic proximity of the two datacenters minimizes the additional commit latency. Reads can be served from either region, and clients can get data from whichever region is closer. Getting a read version from the each coast region will still require communicating with a west coast datacenter. Clients can cache read versions if they can tolerate reading stale data to avoid waiting on read versions.
While everything is healthy, writes need to be made durable in both west coast datacenters before a commit can succeed. The geographic proximity of the two datacenters minimizes the additional commit latency. Reads can be served from either region, and clients can get data from whichever region is closer. Getting a read version from east coast region will still require communicating with a west coast datacenter. Clients can cache read versions if they can tolerate reading stale data to avoid waiting on read versions.
If either west coast datacenter fails, the last few mutations will be propagated from the remaining west coast datacenter to the east coast. At this point, FoundationDB will start accepting commits on the east coast. Once the west coast comes back online, the system will automatically start copying all the data that was committed to the east coast back to the west coast replica. Once the west coast has caught up, the system will automatically switch back to accepting writes from the west coast again.
@ -615,7 +615,7 @@ The number of replicas in each region is controlled by redundancy level. For exa
Asymmetric configurations
-------------------------
The fact that satellite policies are configured per region allows for asymmetric configurations. For example, FoudnationDB can have a three datacenter setup where there are two datacenters on the west coast (WC1, WC2) and one datacenter on the east coast (EC1). The west coast region can be set as the preferred active region by setting the priority of its primary datacenter higher than the east coast datacenter. The west coast region should have a satellite policy configured, so that when it is active, FoundationDB is making mutations durable in both west coast datacenters. In the rare event that one of the west coast datacenter have failed, FoundationDB will fail over to the east coast datacenter. Because this region does not a satellite datacenter, the mutations will only be made durable in one datacenter while the transaction subsystem is located here. However this is justifiable because the region will only be active if a datacenter has already been lost.
The fact that satellite policies are configured per region allows for asymmetric configurations. For example, FoudnationDB can have a three datacenter setup where there are two datacenters on the west coast (WC1, WC2) and one datacenter on the east coast (EC1). The west coast region can be set as the preferred active region by setting the priority of its primary datacenter higher than the east coast datacenter. The west coast region should have a satellite policy configured, so that when it is active, FoundationDB is making mutations durable in both west coast datacenters. In the rare event that one of the west coast datacenters has failed, FoundationDB will fail over to the east coast datacenter. Because this region does not a satellite datacenter, the mutations will only be made durable in one datacenter while the transaction subsystem is located here. However, this is justifiable because the region will only be active if a datacenter has already been lost.
This is the region configuration that implements the example::
@ -669,7 +669,7 @@ To configure an existing database to regions, do the following steps:
4. Configure ``usable_regions=2``. This will cause the cluster to start copying data between the regions.
5. Watch ``status`` and wait until data movement is complete. This will mean signal that the remote datacenter has a full replica of all of the data in the database.
5. Watch ``status`` and wait until data movement is complete. This will signal that the remote datacenter has a full replica of all of the data in the database.
6. Change the region configuration to have a non-negative priority for the primary datacenters in both regions. This will enable automatic failover between regions.
@ -680,7 +680,7 @@ When a primary datacenter fails, the cluster will go into a degraded state. It w
.. warning:: While a datacenter has failed, the maximum write throughput of the cluster will be roughly 1/3 of normal performance. This is because the transaction logs need to store all of the mutations being committed, so that once the other datacenter comes back online, it can replay history to catch back up.
To drop the dead datacenter do the follow steps:
To drop the dead datacenter do the following steps:
1. Configure the region configuration so that the dead datacenter has a negative priority.

View File

@ -40,7 +40,7 @@ Promises and futures can be used within a single process, but their real strengt
wait()
------
At the point when a receiver holding a ``Future<T>`` needs the ``T`` to continue computation, it invokes the ``wait()`` statement with the ``Future<T>`` as its parameter. The ``wait()`` statement allows the calling actor to pause execution until the value of the future is set, returning a value of type ``T`` During the wait, other actors can continue execution, providing asynchronous concurrency within a single process.
At the point when a receiver holding a ``Future<T>`` needs the ``T`` to continue computation, it invokes the ``wait()`` statement with the ``Future<T>`` as its parameter. The ``wait()`` statement allows the calling actor to pause execution until the value of the future is set, returning a value of type ``T``. During the wait, other actors can continue execution, providing asynchronous concurrency within a single process.
ACTOR
-----
@ -154,5 +154,5 @@ Some preprocessor definitions will not fix all issues though. When programming f
foo([x]() { x->bar(); })
}
- state variables in don't follow the normal scoping rules. So in flow a state variable can be defined in a inner scope and later it can be used in the outer scope. In order to not break compilation in IDE-mode, always define state variables in the outermost scope they will be used.
- state variables in flow don't follow the normal scoping rules. So in flow a state variable can be defined in a inner scope and later it can be used in the outer scope. In order to not break compilation in IDE-mode, always define state variables in the outermost scope they will be used.

View File

@ -470,6 +470,10 @@ void initHelp() {
"include all|<ADDRESS>*",
"permit previously-excluded servers to rejoin the database",
"If `all' is specified, the excluded servers list is cleared.\n\nFor each IP address or IP:port pair in <ADDRESS>*, removes any matching exclusions from the excluded servers list. (A specified IP will match all IP:* exclusion entries)");
helpMap["snapshot"] = CommandHelp(
"snapshot <BINARY-PATH>:<ARG1=VAL1>,<ARG2=VAL2>,...",
"snapshot the database",
"invokes binary provided in binary-path with the arg,value pairs on TLog, Storage and Coordinators nodes. UID is a reserved ARG key.");
helpMap["setclass"] = CommandHelp(
"setclass <ADDRESS> <unset|storage|transaction|default>",
"change the class of a process",
@ -2121,6 +2125,11 @@ ACTOR Future<bool> exclude( Database db, std::vector<StringRef> tokens, Referenc
return false;
}
ACTOR Future<bool> createSnapshot(Database db, StringRef snapCmd) {
wait(makeInterruptable(mgmtSnapCreate(db, snapCmd)));
return false;
}
ACTOR Future<bool> setClass( Database db, std::vector<StringRef> tokens ) {
if( tokens.size() == 1 ) {
vector<ProcessData> _workers = wait( makeInterruptable(getWorkers(db)) );
@ -2720,6 +2729,17 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise) {
continue;
}
if (tokencmp(tokens[0], "snapshot")) {
if (tokens.size() != 2) {
printUsage(tokens[0]);
is_error = true;
} else {
bool err = wait(createSnapshot(db, tokens[1]));
if (err) is_error = true;
}
continue;
}
if (tokencmp(tokens[0], "setclass")) {
if (tokens.size() != 3 && tokens.size() != 1) {
printUsage(tokens[0]);

View File

@ -44,7 +44,8 @@ static const char* typeString[] = { "SetValue",
"ByteMax",
"MinV2",
"AndV2",
"CompareAndClear" };
"CompareAndClear",
"Exec" };
struct MutationRef {
static const int OVERHEAD_BYTES = 12; //12 is the size of Header in MutationList entries
@ -70,6 +71,9 @@ struct MutationRef {
MinV2,
AndV2,
CompareAndClear,
// ExecOp is always set with FIRST_IN_BATCH option to quickly identify
// the op in a transaction batch while parsing it in TLog
Exec,
MAX_ATOMIC_OP
};
// This is stored this way for serialization purposes.

View File

@ -148,6 +148,7 @@ public:
int64_t transactionsMaybeCommitted;
int64_t transactionsResourceConstrained;
int64_t transactionsProcessBehind;
int64_t transactionWaitsForFullRecovery;
ContinuousSample<double> latencies, readLatencies, commitLatencies, GRVLatencies, mutationsPerCommit, bytesPerCommit;
int outstandingWatches;

View File

@ -1474,6 +1474,29 @@ ACTOR Future<Void> waitForExcludedServers( Database cx, vector<AddressExclusion>
}
}
ACTOR Future<Void> mgmtSnapCreate(Database cx, StringRef snapCmd) {
state int retryCount = 0;
loop {
state UID snapUID = deterministicRandom()->randomUniqueID();
try {
wait(snapCreate(cx, snapCmd, snapUID));
printf("Snapshots tagged with UID: %s, check logs for status\n", snapUID.toString().c_str());
TraceEvent("SnapCreateSucceeded").detail("snapUID", snapUID);
break;
} catch (Error& e) {
++retryCount;
TraceEvent(retryCount > 3 ? SevWarn : SevInfo, "SnapCreateFailed").error(e);
if (retryCount > 3) {
fprintf(stderr, "Snapshot create failed, %d (%s)."
" Please cleanup any instance level snapshots created.\n", e.code(), e.what());
throw;
}
}
}
return Void();
}
ACTOR Future<Void> waitForFullReplication( Database cx ) {
state ReadYourWritesTransaction tr(cx);
loop {

View File

@ -191,5 +191,9 @@ ACTOR Future<std::vector<NetworkAddress>> getCoordinators( Database cx );
void schemaCoverage( std::string const& spath, bool covered=true );
bool schemaMatch( json_spirit::mValue const& schema, json_spirit::mValue const& result, std::string& errorStr, Severity sev=SevError, bool checkCoverage=false, std::string path = std::string(), std::string schema_path = std::string() );
// execute payload in 'snapCmd' on all the coordinators, TLogs and
// storage nodes
ACTOR Future<Void> mgmtSnapCreate(Database cx, StringRef snapCmd);
#include "flow/unactorcompiler.h"
#endif

View File

@ -49,6 +49,7 @@ struct MasterProxyInterface {
RequestStream< struct GetRawCommittedVersionRequest > getRawCommittedVersion;
RequestStream< struct TxnStateRequest > txnState;
RequestStream<struct ExecRequest> execReq;
RequestStream< struct GetHealthMetricsRequest > getHealthMetrics;
@ -62,7 +63,7 @@ struct MasterProxyInterface {
void serialize(Archive& ar) {
serializer(ar, locality, provisional, commit, getConsistentReadVersion, getKeyServersLocations,
waitFailure, getStorageServerRejoinInfo, getRawCommittedVersion,
txnState, getHealthMetrics);
txnState, getHealthMetrics, execReq);
}
void initEndpoints() {
@ -298,4 +299,21 @@ struct GetHealthMetricsRequest
}
};
struct ExecRequest
{
constexpr static FileIdentifier file_identifier = 22403900;
Arena arena;
StringRef execPayload;
ReplyPromise<Void> reply;
Optional<UID> debugID;
explicit ExecRequest(Optional<UID> const& debugID = Optional<UID>()) : debugID(debugID) {}
explicit ExecRequest(StringRef exec, Optional<UID> debugID = Optional<UID>()) : execPayload(exec), debugID(debugID) {}
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, execPayload, reply, arena, debugID);
}
};
#endif

View File

@ -1964,7 +1964,7 @@ Future<Standalone<RangeResultRef>> getRange( Database const& cx, Future<Version>
}
Transaction::Transaction( Database const& cx )
: cx(cx), info(cx->taskID), backoff(CLIENT_KNOBS->DEFAULT_BACKOFF), committedVersion(invalidVersion), versionstampPromise(Promise<Standalone<StringRef>>()), options(cx), numErrors(0), trLogInfo(createTrLogInfoProbabilistically(cx))
: cx(cx), info(cx->taskID), backoff(CLIENT_KNOBS->DEFAULT_BACKOFF), committedVersion(invalidVersion), versionstampPromise(Promise<Standalone<StringRef>>()), options(cx), numErrors(0), numRetries(0), trLogInfo(createTrLogInfoProbabilistically(cx))
{
setPriority(GetReadVersionRequest::PRIORITY_DEFAULT);
}
@ -1987,6 +1987,7 @@ void Transaction::operator=(Transaction&& r) BOOST_NOEXCEPT {
info = r.info;
backoff = r.backoff;
numErrors = r.numErrors;
numRetries = r.numRetries;
committedVersion = r.committedVersion;
versionstampPromise = std::move(r.versionstampPromise);
watches = r.watches;
@ -2287,6 +2288,45 @@ void Transaction::atomicOp(const KeyRef& key, const ValueRef& operand, MutationR
TEST(true); //NativeAPI atomic operation
}
ACTOR Future<Void> executeCoordinators(DatabaseContext* cx, StringRef execPayload, Optional<UID> debugID) {
try {
if (debugID.present()) {
g_traceBatch.addEvent("TransactionDebug", debugID.get().first(), "NativeAPI.executeCoordinators.Before");
}
state ExecRequest req(execPayload, debugID);
if (debugID.present()) {
g_traceBatch.addEvent("TransactionDebug", debugID.get().first(),
"NativeAPI.executeCoordinators.Inside loop");
}
wait(loadBalance(cx->getMasterProxies(false), &MasterProxyInterface::execReq, req, cx->taskID));
if (debugID.present())
g_traceBatch.addEvent("TransactionDebug", debugID.get().first(),
"NativeAPI.executeCoordinators.After");
return Void();
} catch (Error& e) {
TraceEvent("NativeAPI.executeCoordinatorsError").error(e);
throw;
}
}
void Transaction::execute(const KeyRef& cmdType, const ValueRef& cmdPayload) {
TraceEvent("Execute operation").detail("Key", cmdType.toString()).detail("Value", cmdPayload.toString());
if (cmdType.size() > CLIENT_KNOBS->KEY_SIZE_LIMIT) throw key_too_large();
if (cmdPayload.size() > CLIENT_KNOBS->VALUE_SIZE_LIMIT) throw value_too_large();
auto& req = tr;
// Helps with quickly finding the exec op in a tlog batch
setOption(FDBTransactionOptions::FIRST_IN_BATCH);
auto& t = req.transaction;
auto r = singleKeyRange(cmdType, req.arena);
auto v = ValueRef(req.arena, cmdPayload);
t.mutations.push_back(req.arena, MutationRef(MutationRef::Exec, r.begin, v));
}
void Transaction::clear( const KeyRangeRef& range, bool addConflictRange ) {
auto &req = tr;
auto &t = req.transaction;
@ -2364,6 +2404,10 @@ TransactionOptions::TransactionOptions(Database const& cx) {
if (BUGGIFY) {
commitOnFirstProxy = true;
}
maxRetries = cx->transactionMaxRetries;
if (maxRetries == -1) {
maxRetries = 10;
}
}
TransactionOptions::TransactionOptions() {
@ -2373,11 +2417,19 @@ TransactionOptions::TransactionOptions() {
void TransactionOptions::reset(Database const& cx) {
double oldMaxBackoff = maxBackoff;
double oldMaxRetries = maxRetries;
memset(this, 0, sizeof(*this));
maxBackoff = cx->apiVersionAtLeast(610) ? oldMaxBackoff : cx->transactionMaxBackoff;
maxRetries = oldMaxRetries;
lockAware = cx->lockAware;
}
void Transaction::onErrorReset() {
int32_t oldNumRetires = numRetries;
reset();
numRetries = oldNumRetires;
}
void Transaction::reset() {
tr = CommitTransactionRequest();
readVersion = Future<Version>();
@ -2654,7 +2706,13 @@ ACTOR static Future<Void> tryCommit( Database cx, Reference<TransactionLogInfo>
// The user needs to be informed that we aren't sure whether the commit happened. Standard retry loops retry it anyway (relying on transaction idempotence) but a client might do something else.
throw commit_unknown_result();
} else {
if (e.code() != error_code_transaction_too_old && e.code() != error_code_not_committed && e.code() != error_code_database_locked && e.code() != error_code_proxy_memory_limit_exceeded)
if (e.code() != error_code_transaction_too_old
&& e.code() != error_code_not_committed
&& e.code() != error_code_database_locked
&& e.code() != error_code_proxy_memory_limit_exceeded
&& e.code() != error_code_transaction_not_permitted
&& e.code() != error_code_cluster_not_fully_recovered
&& e.code() != error_code_txn_exec_log_anti_quorum)
TraceEvent(SevError, "TryCommitError").error(e);
if (trLogInfo)
trLogInfo->addLog(FdbClientLogEvents::EventCommitError(startTime, static_cast<int>(e.code()), req));
@ -2765,6 +2823,7 @@ ACTOR Future<Void> commitAndWatch(Transaction *self) {
}
self->versionstampPromise.sendError(transaction_invalid_version());
//self->onErrorReset();
self->reset();
}
@ -3024,6 +3083,9 @@ Future<Standalone<StringRef>> Transaction::getVersionstamp() {
}
Future<Void> Transaction::onError( Error const& e ) {
if (numRetries < std::numeric_limits<int>::max()) {
numRetries++;
}
if (e.code() == error_code_success)
{
return client_invalid_operation();
@ -3032,7 +3094,8 @@ Future<Void> Transaction::onError( Error const& e ) {
e.code() == error_code_commit_unknown_result ||
e.code() == error_code_database_locked ||
e.code() == error_code_proxy_memory_limit_exceeded ||
e.code() == error_code_process_behind)
e.code() == error_code_process_behind ||
e.code() == error_code_cluster_not_fully_recovered)
{
if(e.code() == error_code_not_committed)
cx->transactionsNotCommitted++;
@ -3042,9 +3105,15 @@ Future<Void> Transaction::onError( Error const& e ) {
cx->transactionsResourceConstrained++;
if (e.code() == error_code_process_behind)
cx->transactionsProcessBehind++;
if (e.code() == error_code_cluster_not_fully_recovered) {
cx->transactionWaitsForFullRecovery++;
if (numRetries > options.maxRetries) {
return e;
}
}
double backoff = getBackoff(e.code());
reset();
onErrorReset();
return delay( backoff, info.taskID );
}
if (e.code() == error_code_transaction_too_old ||
@ -3056,7 +3125,7 @@ Future<Void> Transaction::onError( Error const& e ) {
cx->transactionsFutureVersions++;
double maxBackoff = options.maxBackoff;
reset();
onErrorReset();
return delay( std::min(CLIENT_KNOBS->FUTURE_VERSION_RETRY_DELAY, maxBackoff), info.taskID );
}
@ -3260,3 +3329,102 @@ void enableClientInfoLogging() {
networkOptions.logClientInfo = true;
TraceEvent(SevInfo, "ClientInfoLoggingEnabled");
}
ACTOR Future<Void> snapCreate(Database inputCx, StringRef snapCmd, UID snapUID) {
state Transaction tr(inputCx);
state DatabaseContext* cx = inputCx.getPtr();
// remember the client ID before the snap operation
state UID preSnapClientUID = cx->clientInfo->get().id;
TraceEvent("SnapCreateEnter")
.detail("SnapCmd", snapCmd.toString())
.detail("UID", snapUID)
.detail("PreSnapClientUID", preSnapClientUID);
StringRef snapCmdArgs = snapCmd;
StringRef snapCmdPart = snapCmdArgs.eat(":");
state Standalone<StringRef> snapUIDRef(snapUID.toString());
state Standalone<StringRef> snapPayloadRef = snapCmdPart
.withSuffix(LiteralStringRef(":uid="))
.withSuffix(snapUIDRef)
.withSuffix(LiteralStringRef(","))
.withSuffix(snapCmdArgs);
state Standalone<StringRef>
tLogCmdPayloadRef = LiteralStringRef("empty-binary:uid=").withSuffix(snapUIDRef);
// disable popping of TLog
tr.reset();
loop {
try {
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
tr.execute(execDisableTLogPop, tLogCmdPayloadRef);
wait(timeoutError(tr.commit(), 10));
break;
} catch (Error& e) {
TraceEvent("DisableTLogPopFailed").error(e);
wait(tr.onError(e));
}
}
TraceEvent("SnapCreateAfterLockingTLogs").detail("UID", snapUID);
// snap the storage and Tlogs
// if we retry the below command in failure cases with the same snapUID
// then the snapCreate can end up creating multiple snapshots with
// the same name which needs additional handling, hence we fail in
// failure cases and let the caller retry with different snapUID
tr.reset();
try {
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
tr.execute(execSnap, snapPayloadRef);
wait(tr.commit());
} catch (Error& e) {
TraceEvent("SnapCreateErroSnapTLogStorage").error(e);
throw;
}
TraceEvent("SnapCreateAfterSnappingTLogStorage").detail("UID", snapUID);
if (BUGGIFY) {
int32_t toDelay = deterministicRandom()->randomInt(1, 30);
wait(delay(toDelay));
}
// enable popping of the TLog
tr.reset();
loop {
try {
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
tr.execute(execEnableTLogPop, tLogCmdPayloadRef);
wait(tr.commit());
break;
} catch (Error& e) {
TraceEvent("EnableTLogPopFailed").error(e);
wait(tr.onError(e));
}
}
TraceEvent("SnapCreateAfterUnlockingTLogs").detail("UID", snapUID);
// snap the coordinators
try {
Future<Void> exec = executeCoordinators(cx, snapPayloadRef, snapUID);
wait(timeoutError(exec, 5.0));
} catch (Error& e) {
TraceEvent("SnapCreateErrorSnapCoords").error(e);
throw;
}
TraceEvent("SnapCreateAfterSnappingCoords").detail("UID", snapUID);
// if the client IDs did not change then we have a clean snapshot
UID postSnapClientUID = cx->clientInfo->get().id;
if (preSnapClientUID != postSnapClientUID) {
TraceEvent("UID mismatch")
.detail("SnapPreSnapClientUID", preSnapClientUID)
.detail("SnapPostSnapClientUID", postSnapClientUID);
throw coordinators_changed();
}
TraceEvent("SnapCreateComplete").detail("UID", snapUID);
return Void();
}

View File

@ -141,6 +141,7 @@ struct StorageMetrics;
struct TransactionOptions {
double maxBackoff;
uint32_t maxRetries;
uint32_t getReadVersionFlags;
uint32_t customTransactionSizeLimit;
bool checkWritesEnabled : 1;
@ -259,6 +260,14 @@ public:
// If checkWriteConflictRanges is true, existing write conflict ranges will be searched for this key
void set( const KeyRef& key, const ValueRef& value, bool addConflictRange = true );
void atomicOp( const KeyRef& key, const ValueRef& value, MutationRef::Type operationType, bool addConflictRange = true );
// execute operation is similar to set, but the command will reach
// one of the proxies, all the TLogs and all the storage nodes.
// instead of setting a key and value on the DB, it executes the command
// that is passed in the value field.
// - cmdType can be used for logging purposes
// - cmdPayload contains the details of the command to be executed:
// format of the cmdPayload : <binary-path>:<arg1=val1>,<arg2=val2>...
void execute(const KeyRef& cmdType, const ValueRef& cmdPayload);
void clear( const KeyRangeRef& range, bool addConflictRange = true );
void clear( const KeyRef& key, bool addConflictRange = true );
Future<Void> commit(); // Throws not_committed or commit_unknown_result errors in normal operation
@ -278,6 +287,7 @@ public:
void operator=(Transaction&& r) BOOST_NOEXCEPT;
void reset();
void onErrorReset();
void fullReset();
double getBackoff(int errCode);
void debugTransaction(UID dID) { info.debugID = dID; }
@ -288,6 +298,7 @@ public:
TransactionInfo info;
int numErrors;
int numRetries;
std::vector<Reference<Watch>> watches;
@ -324,5 +335,9 @@ std::string unprintable( const std::string& );
int64_t extractIntOption( Optional<StringRef> value, int64_t minValue = std::numeric_limits<int64_t>::min(), int64_t maxValue = std::numeric_limits<int64_t>::max() );
// Takes a snapshot of the cluster, specifically the following persistent
// states: coordinator, TLog and storage state
ACTOR Future<Void> snapCreate(Database cx, StringRef snapCmd, UID snapUID);
#include "flow/unactorcompiler.h"
#endif

View File

@ -36,6 +36,14 @@ const KeyRef keyServersEnd = keyServersKeys.end;
const KeyRangeRef keyServersKeyServersKeys ( LiteralStringRef("\xff/keyServers/\xff/keyServers/"), LiteralStringRef("\xff/keyServers/\xff/keyServers0"));
const KeyRef keyServersKeyServersKey = keyServersKeyServersKeys.begin;
// list of reserved exec commands
const StringRef execSnap = LiteralStringRef("snap"); // snapshot persistent state of
// storage, TLog and coordinated state
const StringRef execDisableTLogPop = LiteralStringRef("\xff/TLogDisablePop"); // disable pop on TLog
const StringRef execEnableTLogPop = LiteralStringRef("\xff/TLogEnablePop"); // enable pop on TLog
// used to communicate snap failures between TLog and SnapTest Workload, used only in simulator
const StringRef snapTestFailStatus = LiteralStringRef("\xff/SnapTestFailStatus/");
const Key keyServersKey( const KeyRef& k ) {
return k.withPrefix( keyServersPrefix );
}

View File

@ -282,6 +282,10 @@ extern const KeyRef healthyZoneKey;
const Value healthyZoneValue( StringRef const& zoneId, Version version );
std::pair<Key,Version> decodeHealthyZoneValue( ValueRef const& );
extern const StringRef execSnap;
extern const StringRef execDisableTLogPop;
extern const StringRef execEnableTLogPop;
extern const StringRef snapTestFailStatus;
// All mutations done to this range are blindly copied into txnStateStore.
// Used to create artifically large txnStateStore instances in testing.

View File

@ -22,7 +22,7 @@
fdbrpc_BUILD_SOURCES += fdbrpc/libeio/eio.c
fdbrpc_CFLAGS := -I$(BOOSTDIR) -I. -Ifdbrpc/libeio -DUSE_UCONTEXT
fdbrpc_CFLAGS := -isystem$(BOOSTDIR) -I. -Ifdbrpc/libeio -DUSE_UCONTEXT
fdbrpc_LDFLAGS :=
ifeq ($(PLATFORM),osx)

View File

@ -17,6 +17,8 @@ set(FDBSERVER_SRCS
DBCoreState.h
DiskQueue.actor.cpp
fdbserver.actor.cpp
FDBExecHelper.actor.cpp
FDBExecHelper.actor.h
IDiskQueue.h
IKeyValueStore.h
IPager.h
@ -152,6 +154,7 @@ set(FDBSERVER_SRCS
workloads/Serializability.actor.cpp
workloads/Sideband.actor.cpp
workloads/SlowTaskWorkload.actor.cpp
workloads/SnapTest.actor.cpp
workloads/StatusWorkload.actor.cpp
workloads/Storefront.actor.cpp
workloads/StreamingRead.actor.cpp

View File

@ -40,6 +40,9 @@ struct ConflictBatch {
TransactionConflict = 0,
TransactionTooOld,
TransactionCommitted,
TransactionNotPermitted,
TransactionNotFullyRecovered,
TransactionExecLogAntiQuorum,
};
void addTransaction( const CommitTransactionRef& transaction );
@ -62,4 +65,4 @@ private:
void addConflictRanges(Version now, std::vector< std::pair<StringRef,StringRef> >::iterator begin, std::vector< std::pair<StringRef,StringRef> >::iterator end, class SkipList* part);
};
#endif
#endif

View File

@ -0,0 +1,225 @@
#define BOOST_SYSTEM_NO_LIB
#define BOOST_DATE_TIME_NO_LIB
#define BOOST_REGEX_NO_LIB
#include <boost/process.hpp>
#include "fdbserver/FDBExecHelper.actor.h"
#include "flow/Trace.h"
#include "flow/flow.h"
#if defined(CMAKE_BUILD) || !defined(WIN32)
#include "versions.h"
#endif
#include "flow/actorcompiler.h" // This must be the last #include.
ExecCmdValueString::ExecCmdValueString(StringRef pCmdValueString) {
cmdValueString = pCmdValueString;
parseCmdValue();
}
void ExecCmdValueString::setCmdValueString(StringRef pCmdValueString) {
// reset everything
binaryPath = StringRef();
keyValueMap.clear();
// set the new cmdValueString
cmdValueString = pCmdValueString;
// parse it out
parseCmdValue();
}
StringRef ExecCmdValueString::getCmdValueString() {
return cmdValueString.toString();
}
StringRef ExecCmdValueString::getBinaryPath() {
return binaryPath;
}
VectorRef<StringRef> ExecCmdValueString::getBinaryArgs() {
return binaryArgs;
}
StringRef ExecCmdValueString::getBinaryArgValue(StringRef key) {
StringRef res;
if (keyValueMap.find(key) != keyValueMap.end()) {
res = keyValueMap[key];
}
return res;
}
void ExecCmdValueString::parseCmdValue() {
StringRef param = this->cmdValueString;
// get the binary path
this->binaryPath = param.eat(LiteralStringRef(":"));
// no arguments provided
if (param == StringRef()) {
return;
}
// extract the arguments
while (param != StringRef()) {
StringRef token = param.eat(LiteralStringRef(","));
this->binaryArgs.push_back(this->binaryArgs.arena(), token);
StringRef key = token.eat(LiteralStringRef("="));
keyValueMap.insert(std::make_pair(key, token));
}
return;
}
void ExecCmdValueString::dbgPrint() {
auto te = TraceEvent("ExecCmdValueString");
te.detail("CmdValueString", cmdValueString.toString());
te.detail("BinaryPath", binaryPath.toString());
int i = 0;
for (auto elem : binaryArgs) {
te.detail(format("Arg", ++i).c_str(), elem.toString());
}
return;
}
ACTOR Future<int> spawnProcess(std::string binPath, std::vector<std::string> paramList, double maxWaitTime, bool isSync)
{
state std::string argsString;
for (auto const& elem : paramList) {
argsString += elem + ",";
}
TraceEvent("SpawnProcess").detail("Cmd", binPath).detail("Args", argsString);
state int err = 0;
state double runTime = 0;
state boost::process::child c(binPath, boost::process::args(paramList),
boost::process::std_err > boost::process::null);
// for async calls in simulator, always delay by a fixed time, otherwise
// the predictability of the simulator breaks
if (!isSync && g_network->isSimulated()) {
wait(delay(deterministicRandom()->random01()));
}
if (!isSync && !g_network->isSimulated()) {
while (c.running() && runTime <= maxWaitTime) {
wait(delay(0.1));
runTime += 0.1;
}
} else {
if (g_network->isSimulated()) {
// to keep the simulator deterministic, wait till the process exits,
// hence giving a large wait time
c.wait_for(std::chrono::hours(24));
ASSERT(!c.running());
} else {
int maxWaitTimeInt = static_cast<int>(maxWaitTime + 1.0);
c.wait_for(std::chrono::seconds(maxWaitTimeInt));
}
}
if (c.running()) {
TraceEvent(SevWarnAlways, "ChildTermination")
.detail("Cmd", binPath)
.detail("Args", argsString);
c.terminate();
err = -1;
if (!c.wait_for(std::chrono::seconds(1))) {
TraceEvent(SevWarnAlways, "SpawnProcessFailedToExit")
.detail("Cmd", binPath)
.detail("Args", argsString);
}
} else {
err = c.exit_code();
}
TraceEvent("SpawnProcess")
.detail("Cmd", binPath)
.detail("Error", err);
return err;
}
ACTOR Future<int> execHelper(ExecCmdValueString* execArg, std::string folder, std::string role) {
state StringRef uidStr = execArg->getBinaryArgValue(LiteralStringRef("uid"));
state int err = 0;
state Future<int> cmdErr;
if (!g_network->isSimulated()) {
// get bin path
auto snapBin = execArg->getBinaryPath();
auto dataFolder = "path=" + folder;
std::vector<std::string> paramList;
paramList.push_back(snapBin.toString());
// get user passed arguments
auto listArgs = execArg->getBinaryArgs();
for (auto elem : listArgs) {
paramList.push_back(elem.toString());
}
// get additional arguments
paramList.push_back(dataFolder);
const char* version = FDB_VT_VERSION;
std::string versionString = "version=";
versionString += version;
paramList.push_back(versionString);
paramList.push_back(role);
cmdErr = spawnProcess(snapBin.toString(), paramList, 3.0, false /*isSync*/);
wait(success(cmdErr));
err = cmdErr.get();
} else {
// copy the files
state std::string folderFrom = folder + "/.";
state std::string folderTo = folder + "-snap-" + uidStr.toString();
std::vector<std::string> paramList;
std::string mkdirBin = "/bin/mkdir";
paramList.push_back(folderTo);
cmdErr = spawnProcess(mkdirBin, paramList, 3.0, false /*isSync*/);
wait(success(cmdErr));
err = cmdErr.get();
if (err == 0) {
std::vector<std::string> paramList;
std::string cpBin = "/bin/cp";
paramList.push_back("-a");
paramList.push_back(folderFrom);
paramList.push_back(folderTo);
cmdErr = spawnProcess(cpBin, paramList, 3.0, true /*isSync*/);
wait(success(cmdErr));
err = cmdErr.get();
}
}
return err;
}
std::map<NetworkAddress, std::set<UID>> execOpsInProgress;
bool isExecOpInProgress(UID execUID) {
NetworkAddress addr = g_network->getLocalAddress();
return (execOpsInProgress[addr].find(execUID) != execOpsInProgress[addr].end());
}
void setExecOpInProgress(UID execUID) {
NetworkAddress addr = g_network->getLocalAddress();
ASSERT(execOpsInProgress[addr].find(execUID) == execOpsInProgress[addr].end());
execOpsInProgress[addr].insert(execUID);
return;
}
void clearExecOpInProgress(UID execUID) {
NetworkAddress addr = g_network->getLocalAddress();
ASSERT(execOpsInProgress[addr].find(execUID) != execOpsInProgress[addr].end());
execOpsInProgress[addr].erase(execUID);
return;
}
std::map<NetworkAddress, std::set<UID>> tLogsAlive;
void registerTLog(UID uid) {
NetworkAddress addr = g_network->getLocalAddress();
tLogsAlive[addr].insert(uid);
}
void unregisterTLog(UID uid) {
NetworkAddress addr = g_network->getLocalAddress();
if (tLogsAlive[addr].find(uid) != tLogsAlive[addr].end()) {
tLogsAlive[addr].erase(uid);
}
}
bool isTLogInSameNode() {
NetworkAddress addr = g_network->getLocalAddress();
return tLogsAlive[addr].size() >= 1;
}

View File

@ -0,0 +1,70 @@
#pragma once
#if defined(NO_INTELLISENSE) && !defined(FDBSERVER_EXEC_HELPER_ACTOR_G_H)
#define FDBSERVER_EXEC_HELPER_ACTOR_G_H
#include "fdbserver/FDBExecHelper.actor.g.h"
#elif !defined(FDBSERVER_EXEC_HELPER_ACTOR_H)
#define FDBSERVER_EXEC_HELPER_ACTOR_H
#include <string>
#include <vector>
#include <map>
#include "flow/Arena.h"
#include "flow/flow.h"
#include "flow/actorcompiler.h"
// execute/snapshot command takes two arguments: <param1> <param2>
// param1 - represents the command type/name
// param2 - takes a binary path followed by a set of arguments in the following
// format <binary-path>:<key1=val1>,<key2=val2>...
// this class will abstract the format and give functions to get various pieces
// of information
class ExecCmdValueString {
public: // ctor & dtor
ExecCmdValueString() {}
explicit ExecCmdValueString(StringRef cmdValueString);
public: // interfaces
StringRef getBinaryPath();
VectorRef<StringRef> getBinaryArgs();
StringRef getBinaryArgValue(StringRef key);
void setCmdValueString(StringRef cmdValueString);
StringRef getCmdValueString(void);
public: // helper functions
void dbgPrint();
private: // functions
void parseCmdValue();
private: // data
Standalone<StringRef> cmdValueString;
Standalone<VectorRef<StringRef>> binaryArgs;
StringRef binaryPath;
std::map<StringRef, StringRef> keyValueMap;
};
// FIXME: move this function to a common location
// spawns a process pointed by `binPath` and the arguments provided at `paramList`,
// if the process spawned takes more than `maxWaitTime` then it will be killed
// if isSync is set to true then the process will be synchronously executed
ACTOR Future<int> spawnProcess(std::string binPath, std::vector<std::string> paramList, double maxWaitTime, bool isSync);
// helper to run all the work related to running the exec command
ACTOR Future<int> execHelper(ExecCmdValueString* execArg, std::string folder, std::string role);
// returns true if the execUID op is in progress
bool isExecOpInProgress(UID execUID);
// adds the execUID op to the list of ops in progress
void setExecOpInProgress(UID execUID);
// clears the execUID op from the list of ops in progress
void clearExecOpInProgress(UID execUID);
// registers a non-stopped TLog instance
void registerTLog(UID uid);
// unregisters a stopped TLog instance
void unregisterTLog(UID uid);
// checks if there is any non-stopped TLog instance
bool isTLogInSameNode();
#endif

View File

@ -78,6 +78,7 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) {
init( DISK_QUEUE_MAX_TRUNCATE_BYTES, 2<<30 ); if ( randomize && BUGGIFY ) DISK_QUEUE_MAX_TRUNCATE_BYTES = 0;
init( TLOG_DEGRADED_DELAY_COUNT, 5 );
init( TLOG_DEGRADED_DURATION, 5.0 );
init( TLOG_IGNORE_POP_AUTO_ENABLE_DELAY, 300.0 );
// Data distribution queue
init( HEALTH_POLL_TIME, 1.0 );

View File

@ -316,6 +316,7 @@ public:
int64_t TLOG_SPILL_THRESHOLD;
int64_t TLOG_HARD_LIMIT_BYTES;
int64_t TLOG_RECOVER_MEMORY_LIMIT;
double TLOG_IGNORE_POP_AUTO_ENABLE_DELAY;
double MAX_TRANSACTIONS_PER_BYTE;

View File

@ -231,7 +231,8 @@ public:
return resultEntries.size() == 0;
}
void getPushLocations( std::vector<Tag> const& tags, std::vector<int>& locations, int locationOffset ) {
void getPushLocations(std::vector<Tag> const& tags, std::vector<int>& locations, int locationOffset,
bool allLocations = false) {
if(locality == tagLocalitySatellite) {
for(auto& t : tags) {
if(t == txsTag || t.locality == tagLocalityLogRouter) {
@ -248,9 +249,17 @@ public:
alsoServers.clear();
resultEntries.clear();
for(auto& t : tags) {
if(locality == tagLocalitySpecial || t.locality == locality || t.locality < 0) {
newLocations.push_back(bestLocationFor(t));
if (allLocations) {
// special handling for allLocations
TraceEvent("AllLocationsSet");
for (int i = 0; i < logServers.size(); i++) {
newLocations.push_back(i);
}
} else {
for (auto& t : tags) {
if (locality == tagLocalitySpecial || t.locality == locality || t.locality < 0) {
newLocations.push_back(bestLocationFor(t));
}
}
}
@ -690,7 +699,7 @@ struct ILogSystem {
virtual Future<Void> onLogSystemConfigChange() = 0;
// Returns when the log system configuration has changed due to a tlog rejoin.
virtual void getPushLocations( std::vector<Tag> const& tags, std::vector<int>& locations ) = 0;
virtual void getPushLocations(std::vector<Tag> const& tags, std::vector<int>& locations, bool allLocations = false) = 0;
virtual bool hasRemoteLogs() = 0;
@ -733,7 +742,7 @@ struct CompareFirst {
struct LogPushData : NonCopyable {
// Log subsequences have to start at 1 (the MergedPeekCursor relies on this to make sure we never have !hasMessage() in the middle of data for a version
explicit LogPushData(Reference<ILogSystem> logSystem) : logSystem(logSystem), subsequence(1) {
explicit LogPushData(Reference<ILogSystem> logSystem) : logSystem(logSystem), subsequence(1), hasExecOp(false) {
for(auto& log : logSystem->getLogSystemConfig().tLogs) {
if(log.isLocal) {
for(int i = 0; i < log.tLogs.size(); i++) {
@ -776,7 +785,7 @@ struct LogPushData : NonCopyable {
}
template <class T>
void addTypedMessage( T const& item ) {
void addTypedMessage(T const& item, bool allLocations = false) {
prev_tags.clear();
if(logSystem->hasRemoteLogs()) {
prev_tags.push_back( logSystem->getRandomRouterTag() );
@ -785,8 +794,8 @@ struct LogPushData : NonCopyable {
prev_tags.push_back(tag);
}
msg_locations.clear();
logSystem->getPushLocations( prev_tags, msg_locations );
logSystem->getPushLocations(prev_tags, msg_locations, allLocations);
uint32_t subseq = this->subsequence++;
for(int loc : msg_locations) {
// FIXME: memcpy after the first time
@ -805,6 +814,10 @@ struct LogPushData : NonCopyable {
return messagesWriter[loc].toValue();
}
void setHasExecOp() { hasExecOp = true; }
bool getHasExecOp() { return hasExecOp; }
private:
Reference<ILogSystem> logSystem;
std::vector<Tag> next_message_tags;
@ -812,6 +825,7 @@ private:
std::vector<BinaryWriter> messagesWriter;
std::vector<int> msg_locations;
uint32_t subsequence;
bool hasExecOp;
};
#endif

View File

@ -34,6 +34,7 @@
#include "fdbclient/Notified.h"
#include "fdbclient/KeyRangeMap.h"
#include "fdbserver/ConflictSet.h"
#include "fdbclient/SystemData.h"
#include "flow/Stats.h"
#include "fdbserver/ApplyMetadataMutation.h"
#include "fdbserver/RecoveryState.h"
@ -41,6 +42,10 @@
#include "fdbclient/Atomic.h"
#include "flow/TDMetric.actor.h"
#include "flow/actorcompiler.h" // This must be the last #include.
#include "fdbclient/DatabaseConfiguration.h"
#include "fdbclient/FDBTypes.h"
#include "fdbclient/Knobs.h"
#include "fdbserver/FDBExecHelper.actor.h"
struct ProxyStats {
CounterCollection cc;
@ -220,6 +225,7 @@ struct ProxyCommitData {
RequestStream<GetReadVersionRequest> getConsistentReadVersion;
RequestStream<CommitTransactionRequest> commit;
Database cx;
Reference<AsyncVar<ServerDBInfo>> db;
EventMetricHandle<SingleKeyMutation> singleKeyMutationEvent;
std::map<UID, Reference<StorageInfo>> storageCache;
@ -227,6 +233,7 @@ struct ProxyCommitData {
Deque<std::pair<Version, Version>> txsPopVersions;
Version lastTxsPop;
bool popRemoteTxs;
vector<Standalone<StringRef>> whitelistedBinPathVec;
Optional<LatencyBandConfig> latencyBandConfig;
@ -256,7 +263,7 @@ struct ProxyCommitData {
lastVersionTime(0), commitVersionRequestNumber(1), mostRecentProcessedRequestNumber(0),
getConsistentReadVersion(getConsistentReadVersion), commit(commit), lastCoalesceTime(0),
localCommitBatchesStarted(0), locked(false), commitBatchInterval(SERVER_KNOBS->COMMIT_TRANSACTION_BATCH_INTERVAL_MIN),
firstProxy(firstProxy), cx(openDBOnServer(db, TaskDefaultEndpoint, true, true)),
firstProxy(firstProxy), cx(openDBOnServer(db, TaskDefaultEndpoint, true, true)), db(db),
singleKeyMutationEvent(LiteralStringRef("SingleKeyMutation")), commitBatchesMemBytesCount(0), lastTxsPop(0)
{}
};
@ -408,6 +415,34 @@ ACTOR Future<Void> commitBatcher(ProxyCommitData *commitData, PromiseStream<std:
}
}
void createWhitelistBinPathVec(const std::string& binPath, vector<Standalone<StringRef>>& binPathVec) {
TraceEvent(SevDebug, "BinPathConverter").detail("Input", binPath);
StringRef input(binPath);
while (input != StringRef()) {
StringRef token = input.eat(LiteralStringRef(","));
if (token != StringRef()) {
const uint8_t* ptr = token.begin();
while (ptr != token.end() && *ptr == ' ') {
ptr++;
}
if (ptr != token.end()) {
Standalone<StringRef> newElement(token.substr(ptr - token.begin()));
TraceEvent(SevDebug, "BinPathItem").detail("Element", newElement);
binPathVec.push_back(newElement);
}
}
}
return;
}
bool isWhitelisted(const vector<Standalone<StringRef>>& binPathVec, StringRef binPath) {
TraceEvent("BinPath").detail("Value", binPath);
for (const auto& item : binPathVec) {
TraceEvent("Element").detail("Value", item);
}
return std::find(binPathVec.begin(), binPathVec.end(), binPath) != binPathVec.end();
}
ACTOR Future<Void> commitBatch(
ProxyCommitData* self,
vector<CommitTransactionRequest> trs,
@ -727,10 +762,98 @@ ACTOR Future<Void> commitBatch(
toCommit.addTags(allSources);
}
toCommit.addTypedMessage(m);
}
else
UNREACHABLE();
} else if (m.type == MutationRef::Exec) {
state std::string param2 = m.param2.toString();
state ExecCmdValueString execArg(param2);
execArg.dbgPrint();
state StringRef binPath = execArg.getBinaryPath();
state StringRef uidStr = execArg.getBinaryArgValue(LiteralStringRef("uid"));
auto result =
self->txnStateStore->readValue(LiteralStringRef("log_anti_quorum").withPrefix(configKeysPrefix)).get();
state int logAntiQuorum = 0;
if (result.present()) {
logAntiQuorum = atoi(result.get().toString().c_str());
}
if (m.param1 != execDisableTLogPop
&& m.param1 != execEnableTLogPop
&& !isWhitelisted(self->whitelistedBinPathVec, binPath)) {
TraceEvent("ExecTransactionNotPermitted")
.detail("TransactionNum", transactionNum);
committed[transactionNum] = ConflictBatch::TransactionNotPermitted;
} else if (self->db->get().recoveryState != RecoveryState::FULLY_RECOVERED) {
// Cluster is not fully recovered and needs TLogs
// from previous generation for full recovery.
// Currently, snapshot of old tlog generation is not
// supported and hence failing the snapshot request until
// cluster is fully_recovered.
TraceEvent("ExecTransactionNotFullyRecovered")
.detail("TransactionNum", transactionNum);
committed[transactionNum] = ConflictBatch::TransactionNotFullyRecovered;
} else if (logAntiQuorum > 0) {
// exec op is not supported when logAntiQuorum is configured
// FIXME: Add support for exec ops in the presence of log anti quorum
TraceEvent("ExecOpNotSupportedWithLogAntiQuorum")
.detail("LogAntiQuorum", logAntiQuorum)
.detail("TransactionNum", transactionNum);
committed[transactionNum] = ConflictBatch::TransactionExecLogAntiQuorum;
} else {
// Send the ExecOp to
// - all the storage nodes in a single region and
// - only to storage nodes in local region in multi-region setup
// step 1: get the DatabaseConfiguration
auto result =
self->txnStateStore->readValue(LiteralStringRef("usable_regions").withPrefix(configKeysPrefix)).get();
ASSERT(result.present());
state int usableRegions = atoi(result.get().toString().c_str());
// step 2: find the tag.id from locality info of the master
auto localityKey =
self->txnStateStore->readValue(tagLocalityListKeyFor(self->master.locality.dcId())).get();
int8_t locality = tagLocalityInvalid;
if (usableRegions > 1) {
if (!localityKey.present()) {
TraceEvent(SevError, "LocalityKeyNotPresentForMasterDCID");
ASSERT(localityKey.present());
}
locality = decodeTagLocalityListValue(localityKey.get());
}
std::set<Tag> allSources;
auto& m = (*pMutations)[mutationNum];
if (debugMutation("ProxyCommit", commitVersion, m))
TraceEvent("ProxyCommitTo", self->dbgid)
.detail("To", "all sources")
.detail("Mutation", m.toString())
.detail("Version", commitVersion);
std::vector<Tag> localTags;
auto tagKeys = self->txnStateStore->readRange(serverTagKeys).get();
for( auto& kv : tagKeys ) {
Tag t = decodeServerTagValue( kv.value );
if ((usableRegions > 1 && t.locality == locality)
|| (usableRegions == 1)) {
localTags.push_back(t);
}
allSources.insert(localTags.begin(), localTags.end());
}
auto te1 = TraceEvent("ProxyCommitTo", self->dbgid);
te1.detail("To", "all sources");
te1.detail("UidStr", uidStr);
te1.detail("Mutation", m.toString());
te1.detail("Version", commitVersion);
te1.detail("NumTags", allSources.size());
for (auto& tag : allSources) {
toCommit.addTag(tag);
}
toCommit.addTypedMessage(m, true /* allLocations */);
toCommit.setHasExecOp();
}
} else
UNREACHABLE();
// Check on backing up key, if backup ranges are defined and a normal key
@ -948,7 +1071,15 @@ ACTOR Future<Void> commitBatch(
else if (committed[t] == ConflictBatch::TransactionTooOld) {
trs[t].reply.sendError(transaction_too_old());
}
else {
else if (committed[t] == ConflictBatch::TransactionNotPermitted) {
trs[t].reply.sendError(transaction_not_permitted());
}
else if (committed[t] == ConflictBatch::TransactionNotFullyRecovered) {
trs[t].reply.sendError(cluster_not_fully_recovered());
}
else if (committed[t] == ConflictBatch::TransactionExecLogAntiQuorum) {
trs[t].reply.sendError(txn_exec_log_anti_quorum());
} else {
trs[t].reply.sendError(not_committed());
}
@ -1329,12 +1460,12 @@ ACTOR Future<Void> healthMetricsRequestServer(MasterProxyInterface proxy, GetHea
}
}
ACTOR Future<Void> monitorRemoteCommitted(ProxyCommitData* self, Reference<AsyncVar<ServerDBInfo>> db) {
ACTOR Future<Void> monitorRemoteCommitted(ProxyCommitData* self) {
loop {
wait(delay(0)); //allow this actor to be cancelled if we are removed after db changes.
state Optional<std::vector<OptionalInterface<TLogInterface>>> remoteLogs;
if(db->get().recoveryState >= RecoveryState::ALL_LOGS_RECRUITED) {
for(auto& logSet : db->get().logSystemConfig.tLogs) {
if(self->db->get().recoveryState >= RecoveryState::ALL_LOGS_RECRUITED) {
for(auto& logSet : self->db->get().logSystemConfig.tLogs) {
if(!logSet.isLocal) {
remoteLogs = logSet.tLogs;
for(auto& tLog : logSet.tLogs) {
@ -1349,12 +1480,12 @@ ACTOR Future<Void> monitorRemoteCommitted(ProxyCommitData* self, Reference<Async
}
if(!remoteLogs.present()) {
wait(db->onChange());
wait(self->db->onChange());
continue;
}
self->popRemoteTxs = true;
state Future<Void> onChange = db->onChange();
state Future<Void> onChange = self->db->onChange();
loop {
state std::vector<Future<TLogQueuingMetricsReply>> replies;
for(auto &it : remoteLogs.get()) {
@ -1392,7 +1523,8 @@ ACTOR Future<Void> masterProxyServerCore(
Reference<AsyncVar<ServerDBInfo>> db,
LogEpoch epoch,
Version recoveryTransactionVersion,
bool firstProxy)
bool firstProxy,
std::string whitelistBinPaths)
{
state ProxyCommitData commitData(proxy.id(), master, proxy.getConsistentReadVersion, recoveryTransactionVersion, proxy.commit, db, firstProxy);
@ -1416,31 +1548,32 @@ ACTOR Future<Void> masterProxyServerCore(
//TraceEvent("ProxyInit1", proxy.id());
// Wait until we can load the "real" logsystem, since we don't support switching them currently
while (!(db->get().master.id() == master.id() && db->get().recoveryState >= RecoveryState::RECOVERY_TRANSACTION)) {
while (!(commitData.db->get().master.id() == master.id() && commitData.db->get().recoveryState >= RecoveryState::RECOVERY_TRANSACTION)) {
//TraceEvent("ProxyInit2", proxy.id()).detail("LSEpoch", db->get().logSystemConfig.epoch).detail("Need", epoch);
wait(db->onChange());
wait(commitData.db->onChange());
}
state Future<Void> dbInfoChange = db->onChange();
state Future<Void> dbInfoChange = commitData.db->onChange();
//TraceEvent("ProxyInit3", proxy.id());
commitData.resolvers = db->get().resolvers;
commitData.resolvers = commitData.db->get().resolvers;
ASSERT(commitData.resolvers.size() != 0);
auto rs = commitData.keyResolvers.modify(allKeys);
for(auto r = rs.begin(); r != rs.end(); ++r)
r->value().emplace_back(0,0);
commitData.logSystem = ILogSystem::fromServerDBInfo(proxy.id(), db->get(), false, addActor);
commitData.logSystem = ILogSystem::fromServerDBInfo(proxy.id(), commitData.db->get(), false, addActor);
commitData.logAdapter = new LogSystemDiskQueueAdapter(commitData.logSystem, txsTag, Reference<AsyncVar<PeekSpecialInfo>>(), false);
commitData.txnStateStore = keyValueStoreLogSystem(commitData.logAdapter, proxy.id(), 2e9, true, true, true);
createWhitelistBinPathVec(whitelistBinPaths, commitData.whitelistedBinPathVec);
// ((SERVER_MEM_LIMIT * COMMIT_BATCHES_MEM_FRACTION_OF_TOTAL) / COMMIT_BATCHES_MEM_TO_TOTAL_MEM_SCALE_FACTOR) is only a approximate formula for limiting the memory used.
// COMMIT_BATCHES_MEM_TO_TOTAL_MEM_SCALE_FACTOR is an estimate based on experiments and not an accurate one.
state int64_t commitBatchesMemoryLimit = std::min(SERVER_KNOBS->COMMIT_BATCHES_MEM_BYTES_HARD_LIMIT, static_cast<int64_t>((SERVER_KNOBS->SERVER_MEM_LIMIT * SERVER_KNOBS->COMMIT_BATCHES_MEM_FRACTION_OF_TOTAL) / SERVER_KNOBS->COMMIT_BATCHES_MEM_TO_TOTAL_MEM_SCALE_FACTOR));
TraceEvent(SevInfo, "CommitBatchesMemoryLimit").detail("BytesLimit", commitBatchesMemoryLimit);
addActor.send(monitorRemoteCommitted(&commitData, db));
addActor.send(transactionStarter(proxy, db, addActor, &commitData, &healthMetricsReply, &detailedHealthMetricsReply));
addActor.send(monitorRemoteCommitted(&commitData));
addActor.send(transactionStarter(proxy, commitData.db, addActor, &commitData, &healthMetricsReply, &detailedHealthMetricsReply));
addActor.send(readRequestServer(proxy, &commitData));
addActor.send(rejoinServer(proxy, &commitData));
addActor.send(healthMetricsRequestServer(proxy, &healthMetricsReply, &detailedHealthMetricsReply));
@ -1451,21 +1584,21 @@ ACTOR Future<Void> masterProxyServerCore(
int commitBatchByteLimit =
(int)std::min<double>(SERVER_KNOBS->COMMIT_TRANSACTION_BATCH_BYTES_MAX,
std::max<double>(SERVER_KNOBS->COMMIT_TRANSACTION_BATCH_BYTES_MIN,
SERVER_KNOBS->COMMIT_TRANSACTION_BATCH_BYTES_SCALE_BASE * pow(db->get().client.proxies.size(), SERVER_KNOBS->COMMIT_TRANSACTION_BATCH_BYTES_SCALE_POWER)));
SERVER_KNOBS->COMMIT_TRANSACTION_BATCH_BYTES_SCALE_BASE * pow(commitData.db->get().client.proxies.size(), SERVER_KNOBS->COMMIT_TRANSACTION_BATCH_BYTES_SCALE_POWER)));
commitBatcherActor = commitBatcher(&commitData, batchedCommits, proxy.commit.getFuture(), commitBatchByteLimit, commitBatchesMemoryLimit);
loop choose{
when( wait( dbInfoChange ) ) {
dbInfoChange = db->onChange();
if(db->get().master.id() == master.id() && db->get().recoveryState >= RecoveryState::RECOVERY_TRANSACTION) {
commitData.logSystem = ILogSystem::fromServerDBInfo(proxy.id(), db->get(), false, addActor);
dbInfoChange = commitData.db->onChange();
if(commitData.db->get().master.id() == master.id() && commitData.db->get().recoveryState >= RecoveryState::RECOVERY_TRANSACTION) {
commitData.logSystem = ILogSystem::fromServerDBInfo(proxy.id(), commitData.db->get(), false, addActor);
for(auto it : commitData.tag_popped) {
commitData.logSystem->pop(it.second, it.first);
}
commitData.logSystem->pop(commitData.lastTxsPop, txsTag, 0, tagLocalityRemoteLog);
}
Optional<LatencyBandConfig> newLatencyBandConfig = db->get().latencyBandConfig;
Optional<LatencyBandConfig> newLatencyBandConfig = commitData.db->get().latencyBandConfig;
if(newLatencyBandConfig.present() != commitData.latencyBandConfig.present()
|| (newLatencyBandConfig.present() && newLatencyBandConfig.get().grvConfig != commitData.latencyBandConfig.get().grvConfig))
@ -1498,7 +1631,7 @@ ACTOR Future<Void> masterProxyServerCore(
const vector<CommitTransactionRequest> &trs = batchedRequests.first;
int batchBytes = batchedRequests.second;
//TraceEvent("MasterProxyCTR", proxy.id()).detail("CommitTransactions", trs.size()).detail("TransactionRate", transactionRate).detail("TransactionQueue", transactionQueue.size()).detail("ReleasedTransactionCount", transactionCount);
if (trs.size() || (db->get().recoveryState >= RecoveryState::ACCEPTING_COMMITS && now() - lastCommit >= SERVER_KNOBS->MAX_COMMIT_BATCH_INTERVAL)) {
if (trs.size() || (commitData.db->get().recoveryState >= RecoveryState::ACCEPTING_COMMITS && now() - lastCommit >= SERVER_KNOBS->MAX_COMMIT_BATCH_INTERVAL)) {
lastCommit = now();
if (trs.size() || lastCommitComplete.isReady()) {
@ -1517,6 +1650,63 @@ ACTOR Future<Void> masterProxyServerCore(
rep.version = commitData.committedVersion.get();
req.reply.send(rep);
}
when(ExecRequest _execReq = waitNext(proxy.execReq.getFuture())) {
state ExecRequest execReq = _execReq;
if (execReq.debugID.present())
g_traceBatch.addEvent("TransactionDebug", execReq.debugID.get().first(),
"MasterProxyServer.masterProxyServerCore."
"ExecRequest");
TraceEvent("ExecRequest").detail("Payload", execReq.execPayload.toString());
// get the list of coordinators
state Optional<Value> coordinators = commitData.txnStateStore->readValue(coordinatorsKey).get();
state std::vector<NetworkAddress> coordinatorsAddr =
ClusterConnectionString(coordinators.get().toString()).coordinators();
state std::set<NetworkAddress> coordinatorsAddrSet;
for (int i = 0; i < coordinatorsAddr.size(); i++) {
TraceEvent(SevDebug, "CoordinatorAddress").detail("Addr", coordinatorsAddr[i]);
coordinatorsAddrSet.insert(coordinatorsAddr[i]);
}
// get the list of workers
state std::vector<WorkerDetails> workers =
wait(commitData.db->get().clusterInterface.getWorkers.getReply(GetWorkersRequest()));
// send the exec command to the list of workers which are
// coordinators
state vector<Future<Void>> execCoords;
for (int i = 0; i < workers.size(); i++) {
NetworkAddress primary = workers[i].interf.address();
Optional<NetworkAddress> secondary = workers[i].interf.tLog.getEndpoint().addresses.secondaryAddress;
if (coordinatorsAddrSet.find(primary) != coordinatorsAddrSet.end()
|| (secondary.present() && (coordinatorsAddrSet.find(secondary.get()) != coordinatorsAddrSet.end()))) {
TraceEvent("ExecReqToCoordinator")
.detail("PrimaryWorkerAddr", primary)
.detail("SecondaryWorkerAddr", secondary);
execCoords.push_back(brokenPromiseToNever(workers[i].interf.execReq.getReply(ExecuteRequest(execReq.execPayload))));
}
}
if (execCoords.size() <= 0) {
TraceEvent(SevDebug, "CoordinatorWorkersNotFound");
execReq.reply.sendError(operation_failed());
} else {
try {
wait(timeoutError(waitForAll(execCoords), 10.0));
int numSucc = 0;
for (auto item : execCoords) {
if (item.isValid() && item.isReady()) {
++numSucc;
}
}
bool succ = (numSucc >= ((execCoords.size() + 1) / 2));
succ ? execReq.reply.send(Void()) : execReq.reply.sendError(operation_failed());
} catch (Error& e) {
TraceEvent("WaitingForAllExecCoords").error(e);
execReq.reply.sendError(broken_promise());
}
}
}
when(TxnStateRequest req = waitNext(proxy.txnState.getFuture())) {
state ReplyPromise<Void> reply = req.reply;
if(req.last) maxSequence = req.sequence + 1;
@ -1601,10 +1791,11 @@ ACTOR Future<Void> checkRemoved(Reference<AsyncVar<ServerDBInfo>> db, uint64_t r
ACTOR Future<Void> masterProxyServer(
MasterProxyInterface proxy,
InitializeMasterProxyRequest req,
Reference<AsyncVar<ServerDBInfo>> db)
Reference<AsyncVar<ServerDBInfo>> db,
std::string whitelistBinPaths)
{
try {
state Future<Void> core = masterProxyServerCore(proxy, req.master, db, req.recoveryCount, req.recoveryTransactionVersion, req.firstProxy);
state Future<Void> core = masterProxyServerCore(proxy, req.master, db, req.recoveryCount, req.recoveryTransactionVersion, req.firstProxy, whitelistBinPaths);
loop choose{
when(wait(core)) { return Void(); }
when(wait(checkRemoved(db, req.recoveryCount, proxy))) {}

View File

@ -24,6 +24,7 @@
#include "fdbclient/NativeAPI.actor.h"
#include "fdbclient/Notified.h"
#include "fdbclient/KeyRangeMap.h"
#include "fdbclient/RunTransaction.actor.h"
#include "fdbclient/SystemData.h"
#include "fdbserver/WorkerInterface.actor.h"
#include "fdbserver/TLogInterface.h"
@ -38,6 +39,7 @@
#include "fdbserver/LogSystem.h"
#include "fdbserver/WaitFailure.h"
#include "fdbserver/RecoveryState.h"
#include "fdbserver/FDBExecHelper.actor.h"
#include "flow/actorcompiler.h" // This must be the last #include.
using std::pair;
@ -251,6 +253,7 @@ struct TLogData : NonCopyable {
AsyncVar<bool> largeDiskQueueCommitBytes; //becomes true when diskQueueCommitBytes is greater than MAX_QUEUE_COMMIT_BYTES
Reference<AsyncVar<ServerDBInfo>> dbInfo;
Database cx;
NotifiedVersion queueCommitEnd;
Version queueCommitBegin;
@ -274,15 +277,27 @@ struct TLogData : NonCopyable {
FlowLock concurrentLogRouterReads;
FlowLock persistentDataCommitLock;
bool ignorePopRequest; // ignore pop request from storage servers
double ignorePopDeadline; // time until which the ignorePopRequest will be
// honored
std::string ignorePopUid; // callers that set ignorePopRequest will set this
// extra state, used to validate the ownership of
// the set and for callers that unset will
// be able to match it up
std::string dataFolder; // folder where data is stored
std::map<Tag, Version> toBePopped; // map of Tag->Version for all the pops
// that came when ignorePopRequest was set
Reference<AsyncVar<bool>> degraded;
TLogData(UID dbgid, IKeyValueStore* persistentData, IDiskQueue * persistentQueue, Reference<AsyncVar<ServerDBInfo>> dbInfo, Reference<AsyncVar<bool>> degraded)
TLogData(UID dbgid, IKeyValueStore* persistentData, IDiskQueue * persistentQueue, Reference<AsyncVar<ServerDBInfo>> dbInfo, Reference<AsyncVar<bool>> degraded, std::string folder)
: dbgid(dbgid), instanceID(deterministicRandom()->randomUniqueID().first()),
persistentData(persistentData), rawPersistentQueue(persistentQueue), persistentQueue(new TLogQueue(persistentQueue, dbgid)),
dbInfo(dbInfo), degraded(degraded), queueCommitBegin(0), queueCommitEnd(0),
diskQueueCommitBytes(0), largeDiskQueueCommitBytes(false), bytesInput(0), bytesDurable(0), overheadBytesInput(0), overheadBytesDurable(0),
concurrentLogRouterReads(SERVER_KNOBS->CONCURRENT_LOG_ROUTER_READS)
concurrentLogRouterReads(SERVER_KNOBS->CONCURRENT_LOG_ROUTER_READS),
ignorePopRequest(false), ignorePopDeadline(), ignorePopUid(), dataFolder(folder), toBePopped()
{
cx = openDBOnServer(dbInfo, TaskDefaultEndpoint, true, true);
}
};
@ -416,13 +431,15 @@ struct LogData : NonCopyable, public ReferenceCounted<LogData> {
UID recruitmentID;
std::set<Tag> allTags;
Future<Void> terminated;
FlowLock execOpLock;
bool execOpCommitInProgress;
explicit LogData(TLogData* tLogData, TLogInterface interf, Tag remoteTag, bool isPrimary, int logRouterTags, UID recruitmentID, std::vector<Tag> tags) : tLogData(tLogData), knownCommittedVersion(0), logId(interf.id()),
cc("TLog", interf.id().toString()), bytesInput("BytesInput", cc), bytesDurable("BytesDurable", cc), remoteTag(remoteTag), isPrimary(isPrimary), logRouterTags(logRouterTags), recruitmentID(recruitmentID),
logSystem(new AsyncVar<Reference<ILogSystem>>()), logRouterPoppedVersion(0), durableKnownCommittedVersion(0), minKnownCommittedVersion(0), allTags(tags.begin(), tags.end()), terminated(tLogData->terminated.getFuture()),
// These are initialized differently on init() or recovery
recoveryCount(), stopped(false), initialized(false), queueCommittingVersion(0), newPersistentDataVersion(invalidVersion), unrecoveredBefore(1), recoveredAt(1), unpoppedRecoveredTags(0),
logRouterPopToVersion(0), locality(tagLocalityInvalid)
logRouterPopToVersion(0), locality(tagLocalityInvalid), execOpCommitInProgress(false)
{
startRole(Role::TRANSACTION_LOG, interf.id(), UID());
@ -519,6 +536,7 @@ ACTOR Future<Void> tLogLock( TLogData* self, ReplyPromise< TLogLockResult > repl
TEST( !logData->stopped );
TraceEvent("TLogStop", logData->logId).detail("Ver", stopVersion).detail("IsStopped", logData->stopped).detail("QueueCommitted", logData->queueCommittedVersion.get());
unregisterTLog(logData->logId);
logData->stopped = true;
if(!logData->recoveryComplete.isSet()) {
@ -886,14 +904,28 @@ std::deque<std::pair<Version, LengthPrefixedStringRef>> & getVersionMessages( Re
return tagData->versionMessages;
};
ACTOR Future<Void> tLogPop( TLogData* self, TLogPopRequest req, Reference<LogData> logData ) {
state Version upTo = req.to;
int8_t tagLocality = req.tag.locality;
ACTOR Future<Void> tLogPopCore( TLogData* self, Tag inputTag, Version to, Reference<LogData> logData ) {
if (self->ignorePopRequest && inputTag != txsTag) {
TraceEvent("IgnoringPopRequest").detail("IgnorePopDeadline", self->ignorePopDeadline);
if (self->toBePopped.find(inputTag) == self->toBePopped.end()
|| to > self->toBePopped[inputTag]) {
self->toBePopped[inputTag] = to;
}
// add the pop to the toBePopped map
TraceEvent(SevDebug, "IgnoringPopRequest")
.detail("IgnorePopDeadline", self->ignorePopDeadline)
.detail("Tag", inputTag.toString())
.detail("Version", to);
return Void();
}
state Version upTo = to;
int8_t tagLocality = inputTag.locality;
if (logData->logSystem->get().isValid() && logData->logSystem->get()->isPseudoLocality(tagLocality)) {
upTo = logData->logSystem->get()->popPseudoLocalityTag(tagLocality, req.to);
upTo = logData->logSystem->get()->popPseudoLocalityTag(tagLocality, to);
tagLocality = tagLocalityLogRouter;
}
state Tag tag(tagLocality, req.tag.id);
state Tag tag(tagLocality, inputTag.id);
auto tagData = logData->getTagData(tag);
if (!tagData) {
tagData = logData->createTagData(tag, upTo, true, true, false);
@ -914,7 +946,34 @@ ACTOR Future<Void> tLogPop( TLogData* self, TLogPopRequest req, Reference<LogDat
wait(tagData->eraseMessagesBefore(upTo, self, logData, TaskTLogPop));
//TraceEvent("TLogPop", self->dbgid).detail("Tag", tag.toString()).detail("To", upTo);
}
return Void();
}
ACTOR Future<Void> tLogPop( TLogData* self, TLogPopRequest req, Reference<LogData> logData ) {
// timeout check for ignorePopRequest
if (self->ignorePopRequest && (g_network->now() > self->ignorePopDeadline)) {
TraceEvent("EnableTLogPlayAllIgnoredPops");
// use toBePopped and issue all the pops
state std::map<Tag, Version>::iterator it;
state vector<Future<Void>> ignoredPops;
self->ignorePopRequest = false;
self->ignorePopUid = "";
self->ignorePopDeadline = 0.0;
for (it = self->toBePopped.begin(); it != self->toBePopped.end(); it++) {
TraceEvent("PlayIgnoredPop")
.detail("Tag", it->first.toString())
.detail("Version", it->second);
ignoredPops.push_back(tLogPopCore(self, it->first, it->second, logData));
}
self->toBePopped.clear();
wait(waitForAll(ignoredPops));
TraceEvent("ResetIgnorePopRequest")
.detail("Now", g_network->now())
.detail("IgnorePopRequest", self->ignorePopRequest)
.detail("IgnorePopDeadline", self->ignorePopDeadline);
}
wait(tLogPopCore(self, req.tag, req.to, logData));
req.reply.send(Void());
return Void();
}
@ -1218,6 +1277,207 @@ ACTOR Future<Void> commitQueue( TLogData* self ) {
}
}
void execProcessingHelper(TLogData* self,
Reference<LogData> logData,
TLogCommitRequest* req,
Standalone<VectorRef<Tag>>* execTags,
ExecCmdValueString* execArg,
StringRef* execCmd,
Version* execVersion,
vector<Future<Void>>* snapFailKeySetters,
vector<Future<Void>>* ignoredPops)
{
// inspect the messages to find if there is an Exec type and print
// it. message are prefixed by the length of the message and each
// field is prefixed by the length too
uint8_t type = MutationRef::MAX_ATOMIC_OP;
StringRef param2;
ArenaReader rd(req->arena, req->messages, Unversioned());
int32_t messageLength, rawLength;
uint16_t tagCount;
uint32_t sub;
while (!rd.empty()) {
Tag tmpTag;
bool hasTxsTag = false;
rd.checkpoint();
rd >> messageLength >> sub >> tagCount;
for (int i = 0; i < tagCount; i++) {
rd >> tmpTag;
if (tmpTag == txsTag) {
hasTxsTag = true;
}
execTags->push_back(execTags->arena(), tmpTag);
}
if (!hasTxsTag) {
rd >> type;
if (type == MutationRef::Exec) {
break;
}
}
rawLength = messageLength + sizeof(messageLength);
rd.rewind();
rd.readBytes(rawLength);
}
int32_t len = 0;
if (type == MutationRef::Exec) {
// get param1
rd >> len;
*execCmd = StringRef((uint8_t const*)rd.readBytes(len), len);
// get param2
rd >> len;
param2 = StringRef((uint8_t const*)rd.readBytes(len), len);
TraceEvent(SevDebug, "TLogExecCommandType", self->dbgid)
.detail("Value", execCmd->toString())
.detail("Version", req->version);
execArg->setCmdValueString(param2);
execArg->dbgPrint();
StringRef uidStr = execArg->getBinaryArgValue(LiteralStringRef("uid"));
if (!execCmd->startsWith(LiteralStringRef("\xff"))) {
*execVersion = req->version;
}
if (*execCmd == execSnap) {
// validation check specific to snap request
std::string reason;
if (!self->ignorePopRequest) {
*execVersion = invalidVersion;
reason = "SnapFailIgnorePopNotSet";
} else if (uidStr.toString() != self->ignorePopUid) {
*execVersion = invalidVersion;
reason = "SnapFailedDisableTLogUidMismatch";
}
if (*execVersion == invalidVersion) {
TraceEvent(SevWarn, "TLogSnapFailed")
.detail("IgnorePopUid", self->ignorePopUid)
.detail("IgnorePopRequest", self->ignorePopRequest)
.detail("Reason", reason)
.detail("Version", req->version);
TraceEvent("ExecCmdSnapCreate")
.detail("Uid", uidStr.toString())
.detail("Status", -1)
.detail("Tag", logData->allTags.begin()->toString())
.detail("Role", "TLog")
.detail("Version", req->version);
if (g_network->isSimulated()) {
// write SnapFailedTLog.$UID
Standalone<StringRef> keyStr = snapTestFailStatus.withSuffix(uidStr);
Standalone<StringRef> valStr = LiteralStringRef("Success");
TraceEvent(SevDebug, "TLogKeyStr").detail("Value", keyStr);
snapFailKeySetters->push_back(runRYWTransaction(self->cx, [=](Reference<ReadYourWritesTransaction> tr) -> Future<Void>
{ tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); tr->set(keyStr, valStr); return Void(); }));
}
}
}
if (*execCmd == execDisableTLogPop) {
self->ignorePopRequest = true;
if (self->ignorePopUid != "") {
TraceEvent(SevWarn, "TLogPopDisableonDisable")
.detail("IgnorePopUid", self->ignorePopUid)
.detail("UidStr", uidStr.toString())
.detail("Version", req->version);
}
self->ignorePopUid = uidStr.toString();
self->ignorePopDeadline = g_network->now() + SERVER_KNOBS->TLOG_IGNORE_POP_AUTO_ENABLE_DELAY;
TraceEvent("TLogExecCmdPopDisable")
.detail("ExecCmd", execCmd->toString())
.detail("UidStr", uidStr.toString())
.detail("IgnorePopUid", self->ignorePopUid)
.detail("IgnporePopRequest", self->ignorePopRequest)
.detail("IgnporePopDeadline", self->ignorePopDeadline)
.detail("Version", req->version);
}
if (*execCmd == execEnableTLogPop) {
if (self->ignorePopUid != uidStr.toString()) {
TraceEvent(SevWarn, "TLogPopDisableEnableUidMismatch")
.detail("IgnorePopUid", self->ignorePopUid)
.detail("UidStr", uidStr.toString())
.detail("Version", req->version);
}
TraceEvent("EnableTLogPlayAllIgnoredPops2");
// use toBePopped and issue all the pops
std::map<Tag, Version>::iterator it;
self->ignorePopRequest = false;
self->ignorePopDeadline = 0.0;
self->ignorePopUid = "";
for (it = self->toBePopped.begin(); it != self->toBePopped.end(); it++) {
TraceEvent("PlayIgnoredPop")
.detail("Tag", it->first.toString())
.detail("Version", it->second);
ignoredPops->push_back(tLogPopCore(self, it->first, it->second, logData));
}
self->toBePopped.clear();
TraceEvent("TLogExecCmdPopEnable")
.detail("ExecCmd", execCmd->toString())
.detail("UidStr", uidStr.toString())
.detail("IgnorePopUid", self->ignorePopUid)
.detail("IgnporePopRequest", self->ignorePopRequest)
.detail("IgnporePopDeadline", self->ignorePopDeadline)
.detail("Version", req->version);
}
}
}
ACTOR Future<Void> tLogSnapHelper(TLogData* self,
Reference<LogData> logData,
ExecCmdValueString* execArg,
Version version,
Version execVersion,
StringRef execCmd,
Standalone<VectorRef<Tag>> execTags)
{
state int err = 0;
state StringRef uidStr = execArg->getBinaryArgValue(LiteralStringRef("uid"));
state UID execUID = UID::fromString(uidStr.toString());
state bool otherRoleExeced = false;
// TLog is special, we need to snap at the execVersion.
// storage on the same node should not initiate a snap before TLog which will make
// the snap version at TLog unpredictable
ASSERT(!isExecOpInProgress(execUID));
if (!otherRoleExeced) {
setExecOpInProgress(execUID);
int tmpErr = wait(execHelper(execArg, self->dataFolder, "role=tlog"));
err = tmpErr;
clearExecOpInProgress(execUID);
}
TraceEvent("TLogCommitExecTraceTLog")
.detail("UidStr", uidStr.toString())
.detail("Status", err)
.detail("Tag", logData->allTags.begin()->toString())
.detail("OldTagSize", logData->allTags.size())
.detail("Role", "TLog");
// print the detailed status message
for (int i = 0; i < execTags.size(); i++) {
Version poppedTagVersion = -1;
auto tagv = logData->getTagData(execTags[i]);
if (!tagv) {
continue;
}
poppedTagVersion = tagv->popped;
TraceEvent te = TraceEvent(SevDebug, "TLogExecTraceDetailed");
te.detail("Uid", uidStr.toString());
te.detail("Status", err);
te.detail("Role", "TLog");
te.detail("ExecCmd", execCmd.toString());
te.detail("Param2", execArg->getCmdValueString().toString());
te.detail("Tag", tagv->tag.toString());
te.detail("Version", version);
te.detail("PoppedTagVersion", poppedTagVersion);
te.detail("PersistentDataVersion", logData->persistentDataVersion);
te.detail("PersistentDatadurableVersion", logData->persistentDataDurableVersion);
te.detail("QueueCommittedVersion", logData->queueCommittedVersion.get());
te.detail("IgnorePopUid", self->ignorePopUid);
}
return Void();
}
ACTOR Future<Void> tLogCommit(
TLogData* self,
TLogCommitRequest req,
@ -1252,22 +1512,59 @@ ACTOR Future<Void> tLogCommit(
wait( delayJittered(.005, TaskTLogCommit) );
}
// while exec op is being committed, no new transactions will be admitted.
// This property is useful for snapshot kind of operations which wants to
// take a snap of the disk image at a particular version (no data from
// future version to be included)
// NOTE: execOpCommitInProgress will not be set for exec commands which
// start with \xff
state bool execOpLockTaken = false;
if (logData->execOpCommitInProgress) {
wait(logData->execOpLock.take());
execOpLockTaken = true;
}
if(logData->stopped) {
req.reply.sendError( tlog_stopped() );
return Void();
}
if (logData->version.get() == req.prevVersion) { // Not a duplicate (check relies on no waiting between here and self->version.set() below!)
state Version execVersion = invalidVersion;
state ExecCmdValueString execArg();
state TLogQueueEntryRef qe;
state StringRef execCmd;
state Standalone<VectorRef<Tag>> execTags;
state vector<Future<Void>> snapFailKeySetters;
state vector<Future<Void>> playIgnoredPops;
if (logData->version.get() == req.prevVersion) { // Not a duplicate (check relies on critical section between here self->version.set() below!)
if(req.debugID.present())
g_traceBatch.addEvent("CommitDebug", tlogDebugID.get().first(), "TLog.tLogCommit.Before");
if (req.hasExecOp) {
execProcessingHelper(self, logData, &req, &execTags, &execArg, &execCmd, &execVersion, &snapFailKeySetters, &playIgnoredPops);
if (execVersion != invalidVersion) {
TraceEvent(SevDebug, "SettingExecOpCommit")
.detail("LogId", logData->logId)
.detail("ExecVersion", execVersion)
.detail("Version", req.version);
logData->execOpCommitInProgress = true;
if (!execOpLockTaken) {
wait(logData->execOpLock.take());
execOpLockTaken = true;
} else {
ASSERT(logData->execOpLock.available() == 0);
}
ASSERT(execOpLockTaken);
}
}
//TraceEvent("TLogCommit", logData->logId).detail("Version", req.version);
commitMessages(self, logData, req.version, req.arena, req.messages);
logData->knownCommittedVersion = std::max(logData->knownCommittedVersion, req.knownCommittedVersion);
// Log the changes to the persistent queue, to be committed by commitQueue()
TLogQueueEntryRef qe;
qe.version = req.version;
qe.knownCommittedVersion = logData->knownCommittedVersion;
qe.messages = req.messages;
@ -1281,6 +1578,7 @@ ACTOR Future<Void> tLogCommit(
// Notifies the commitQueue actor to commit persistentQueue, and also unblocks tLogPeekMessages actors
logData->version.set( req.version );
wait(waitForAll(playIgnoredPops));
if(req.debugID.present())
g_traceBatch.addEvent("CommitDebug", tlogDebugID.get().first(), "TLog.tLogCommit.AfterTLogCommit");
@ -1289,6 +1587,19 @@ ACTOR Future<Void> tLogCommit(
state Future<Void> stopped = logData->stopCommit.onTrigger();
wait( timeoutWarning( logData->queueCommittedVersion.whenAtLeast( req.version ) || stopped, 0.1, warningCollectorInput ) );
if ((execVersion != invalidVersion) && execVersion <= logData->queueCommittedVersion.get()) {
wait(tLogSnapHelper(self, logData, &execArg, qe.version, execVersion, execCmd, execTags));
}
if (execVersion != invalidVersion && logData->execOpCommitInProgress) {
ASSERT(execOpLockTaken);
logData->execOpCommitInProgress = false;
}
if (execOpLockTaken) {
logData->execOpLock.release();
execOpLockTaken = false;
}
execVersion = invalidVersion;
if(stopped.isReady()) {
ASSERT(logData->stopped);
req.reply.sendError( tlog_stopped() );
@ -1299,6 +1610,13 @@ ACTOR Future<Void> tLogCommit(
g_traceBatch.addEvent("CommitDebug", tlogDebugID.get().first(), "TLog.tLogCommit.After");
req.reply.send( logData->durableKnownCommittedVersion );
if (g_network->isSimulated()) {
if (snapFailKeySetters.size() > 0) {
TraceEvent(SevDebug, "SettingSnapFailKey");
wait(waitForAll(snapFailKeySetters));
TraceEvent(SevDebug, "SettingSnapFailKeyDone");
}
}
return Void();
}
@ -1473,7 +1791,7 @@ ACTOR Future<Void> serveTLogInterface( TLogData* self, TLogInterface tli, Refere
logData->addActor.send( tLogPeekMessages( self, req, logData ) );
}
when( TLogPopRequest req = waitNext( tli.popMessages.getFuture() ) ) {
logData->addActor.send( tLogPop( self, req, logData ) );
logData->addActor.send(tLogPop(self, req, logData));
}
when( TLogCommitRequest req = waitNext( tli.commit.getFuture() ) ) {
//TraceEvent("TLogCommitReq", logData->logId).detail("Ver", req.version).detail("PrevVer", req.prevVersion).detail("LogVer", logData->version.get());
@ -1507,6 +1825,7 @@ ACTOR Future<Void> serveTLogInterface( TLogData* self, TLogInterface tli, Refere
void removeLog( TLogData* self, Reference<LogData> logData ) {
TraceEvent("TLogRemoved", logData->logId).detail("Input", logData->bytesInput.getValue()).detail("Durable", logData->bytesDurable.getValue());
logData->stopped = true;
unregisterTLog(logData->logId);
if(!logData->recoveryComplete.isSet()) {
logData->recoveryComplete.sendError(end_of_stream());
}
@ -1993,6 +2312,7 @@ ACTOR Future<Void> tLogStart( TLogData* self, InitializeTLogRequest req, Localit
self->queueOrder.push_back(recruited.id());
TraceEvent("TLogStart", logData->logId);
registerTLog(logData->logId);
state Future<Void> updater;
state bool pulledRecoveryVersions = false;
try {
@ -2098,8 +2418,8 @@ ACTOR Future<Void> tLogStart( TLogData* self, InitializeTLogRequest req, Localit
}
// New tLog (if !recoverFrom.size()) or restore from network
ACTOR Future<Void> tLog( IKeyValueStore* persistentData, IDiskQueue* persistentQueue, Reference<AsyncVar<ServerDBInfo>> db, LocalityData locality, PromiseStream<InitializeTLogRequest> tlogRequests, UID tlogId, bool restoreFromDisk, Promise<Void> oldLog, Promise<Void> recovered, Reference<AsyncVar<bool>> degraded) {
state TLogData self( tlogId, persistentData, persistentQueue, db, degraded );
ACTOR Future<Void> tLog( IKeyValueStore* persistentData, IDiskQueue* persistentQueue, Reference<AsyncVar<ServerDBInfo>> db, LocalityData locality, PromiseStream<InitializeTLogRequest> tlogRequests, UID tlogId, bool restoreFromDisk, Promise<Void> oldLog, Promise<Void> recovered, std::string folder, Reference<AsyncVar<bool>> degraded) {
state TLogData self( tlogId, persistentData, persistentQueue, db, degraded, folder );
state Future<Void> error = actorCollection( self.sharedActors.getFuture() );
TraceEvent("SharedTlog", tlogId);

View File

@ -196,7 +196,8 @@ ACTOR Future<ISimulator::KillType> simulatedFDBDRebooter(Reference<ClusterConnec
LocalityData localities, ProcessClass processClass,
std::string* dataFolder, std::string* coordFolder,
std::string baseFolder, ClusterConnectionString connStr,
bool useSeedFile, bool runBackupAgents) {
bool useSeedFile, bool runBackupAgents,
std::string whitelistBinPaths) {
state ISimulator::ProcessInfo *simProcess = g_simulator.getCurrentProcess();
state UID randomId = nondeterministicRandom()->randomUniqueID();
state int cycles = 0;
@ -250,7 +251,7 @@ ACTOR Future<ISimulator::KillType> simulatedFDBDRebooter(Reference<ClusterConnec
NetworkAddress n(ip, listenPort, true, sslEnabled && listenPort == port);
futures.push_back(FlowTransport::transport().bind( n, n ));
}
Future<Void> fd = fdbd( connFile, localities, processClass, *dataFolder, *coordFolder, 500e6, "", "", -1);
Future<Void> fd = fdbd( connFile, localities, processClass, *dataFolder, *coordFolder, 500e6, "", "", -1, whitelistBinPaths);
Future<Void> backup = runBackupAgents ? runBackup(connFile) : Future<Void>(Never());
Future<Void> dr = runBackupAgents ? runDr(connFile) : Future<Void>(Never());
@ -359,7 +360,7 @@ std::map< Optional<Standalone<StringRef>>, std::vector< std::vector< std::string
ACTOR Future<Void> simulatedMachine(ClusterConnectionString connStr, std::vector<IPAddress> ips, bool sslEnabled,
Reference<TLSOptions> tlsOptions, LocalityData localities,
ProcessClass processClass, std::string baseFolder, bool restarting,
bool useSeedFile, bool runBackupAgents, bool sslOnly) {
bool useSeedFile, bool runBackupAgents, bool sslOnly, std::string whitelistBinPaths) {
state int bootCount = 0;
state std::vector<std::string> myFolders;
state std::vector<std::string> coordFolders;
@ -401,7 +402,7 @@ ACTOR Future<Void> simulatedMachine(ClusterConnectionString connStr, std::vector
std::string path = joinPath(myFolders[i], "fdb.cluster");
Reference<ClusterConnectionFile> clusterFile(useSeedFile ? new ClusterConnectionFile(path, connStr.toString()) : new ClusterConnectionFile(path));
const int listenPort = i*listenPerProcess + 1;
processes.push_back(simulatedFDBDRebooter(clusterFile, ips[i], sslEnabled, tlsOptions, listenPort, listenPerProcess, localities, processClass, &myFolders[i], &coordFolders[i], baseFolder, connStr, useSeedFile, runBackupAgents));
processes.push_back(simulatedFDBDRebooter(clusterFile, ips[i], sslEnabled, tlsOptions, listenPort, listenPerProcess, localities, processClass, &myFolders[i], &coordFolders[i], baseFolder, connStr, useSeedFile, runBackupAgents, whitelistBinPaths));
TraceEvent("SimulatedMachineProcess", randomId).detail("Address", NetworkAddress(ips[i], listenPort, true, false)).detail("ZoneId", localities.zoneId()).detail("DataHall", localities.dataHallId()).detail("Folder", myFolders[i]);
}
@ -606,7 +607,7 @@ IPAddress makeIPAddressForSim(bool isIPv6, std::array<int, 4> parts) {
ACTOR Future<Void> restartSimulatedSystem(vector<Future<Void>>* systemActors, std::string baseFolder, int* pTesterCount,
Optional<ClusterConnectionString>* pConnString,
Standalone<StringRef>* pStartingConfiguration,
Reference<TLSOptions> tlsOptions, int extraDB) {
Reference<TLSOptions> tlsOptions, int extraDB, std::string whitelistBinPaths) {
CSimpleIni ini;
ini.SetUnicode();
ini.LoadFile(joinPath(baseFolder, "restartInfo.ini").c_str());
@ -704,7 +705,7 @@ ACTOR Future<Void> restartSimulatedSystem(vector<Future<Void>>* systemActors, st
systemActors->push_back(reportErrors(
simulatedMachine(conn, ipAddrs, usingSSL, tlsOptions, localities, processClass, baseFolder, true,
i == useSeedForMachine, enableExtraDB,
usingSSL && (listenersPerProcess == 1 || processClass == ProcessClass::TesterClass)),
usingSSL && (listenersPerProcess == 1 || processClass == ProcessClass::TesterClass), whitelistBinPaths),
processClass == ProcessClass::TesterClass ? "SimulatedTesterMachine" : "SimulatedMachine"));
}
@ -1086,7 +1087,8 @@ void SimulationConfig::generateNormalConfig(int minimumReplication, int minimumR
void setupSimulatedSystem(vector<Future<Void>>* systemActors, std::string baseFolder, int* pTesterCount,
Optional<ClusterConnectionString>* pConnString, Standalone<StringRef>* pStartingConfiguration,
int extraDB, int minimumReplication, int minimumRegions, Reference<TLSOptions> tlsOptions) {
int extraDB, int minimumReplication, int minimumRegions, Reference<TLSOptions> tlsOptions,
std::string whitelistBinPaths) {
// SOMEDAY: this does not test multi-interface configurations
SimulationConfig simconfig(extraDB, minimumReplication, minimumRegions);
StatusObject startingConfigJSON = simconfig.db.toJSON(true);
@ -1282,7 +1284,7 @@ void setupSimulatedSystem(vector<Future<Void>>* systemActors, std::string baseFo
LocalityData localities(Optional<Standalone<StringRef>>(), zoneId, machineId, dcUID);
localities.set(LiteralStringRef("data_hall"), dcUID);
systemActors->push_back(reportErrors(simulatedMachine(conn, ips, sslEnabled, tlsOptions,
localities, processClass, baseFolder, false, machine == useSeedForMachine, true, sslOnly), "SimulatedMachine"));
localities, processClass, baseFolder, false, machine == useSeedForMachine, true, sslOnly, whitelistBinPaths ), "SimulatedMachine"));
if (extraDB && g_simulator.extraDB->toString() != conn.toString()) {
std::vector<IPAddress> extraIps;
@ -1296,7 +1298,7 @@ void setupSimulatedSystem(vector<Future<Void>>* systemActors, std::string baseFo
localities.set(LiteralStringRef("data_hall"), dcUID);
systemActors->push_back(reportErrors(simulatedMachine(*g_simulator.extraDB, extraIps, sslEnabled, tlsOptions,
localities,
processClass, baseFolder, false, machine == useSeedForMachine, false, sslOnly), "SimulatedMachine"));
processClass, baseFolder, false, machine == useSeedForMachine, false, sslOnly, whitelistBinPaths ), "SimulatedMachine"));
}
assignedMachines++;
@ -1324,7 +1326,7 @@ void setupSimulatedSystem(vector<Future<Void>>* systemActors, std::string baseFo
systemActors->push_back( reportErrors( simulatedMachine(
conn, ips, sslEnabled, tlsOptions,
localities, ProcessClass(ProcessClass::TesterClass, ProcessClass::CommandLineSource),
baseFolder, false, i == useSeedForMachine, false, sslEnabled),
baseFolder, false, i == useSeedForMachine, false, sslEnabled, whitelistBinPaths ),
"SimulatedTesterMachine") );
}
*pStartingConfiguration = startingConfigString;
@ -1380,7 +1382,7 @@ void checkExtraDB(const char *testFile, int &extraDB, int &minimumReplication, i
ifs.close();
}
ACTOR void setupAndRun(std::string dataFolder, const char *testFile, bool rebooting, Reference<TLSOptions> tlsOptions) {
ACTOR void setupAndRun(std::string dataFolder, const char *testFile, bool rebooting, bool restoring, std::string whitelistBinPaths, Reference<TLSOptions> tlsOptions) {
state vector<Future<Void>> systemActors;
state Optional<ClusterConnectionString> connFile;
state Standalone<StringRef> startingConfiguration;
@ -1410,13 +1412,16 @@ ACTOR void setupAndRun(std::string dataFolder, const char *testFile, bool reboot
try {
//systemActors.push_back( startSystemMonitor(dataFolder) );
if (rebooting) {
wait(timeoutError(restartSimulatedSystem(&systemActors, dataFolder, &testerCount, &connFile,
&startingConfiguration, tlsOptions, extraDB),
100.0));
} else {
wait( timeoutError( restartSimulatedSystem( &systemActors, dataFolder, &testerCount, &connFile, &startingConfiguration, tlsOptions, extraDB, whitelistBinPaths), 100.0 ) );
// FIXME: snapshot restore does not support multi-region restore, hence restore it as single region always
if (restoring) {
startingConfiguration = LiteralStringRef("usable_regions=1");
}
}
else {
g_expect_full_pointermap = 1;
setupSimulatedSystem(&systemActors, dataFolder, &testerCount, &connFile, &startingConfiguration, extraDB,
minimumReplication, minimumRegions, tlsOptions);
minimumReplication, minimumRegions, tlsOptions, whitelistBinPaths);
wait( delay(1.0) ); // FIXME: WHY!!! //wait for machines to boot
}
std::string clusterFileDir = joinPath( dataFolder, deterministicRandom()->randomUniqueID().toString() );

View File

@ -24,6 +24,6 @@
#define FDBSERVER_SIMULATEDCLUSTER_H
#pragma once
void setupAndRun(std::string const& dataFolder, const char* const& testFile, bool const& rebooting, Reference<TLSOptions> const& useSSL);
void setupAndRun(std::string const& dataFolder, const char* const& testFile, bool const& rebooting, bool const& restoring, std::string const& whitelistBinPath, Reference<TLSOptions> const& useSSL);
#endif

View File

@ -218,13 +218,14 @@ struct TLogCommitRequest {
ReplyPromise<Version> reply;
Optional<UID> debugID;
bool hasExecOp;
TLogCommitRequest() {}
TLogCommitRequest( const Arena& a, Version prevVersion, Version version, Version knownCommittedVersion, Version minKnownCommittedVersion, StringRef messages, Optional<UID> debugID )
: arena(a), prevVersion(prevVersion), version(version), knownCommittedVersion(knownCommittedVersion), minKnownCommittedVersion(minKnownCommittedVersion), messages(messages), debugID(debugID) {}
TLogCommitRequest( const Arena& a, Version prevVersion, Version version, Version knownCommittedVersion, Version minKnownCommittedVersion, StringRef messages, bool hasExecOp, Optional<UID> debugID )
: arena(a), prevVersion(prevVersion), version(version), knownCommittedVersion(knownCommittedVersion), minKnownCommittedVersion(minKnownCommittedVersion), messages(messages), debugID(debugID), hasExecOp(hasExecOp){}
template <class Ar>
void serialize( Ar& ar ) {
serializer(ar, prevVersion, version, knownCommittedVersion, minKnownCommittedVersion, messages, reply, arena, debugID);
serializer(ar, prevVersion, version, knownCommittedVersion, minKnownCommittedVersion, messages, reply, arena, debugID, hasExecOp);
}
};

View File

@ -24,6 +24,7 @@
#include "fdbclient/NativeAPI.actor.h"
#include "fdbclient/Notified.h"
#include "fdbclient/KeyRangeMap.h"
#include "fdbclient/RunTransaction.actor.h"
#include "fdbclient/SystemData.h"
#include "fdbserver/WorkerInterface.actor.h"
#include "fdbserver/TLogInterface.h"
@ -38,6 +39,7 @@
#include "fdbserver/LogSystem.h"
#include "fdbserver/WaitFailure.h"
#include "fdbserver/RecoveryState.h"
#include "fdbserver/FDBExecHelper.actor.h"
#include "flow/actorcompiler.h" // This must be the last #include.
using std::pair;
@ -301,6 +303,7 @@ struct TLogData : NonCopyable {
AsyncVar<bool> largeDiskQueueCommitBytes; //becomes true when diskQueueCommitBytes is greater than MAX_QUEUE_COMMIT_BYTES
Reference<AsyncVar<ServerDBInfo>> dbInfo;
Database cx;
NotifiedVersion queueCommitEnd;
Version queueCommitBegin;
@ -325,16 +328,28 @@ struct TLogData : NonCopyable {
FlowLock concurrentLogRouterReads;
FlowLock persistentDataCommitLock;
bool ignorePopRequest; // ignore pop request from storage servers
double ignorePopDeadline; // time until which the ignorePopRequest will be
// honored
std::string ignorePopUid; // callers that set ignorePopRequest will set this
// extra state, used to validate the ownership of
// the set and for callers that unset will
// be able to match it up
std::string dataFolder; // folder where data is stored
std::map<Tag, Version> toBePopped; // map of Tag->Version for all the pops
// that came when ignorePopRequest was set
Reference<AsyncVar<bool>> degraded;
TLogData(UID dbgid, IKeyValueStore* persistentData, IDiskQueue * persistentQueue, Reference<AsyncVar<ServerDBInfo>> dbInfo, Reference<AsyncVar<bool>> degraded)
TLogData(UID dbgid, IKeyValueStore* persistentData, IDiskQueue * persistentQueue, Reference<AsyncVar<ServerDBInfo>> dbInfo, Reference<AsyncVar<bool>> degraded, std::string folder)
: dbgid(dbgid), instanceID(deterministicRandom()->randomUniqueID().first()),
persistentData(persistentData), rawPersistentQueue(persistentQueue), persistentQueue(new TLogQueue(persistentQueue, dbgid)),
dbInfo(dbInfo), degraded(degraded), queueCommitBegin(0), queueCommitEnd(0),
diskQueueCommitBytes(0), largeDiskQueueCommitBytes(false), bytesInput(0), bytesDurable(0), overheadBytesInput(0), overheadBytesDurable(0),
peekMemoryLimiter(SERVER_KNOBS->TLOG_SPILL_REFERENCE_MAX_PEEK_MEMORY_BYTES),
concurrentLogRouterReads(SERVER_KNOBS->CONCURRENT_LOG_ROUTER_READS)
concurrentLogRouterReads(SERVER_KNOBS->CONCURRENT_LOG_ROUTER_READS),
ignorePopRequest(false), ignorePopDeadline(), ignorePopUid(), dataFolder(folder), toBePopped()
{
cx = openDBOnServer(dbInfo, TaskDefaultEndpoint, true, true);
}
};
@ -474,13 +489,15 @@ struct LogData : NonCopyable, public ReferenceCounted<LogData> {
UID recruitmentID;
std::set<Tag> allTags;
Future<Void> terminated;
FlowLock execOpLock;
bool execOpCommitInProgress;
explicit LogData(TLogData* tLogData, TLogInterface interf, Tag remoteTag, bool isPrimary, int logRouterTags, UID recruitmentID, uint64_t protocolVersion, std::vector<Tag> tags) : tLogData(tLogData), knownCommittedVersion(0), logId(interf.id()),
cc("TLog", interf.id().toString()), bytesInput("BytesInput", cc), bytesDurable("BytesDurable", cc), remoteTag(remoteTag), isPrimary(isPrimary), logRouterTags(logRouterTags), recruitmentID(recruitmentID), protocolVersion(protocolVersion),
logSystem(new AsyncVar<Reference<ILogSystem>>()), logRouterPoppedVersion(0), durableKnownCommittedVersion(0), minKnownCommittedVersion(0), queuePoppedVersion(0), allTags(tags.begin(), tags.end()), terminated(tLogData->terminated.getFuture()),
// These are initialized differently on init() or recovery
recoveryCount(), stopped(false), initialized(false), queueCommittingVersion(0), newPersistentDataVersion(invalidVersion), unrecoveredBefore(1), recoveredAt(1), unpoppedRecoveredTags(0),
logRouterPopToVersion(0), locality(tagLocalityInvalid)
logRouterPopToVersion(0), locality(tagLocalityInvalid), execOpCommitInProgress(false)
{
startRole(Role::TRANSACTION_LOG, interf.id(), UID());
@ -591,6 +608,7 @@ ACTOR Future<Void> tLogLock( TLogData* self, ReplyPromise< TLogLockResult > repl
TEST( !logData->stopped );
TraceEvent("TLogStop", logData->logId).detail("Ver", stopVersion).detail("IsStopped", logData->stopped).detail("QueueCommitted", logData->queueCommittedVersion.get());
unregisterTLog(logData->logId);
logData->stopped = true;
if(!logData->recoveryComplete.isSet()) {
@ -1136,14 +1154,28 @@ std::deque<std::pair<Version, LengthPrefixedStringRef>> & getVersionMessages( Re
return tagData->versionMessages;
};
ACTOR Future<Void> tLogPop( TLogData* self, TLogPopRequest req, Reference<LogData> logData ) {
state Version upTo = req.to;
int8_t tagLocality = req.tag.locality;
ACTOR Future<Void> tLogPopCore( TLogData* self, Tag inputTag, Version to, Reference<LogData> logData ) {
if (self->ignorePopRequest && inputTag != txsTag) {
TraceEvent("IgnoringPopRequest").detail("IgnorePopDeadline", self->ignorePopDeadline);
if (self->toBePopped.find(inputTag) == self->toBePopped.end()
|| to > self->toBePopped[inputTag]) {
self->toBePopped[inputTag] = to;
}
// add the pop to the toBePopped map
TraceEvent(SevDebug, "IgnoringPopRequest")
.detail("IgnorePopDeadline", self->ignorePopDeadline)
.detail("Tag", inputTag.toString())
.detail("Version", to);
return Void();
}
state Version upTo = to;
int8_t tagLocality = inputTag.locality;
if (logData->logSystem->get().isValid() && logData->logSystem->get()->isPseudoLocality(tagLocality)) {
upTo = logData->logSystem->get()->popPseudoLocalityTag(tagLocality, req.to);
upTo = logData->logSystem->get()->popPseudoLocalityTag(tagLocality, to);
tagLocality = tagLocalityLogRouter;
}
state Tag tag(tagLocality, req.tag.id);
state Tag tag(tagLocality, inputTag.id);
auto tagData = logData->getTagData(tag);
if (!tagData) {
tagData = logData->createTagData(tag, upTo, true, true, false);
@ -1165,7 +1197,34 @@ ACTOR Future<Void> tLogPop( TLogData* self, TLogPopRequest req, Reference<LogDat
wait(tagData->eraseMessagesBefore(upTo, self, logData, TaskTLogPop));
//TraceEvent("TLogPop", self->dbgid).detail("Tag", tag.toString()).detail("To", upTo);
}
return Void();
}
ACTOR Future<Void> tLogPop( TLogData* self, TLogPopRequest req, Reference<LogData> logData ) {
// timeout check for ignorePopRequest
if (self->ignorePopRequest && (g_network->now() > self->ignorePopDeadline)) {
TraceEvent("EnableTLogPlayAllIgnoredPops");
// use toBePopped and issue all the pops
state std::map<Tag, Version>::iterator it;
state vector<Future<Void>> ignoredPops;
self->ignorePopRequest = false;
self->ignorePopUid = "";
self->ignorePopDeadline = 0.0;
for (it = self->toBePopped.begin(); it != self->toBePopped.end(); it++) {
TraceEvent("PlayIgnoredPop")
.detail("Tag", it->first.toString())
.detail("Version", it->second);
ignoredPops.push_back(tLogPopCore(self, it->first, it->second, logData));
}
self->toBePopped.clear();
wait(waitForAll(ignoredPops));
TraceEvent("ResetIgnorePopRequest")
.detail("Now", g_network->now())
.detail("IgnorePopRequest", self->ignorePopRequest)
.detail("IgnorePopDeadline", self->ignorePopDeadline);
}
wait(tLogPopCore(self, req.tag, req.to, logData));
req.reply.send(Void());
return Void();
}
@ -1585,6 +1644,207 @@ ACTOR Future<Void> commitQueue( TLogData* self ) {
}
}
void execProcessingHelper(TLogData* self,
Reference<LogData> logData,
TLogCommitRequest* req,
Standalone<VectorRef<Tag>>* execTags,
ExecCmdValueString* execArg,
StringRef* execCmd,
Version* execVersion,
vector<Future<Void>>* snapFailKeySetters,
vector<Future<Void>>* ignoredPops)
{
// inspect the messages to find if there is an Exec type and print
// it. message are prefixed by the length of the message and each
// field is prefixed by the length too
uint8_t type = MutationRef::MAX_ATOMIC_OP;
StringRef param2;
ArenaReader rd(req->arena, req->messages, Unversioned());
int32_t messageLength, rawLength;
uint16_t tagCount;
uint32_t sub;
while (!rd.empty()) {
Tag tmpTag;
bool hasTxsTag = false;
rd.checkpoint();
rd >> messageLength >> sub >> tagCount;
for (int i = 0; i < tagCount; i++) {
rd >> tmpTag;
if (tmpTag == txsTag) {
hasTxsTag = true;
}
execTags->push_back(execTags->arena(), tmpTag);
}
if (!hasTxsTag) {
rd >> type;
if (type == MutationRef::Exec) {
break;
}
}
rawLength = messageLength + sizeof(messageLength);
rd.rewind();
rd.readBytes(rawLength);
}
int32_t len = 0;
if (type == MutationRef::Exec) {
// get param1
rd >> len;
*execCmd = StringRef((uint8_t const*)rd.readBytes(len), len);
// get param2
rd >> len;
param2 = StringRef((uint8_t const*)rd.readBytes(len), len);
TraceEvent(SevDebug, "TLogExecCommandType", self->dbgid)
.detail("Value", execCmd->toString())
.detail("Version", req->version);
execArg->setCmdValueString(param2);
execArg->dbgPrint();
StringRef uidStr = execArg->getBinaryArgValue(LiteralStringRef("uid"));
if (!execCmd->startsWith(LiteralStringRef("\xff"))) {
*execVersion = req->version;
}
if (*execCmd == execSnap) {
// validation check specific to snap request
std::string reason;
if (!self->ignorePopRequest) {
*execVersion = invalidVersion;
reason = "SnapFailIgnorePopNotSet";
} else if (uidStr.toString() != self->ignorePopUid) {
*execVersion = invalidVersion;
reason = "SnapFailedDisableTLogUidMismatch";
}
if (*execVersion == invalidVersion) {
TraceEvent(SevWarn, "TLogSnapFailed")
.detail("IgnorePopUid", self->ignorePopUid)
.detail("IgnorePopRequest", self->ignorePopRequest)
.detail("Reason", reason)
.detail("Version", req->version);
TraceEvent("ExecCmdSnapCreate")
.detail("Uid", uidStr.toString())
.detail("Status", -1)
.detail("Tag", logData->allTags.begin()->toString())
.detail("Role", "TLog")
.detail("Version", req->version);
if (g_network->isSimulated()) {
// write SnapFailedTLog.$UID
Standalone<StringRef> keyStr = snapTestFailStatus.withSuffix(uidStr);
StringRef valStr = LiteralStringRef("Success");
TraceEvent(SevDebug, "TLogKeyStr").detail("Value", keyStr);
snapFailKeySetters->push_back(runRYWTransaction(self->cx, [=](Reference<ReadYourWritesTransaction> tr) -> Future<Void>
{ tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); tr->set(keyStr, valStr); return Void(); }));
}
}
}
if (*execCmd == execDisableTLogPop) {
self->ignorePopRequest = true;
if (self->ignorePopUid != "") {
TraceEvent(SevWarn, "TLogPopDisableonDisable")
.detail("IgnorePopUid", self->ignorePopUid)
.detail("UidStr", uidStr.toString())
.detail("Version", req->version);
}
self->ignorePopUid = uidStr.toString();
self->ignorePopDeadline = g_network->now() + SERVER_KNOBS->TLOG_IGNORE_POP_AUTO_ENABLE_DELAY;
TraceEvent("TLogExecCmdPopDisable")
.detail("ExecCmd", execCmd->toString())
.detail("UidStr", uidStr.toString())
.detail("IgnorePopUid", self->ignorePopUid)
.detail("IgnporePopRequest", self->ignorePopRequest)
.detail("IgnporePopDeadline", self->ignorePopDeadline)
.detail("Version", req->version);
}
if (*execCmd == execEnableTLogPop) {
if (self->ignorePopUid != uidStr.toString()) {
TraceEvent(SevWarn, "TLogPopDisableEnableUidMismatch")
.detail("IgnorePopUid", self->ignorePopUid)
.detail("UidStr", uidStr.toString())
.detail("Version", req->version);
}
TraceEvent("EnableTLogPlayAllIgnoredPops2");
// use toBePopped and issue all the pops
std::map<Tag, Version>::iterator it;
self->ignorePopRequest = false;
self->ignorePopDeadline = 0.0;
self->ignorePopUid = "";
for (it = self->toBePopped.begin(); it != self->toBePopped.end(); it++) {
TraceEvent("PlayIgnoredPop")
.detail("Tag", it->first.toString())
.detail("Version", it->second);
ignoredPops->push_back(tLogPopCore(self, it->first, it->second, logData));
}
self->toBePopped.clear();
TraceEvent("TLogExecCmdPopEnable")
.detail("ExecCmd", execCmd->toString())
.detail("UidStr", uidStr.toString())
.detail("IgnorePopUid", self->ignorePopUid)
.detail("IgnporePopRequest", self->ignorePopRequest)
.detail("IgnporePopDeadline", self->ignorePopDeadline)
.detail("Version", req->version);
}
}
}
ACTOR Future<Void> tLogSnapHelper(TLogData* self,
Reference<LogData> logData,
ExecCmdValueString* execArg,
Version version,
Version execVersion,
StringRef execCmd,
Standalone<VectorRef<Tag>> execTags)
{
state int err = 0;
state StringRef uidStr = execArg->getBinaryArgValue(LiteralStringRef("uid"));
state UID execUID = UID::fromString(uidStr.toString());
state bool otherRoleExeced = false;
// TLog is special, we need to snap at the execVersion.
// storage on the same node should not initiate a snap before TLog which will make
// the snap version at TLog unpredictable
ASSERT(!isExecOpInProgress(execUID));
if (!otherRoleExeced) {
setExecOpInProgress(execUID);
int tmpErr = wait(execHelper(execArg, self->dataFolder, "role=tlog"));
err = tmpErr;
clearExecOpInProgress(execUID);
}
TraceEvent("TLogCommitExecTraceTLog")
.detail("UidStr", uidStr.toString())
.detail("Status", err)
.detail("Tag", logData->allTags.begin()->toString())
.detail("OldTagSize", logData->allTags.size())
.detail("Role", "TLog");
// print the detailed status message
for (int i = 0; i < execTags.size(); i++) {
Version poppedTagVersion = -1;
auto tagv = logData->getTagData(execTags[i]);
if (!tagv) {
continue;
}
poppedTagVersion = tagv->popped;
TraceEvent te = TraceEvent(SevDebug, "TLogExecTraceDetailed");
te.detail("Uid", uidStr.toString());
te.detail("Status", err);
te.detail("Role", "TLog");
te.detail("ExecCmd", execCmd.toString());
te.detail("Param2", execArg->getCmdValueString().toString());
te.detail("Tag", tagv->tag.toString());
te.detail("Version", version);
te.detail("PoppedTagVersion", poppedTagVersion);
te.detail("PersistentDataVersion", logData->persistentDataVersion);
te.detail("PersistentDatadurableVersion", logData->persistentDataDurableVersion);
te.detail("QueueCommittedVersion", logData->queueCommittedVersion.get());
te.detail("IgnorePopUid", self->ignorePopUid);
}
return Void();
}
ACTOR Future<Void> tLogCommit(
TLogData* self,
TLogCommitRequest req,
@ -1619,22 +1879,60 @@ ACTOR Future<Void> tLogCommit(
wait( delayJittered(.005, TaskTLogCommit) );
}
// while exec op is being committed, no new transactions will be admitted.
// This property is useful for snapshot kind of operations which wants to
// take a snap of the disk image at a particular version (not data from
// future version to be included)
// NOTE: execOpCommitInProgress will not be set for exec commands which
// start with \xff
state bool execOpLockTaken = false;
if (logData->execOpCommitInProgress) {
wait(logData->execOpLock.take());
execOpLockTaken = true;
}
if(logData->stopped) {
req.reply.sendError( tlog_stopped() );
return Void();
}
if (logData->version.get() == req.prevVersion) { // Not a duplicate (check relies on no waiting between here and self->version.set() below!)
state Version execVersion = invalidVersion;
state ExecCmdValueString execArg();
state TLogQueueEntryRef qe;
state StringRef execCmd;
state Standalone<VectorRef<Tag>> execTags;
state vector<Future<Void>> playIgnoredPops;
state vector<Future<Void>> snapFailKeySetters;
if (logData->version.get() == req.prevVersion) { // Not a duplicate (check relies on critical section between here self->version.set() below!)
if(req.debugID.present())
g_traceBatch.addEvent("CommitDebug", tlogDebugID.get().first(), "TLog.tLogCommit.Before");
if (req.hasExecOp) {
execProcessingHelper(self, logData, &req, &execTags, &execArg, &execCmd, &execVersion, &snapFailKeySetters, &playIgnoredPops);
if (execVersion != invalidVersion) {
TraceEvent(SevDebug, "SettingExecOpCommit")
.detail("LogId", logData->logId)
.detail("ExecVersion", execVersion)
.detail("Version", req.version);
logData->execOpCommitInProgress = true;
if (!execOpLockTaken) {
wait(logData->execOpLock.take());
execOpLockTaken = true;
} else {
ASSERT(logData->execOpLock.available() == 0);
}
ASSERT(execOpLockTaken);
}
}
//TraceEvent("TLogCommit", logData->logId).detail("Version", req.version);
commitMessages(self, logData, req.version, req.arena, req.messages);
logData->knownCommittedVersion = std::max(logData->knownCommittedVersion, req.knownCommittedVersion);
// Log the changes to the persistent queue, to be committed by commitQueue()
TLogQueueEntryRef qe;
qe.version = req.version;
qe.knownCommittedVersion = logData->knownCommittedVersion;
qe.messages = req.messages;
@ -1648,6 +1946,7 @@ ACTOR Future<Void> tLogCommit(
// Notifies the commitQueue actor to commit persistentQueue, and also unblocks tLogPeekMessages actors
logData->version.set( req.version );
wait(waitForAll(playIgnoredPops));
if(req.debugID.present())
g_traceBatch.addEvent("CommitDebug", tlogDebugID.get().first(), "TLog.tLogCommit.AfterTLogCommit");
@ -1656,6 +1955,20 @@ ACTOR Future<Void> tLogCommit(
state Future<Void> stopped = logData->stopCommit.onTrigger();
wait( timeoutWarning( logData->queueCommittedVersion.whenAtLeast( req.version ) || stopped, 0.1, warningCollectorInput ) );
if ((execVersion != invalidVersion) &&
execVersion <= logData->queueCommittedVersion.get()) {
wait(tLogSnapHelper(self, logData, &execArg, qe.version, execVersion, execCmd, execTags));
}
if (execVersion != invalidVersion && logData->execOpCommitInProgress) {
ASSERT(execOpLockTaken);
logData->execOpCommitInProgress = false;
}
if (execOpLockTaken) {
logData->execOpLock.release();
execOpLockTaken = false;
}
execVersion = invalidVersion;
if(stopped.isReady()) {
ASSERT(logData->stopped);
req.reply.sendError( tlog_stopped() );
@ -1666,6 +1979,13 @@ ACTOR Future<Void> tLogCommit(
g_traceBatch.addEvent("CommitDebug", tlogDebugID.get().first(), "TLog.tLogCommit.After");
req.reply.send( logData->durableKnownCommittedVersion );
if (g_network->isSimulated()) {
if (snapFailKeySetters.size() > 0) {
TraceEvent(SevDebug, "SettingSnapFailKey");
wait(waitForAll(snapFailKeySetters));
TraceEvent(SevDebug, "SettingSnapFailKeyDone");
}
}
return Void();
}
@ -1841,7 +2161,7 @@ ACTOR Future<Void> serveTLogInterface( TLogData* self, TLogInterface tli, Refere
logData->addActor.send( tLogPeekMessages( self, req, logData ) );
}
when( TLogPopRequest req = waitNext( tli.popMessages.getFuture() ) ) {
logData->addActor.send( tLogPop( self, req, logData ) );
logData->addActor.send(tLogPop(self, req, logData));
}
when( TLogCommitRequest req = waitNext( tli.commit.getFuture() ) ) {
//TraceEvent("TLogCommitReq", logData->logId).detail("Ver", req.version).detail("PrevVer", req.prevVersion).detail("LogVer", logData->version.get());
@ -1875,6 +2195,7 @@ ACTOR Future<Void> serveTLogInterface( TLogData* self, TLogInterface tli, Refere
void removeLog( TLogData* self, Reference<LogData> logData ) {
TraceEvent("TLogRemoved", self->dbgid).detail("LogId", logData->logId).detail("Input", logData->bytesInput.getValue()).detail("Durable", logData->bytesDurable.getValue());
logData->stopped = true;
unregisterTLog(logData->logId);
if(!logData->recoveryComplete.isSet()) {
logData->recoveryComplete.sendError(end_of_stream());
}
@ -2388,6 +2709,8 @@ ACTOR Future<Void> tLogStart( TLogData* self, InitializeTLogRequest req, Localit
self->spillOrder.push_back(recruited.id());
TraceEvent("TLogStart", logData->logId);
registerTLog(logData->logId);
state Future<Void> updater;
state bool pulledRecoveryVersions = false;
try {
@ -2493,8 +2816,8 @@ ACTOR Future<Void> tLogStart( TLogData* self, InitializeTLogRequest req, Localit
}
// New tLog (if !recoverFrom.size()) or restore from network
ACTOR Future<Void> tLog( IKeyValueStore* persistentData, IDiskQueue* persistentQueue, Reference<AsyncVar<ServerDBInfo>> db, LocalityData locality, PromiseStream<InitializeTLogRequest> tlogRequests, UID tlogId, bool restoreFromDisk, Promise<Void> oldLog, Promise<Void> recovered, Reference<AsyncVar<bool>> degraded ) {
state TLogData self( tlogId, persistentData, persistentQueue, db, degraded );
ACTOR Future<Void> tLog( IKeyValueStore* persistentData, IDiskQueue* persistentQueue, Reference<AsyncVar<ServerDBInfo>> db, LocalityData locality, PromiseStream<InitializeTLogRequest> tlogRequests, UID tlogId, bool restoreFromDisk, Promise<Void> oldLog, Promise<Void> recovered, std::string folder, Reference<AsyncVar<bool>> degraded ) {
state TLogData self( tlogId, persistentData, persistentQueue, db, degraded, folder );
state Future<Void> error = actorCollection( self.sharedActors.getFuture() );
TraceEvent("SharedTlog", tlogId);

View File

@ -431,7 +431,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
vector<Future<Void>> tLogCommitResults;
for(int loc=0; loc< it->logServers.size(); loc++) {
Standalone<StringRef> msg = data.getMessages(location);
allReplies.push_back( it->logServers[loc]->get().interf().commit.getReply( TLogCommitRequest( msg.arena(), prevVersion, version, knownCommittedVersion, minKnownCommittedVersion, msg, debugID ), TaskTLogCommitReply ) );
allReplies.push_back( it->logServers[loc]->get().interf().commit.getReply( TLogCommitRequest( msg.arena(), prevVersion, version, knownCommittedVersion, minKnownCommittedVersion, msg, data.getHasExecOp(), debugID ), TaskTLogCommitReply ) );
Future<Void> commitSuccess = success(allReplies.back());
addActor.get().send(commitSuccess);
tLogCommitResults.push_back(commitSuccess);
@ -1108,11 +1108,11 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
return std::numeric_limits<Version>::max();
}
virtual void getPushLocations( std::vector<Tag> const& tags, std::vector<int>& locations ) {
virtual void getPushLocations(std::vector<Tag> const& tags, std::vector<int>& locations, bool allLocations) {
int locationOffset = 0;
for(auto& log : tLogs) {
if(log->isLocal && log->logServers.size()) {
log->getPushLocations(tags, locations, locationOffset);
log->getPushLocations(tags, locations, locationOffset, allLocations);
locationOffset += log->logServers.size();
}
}

View File

@ -60,6 +60,7 @@ struct WorkerInterface {
RequestStream< struct EventLogRequest > eventLogRequest;
RequestStream< struct TraceBatchDumpRequest > traceBatchDumpRequest;
RequestStream< struct DiskStoreRequest > diskStoreRequest;
RequestStream<struct ExecuteRequest> execReq;
TesterInterface testerInterface;
@ -71,7 +72,7 @@ struct WorkerInterface {
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, clientInterface, locality, tLog, master, masterProxy, dataDistributor, ratekeeper, resolver, storage, logRouter, debugPing, coordinationPing, waitFailure, setMetricsRate, eventLogRequest, traceBatchDumpRequest, testerInterface, diskStoreRequest);
serializer(ar, clientInterface, locality, tLog, master, masterProxy, dataDistributor, ratekeeper, resolver, storage, logRouter, debugPing, coordinationPing, waitFailure, setMetricsRate, eventLogRequest, traceBatchDumpRequest, testerInterface, diskStoreRequest, execReq);
}
};
@ -239,6 +240,23 @@ struct TraceBatchDumpRequest {
}
};
struct ExecuteRequest {
constexpr static FileIdentifier file_identifier = 8184128;
ReplyPromise<Void> reply;
Arena arena;
StringRef execPayload;
ExecuteRequest(StringRef execPayload) : execPayload(execPayload) {}
ExecuteRequest() : execPayload() {}
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, reply, execPayload, arena);
}
};
struct LoadedReply {
constexpr static FileIdentifier file_identifier = 9956350;
Standalone<StringRef> payload;
@ -380,7 +398,9 @@ ACTOR Future<Void> extractClusterInterface(Reference<AsyncVar<Optional<struct Cl
ACTOR Future<Void> fdbd(Reference<ClusterConnectionFile> ccf, LocalityData localities, ProcessClass processClass,
std::string dataFolder, std::string coordFolder, int64_t memoryLimit,
std::string metricsConnFile, std::string metricsPrefix, int64_t memoryProfilingThreshold);
std::string metricsConnFile, std::string metricsPrefix, int64_t memoryProfilingThreshold,
std::string whitelistBinPaths);
ACTOR Future<Void> clusterController(Reference<ClusterConnectionFile> ccf,
Reference<AsyncVar<Optional<ClusterControllerFullInterface>>> currentCC,
Reference<AsyncVar<ClusterControllerPriorityInfo>> asyncPriorityInfo,
@ -399,11 +419,11 @@ ACTOR Future<Void> storageServer(IKeyValueStore* persistentData, StorageServerIn
ACTOR Future<Void> masterServer(MasterInterface mi, Reference<AsyncVar<ServerDBInfo>> db,
ServerCoordinators serverCoordinators, LifetimeToken lifetime, bool forceRecovery);
ACTOR Future<Void> masterProxyServer(MasterProxyInterface proxy, InitializeMasterProxyRequest req,
Reference<AsyncVar<ServerDBInfo>> db);
Reference<AsyncVar<ServerDBInfo>> db, std::string whitelistBinPaths);
ACTOR Future<Void> tLog(IKeyValueStore* persistentData, IDiskQueue* persistentQueue,
Reference<AsyncVar<ServerDBInfo>> db, LocalityData locality,
PromiseStream<InitializeTLogRequest> tlogRequests, UID tlogId, bool restoreFromDisk,
Promise<Void> oldLog, Promise<Void> recovered, Reference<AsyncVar<bool>> degraded); // changes tli->id() to be the recovered ID
Promise<Void> oldLog, Promise<Void> recovered, std::string folder, Reference<AsyncVar<bool>> degraded); // changes tli->id() to be the recovered ID
ACTOR Future<Void> monitorServerDBInfo(Reference<AsyncVar<Optional<ClusterControllerFullInterface>>> ccInterface,
Reference<ClusterConnectionFile> ccf, LocalityData locality,
Reference<AsyncVar<ServerDBInfo>> dbInfo);
@ -425,7 +445,7 @@ namespace oldTLog_6_0 {
ACTOR Future<Void> tLog(IKeyValueStore* persistentData, IDiskQueue* persistentQueue,
Reference<AsyncVar<ServerDBInfo>> db, LocalityData locality,
PromiseStream<InitializeTLogRequest> tlogRequests, UID tlogId, bool restoreFromDisk,
Promise<Void> oldLog, Promise<Void> recovered, Reference<AsyncVar<bool>> degraded);
Promise<Void> oldLog, Promise<Void> recovered, std::string folder, Reference<AsyncVar<bool>> degraded);
}
typedef decltype(&tLog) TLogFn;

View File

@ -61,6 +61,8 @@
#include "versions.h"
#endif
#include "fdbmonitor/SimpleIni.h"
#ifdef __linux__
#include <execinfo.h>
#include <signal.h>
@ -79,8 +81,8 @@
#include "flow/actorcompiler.h" // This must be the last #include.
enum {
OPT_CONNFILE, OPT_SEEDCONNFILE, OPT_SEEDCONNSTRING, OPT_ROLE, OPT_LISTEN, OPT_PUBLICADDR, OPT_DATAFOLDER, OPT_LOGFOLDER, OPT_PARENTPID, OPT_NEWCONSOLE, OPT_NOBOX, OPT_TESTFILE, OPT_RESTARTING, OPT_RANDOMSEED, OPT_KEY, OPT_MEMLIMIT, OPT_STORAGEMEMLIMIT, OPT_MACHINEID, OPT_DCID, OPT_MACHINE_CLASS, OPT_BUGGIFY, OPT_VERSION, OPT_CRASHONERROR, OPT_HELP, OPT_NETWORKIMPL, OPT_NOBUFSTDOUT, OPT_BUFSTDOUTERR, OPT_TRACECLOCK, OPT_NUMTESTERS, OPT_DEVHELP, OPT_ROLLSIZE, OPT_MAXLOGS, OPT_MAXLOGSSIZE, OPT_KNOB, OPT_TESTSERVERS, OPT_TEST_ON_SERVERS, OPT_METRICSCONNFILE, OPT_METRICSPREFIX,
OPT_LOGGROUP, OPT_LOCALITY, OPT_IO_TRUST_SECONDS, OPT_IO_TRUST_WARN_ONLY, OPT_FILESYSTEM, OPT_PROFILER_RSS_SIZE, OPT_KVFILE, OPT_TRACE_FORMAT, OPT_USE_OBJECT_SERIALIZER };
OPT_CONNFILE, OPT_SEEDCONNFILE, OPT_SEEDCONNSTRING, OPT_ROLE, OPT_LISTEN, OPT_PUBLICADDR, OPT_DATAFOLDER, OPT_LOGFOLDER, OPT_PARENTPID, OPT_NEWCONSOLE, OPT_NOBOX, OPT_TESTFILE, OPT_RESTARTING, OPT_RESTORING, OPT_RANDOMSEED, OPT_KEY, OPT_MEMLIMIT, OPT_STORAGEMEMLIMIT, OPT_MACHINEID, OPT_DCID, OPT_MACHINE_CLASS, OPT_BUGGIFY, OPT_VERSION, OPT_CRASHONERROR, OPT_HELP, OPT_NETWORKIMPL, OPT_NOBUFSTDOUT, OPT_BUFSTDOUTERR, OPT_TRACECLOCK, OPT_NUMTESTERS, OPT_DEVHELP, OPT_ROLLSIZE, OPT_MAXLOGS, OPT_MAXLOGSSIZE, OPT_KNOB, OPT_TESTSERVERS, OPT_TEST_ON_SERVERS, OPT_METRICSCONNFILE, OPT_METRICSPREFIX,
OPT_LOGGROUP, OPT_LOCALITY, OPT_IO_TRUST_SECONDS, OPT_IO_TRUST_WARN_ONLY, OPT_FILESYSTEM, OPT_PROFILER_RSS_SIZE, OPT_KVFILE, OPT_TRACE_FORMAT, OPT_USE_OBJECT_SERIALIZER, OPT_WHITELIST_BINPATH };
CSimpleOpt::SOption g_rgOptions[] = {
{ OPT_CONNFILE, "-C", SO_REQ_SEP },
@ -158,6 +160,7 @@ CSimpleOpt::SOption g_rgOptions[] = {
{ OPT_TRACE_FORMAT , "--trace_format", SO_REQ_SEP },
{ OPT_USE_OBJECT_SERIALIZER, "-S", SO_REQ_SEP },
{ OPT_USE_OBJECT_SERIALIZER, "--object-serializer", SO_REQ_SEP },
{ OPT_WHITELIST_BINPATH, "--whitelist_binpath", SO_REQ_SEP },
#ifndef TLS_DISABLED
TLS_OPTION_FLAGS
@ -913,6 +916,7 @@ int main(int argc, char* argv[]) {
const char *testFile = "tests/default.txt";
std::string kvFile;
std::string testServersStr;
std::string whitelistBinPaths;
std::vector<std::string> publicAddressStrs, listenAddressStrs;
const char *targetKey = NULL;
uint64_t memLimit = 8LL << 30; // Nice to maintain the same default value for memLimit and SERVER_KNOBS->SERVER_MEM_LIMIT and SERVER_KNOBS->COMMIT_BATCHES_MEM_BYTES_HARD_LIMIT
@ -1193,7 +1197,7 @@ int main(int argc, char* argv[]) {
case OPT_RESTARTING:
restarting = true;
break;
case OPT_RANDOMSEED: {
case OPT_RANDOMSEED: {
char* end;
randomSeed = (uint32_t)strtoul( args.OptionArg(), &end, 0 );
if( *end ) {
@ -1299,6 +1303,9 @@ int main(int argc, char* argv[]) {
}
break;
}
case OPT_WHITELIST_BINPATH:
whitelistBinPaths = args.OptionArg();
break;
#ifndef TLS_DISABLED
case TLSOptions::OPT_TLS_PLUGIN:
args.OptionArg();
@ -1641,7 +1648,8 @@ int main(int argc, char* argv[]) {
std::vector<std::string> directories = platform::listDirectories( dataFolder );
for(int i = 0; i < directories.size(); i++)
if( directories[i].size() != 32 && directories[i] != "." && directories[i] != ".." && directories[i] != "backups") {
if (directories[i].size() != 32 && directories[i] != "." && directories[i] != ".." &&
directories[i] != "backups" && directories[i].find("snap") == std::string::npos) {
TraceEvent(SevError, "IncompatibleDirectoryFound").detail("DataFolder", dataFolder).detail("SuspiciousFile", directories[i]);
fprintf(stderr, "ERROR: Data folder `%s' had non fdb file `%s'; please use clean, fdb-only folder\n", dataFolder.c_str(), directories[i].c_str());
flushAndExit(FDB_EXIT_ERROR);
@ -1658,12 +1666,85 @@ int main(int argc, char* argv[]) {
flushAndExit(FDB_EXIT_ERROR);
}
int isRestoring = 0;
if (!restarting) {
platform::eraseDirectoryRecursive( dataFolder );
platform::createDirectory( dataFolder );
}
} else {
CSimpleIni ini;
ini.SetUnicode();
std::string absDataFolder = abspath(dataFolder);
ini.LoadFile(joinPath(absDataFolder, "restartInfo.ini").c_str());
int backupFailed = true;
const char* isRestoringStr = ini.GetValue("RESTORE", "isRestoring", NULL);
if (isRestoringStr) {
isRestoring = atoi(isRestoringStr);
const char* backupFailedStr = ini.GetValue("RESTORE", "BackupFailed", NULL);
if (isRestoring && backupFailedStr) {
backupFailed = atoi(backupFailedStr);
}
}
if (isRestoring && !backupFailed) {
std::vector<std::string> returnList;
std::string ext = "";
returnList = platform::listDirectories(absDataFolder);
std::string snapStr = ini.GetValue("RESTORE", "RestoreSnapUID");
setupAndRun( dataFolder, testFile, restarting, tlsOptions );
TraceEvent("RestoringDataFolder").detail("DataFolder", absDataFolder);
TraceEvent("RestoreSnapUID").detail("UID", snapStr);
// delete all files (except fdb.cluster) in non-snap directories
for (int i = 0; i < returnList.size(); i++) {
if (returnList[i] == "." || returnList[i] == "..") {
continue;
}
if (returnList[i].find(snapStr) != std::string::npos) {
continue;
}
std::string childf = absDataFolder + "/" + returnList[i];
std::vector<std::string> returnFiles = platform::listFiles(childf, ext);
for (int j = 0; j < returnFiles.size(); j++) {
if (returnFiles[j] != "fdb.cluster" && returnFiles[j] != "fitness") {
TraceEvent("DeletingNonSnapfiles")
.detail("FileBeingDeleted", childf + "/" + returnFiles[j]);
deleteFile(childf + "/" + returnFiles[j]);
}
}
}
// move the contents from snap folder to the original folder,
// delete snap folders
for (int i = 0; i < returnList.size(); i++) {
if (returnList[i] == "." || returnList[i] == "..") {
continue;
}
std::string dirSrc = absDataFolder + "/" + returnList[i];
// delete snap directories which are not part of restoreSnapUID
if (returnList[i].find(snapStr) == std::string::npos) {
if (returnList[i].find("snap") != std::string::npos) {
platform::eraseDirectoryRecursive(dirSrc);
}
continue;
}
// remove empty/partial snap directories
std::vector<std::string> childrenList = platform::listFiles(dirSrc);
if (childrenList.size() == 0) {
TraceEvent("RemovingEmptySnapDirectory").detail("DirBeingDeleted", dirSrc);
platform::eraseDirectoryRecursive(dirSrc);
continue;
}
std::string origDir = returnList[i].substr(0, 32);
std::string dirToRemove = absDataFolder + "/" + origDir;
TraceEvent("DeletingOriginalNonSnapDirectory").detail("FileBeingDeleted", dirToRemove);
platform::eraseDirectoryRecursive(dirToRemove);
renameFile(dirSrc, dirToRemove);
TraceEvent("RenamingSnapToOriginalDirectory")
.detail("Oldname", dirSrc)
.detail("Newname", dirToRemove);
}
}
}
setupAndRun( dataFolder, testFile, restarting, (isRestoring >= 1), whitelistBinPaths, tlsOptions);
g_simulator.run();
} else if (role == FDBD) {
ASSERT( connectionFile );
@ -1674,7 +1755,7 @@ int main(int argc, char* argv[]) {
dataFolder = format("fdb/%d/", publicAddresses.address.port); // SOMEDAY: Better default
vector<Future<Void>> actors(listenErrors.begin(), listenErrors.end());
actors.push_back( fdbd(connectionFile, localities, processClass, dataFolder, dataFolder, storageMemLimit, metricsConnFile, metricsPrefix, rsssize) );
actors.push_back( fdbd(connectionFile, localities, processClass, dataFolder, dataFolder, storageMemLimit, metricsConnFile, metricsPrefix, rsssize, whitelistBinPaths) );
//actors.push_back( recurring( []{}, .001 ) ); // for ASIO latency measurement
f = stopAfter( waitForAll(actors) );

View File

@ -47,6 +47,7 @@
<ActorCompiler Include="KeyValueStoreCompressTestData.actor.cpp" />
<ActorCompiler Include="IndirectShadowPager.actor.cpp" />
<ClCompile Include="Knobs.cpp" />
<ActorCompiler Include="FDBExecHelper.actor.cpp" />
<ActorCompiler Include="QuietDatabase.actor.cpp" />
<ActorCompiler Include="networktest.actor.cpp" />
<ActorCompiler Include="workloads\Unreadable.actor.cpp" />
@ -152,6 +153,7 @@
<ActorCompiler Include="workloads\VersionStamp.actor.cpp" />
<ActorCompiler Include="workloads\Serializability.actor.cpp" />
<ActorCompiler Include="workloads\DiskDurability.actor.cpp" />
<ActorCompiler Include="workloads\SnapTest.actor.cpp" />
<ActorCompiler Include="workloads\Mako.actor.cpp" />
</ItemGroup>
<ItemGroup>
@ -166,6 +168,9 @@
</ActorCompiler>
<ClInclude Include="DataDistributorInterface.h" />
<ClInclude Include="DBCoreState.h" />
<ActorCompiler Include="FDBExecHelper.actor.h">
<EnableCompile>false</EnableCompile>
</ActorCompiler>
<ClInclude Include="IDiskQueue.h" />
<ClInclude Include="IKeyValueStore.h" />
<ClInclude Include="IndirectShadowPager.h" />

View File

@ -49,6 +49,7 @@
#include "fdbserver/RecoveryState.h"
#include "fdbserver/LogProtocolMessage.h"
#include "fdbserver/LatencyBandConfig.h"
#include "fdbserver/FDBExecHelper.actor.h"
#include "flow/TDMetric.actor.h"
#include "flow/actorcompiler.h" // This must be the last #include.
@ -1834,14 +1835,17 @@ void addMutation( Reference<T>& target, Version version, MutationRef const& muta
}
template <class T>
void splitMutations( KeyRangeMap<T>& map, VerUpdateRef const& update ) {
for(auto& m : update.mutations) {
splitMutation(map, m, update.version);
void splitMutations(StorageServer* data, KeyRangeMap<T>& map, VerUpdateRef const& update, vector<int>& execIndex) {
for(int i = 0; i < update.mutations.size(); i++) {
splitMutation(data, map, update.mutations[i], update.version);
if (update.mutations[i].type == MutationRef::Exec) {
execIndex.push_back(i);
}
}
}
template <class T>
void splitMutation( KeyRangeMap<T>& map, MutationRef const& m, Version ver ) {
void splitMutation(StorageServer* data, KeyRangeMap<T>& map, MutationRef const& m, Version ver) {
if(isSingleKeyMutation((MutationRef::Type) m.type)) {
if ( !SHORT_CIRCUT_ACTUAL_STORAGE || !normalKeys.contains(m.param1) )
addMutation( map.rangeContaining(m.param1)->value(), ver, m );
@ -1855,11 +1859,53 @@ void splitMutation( KeyRangeMap<T>& map, MutationRef const& m, Version ver ) {
addMutation( i->value(), ver, MutationRef((MutationRef::Type)m.type, k.begin, k.end) );
}
}
}
else
} else if (m.type == MutationRef::Exec) {
} else
ASSERT(false); // Unknown mutation type in splitMutations
}
ACTOR Future<Void>
snapHelper(StorageServer* data, MutationRef m, Version ver)
{
state std::string cmd = m.param1.toString();
if ((cmd == execDisableTLogPop) || (cmd == execEnableTLogPop)) {
TraceEvent("IgnoreNonSnapCommands").detail("ExecCommand", cmd);
return Void();
}
state ExecCmdValueString execArg(m.param2);
state StringRef uidStr = execArg.getBinaryArgValue(LiteralStringRef("uid"));
state int err = 0;
state Future<int> cmdErr;
state UID execUID = UID::fromString(uidStr.toString());
state bool skip = false;
if (cmd == execSnap && isTLogInSameNode()) {
skip = true;
}
// other storage has initiated the exec, so we can skip
if (!skip && isExecOpInProgress(execUID)) {
skip = true;
}
if (!skip) {
setExecOpInProgress(execUID);
int err = wait(execHelper(&execArg, data->folder, "role=storage"));
clearExecOpInProgress(execUID);
}
TraceEvent te = TraceEvent("ExecTraceStorage");
te.detail("Uid", uidStr.toString());
te.detail("Status", err);
te.detail("Role", "storage");
te.detail("Version", ver);
te.detail("Mutation", m.toString());
te.detail("Mid", data->thisServerID.toString());
te.detail("DurableVersion", data->durableVersion.get());
te.detail("DataVersion", data->version.get());
te.detail("Tag", data->tag.toString());
te.detail("SnapCreateSkipped", skip);
return Void();
}
ACTOR Future<Void> fetchKeys( StorageServer *data, AddingShard* shard ) {
state TraceInterval interval("FetchKeys");
state KeyRange keys = shard->keys;
@ -1967,21 +2013,29 @@ ACTOR Future<Void> fetchKeys( StorageServer *data, AddingShard* shard ) {
if (this_block.more) {
Key nfk = this_block.readThrough.present() ? this_block.readThrough.get() : keyAfter( this_block.end()[-1].key );
if (nfk != keys.end) {
std::deque< Standalone<VerUpdateRef> > updatesToSplit = std::move( shard->updates );
state std::deque< Standalone<VerUpdateRef> > updatesToSplit = std::move( shard->updates );
// This actor finishes committing the keys [keys.begin,nfk) that we already fetched.
// The remaining unfetched keys [nfk,keys.end) will become a separate AddingShard with its own fetchKeys.
shard->server->addShard( ShardInfo::addingSplitLeft( KeyRangeRef(keys.begin, nfk), shard ) );
shard->server->addShard( ShardInfo::newAdding( data, KeyRangeRef(nfk, keys.end) ) );
shard = data->shards.rangeContaining( keys.begin ).value()->adding;
auto otherShard = data->shards.rangeContaining( nfk ).value()->adding;
state AddingShard* otherShard = data->shards.rangeContaining( nfk ).value()->adding;
keys = shard->keys;
// Split our prior updates. The ones that apply to our new, restricted key range will go back into shard->updates,
// and the ones delivered to the new shard will be discarded because it is in WaitPrevious phase (hasn't chosen a fetchVersion yet).
// What we are doing here is expensive and could get more expensive if we started having many more blocks per shard. May need optimization in the future.
for(auto u = updatesToSplit.begin(); u != updatesToSplit.end(); ++u)
splitMutations( data->shards, *u );
state vector<int> execIdxVec;
state std::deque< Standalone<VerUpdateRef> >::iterator u = updatesToSplit.begin();
for(; u != updatesToSplit.end(); ++u) {
ASSERT(execIdxVec.size() == 0);
splitMutations(data, data->shards, *u, execIdxVec);
for (auto execIdx : execIdxVec) {
wait(snapHelper(data, u->mutations[execIdx], u->version));
}
execIdxVec.clear();
}
TEST( true );
TEST( shard->updates.size() );
@ -2173,7 +2227,8 @@ void ShardInfo::addMutation(Version version, MutationRef const& mutation) {
adding->addMutation(version, mutation);
else if (readWrite)
readWrite->addMutation(version, mutation, this->keys, readWrite->updateEagerReads);
else if (mutation.type != MutationRef::ClearRange) {
else if ((mutation.type != MutationRef::ClearRange)
&& (mutation.type != MutationRef::Exec)) {
TraceEvent(SevError, "DeliveredToNotAssigned").detail("Version", version).detail("Mutation", mutation.toString());
ASSERT(false); // Mutation delivered to notAssigned shard!
}
@ -2382,7 +2437,7 @@ public:
// debugMutation("SSUpdateMutation", changes[c].version, *m);
//}
splitMutation( data->shards, m, ver );
splitMutation(data, data->shards, m, ver);
}
if (data->otherError.getFuture().isReady()) data->otherError.getFuture().get();
@ -2588,6 +2643,9 @@ ACTOR Future<Void> update( StorageServer* data, bool* pReceivedUpdate )
state VerUpdateRef* pUpdate = &fii.changes[changeNum];
for(; mutationNum < pUpdate->mutations.size(); mutationNum++) {
updater.applyMutation(data, pUpdate->mutations[mutationNum], pUpdate->version);
if (pUpdate->mutations[mutationNum].type == MutationRef::Exec) {
wait(snapHelper(data, pUpdate->mutations[mutationNum], pUpdate->version));
}
mutationBytes += pUpdate->mutations[mutationNum].totalSize();
injectedChanges = true;
if(mutationBytes > SERVER_KNOBS->DESIRED_UPDATE_BYTES) {
@ -2660,6 +2718,9 @@ ACTOR Future<Void> update( StorageServer* data, bool* pReceivedUpdate )
++data->counters.atomicMutations;
break;
}
if (msg.type == MutationRef::Exec) {
wait(snapHelper(data, msg, ver));
}
}
else
TraceEvent(SevError, "DiscardingPeekedData", data->thisServerID).detail("Mutation", msg.toString()).detail("Version", cloneCursor2->version().toString());

View File

@ -35,6 +35,7 @@
#include "fdbserver/ClusterRecruitmentInterface.h"
#include "fdbserver/DataDistributorInterface.h"
#include "fdbserver/ServerDBInfo.h"
#include "fdbserver/FDBExecHelper.actor.h"
#include "fdbserver/CoordinationInterface.h"
#include "fdbclient/FailureMonitorClient.h"
#include "fdbclient/MonitorLeader.h"
@ -66,6 +67,7 @@ extern IKeyValueStore* keyValueStoreCompressTestData(IKeyValueStore* store);
# define KV_STORE(filename,uid) keyValueStoreMemory(filename,uid)
#endif
ACTOR static Future<Void> extractClientInfo( Reference<AsyncVar<ServerDBInfo>> db, Reference<AsyncVar<ClientDBInfo>> info ) {
loop {
info->set( db->get().client );
@ -229,6 +231,7 @@ std::string filenameFromId( KeyValueStoreType storeType, std::string folder, std
UNREACHABLE();
}
struct TLogOptions {
TLogOptions() = default;
TLogOptions( TLogVersion v, TLogSpillType s ) : version(v), spillType(s) {}
@ -696,7 +699,8 @@ ACTOR Future<Void> workerServer(
Reference<AsyncVar<ClusterControllerPriorityInfo>> asyncPriorityInfo,
ProcessClass initialClass, std::string folder, int64_t memoryLimit,
std::string metricsConnFile, std::string metricsPrefix,
Promise<Void> recoveredDiskFiles, int64_t memoryProfileThreshold) {
Promise<Void> recoveredDiskFiles, int64_t memoryProfileThreshold,
std::string _coordFolder, std::string whitelistBinPaths) {
state PromiseStream< ErrorInfo > errors;
state Reference<AsyncVar<Optional<DataDistributorInterface>>> ddInterf( new AsyncVar<Optional<DataDistributorInterface>>() );
state Reference<AsyncVar<Optional<RatekeeperInterface>>> rkInterf( new AsyncVar<Optional<RatekeeperInterface>>() );
@ -717,6 +721,7 @@ ACTOR Future<Void> workerServer(
// here is no, so that when running with log_version==3, all files should say V=3.
state std::map<std::tuple<TLogVersion, KeyValueStoreType::StoreType, TLogSpillType>,
std::pair<Future<Void>, PromiseStream<InitializeTLogRequest>>> sharedLogs;
state std::string coordFolder = abspath(_coordFolder);
state WorkerInterface interf( locality );
@ -832,7 +837,7 @@ ACTOR Future<Void> workerServer(
auto& logData = sharedLogs[std::make_tuple(s.tLogOptions.version, s.storeType, s.tLogOptions.spillType)];
// FIXME: Shouldn't if logData.first isValid && !isReady, shouldn't we
// be sending a fake InitializeTLogRequest rather than calling tLog() ?
Future<Void> tl = tLogFn( kv, queue, dbInfo, locality, !logData.first.isValid() || logData.first.isReady() ? logData.second : PromiseStream<InitializeTLogRequest>(), s.storeID, true, oldLog, recovery, degraded );
Future<Void> tl = tLogFn( kv, queue, dbInfo, locality, !logData.first.isValid() || logData.first.isReady() ? logData.second : PromiseStream<InitializeTLogRequest>(), s.storeID, true, oldLog, recovery, folder, degraded );
recoveries.push_back(recovery.getFuture());
tl = handleIOErrors( tl, kv, s.storeID );
@ -989,7 +994,7 @@ ACTOR Future<Void> workerServer(
filesClosed.add( data->onClosed() );
filesClosed.add( queue->onClosed() );
logData.first = tLogFn( data, queue, dbInfo, locality, logData.second, logId, false, Promise<Void>(), Promise<Void>(), degraded );
logData.first = tLogFn( data, queue, dbInfo, locality, logData.second, logId, false, Promise<Void>(), Promise<Void>(), folder, degraded );
logData.first = handleIOErrors( logData.first, data, logId );
logData.first = handleIOErrors( logData.first, queue, logId );
errorForwarders.add( forwardError( errors, Role::SHARED_TRANSACTION_LOG, logId, logData.first ) );
@ -1053,7 +1058,7 @@ ACTOR Future<Void> workerServer(
//printf("Recruited as masterProxyServer\n");
errorForwarders.add( zombie(recruited, forwardError( errors, Role::MASTER_PROXY, recruited.id(),
masterProxyServer( recruited, req, dbInfo ) ) ) );
masterProxyServer( recruited, req, dbInfo, whitelistBinPaths ) ) ) );
req.reply.send(recruited);
}
when( InitializeResolverRequest req = waitNext(interf.resolver.getFuture()) ) {
@ -1166,6 +1171,25 @@ ACTOR Future<Void> workerServer(
systemMonitor();
loggingTrigger = delay( loggingDelay, TaskFlushTrace );
}
when(state ExecuteRequest req = waitNext(interf.execReq.getFuture())) {
state ExecCmdValueString execArg(req.execPayload);
try {
int err = wait(execHelper(&execArg, coordFolder, "role=coordinator"));
StringRef uidStr = execArg.getBinaryArgValue(LiteralStringRef("uid"));
auto tokenStr = "ExecTrace/Coordinators/" + uidStr.toString();
auto te = TraceEvent("ExecTraceCoordinators");
te.detail("Uid", uidStr.toString());
te.detail("Status", err);
te.detail("Role", "coordinator");
te.detail("Value", coordFolder);
te.detail("ExecPayload", execArg.getCmdValueString().toString());
te.trackLatest(tokenStr.c_str());
req.reply.send(Void());
} catch (Error& e) {
TraceEvent("ExecHelperError").error(e);
req.reply.sendError(broken_promise());
}
}
when( wait( errorForwarders.getResult() ) ) {}
when( wait( handleErrors ) ) {}
}
@ -1317,12 +1341,16 @@ ACTOR Future<Void> fdbd(
int64_t memoryLimit,
std::string metricsConnFile,
std::string metricsPrefix,
int64_t memoryProfileThreshold)
int64_t memoryProfileThreshold,
std::string whitelistBinPaths)
{
try {
ServerCoordinators coordinators( connFile );
TraceEvent("StartingFDBD").detail("ZoneID", localities.zoneId()).detail("MachineId", localities.machineId()).detail("DiskPath", dataFolder).detail("CoordPath", coordFolder);
if (g_network->isSimulated()) {
whitelistBinPaths = ",, random_path, /bin/snap_create.sh,,";
}
TraceEvent("StartingFDBD").detail("ZoneID", localities.zoneId()).detail("MachineId", localities.machineId()).detail("DiskPath", dataFolder).detail("CoordPath", coordFolder).detail("WhiteListBinPath", whitelistBinPaths);
// SOMEDAY: start the services on the machine in a staggered fashion in simulation?
state vector<Future<Void>> v;
@ -1344,7 +1372,7 @@ ACTOR Future<Void> fdbd(
v.push_back( reportErrors( processClass == ProcessClass::TesterClass ? monitorLeader( connFile, cc ) : clusterController( connFile, cc , asyncPriorityInfo, recoveredDiskFiles.getFuture(), localities ), "ClusterController") );
v.push_back( reportErrors(extractClusterInterface( cc, ci ), "ExtractClusterInterface") );
v.push_back( reportErrors(failureMonitorClient( ci, true ), "FailureMonitorClient") );
v.push_back( reportErrorsExcept(workerServer(connFile, cc, localities, asyncPriorityInfo, processClass, dataFolder, memoryLimit, metricsConnFile, metricsPrefix, recoveredDiskFiles, memoryProfileThreshold), "WorkerServer", UID(), &normalWorkerErrors()) );
v.push_back( reportErrorsExcept(workerServer(connFile, cc, localities, asyncPriorityInfo, processClass, dataFolder, memoryLimit, metricsConnFile, metricsPrefix, recoveredDiskFiles, memoryProfileThreshold, coordFolder, whitelistBinPaths), "WorkerServer", UID(), &normalWorkerErrors()) );
state Future<Void> firstConnect = reportErrors( printOnFirstConnected(ci), "ClusterFirstConnectedError" );
wait( quorum(v,1) );

View File

@ -102,13 +102,13 @@ struct CycleWorkload : TestWorkload {
try {
// Reverse next and next^2 node
Optional<Value> v = wait( tr.get( self->key(r) ) );
if (!v.present()) self->badRead("r", r, tr);
if (!v.present()) self->badRead("KeyR", r, tr);
state int r2 = self->fromValue(v.get());
Optional<Value> v2 = wait( tr.get( self->key(r2) ) );
if (!v2.present()) self->badRead("r2", r2, tr);
if (!v2.present()) self->badRead("KeyR2", r2, tr);
state int r3 = self->fromValue(v2.get());
Optional<Value> v3 = wait( tr.get( self->key(r3) ) );
if (!v3.present()) self->badRead("r3", r3, tr);
if (!v3.present()) self->badRead("KeyR3", r3, tr);
int r4 = self->fromValue(v3.get());
tr.clear( self->key(r) ); //< Shouldn't have an effect, but will break with wrong ordering

View File

@ -34,12 +34,14 @@ struct SaveAndKillWorkload : TestWorkload {
std::string restartInfo;
double testDuration;
int isRestoring;
SaveAndKillWorkload(WorkloadContext const& wcx)
: TestWorkload(wcx)
{
restartInfo = getOption( options, LiteralStringRef("restartInfoLocation"), LiteralStringRef("simfdb/restartInfo.ini") ).toString();
testDuration = getOption( options, LiteralStringRef("testDuration"), 10.0 );
isRestoring = getOption( options, LiteralStringRef("isRestoring"), 0 );
}
virtual std::string description() { return "SaveAndKillWorkload"; }
@ -59,6 +61,7 @@ struct SaveAndKillWorkload : TestWorkload {
ini.SetUnicode();
ini.LoadFile(self->restartInfo.c_str());
ini.SetValue("RESTORE", "isRestoring", format("%d", self->isRestoring).c_str());
ini.SetValue("META", "processesPerMachine", format("%d", g_simulator.processesPerMachine).c_str());
ini.SetValue("META", "listenersPerProcess", format("%d", g_simulator.listenersPerProcess).c_str());
ini.SetValue("META", "desiredCoordinators", format("%d", g_simulator.desiredCoordinators).c_str());

View File

@ -0,0 +1,367 @@
#include <boost/lexical_cast.hpp>
#include "fdbclient/ManagementAPI.actor.h"
#include "fdbclient/NativeAPI.actor.h"
#include "fdbclient/ReadYourWrites.h"
#include "fdbrpc/ContinuousSample.h"
#include "fdbmonitor/SimpleIni.h"
#include "fdbserver/ClusterRecruitmentInterface.h"
#include "fdbserver/Status.h"
#include "fdbserver/TesterInterface.actor.h"
#include "fdbserver/WorkerInterface.actor.h"
#include "fdbserver/workloads/BulkSetup.actor.h"
#include "fdbserver/workloads/workloads.actor.h"
#include "flow/actorcompiler.h"
void getVersionAndnumTags(TraceEventFields md, Version& version, int& numTags) {
version = -1;
numTags = -1;
version = boost::lexical_cast<int64_t>(md.getValue("Version"));
numTags = boost::lexical_cast<int>(md.getValue("NumTags"));
}
void getTagAndDurableVersion(TraceEventFields md, Version version, Tag& tag, Version& durableVersion) {
Version verifyVersion;
durableVersion = -1;
verifyVersion = boost::lexical_cast<int64_t>(md.getValue("Version"));
std::string tagString = md.getValue("Tag");
int colon = tagString.find_first_of(':');
std::string localityString = tagString.substr(0, colon);
std::string idString = tagString.substr(colon + 1);
tag.locality = boost::lexical_cast<int>(localityString);
tag.id = boost::lexical_cast<int>(idString);
durableVersion = boost::lexical_cast<int64_t>(md.getValue("DurableVersion"));
}
void getMinAndMaxTLogVersions(TraceEventFields md, Version version, Tag tag, Version& minTLogVersion,
Version& maxTLogVersion) {
Version verifyVersion;
Tag verifyTag;
minTLogVersion = maxTLogVersion = -1;
verifyVersion = boost::lexical_cast<int64_t>(md.getValue("Version"));
std::string tagString = md.getValue("Tag");
int colon = tagString.find_first_of(':');
std::string localityString = tagString.substr(0, colon);
std::string idString = tagString.substr(colon + 1);
verifyTag.locality = boost::lexical_cast<int>(localityString);
verifyTag.id = boost::lexical_cast<int>(idString);
if (tag != verifyTag) {
return;
}
minTLogVersion = boost::lexical_cast<int64_t>(md.getValue("PoppedTagVersion"));
maxTLogVersion = boost::lexical_cast<int64_t>(md.getValue("QueueCommittedVersion"));
}
void filterEmptyMessages(std::vector<Future<TraceEventFields>>& messages) {
messages.erase(std::remove_if(messages.begin(), messages.end(),
[](Future<TraceEventFields>const & msgFuture)
{
return !msgFuture.isReady() || msgFuture.get().size() == 0;
}
), messages.end());
return;
}
void printMessages(std::vector<Future<TraceEventFields>>& messages) {
for (int i = 0; i < messages.size(); i++) {
TraceEvent("SnapTestMessages").detail("I", i).detail("Value", messages[i].get().toString());
}
return;
}
struct SnapTestWorkload : TestWorkload {
public: // variables
int numSnaps; // num of snapshots to be taken
// FIXME: currently validation works on numSnap = 1
double maxSnapDelay; // max delay before which a snapshot will be taken
int testID; // test id
UID snapUID; // UID used for snap name
std::string restartInfoLocation; // file location to store the snap restore info
int maxRetryCntToRetrieveMessage; // number of retires to do trackLatest
bool skipCheck; // disable check if the exec fails
public: // ctor & dtor
SnapTestWorkload(WorkloadContext const& wcx)
: TestWorkload(wcx), numSnaps(0), maxSnapDelay(0.0), testID(0), snapUID() {
TraceEvent("SnapTestWorkload Constructor");
std::string workloadName = "SnapTest";
maxRetryCntToRetrieveMessage = 10;
numSnaps = getOption(options, LiteralStringRef("numSnaps"), 0);
maxSnapDelay = getOption(options, LiteralStringRef("maxSnapDelay"), 25.0);
testID = getOption(options, LiteralStringRef("testID"), 0);
restartInfoLocation =
getOption(options, LiteralStringRef("restartInfoLocation"), LiteralStringRef("simfdb/restartInfo.ini"))
.toString();
skipCheck = false;
}
public: // workload functions
std::string description() override { return "SnapTest"; }
Future<Void> setup(Database const& cx) override {
TraceEvent("SnapTestWorkloadSetup");
return Void();
}
Future<Void> start(Database const& cx) override {
TraceEvent("SnapTestWorkloadStart");
if (clientId == 0) {
return _start(cx, this);
}
return Void();
}
ACTOR Future<bool> _check(Database cx, SnapTestWorkload* self) {
if (self->skipCheck) {
TraceEvent(SevWarnAlways, "SnapCheckIgnored");
return true;
}
state Transaction tr(cx);
// read the key SnapFailedTLog.$UID
loop {
try {
Standalone<StringRef> keyStr = snapTestFailStatus.withSuffix(StringRef(self->snapUID.toString()));
TraceEvent("TestKeyStr").detail("Value", keyStr);
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
Optional<Value> val = wait(tr.get(keyStr));
if (val.present()) {
break;
}
// wait for the key to be written out by TLogs
wait(delay(0.1));
} catch (Error &e) {
wait(tr.onError(e));
}
}
return true;
}
Future<bool> check(Database const& cx) override {
TraceEvent("SnapTestWorkloadCheck").detail("ClientID", clientId);
if (clientId != 0) {
return true;
}
if (this->testID != 5 && this->testID != 6) {
return true;
}
return _check(cx, this);
}
void getMetrics(vector<PerfMetric>& m) override { TraceEvent("SnapTestWorkloadGetMetrics"); }
ACTOR Future<Void> snapExecHelper(SnapTestWorkload* self, Database cx, StringRef keyRef, StringRef valueRef) {
state Transaction tr(cx);
loop {
try {
tr.execute(keyRef, valueRef);
wait(tr.commit());
break;
} catch (Error& e) {
try {
wait(tr.onError(e));
} catch (Error& e) {
if (e.code() == error_code_cluster_not_fully_recovered
|| e.code() == error_code_txn_exec_log_anti_quorum) {
TraceEvent(SevWarnAlways, "ClusterNotFullyRecovered");
self->skipCheck = true;
break;
}
throw;
}
}
}
return Void();
}
ACTOR Future<Void> _create_keys(Database cx, std::string prefix, bool even = true) {
state Transaction tr(cx);
state vector<int64_t> keys;
for (int i = 0; i < 1000; i++) {
keys.push_back(deterministicRandom()->randomInt64(0, INT64_MAX - 2));
}
state int retry = 0;
tr.reset();
loop {
try {
for (auto id : keys) {
if (even) {
if (id % 2 != 0) {
id++;
}
} else {
if (id % 2 == 0) {
id++;
}
}
std::string Key1 = prefix + std::to_string(id);
Key key1Ref(Key1);
std::string Val1 = std::to_string(id);
Value val1Ref(Val1);
tr.set(key1Ref, val1Ref, false);
}
wait(tr.commit());
break;
} catch (Error& e) {
wait(tr.onError(e));
}
}
return Void();
}
ACTOR Future<Void> _start(Database cx, SnapTestWorkload* self) {
state Transaction tr(cx);
if (self->testID == 0) {
// create even keys before the snapshot
wait(self->_create_keys(cx, "snapKey"));
} else if (self->testID == 1) {
// create a snapshot
state double toDelay = fmod(deterministicRandom()->randomUInt32(), self->maxSnapDelay);
TraceEvent("ToDelay").detail("Value", toDelay);
ASSERT(toDelay < self->maxSnapDelay);
wait(delay(toDelay));
state int retry = 0;
state bool snapFailed = false;
loop {
self->snapUID = deterministicRandom()->randomUniqueID();
try {
StringRef snapCmdRef = LiteralStringRef("/bin/snap_create.sh");
Future<Void> status = snapCreate(cx, snapCmdRef, self->snapUID);
wait(status);
break;
} catch (Error& e) {
if (e.code() == error_code_cluster_not_fully_recovered ||
e.code() == error_code_txn_exec_log_anti_quorum) {
++retry;
if (retry > 3) {
snapFailed = true;
break;
}
}
}
}
CSimpleIni ini;
ini.SetUnicode();
ini.LoadFile(self->restartInfoLocation.c_str());
std::string uidStr = self->snapUID.toString();
ini.SetValue("RESTORE", "RestoreSnapUID", uidStr.c_str());
ini.SetValue("RESTORE", "BackupFailed", format("%d", snapFailed).c_str());
ini.SaveFile(self->restartInfoLocation.c_str());
// write the snapUID to a file
TraceEvent("SnapshotCreateStatus").detail("Status", !snapFailed ? "Success" : "Failure");
} else if (self->testID == 2) {
// create odd keys after the snapshot
wait(self->_create_keys(cx, "snapKey", false /*even*/));
} else if (self->testID == 3) {
CSimpleIni ini;
ini.SetUnicode();
ini.LoadFile(self->restartInfoLocation.c_str());
bool backupFailed = atoi(ini.GetValue("RESTORE", "BackupFailed"));
if (backupFailed) {
// since backup failed, skip the restore checking
TraceEvent(SevWarnAlways, "BackupFailedSkippingRestoreCheck");
return Void();
}
state KeySelector begin = firstGreaterOrEqual(normalKeys.begin);
state KeySelector end = firstGreaterOrEqual(normalKeys.end);
state int cnt = 0;
// read the entire normalKeys range and look at keys prefixed
// with snapKeys 1) validate that all key ids are even ie -
// created before snap 2) values are same as the key id 3) # of
// keys adds up to the total keys created before snap
tr.reset();
loop {
try {
Standalone<RangeResultRef> kvRange = wait(tr.getRange(begin, end, 1000));
if (!kvRange.more && kvRange.size() == 0) {
TraceEvent("SnapTestNoMoreEntries");
break;
}
for (int i = 0; i < kvRange.size(); i++) {
if (kvRange[i].key.startsWith(LiteralStringRef("snapKey"))) {
std::string tmp1 = kvRange[i].key.substr(7).toString();
int64_t id = strtol(tmp1.c_str(), nullptr, 0);
if (id % 2 != 0) {
throw operation_failed();
}
++cnt;
std::string tmp2 = kvRange[i].value.toString();
int64_t value = strtol(tmp2.c_str(), nullptr, 0);
if (id != value) {
throw operation_failed();
}
}
}
begin = firstGreaterThan(kvRange.end()[-1].key);
} catch (Error& e) {
wait(tr.onError(e));
}
}
if (cnt != 1000) {
TraceEvent(SevError, "SnapTestVerifyCntValue").detail("Value", cnt);
throw operation_failed();
}
} else if (self->testID == 4) {
// description: if disable of a TLog pop was not followed by a
// corresponding enable, then TLog will automatically enable the
// popping of TLogs. this test case validates that we auto
// enable the popping of TLogs
state Standalone<StringRef> payLoadRef = LiteralStringRef("empty-binary:uid=a36b2ca0e8dab0452ac3e12b6b926f4b");
wait(self->snapExecHelper(self, cx, execDisableTLogPop, payLoadRef));
} else if (self->testID == 5) {
// snapshot create without disabling pop of the TLog
StringRef uidStr = LiteralStringRef("d78b08d47f341158e9a54d4baaf4a4dd");
self->snapUID = UID::fromString(uidStr.toString());
state Standalone<StringRef> snapPayload = LiteralStringRef("/bin/"
"snap_create.sh:uid=").withSuffix(uidStr);
wait(self->snapExecHelper(self, cx, execSnap, snapPayload));
} else if (self->testID == 6) {
// disable popping of TLog and snapshot create with mis-matching
payLoadRef = LiteralStringRef("empty-binary:uid=f49d27ddf7a28b6549d930743e0ebdbe");
wait(self->snapExecHelper(self, cx, execDisableTLogPop, payLoadRef));
if (self->skipCheck) {
return Void();
}
StringRef uidStr = LiteralStringRef("ba61e9612a561d60bd83ad83e1b63568");
self->snapUID = UID::fromString(uidStr.toString());
snapPayload = LiteralStringRef("/bin/snap_create.sh:uid=").withSuffix(uidStr);
wait(self->snapExecHelper(self, cx, execSnap, snapPayload));
} else if (self->testID == 7) {
// create a snapshot with a non whitelisted binary path and operation
// should fail
state bool testedFailure = false;
snapFailed = false;
loop {
self->snapUID = deterministicRandom()->randomUniqueID();
try {
StringRef snapCmdRef = LiteralStringRef("/bin/snap_create1.sh");
Future<Void> status = snapCreate(cx, snapCmdRef, self->snapUID);
wait(status);
break;
} catch (Error& e) {
if (e.code() == error_code_cluster_not_fully_recovered ||
e.code() == error_code_txn_exec_log_anti_quorum) {
snapFailed = true;
break;
}
if (e.code() == error_code_transaction_not_permitted) {
testedFailure = true;
break;
}
}
}
ASSERT(testedFailure || snapFailed);
}
wait(delay(0.0));
return Void();
}
};
WorkloadFactory<SnapTestWorkload> SnapTestWorkloadFactory("SnapTest");

View File

@ -65,6 +65,9 @@ ERROR( lookup_failed, 1041, "DNS lookup failed" )
ERROR( proxy_memory_limit_exceeded, 1042, "Proxy commit memory limit exceeded" )
ERROR( shutdown_in_progress, 1043, "Operation no longer supported due to shutdown" )
ERROR( serialization_failed, 1044, "Failed to deserialize an object" )
ERROR( transaction_not_permitted, 1045, "Operation not permitted")
ERROR( cluster_not_fully_recovered, 1046, "Cluster not fully recovered")
ERROR( txn_exec_log_anti_quorum, 1047, "Execute Transaction not supported when log anti quorum is configured")
ERROR( broken_promise, 1100, "Broken promise" )
ERROR( operation_cancelled, 1101, "Asynchronous operation cancelled" )

View File

@ -20,7 +20,7 @@
# -*- mode: makefile; -*-
flow_CFLAGS := -I$(BOOSTDIR) -I. -DUSE_UCONTEXT
flow_CFLAGS := -isystem$(BOOSTDIR) -I. -DUSE_UCONTEXT
flow_LDFLAGS :=
ifeq ($(PLATFORM),osx)

View File

@ -108,6 +108,7 @@ add_fdb_test(TEST_FILES fast/RandomUnitTests.txt)
add_fdb_test(TEST_FILES fast/SelectorCorrectness.txt)
add_fdb_test(TEST_FILES fast/Sideband.txt)
add_fdb_test(TEST_FILES fast/SidebandWithStatus.txt)
add_fdb_test(TEST_FILES fast/SnapTestFailAndDisablePop.txt)
add_fdb_test(TEST_FILES fast/SwizzledRollbackSideband.txt)
add_fdb_test(TEST_FILES fast/SystemRebootTestCycle.txt)
add_fdb_test(TEST_FILES fast/TaskBucketCorrectness.txt)
@ -141,6 +142,18 @@ add_fdb_test(
add_fdb_test(
TEST_FILES restarting/StorefrontTestRestart-1.txt
restarting/StorefrontTestRestart-2.txt)
add_fdb_test(
TEST_FILES restarting/from_6.2.0/SnapTestSimpleRestart-1.txt
restarting/from_6.2.0/SnapTestSimpleRestart-2.txt)
add_fdb_test(
TEST_FILES restarting/from_6.2.0/SnapTestRestart-1.txt
restarting/from_6.2.0/SnapTestRestart-2.txt)
add_fdb_test(
TEST_FILES restarting/from_6.2.0/SnapCycleRestart-1.txt
restarting/from_6.2.0/SnapCycleRestart-2.txt)
add_fdb_test(
TEST_FILES restarting/from_6.2.0/SnapTestAttrition-1.txt
restarting/from_6.2.0/SnapTestAttrition-2.txt)
add_fdb_test(
TEST_FILES restarting/from_5.1.7/DrUpgradeRestart-1.txt
restarting/from_5.1.7/DrUpgradeRestart-2.txt IGNORE)

View File

@ -0,0 +1,28 @@
; verify that the TLog popping disable times out and switches to enable mode
; automatically, if not enabled specifically
testTitle=SnapTLogPopDisableTimeout
testName=SnapTest
numSnaps=1
maxSnapDelay=3.0
testID=4
; snapCreate without TLogPopDisable
testTitle=SnapCreateWithNoDisablePop
testName=SnapTest
numSnaps=1
maxSnapDelay=3.0
testID=5
; snapCreate and tlogPopDisable with mis-matched UID
testTitle=SnapCreateDisableTLogPopMismatch
testName=SnapTest
numSnaps=1
maxSnapDelay=3.0
testID=6
; snapCreate with binary path that is not whitelisted
testTitle=SnapCreateNotWhitelistedBinaryPath
testName=SnapTest
numSnaps=1
maxSnapDelay=3.0
testID=7

View File

@ -0,0 +1,21 @@
testTitle=SnapCyclePre
;Take snap and do cycle test
clearAfterTest=false
testName=Cycle
transactionsPerSecond=2500.0
nodeCount=2500
testDuration=10.0
expectedRate=0
testName=SnapTest
numSnaps=1
maxSnapDelay=10.0
testID=1
clearAfterTest=false
testTitle=SnapCycleShutdown
;save and shutdown
testName=SaveAndKill
restartInfoLocation=simfdb/restartInfo.ini
testDuration=10.0
isRestoring=1

View File

@ -0,0 +1,8 @@
testTitle=SnapCycleRestore
;Post snap restore test
runSetup=false
testName=Cycle
transactionsPerSecond=2500.0
nodeCount=2500
testDuration=10.0
expectedRate=0

View File

@ -0,0 +1,45 @@
testTitle=SnapTestPre
;write 1000 Keys ending with even numbers
testName=SnapTest
numSnaps=1
maxSnapDelay=3.0
testID=0
clearAfterTest=false
testTitle=SnapTestTakeSnap
;Take snap and do read/write
testName=ReadWrite
testDuration=10.0
transactionsPerSecond=10000
writesPerTransactionA=0
readsPerTransactionA=10
writesPerTransactionB=10
readsPerTransactionB=1
alpha=0.5
nodeCount=100000
valueBytes=16
discardEdgeMeasurements=false
testName=SnapTest
numSnaps=1
maxSnapDelay=10.0
testID=1
clearAfterTest=false
testName=Attrition
testDuration=10.0
testTitle=SnapTestPost
;write 1000 Keys ending with odd numbers
testName=SnapTest
numSnaps=1
maxSnapDelay=25.0
testID=2
clearAfterTest=false
; save and shutdown
testTitle=SnapSimpleShutdown
testName=SaveAndKill
restartInfoLocation=simfdb/restartInfo.ini
testDuration=10.0
isRestoring=1

View File

@ -0,0 +1,7 @@
; verify all keys are even numbered
testTitle=SnapTestVerify
testName=SnapTest
numSnaps=1
maxSnapDelay=3.0
testID=3
restartInfoLocation=simfdb/restartInfo.ini

View File

@ -0,0 +1,42 @@
testTitle=SnapTestPre
;write 1000 Keys ending with even numbers
testName=SnapTest
numSnaps=1
maxSnapDelay=3.0
testID=0
clearAfterTest=false
testTitle=SnapTestTakeSnap
;Take snap and do read/write
testName=ReadWrite
testDuration=10.0
transactionsPerSecond=10000
writesPerTransactionA=0
readsPerTransactionA=10
writesPerTransactionB=10
readsPerTransactionB=1
alpha=0.5
nodeCount=100000
valueBytes=16
discardEdgeMeasurements=false
testName=SnapTest
numSnaps=1
maxSnapDelay=10.0
testID=1
clearAfterTest=false
testTitle=SnapTestPost
;write 1000 Keys ending with odd numbers
testName=SnapTest
numSnaps=1
maxSnapDelay=25.0
testID=2
clearAfterTest=false
testTitle=SnapTestShutdown
;save and shutdown
testName=SaveAndKill
restartInfoLocation=simfdb/restartInfo.ini
testDuration=10.0
isRestoring=1

View File

@ -0,0 +1,6 @@
; verify all keys are even numbered
testTitle=SnapTestVerify
testName=SnapTest
numSnaps=1
maxSnapDelay=3.0
testID=3

View File

@ -0,0 +1,30 @@
;write 1000 Keys ending with even number
testTitle=SnapSimplePre
testName=SnapTest
numSnaps=1
maxSnapDelay=30.0
testID=0
clearAfterTest=false
;take snap
testTitle=SnapSimpleTakeSnap
testName=SnapTest
numSnaps=1
maxSnapDelay=5.0
testID=1
clearAfterTest=false
;write 1000 Keys ending with odd number
testTitle=SnapSimplePost
testName=SnapTest
numSnaps=1
maxSnapDelay=3.0
testID=2
clearAfterTest=false
; save and shutdown
testTitle=SnapSimpleShutdown
testName=SaveAndKill
restartInfoLocation=simfdb/restartInfo.ini
testDuration=10.0
isRestoring=1

View File

@ -0,0 +1,6 @@
; verify all keys are even numbered
testTitle=SnapSimpleVerify
testName=SnapTest
numSnaps=1
maxSnapDelay=3.0
testID=3