Merge remote-tracking branch 'upstream/master' into peek-cursor-timeout-bug

This commit is contained in:
Alex Miller 2019-07-15 17:05:39 -07:00
commit 4cc60dc9b8
159 changed files with 3040 additions and 1674 deletions

View File

@ -68,6 +68,10 @@ class ResultSet(object):
self.tester_results[name] = results
@staticmethod
def _min_tuple(t1, t2):
return t1 if fdb.tuple.compare(t1, t2) < 0 else t2
def check_for_errors(self):
if len(self.tester_results) == 1:
return (0, False)
@ -97,7 +101,7 @@ class ResultSet(object):
# If these results aren't using sequence numbers, then we match two results based on whether they share the same key
else:
min_key = min([r.key(self.specification) for r in results.values()])
min_key = reduce(ResultSet._min_tuple, [r.key(self.specification) for r in results.values()])
results = {i: r for i, r in results.items() if Result.tuples_match(r.key(self.specification), min_key)}
# Increment the indices for those testers which produced a result in this iteration

View File

@ -85,7 +85,7 @@ void fdb_flow_test() {
openTraceFile(NetworkAddress(), 1000000, 1000000, ".");
systemMonitor();
uncancellable(recurring(&systemMonitor, 5.0, TaskFlushTrace));
uncancellable(recurring(&systemMonitor, 5.0, TaskPriority::FlushTrace));
Future<Void> t = _test();
@ -179,7 +179,7 @@ namespace FDB {
}
void backToFutureCallback( FDBFuture* f, void* data ) {
g_network->onMainThread( Promise<Void>((SAV<Void>*)data), TaskDefaultOnMainThread ); // SOMEDAY: think about this priority
g_network->onMainThread( Promise<Void>((SAV<Void>*)data), TaskPriority::DefaultOnMainThread ); // SOMEDAY: think about this priority
}
// backToFuture<Type>( FDBFuture*, (FDBFuture* -> Type) ) -> Future<Type>

View File

@ -1551,19 +1551,21 @@ struct UnitTestsFunc : InstructionFunc {
const uint64_t noRetryLimit = -1;
const uint64_t maxRetryDelay = 100;
const uint64_t sizeLimit = 100000;
const uint64_t maxFieldLength = 1000;
data->db->setDatabaseOption(FDBDatabaseOption::FDB_DB_OPTION_LOCATION_CACHE_SIZE, Optional<StringRef>(StringRef((const uint8_t*)&locationCacheSize, 8)));
data->db->setDatabaseOption(FDBDatabaseOption::FDB_DB_OPTION_MAX_WATCHES, Optional<StringRef>(StringRef((const uint8_t*)&maxWatches, 8)));
data->db->setDatabaseOption(FDBDatabaseOption::FDB_DB_OPTION_DATACENTER_ID, Optional<StringRef>(LiteralStringRef("dc_id")));
data->db->setDatabaseOption(FDBDatabaseOption::FDB_DB_OPTION_MACHINE_ID, Optional<StringRef>(LiteralStringRef("machine_id")));
data->db->setDatabaseOption(FDBDatabaseOption::FDB_DB_OPTION_SNAPSHOT_RYW_ENABLE);
data->db->setDatabaseOption(FDBDatabaseOption::FDB_DB_OPTION_SNAPSHOT_RYW_DISABLE);
data->db->setDatabaseOption(FDBDatabaseOption::FDB_DB_OPTION_TRANSACTION_LOGGING_MAX_FIELD_LENGTH, Optional<StringRef>(StringRef((const uint8_t*)&maxFieldLength, 8)));
data->db->setDatabaseOption(FDBDatabaseOption::FDB_DB_OPTION_TRANSACTION_TIMEOUT, Optional<StringRef>(StringRef((const uint8_t*)&timeout, 8)));
data->db->setDatabaseOption(FDBDatabaseOption::FDB_DB_OPTION_TRANSACTION_TIMEOUT, Optional<StringRef>(StringRef((const uint8_t*)&noTimeout, 8)));
data->db->setDatabaseOption(FDBDatabaseOption::FDB_DB_OPTION_TRANSACTION_MAX_RETRY_DELAY, Optional<StringRef>(StringRef((const uint8_t*)&maxRetryDelay, 8)));
data->db->setDatabaseOption(FDBDatabaseOption::FDB_DB_OPTION_TRANSACTION_SIZE_LIMIT, Optional<StringRef>(StringRef((const uint8_t*)&sizeLimit, 8)));
data->db->setDatabaseOption(FDBDatabaseOption::FDB_DB_OPTION_TRANSACTION_RETRY_LIMIT, Optional<StringRef>(StringRef((const uint8_t*)&retryLimit, 8)));
data->db->setDatabaseOption(FDBDatabaseOption::FDB_DB_OPTION_TRANSACTION_RETRY_LIMIT, Optional<StringRef>(StringRef((const uint8_t*)&noRetryLimit, 8)));
data->db->setDatabaseOption(FDBDatabaseOption::FDB_DB_OPTION_SNAPSHOT_RYW_ENABLE);
data->db->setDatabaseOption(FDBDatabaseOption::FDB_DB_OPTION_SNAPSHOT_RYW_DISABLE);
state Reference<Transaction> tr = data->db->createTransaction();
tr->setOption(FDBTransactionOption::FDB_TR_OPTION_PRIORITY_SYSTEM_IMMEDIATE);
@ -1574,6 +1576,7 @@ struct UnitTestsFunc : InstructionFunc {
tr->setOption(FDBTransactionOption::FDB_TR_OPTION_READ_YOUR_WRITES_DISABLE);
tr->setOption(FDBTransactionOption::FDB_TR_OPTION_READ_SYSTEM_KEYS);
tr->setOption(FDBTransactionOption::FDB_TR_OPTION_ACCESS_SYSTEM_KEYS);
tr->setOption(FDBTransactionOption::FDB_TR_OPTION_TRANSACTION_LOGGING_MAX_FIELD_LENGTH, Optional<StringRef>(StringRef((const uint8_t*)&maxFieldLength, 8)));
tr->setOption(FDBTransactionOption::FDB_TR_OPTION_TIMEOUT, Optional<StringRef>(StringRef((const uint8_t*)&timeout, 8)));
tr->setOption(FDBTransactionOption::FDB_TR_OPTION_RETRY_LIMIT, Optional<StringRef>(StringRef((const uint8_t*)&retryLimit, 8)));
tr->setOption(FDBTransactionOption::FDB_TR_OPTION_MAX_RETRY_DELAY, Optional<StringRef>(StringRef((const uint8_t*)&maxRetryDelay, 8)));

View File

@ -793,13 +793,14 @@ func (sm *StackMachine) processInst(idx int, inst tuple.Tuple) {
db.Options().SetMaxWatches(10001)
db.Options().SetDatacenterId("dc_id")
db.Options().SetMachineId("machine_id")
db.Options().SetSnapshotRywEnable()
db.Options().SetSnapshotRywDisable()
db.Options().SetTransactionLoggingMaxFieldLength(1000)
db.Options().SetTransactionTimeout(100000)
db.Options().SetTransactionTimeout(0)
db.Options().SetTransactionMaxRetryDelay(100)
db.Options().SetTransactionRetryLimit(10)
db.Options().SetTransactionRetryLimit(-1)
db.Options().SetSnapshotRywEnable()
db.Options().SetSnapshotRywDisable()
if !fdb.IsAPIVersionSelected() {
log.Fatal("API version should be selected")
@ -836,6 +837,7 @@ func (sm *StackMachine) processInst(idx int, inst tuple.Tuple) {
tr.Options().SetReadYourWritesDisable()
tr.Options().SetReadSystemKeys()
tr.Options().SetAccessSystemKeys()
tr.Options().SetTransactionLoggingMaxFieldLength(1000)
tr.Options().SetTimeout(60 * 1000)
tr.Options().SetRetryLimit(50)
tr.Options().SetMaxRetryDelay(100)

View File

@ -228,6 +228,30 @@ func (o NetworkOptions) SetEnableSlowTaskProfiling() error {
return o.setOpt(71, nil)
}
// Enable client buggify - will make requests randomly fail (intended for client testing)
func (o NetworkOptions) SetClientBuggifyEnable() error {
return o.setOpt(80, nil)
}
// Disable client buggify
func (o NetworkOptions) SetClientBuggifyDisable() error {
return o.setOpt(81, nil)
}
// Set the probability of a CLIENT_BUGGIFY section being active for the current execution.
//
// Parameter: probability expressed as a percentage between 0 and 100
func (o NetworkOptions) SetClientBuggifySectionActivatedProbability(param int64) error {
return o.setOpt(82, int64ToBytes(param))
}
// Set the probability of an active CLIENT_BUGGIFY section being fired. A section will only fire if it was activated
//
// Parameter: probability expressed as a percentage between 0 and 100
func (o NetworkOptions) SetClientBuggifySectionFiredProbability(param int64) error {
return o.setOpt(83, int64ToBytes(param))
}
// Set the size of the client location cache. Raising this value can boost performance in very large databases where clients access data in a near-random pattern. Defaults to 100000.
//
// Parameter: Max location cache entries
@ -256,6 +280,23 @@ func (o DatabaseOptions) SetDatacenterId(param string) error {
return o.setOpt(22, []byte(param))
}
// Snapshot read operations will see the results of writes done in the same transaction. This is the default behavior.
func (o DatabaseOptions) SetSnapshotRywEnable() error {
return o.setOpt(26, nil)
}
// Snapshot read operations will not see the results of writes done in the same transaction. This was the default behavior prior to API version 300.
func (o DatabaseOptions) SetSnapshotRywDisable() error {
return o.setOpt(27, nil)
}
// Sets the maximum escaped length of key and value fields to be logged to the trace file via the LOG_TRANSACTION option. This sets the ``transaction_logging_max_field_length`` option of each transaction created by this database. See the transaction option description for more information.
//
// Parameter: Maximum length of escaped key and value fields.
func (o DatabaseOptions) SetTransactionLoggingMaxFieldLength(param int64) error {
return o.setOpt(405, int64ToBytes(param))
}
// Set a timeout in milliseconds which, when elapsed, will cause each transaction automatically to be cancelled. This sets the ``timeout`` option of each transaction created by this database. See the transaction option description for more information. Using this option requires that the API version is 610 or higher.
//
// Parameter: value in milliseconds of timeout
@ -277,23 +318,13 @@ func (o DatabaseOptions) SetTransactionMaxRetryDelay(param int64) error {
return o.setOpt(502, int64ToBytes(param))
}
// Set the maximum transaction size which, if exceeded, will cause the transaction to be cancelled. Default to 10,000,000 bytes.
// Set the maximum transaction size in bytes. This sets the ``size_limit`` option on each transaction created by this database. See the transaction option description for more information.
//
// Parameter: value in bytes
func (o DatabaseOptions) SetTransactionSizeLimit(param int64) error {
return o.setOpt(503, int64ToBytes(param))
}
// Snapshot read operations will see the results of writes done in the same transaction. This is the default behavior.
func (o DatabaseOptions) SetSnapshotRywEnable() error {
return o.setOpt(26, nil)
}
// Snapshot read operations will not see the results of writes done in the same transaction. This was the default behavior prior to API version 300.
func (o DatabaseOptions) SetSnapshotRywDisable() error {
return o.setOpt(27, nil)
}
// The transaction, if not self-conflicting, may be committed a second time after commit succeeds, in the event of a fault
func (o TransactionOptions) SetCausalWriteRisky() error {
return o.setOpt(10, nil)
@ -388,6 +419,13 @@ func (o TransactionOptions) SetLogTransaction() error {
return o.setOpt(404, nil)
}
// Sets the maximum escaped length of key and value fields to be logged to the trace file via the LOG_TRANSACTION option, after which the field will be truncated. A negative value disables truncation.
//
// Parameter: Maximum length of escaped key and value fields.
func (o TransactionOptions) SetTransactionLoggingMaxFieldLength(param int64) error {
return o.setOpt(405, int64ToBytes(param))
}
// Set a timeout in milliseconds which, when elapsed, will cause the transaction automatically to be cancelled. Valid parameter values are ``[0, INT_MAX]``. If set to 0, will disable all timeouts. All pending and any future uses of the transaction will throw an exception. The transaction can be used again after it is reset. Prior to API version 610, like all other transaction options, the timeout must be reset after a call to ``onError``. If the API version is 610 or greater, the timeout is not reset after an ``onError`` call. This allows the user to specify a longer timeout on specific transactions than the default timeout specified through the ``transaction_timeout`` database option without the shorter database timeout cancelling transactions that encounter a retryable error. Note that at all API versions, it is safe and legal to set the timeout each time the transaction begins, so most code written assuming the older behavior can be upgraded to the newer behavior without requiring any modification, and the caller is not required to implement special logic in retry loops to only conditionally set this option.
//
// Parameter: value in milliseconds of timeout
@ -409,7 +447,7 @@ func (o TransactionOptions) SetMaxRetryDelay(param int64) error {
return o.setOpt(502, int64ToBytes(param))
}
// Set the maximum transaction size which, if exceeded, will cause the transaction to be cancelled. Valid parameter values are ``[32, 10,000,000]```.
// Set the transaction size limit in bytes. The size is calculated by combining the sizes of all keys and values written or mutated, all key ranges cleared, and all read and write conflict ranges. (In other words, it includes the total size of all data included in the request to the cluster to commit the transaction.) Large transactions can cause performance problems on FoundationDB clusters, so setting this limit to a smaller value than the default can help prevent the client from accidentally degrading the cluster's performance. This value must be at least 32 and cannot be set to higher than 10,000,000, the default transaction size limit.
//
// Parameter: value in bytes
func (o TransactionOptions) SetSizeLimit(param int64) error {

View File

@ -481,13 +481,14 @@ public class AsyncStackTester {
db.options().setMaxWatches(10001);
db.options().setDatacenterId("dc_id");
db.options().setMachineId("machine_id");
db.options().setSnapshotRywEnable();
db.options().setSnapshotRywDisable();
db.options().setTransactionLoggingMaxFieldLength(1000);
db.options().setTransactionTimeout(100000);
db.options().setTransactionTimeout(0);
db.options().setTransactionMaxRetryDelay(100);
db.options().setTransactionRetryLimit(10);
db.options().setTransactionRetryLimit(-1);
db.options().setSnapshotRywEnable();
db.options().setSnapshotRywDisable();
tr.options().setPrioritySystemImmediate();
tr.options().setPriorityBatch();
@ -496,6 +497,7 @@ public class AsyncStackTester {
tr.options().setReadYourWritesDisable();
tr.options().setReadSystemKeys();
tr.options().setAccessSystemKeys();
tr.options().setTransactionLoggingMaxFieldLength(1000);
tr.options().setTimeout(60*1000);
tr.options().setRetryLimit(50);
tr.options().setMaxRetryDelay(100);

View File

@ -434,13 +434,14 @@ public class StackTester {
db.options().setMaxWatches(10001);
db.options().setDatacenterId("dc_id");
db.options().setMachineId("machine_id");
db.options().setSnapshotRywEnable();
db.options().setSnapshotRywDisable();
db.options().setTransactionLoggingMaxFieldLength(1000);
db.options().setTransactionTimeout(100000);
db.options().setTransactionTimeout(0);
db.options().setTransactionMaxRetryDelay(100);
db.options().setTransactionRetryLimit(10);
db.options().setTransactionRetryLimit(-1);
db.options().setSnapshotRywEnable();
db.options().setSnapshotRywDisable();
tr.options().setPrioritySystemImmediate();
tr.options().setPriorityBatch();
@ -449,6 +450,7 @@ public class StackTester {
tr.options().setReadYourWritesDisable();
tr.options().setReadSystemKeys();
tr.options().setAccessSystemKeys();
tr.options().setTransactionLoggingMaxFieldLength(1000);
tr.options().setTimeout(60*1000);
tr.options().setRetryLimit(50);
tr.options().setMaxRetryDelay(100);

View File

@ -1,64 +0,0 @@
#!/usr/bin/python
#
# size_limit.py
#
# This source file is part of the FoundationDB open source project
#
# Copyright 2013-2019 Apple Inc. and the FoundationDB project authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import fdb
import sys
fdb.api_version(610)
@fdb.transactional
def setValue(tr, key, value):
tr[key] = value
@fdb.transactional
def setValueWithLimit(tr, key, value, limit):
tr.options.set_size_limit(limit)
tr[key] = value
def run(clusterFile):
db = fdb.open(clusterFile)
db.options.set_transaction_timeout(2000) # 2 seconds
db.options.set_transaction_retry_limit(3)
value = 'a' * 1024
setValue(db, 't1', value)
assert(value == db['t1'])
try:
db.options.set_transaction_size_limit(1000)
setValue(db, 't2', value)
assert(False) # not reached
except fdb.impl.FDBError as e:
assert(e.code == 2101) # Transaction exceeds byte limit (2101)
# Per transaction option overrides database option
db.options.set_transaction_size_limit(1000000)
try:
setValueWithLimit(db, 't3', value, 1000)
assert(False) # not reached
except fdb.impl.FDBError as e:
assert(e.code == 2101) # Transaction exceeds byte limit (2101)
# Expect a cluster file as input. This test will write to the FDB cluster, so
# be aware of potential side effects.
if __name__ == '__main__':
clusterFile = sys.argv[1]
run(clusterFile)

View File

@ -0,0 +1,76 @@
#!/usr/bin/python
#
# size_limit_tests.py
#
# This source file is part of the FoundationDB open source project
#
# Copyright 2013-2019 Apple Inc. and the FoundationDB project authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import fdb
import sys
if __name__ == '__main__':
fdb.api_version(610)
@fdb.transactional
def setValue(tr, key, value):
tr[key] = value
@fdb.transactional
def setValueWithLimit(tr, key, value, limit):
tr.options.set_size_limit(limit)
tr[key] = value
def test_size_limit_option(db):
db.options.set_transaction_timeout(2000) # 2 seconds
db.options.set_transaction_retry_limit(3)
value = b'a' * 1024
setValue(db, b't1', value)
assert(value == db[b't1'])
try:
db.options.set_transaction_size_limit(1000)
setValue(db, b't2', value)
assert(False) # not reached
except fdb.FDBError as e:
assert(e.code == 2101) # Transaction exceeds byte limit (2101)
# Per transaction option overrides database option
db.options.set_transaction_size_limit(1000000)
try:
setValueWithLimit(db, b't3', value, 1000)
assert(False) # not reached
except fdb.FDBError as e:
assert(e.code == 2101) # Transaction exceeds byte limit (2101)
# DB default survives on_error reset
db.options.set_transaction_size_limit(1000)
tr = db.create_transaction()
try:
tr[b't4'] = b'bar'
tr.on_error(fdb.FDBError(1007)).wait()
setValue(tr, b't4', value)
tr.commit().wait()
assert(False) # not reached
except fdb.FDBError as e:
assert(e.code == 2101) # Transaction exceeds byte limit (2101)
# Expect a cluster file as input. This test will write to the FDB cluster, so
# be aware of potential side effects.
if __name__ == '__main__':
clusterFile = sys.argv[1]
db = fdb.open(clusterFile)
test_size_limit_option(db)

View File

@ -48,6 +48,8 @@ from cancellation_timeout_tests import test_retry_limits
from cancellation_timeout_tests import test_db_retry_limits
from cancellation_timeout_tests import test_combinations
from size_limit_tests import test_size_limit_option
random.seed(0)
if len(sys.argv) == 4:
@ -126,9 +128,13 @@ class Instruction:
def test_db_options(db):
db.options.set_location_cache_size(100001)
db.options.set_max_watches(100001)
db.options.set_datacenter_id("dc_id")
db.options.set_machine_id("machine_id")
db.options.set_snapshot_ryw_enable()
db.options.set_snapshot_ryw_disable()
db.options.set_transaction_logging_max_field_length(1000)
db.options.set_transaction_timeout(100000)
db.options.set_transaction_timeout(0)
db.options.set_transaction_timeout(0)
@ -136,8 +142,6 @@ def test_db_options(db):
db.options.set_transaction_size_limit(100000)
db.options.set_transaction_retry_limit(10)
db.options.set_transaction_retry_limit(-1)
db.options.set_snapshot_ryw_enable()
db.options.set_snapshot_ryw_disable()
@fdb.transactional
@ -149,6 +153,7 @@ def test_options(tr):
tr.options.set_read_your_writes_disable()
tr.options.set_read_system_keys()
tr.options.set_access_system_keys()
tr.options.set_transaction_logging_max_field_length(1000)
tr.options.set_timeout(60 * 1000)
tr.options.set_retry_limit(50)
tr.options.set_max_retry_delay(100)
@ -543,8 +548,6 @@ class Tester:
inst.push(b"WAITED_FOR_EMPTY")
elif inst.op == six.u("UNIT_TESTS"):
try:
db.options.set_location_cache_size(100001)
test_db_options(db)
test_options(db)
test_watches(db)
@ -557,6 +560,8 @@ class Tester:
test_locality(db)
test_predicates()
test_size_limit_option(db)
except fdb.FDBError as e:
print("Unit tests failed: %s" % e.description)
traceback.print_exc()

View File

@ -456,14 +456,15 @@ class Tester
@db.options.set_max_watches(10001)
@db.options.set_datacenter_id("dc_id")
@db.options.set_machine_id("machine_id")
@db.options.set_snapshot_ryw_enable()
@db.options.set_snapshot_ryw_disable()
@db.options.set_transaction_logging_max_field_length(1000)
@db.options.set_transaction_timeout(100000)
@db.options.set_transaction_timeout(0)
@db.options.set_transaction_max_retry_delay(100)
@db.options.set_transaction_size_limit(100000)
@db.options.set_transaction_retry_limit(10)
@db.options.set_transaction_retry_limit(-1)
@db.options.set_snapshot_ryw_enable()
@db.options.set_snapshot_ryw_disable()
@db.transact do |tr|
tr.options.set_priority_system_immediate
@ -473,6 +474,7 @@ class Tester
tr.options.set_read_your_writes_disable
tr.options.set_read_system_keys
tr.options.set_access_system_keys
tr.options.set_transaction_logging_max_field_length(1000)
tr.options.set_timeout(60*1000)
tr.options.set_retry_limit(50)
tr.options.set_max_retry_delay(100)

View File

@ -8,6 +8,12 @@ set(FDB_RELEASE OFF CACHE BOOL "This is a building of a final release")
set(USE_LD "LD" CACHE STRING "The linker to use for building: can be LD (system default, default choice), GOLD, or LLD")
set(USE_LIBCXX OFF CACHE BOOL "Use libc++")
set(USE_CCACHE OFF CACHE BOOL "Use ccache for compilation if available")
set(RELATIVE_DEBUG_PATHS OFF CACHE BOOL "Use relative file paths in debug info")
set(rel_debug_paths OFF)
if(RELATIVE_DEBUG_PATHS)
set(rel_debug_paths ON)
endif()
if(USE_GPERFTOOLS)
find_package(Gperftools REQUIRED)
@ -103,6 +109,10 @@ else()
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fuse-ld=lld -Wl,--disable-new-dtags")
endif()
if(rel_debug_paths)
add_compile_options("-fdebug-prefix-map=${CMAKE_SOURCE_DIR}=." "-fdebug-prefix-map=${CMAKE_BINARY_DIR}=.")
endif()
# we always compile with debug symbols. CPack will strip them out
# and create a debuginfo rpm
add_compile_options(-ggdb -fno-omit-frame-pointer)

View File

@ -49,6 +49,8 @@
.. |max-retry-delay-database-option| replace:: FIXME
.. |transaction-size-limit-database-option| replace:: FIXME
.. |timeout-database-option| replace:: FIXME
.. |transaction-logging-max-field-length-database-option| replace:: FIXME
.. |transaction-logging-max-field-length-transaction-option| replace:: FIXME
.. include:: api-common.rst.inc

View File

@ -326,6 +326,10 @@
If this option has been set more times with this database than the disable option, snapshot reads will *not* see the effects of prior writes in the same transaction. Disabling this option is equivalent to calling |snapshot-ryw-disable-transaction-option| on each transaction created by this database.
.. |option-db-tr-transaction-logging-max-field-length-blurb| replace::
Sets the maximum escaped length of key and value fields to be logged to the trace file via the LOG_TRANSACTION option. This is equivalent to calling |transaction-logging-max-field-length-transaction-option| on each transaction created by this database.
.. |transaction-options-blurb| replace::
Transaction options alter the behavior of FoundationDB transactions. FoundationDB defaults to extremely safe transaction behavior, and we have worked hard to make the performance excellent with the default setting, so you should not often need to use transaction options.
@ -399,7 +403,7 @@
.. |option-set-size-limit-blurb| replace::
Set the maximum transaction size limit in bytes. The size is calculated by combining the sizes of all keys and values written or mutated, all key ranges cleared, and all read and write conflict ranges. (In other words, it includes the total size of all data included in the request to the cluster to commit the transaction.) Large transactions can cause performance problems on FoundationDB clusters, so setting this limit to a smaller value than the default can help prevent the client from accidentally degrading the cluster's performance. This value must be at least 32 and cannot be set to higher than 10,000,000, the default transaction size limit. The value set by this limit will persist across transaction resets.
Set the transaction size limit in bytes. The size is calculated by combining the sizes of all keys and values written or mutated, all key ranges cleared, and all read and write conflict ranges. (In other words, it includes the total size of all data included in the request to the cluster to commit the transaction.) Large transactions can cause performance problems on FoundationDB clusters, so setting this limit to a smaller value than the default can help prevent the client from accidentally degrading the cluster's performance. This value must be at least 32 and cannot be set to higher than 10,000,000, the default transaction size limit.
.. |option-set-timeout-blurb1| replace::
@ -411,7 +415,7 @@
.. |option-set-timeout-blurb3| replace::
Prior to API version 610, like all other transaction options, a timeout must be reset after a call to |on-error-func|. Note that resetting this option resets only the timeout *duration*, not the starting point from which the time is measured. If the API version is 610 or newer, then the timeout is not reset. This allows the user to specify a timeout for specific transactions that is longer than the timeout specified by |timeout-database-option|. Note that at all API versions, it is safe and legal to call this option after each call to |on-error-func|, so most code written assuming the older behavior can be upgraded without requiring any modification. This also means that there is no need to introduce logic to conditionally set this option within retry loops. One can set the default timeout for all transactions by calling |timeout-database-option|.
Prior to API version 610, like all other transaction options, a timeout must be reset after a call to |on-error-func|. Note that resetting this option resets only the timeout *duration*, not the starting point from which the time is measured. If the API version is 610 or newer, then the timeout is not reset. This allows the user to specify a timeout for specific transactions that is longer than the timeout specified by |timeout-database-option|. Note that at all API versions, it is safe and legal to call this option after each call to |on-error-func|, so most code written assuming the older behavior can be upgraded without requiring any modification. This also means that there is no need to introduce logic to conditionally set this option within retry loops. One can set the default timeout for all transactions by calling |timeout-database-option|.
.. |option-next-write-no-write-conflict-range-blurb| replace::
@ -421,6 +425,10 @@
Care needs to be taken when using this option on a transaction that is shared between multiple threads. When setting this option, write conflict ranges will be disabled on the next write operation, regardless of what thread it is on.
.. |option-set-transaction-logging-max-field-length-blurb| replace::
Sets the maximum escaped length of key and value fields to be logged to the trace file via the LOG_TRANSACTION option, after which the field will be truncated. A negative value disables truncation. One can set the default max field length for all transactions by calling |transaction-logging-max-field-length-database-option|.
.. |future-blurb1| replace::
Many FoundationDB API functions return "future" objects. A brief overview of futures is included in the :doc:`class scheduling tutorial <class-scheduling>`. Most future objects behave just like a normal object, but block when you use them for the first time if the asynchronous function which returned the future has not yet completed its action. A future object is considered ready when either a value is available, or when an error has occurred.

View File

@ -25,6 +25,7 @@
.. |timeout-database-option| replace:: :func:`Database.options.set_transaction_timeout`
.. |max-retry-delay-database-option| replace:: :func:`Database.options.set_transaction_max_retry_delay`
.. |transaction-size-limit-database-option| replace:: :func:`Database.options.set_transaction_size_limit`
.. |transaction-logging-max-field-length-database-option| replace:: :func:`Database.options.set_transaction_logging_max_field_length`
.. |snapshot-ryw-enable-database-option| replace:: :func:`Database.options.set_snapshot_ryw_enable`
.. |snapshot-ryw-disable-database-option| replace:: :func:`Database.options.set_snapshot_ryw_disable`
.. |future-type-string| replace:: a :ref:`future <api-python-future>`
@ -35,6 +36,7 @@
.. |size-limit-transaction-option| replace:: :func:`Transaction.options.set_size_limit`
.. |snapshot-ryw-enable-transaction-option| replace:: :func:`Transaction.options.set_snapshot_ryw_enable`
.. |snapshot-ryw-disable-transaction-option| replace:: :func:`Transaction.options.set_snapshot_ryw_disable`
.. |transaction-logging-max-field-length-transaction-option| replace:: :func:`Transaction.options.set_transaction_logging_max_field_length`
.. |lazy-iterator-object| replace:: generator
.. |key-meth| replace:: :meth:`Subspace.key`
.. |directory-subspace| replace:: :ref:`DirectorySubspace <api-python-directory-subspace>`
@ -384,6 +386,10 @@ Database options
|option-db-tr-size-limit-blurb|
.. method:: Database.options.set_transaction_logging_max_field_length(size_limit)
|option-db-tr-transaction-logging-max-field-length-blurb|
.. method:: Database.options.set_snapshot_ryw_enable()
|option-db-snapshot-ryw-enable-blurb|
@ -855,6 +861,10 @@ Transaction options
|option-set-timeout-blurb3|
.. method:: Transaction.options.set_transaction_logging_max_field_length(size_limit)
|option-set-transaction-logging-max-field-length-blurb|
.. _api-python-future:
Future objects

View File

@ -25,6 +25,7 @@
.. |transaction-size-limit-database-option| replace:: :func:`Database.options.set_transaction_size_limit`
.. |snapshot-ryw-enable-database-option| replace:: :meth:`Database.options.set_snapshot_ryw_enable`
.. |snapshot-ryw-disable-database-option| replace:: :meth:`Database.options.set_snapshot_ryw_disable`
.. |transaction-logging-max-field-length-database-option| replace:: :meth:`Database.options.set_transaction_logging_max_field_length`
.. |future-type-string| replace:: a :class:`Future`
.. |read-your-writes-disable-option| replace:: :meth:`Transaction.options.set_read_your_writes_disable`
.. |retry-limit-transaction-option| replace:: :meth:`Transaction.options.set_retry_limit`
@ -33,6 +34,7 @@
.. |size-limit-transaction-option| replace:: :meth:`Transaction.options.set_size_limit`
.. |snapshot-ryw-enable-transaction-option| replace:: :meth:`Transaction.options.set_snapshot_ryw_enable`
.. |snapshot-ryw-disable-transaction-option| replace:: :meth:`Transaction.options.set_snapshot_ryw_disable`
.. |transaction-logging-max-field-length-transaction-option| replace:: :meth:`Transaction.options.set_transaction_logging_max_field_length`
.. |lazy-iterator-object| replace:: :class:`Enumerator`
.. |key-meth| replace:: :meth:`Subspace.key`
.. |directory-subspace| replace:: :class:`DirectorySubspace`
@ -380,6 +382,10 @@ Database options
|option-db-tr-size-limit-blurb|
.. method:: Database.options.set_transaction_logging_max_field_length(size_limit) -> nil
|option-db-tr-transaction-logging-max-field-length-blurb|
.. method:: Database.options.set_snapshot_ryw_enable() -> nil
|option-db-snapshot-ryw-enable-blurb|
@ -797,6 +803,10 @@ Transaction options
|option-set-timeout-blurb3|
.. method:: Transaction.options.set_transaction_logging_max_field_length(size_limit) -> nil
|option-set-transaction-logging-max-field-length-blurb|
.. _transact:
The transact method

View File

@ -316,19 +316,19 @@ Single datacenter modes
+==============================+==+=================+=================+================+
| Best for | | 1-2 machines | 3-4 machines | 5+ machines |
+------------------------------+--+-----------------+-----------------+----------------+
| Replication | | 1 copy | 2 copy | 3 copy |
| Total Replicas | | 1 copy | 2 copies | 3 copies |
+------------------------------+--+-----------------+-----------------+----------------+
| # live machines | | | | |
| Live machines required | | | | |
| to make progress | | 1 | 2 | 3 |
+------------------------------+--+-----------------+-----------------+----------------+
| Minimum # of machines | | | | |
| Required machines | | | | |
| for fault tolerance | | impossible | 3 | 4 |
+------------------------------+--+-----------------+-----------------+----------------+
| Ideal # of | | | | |
| Ideal number of | | | | |
| coordination servers | | 1 | 3 | 5 |
+------------------------------+--+-----------------+-----------------+----------------+
| # simultaneous failures | | | | |
| after which data may be lost | | any machine | 2+ machines | 3+ machines |
| Simultaneous failures | | | | |
| after which data may be lost | | any process | 2+ machines | 3+ machines |
+------------------------------+--+-----------------+-----------------+----------------+
In the three single datacenter redundancy modes, FoundationDB replicates data across the required number of machines in the cluster, but without aiming for datacenter redundancy. Although machines may be placed in more than one datacenter, the cluster will not be tolerant of datacenter-correlated failures.

View File

@ -51,6 +51,8 @@
.. |max-retry-delay-database-option| replace:: FIXME
.. |transaction-size-limit-database-option| replace:: FIXME
.. |timeout-database-option| replace:: FIXME
.. |transaction-logging-max-field-length-transaction-option| replace:: FIXME
.. |transaction-logging-max-field-length-database-option| replace:: FIXME
.. include:: api-common.rst.inc

View File

@ -51,6 +51,8 @@
.. |max-retry-delay-database-option| replace:: FIXME
.. |transaction-size-limit-database-option| replace:: FIXME
.. |timeout-database-option| replace:: FIXME
.. |transaction-logging-max-field-length-transaction-option| replace:: FIXME
.. |transaction-logging-max-field-length-database-option| replace:: FIXME
.. include:: api-common.rst.inc

View File

@ -10,38 +10,38 @@ macOS
The macOS installation package is supported on macOS 10.7+. It includes the client and (optionally) the server.
* `FoundationDB-6.1.10.pkg <https://www.foundationdb.org/downloads/6.1.10/macOS/installers/FoundationDB-6.1.10.pkg>`_
* `FoundationDB-6.1.11.pkg <https://www.foundationdb.org/downloads/6.1.11/macOS/installers/FoundationDB-6.1.11.pkg>`_
Ubuntu
------
The Ubuntu packages are supported on 64-bit Ubuntu 12.04+, but beware of the Linux kernel bug in Ubuntu 12.x.
* `foundationdb-clients-6.1.10-1_amd64.deb <https://www.foundationdb.org/downloads/6.1.10/ubuntu/installers/foundationdb-clients_6.1.10-1_amd64.deb>`_
* `foundationdb-server-6.1.10-1_amd64.deb <https://www.foundationdb.org/downloads/6.1.10/ubuntu/installers/foundationdb-server_6.1.10-1_amd64.deb>`_ (depends on the clients package)
* `foundationdb-clients-6.1.11-1_amd64.deb <https://www.foundationdb.org/downloads/6.1.11/ubuntu/installers/foundationdb-clients_6.1.11-1_amd64.deb>`_
* `foundationdb-server-6.1.11-1_amd64.deb <https://www.foundationdb.org/downloads/6.1.11/ubuntu/installers/foundationdb-server_6.1.11-1_amd64.deb>`_ (depends on the clients package)
RHEL/CentOS EL6
---------------
The RHEL/CentOS EL6 packages are supported on 64-bit RHEL/CentOS 6.x.
* `foundationdb-clients-6.1.10-1.el6.x86_64.rpm <https://www.foundationdb.org/downloads/6.1.10/rhel6/installers/foundationdb-clients-6.1.10-1.el6.x86_64.rpm>`_
* `foundationdb-server-6.1.10-1.el6.x86_64.rpm <https://www.foundationdb.org/downloads/6.1.10/rhel6/installers/foundationdb-server-6.1.10-1.el6.x86_64.rpm>`_ (depends on the clients package)
* `foundationdb-clients-6.1.11-1.el6.x86_64.rpm <https://www.foundationdb.org/downloads/6.1.11/rhel6/installers/foundationdb-clients-6.1.11-1.el6.x86_64.rpm>`_
* `foundationdb-server-6.1.11-1.el6.x86_64.rpm <https://www.foundationdb.org/downloads/6.1.11/rhel6/installers/foundationdb-server-6.1.11-1.el6.x86_64.rpm>`_ (depends on the clients package)
RHEL/CentOS EL7
---------------
The RHEL/CentOS EL7 packages are supported on 64-bit RHEL/CentOS 7.x.
* `foundationdb-clients-6.1.10-1.el7.x86_64.rpm <https://www.foundationdb.org/downloads/6.1.10/rhel7/installers/foundationdb-clients-6.1.10-1.el7.x86_64.rpm>`_
* `foundationdb-server-6.1.10-1.el7.x86_64.rpm <https://www.foundationdb.org/downloads/6.1.10/rhel7/installers/foundationdb-server-6.1.10-1.el7.x86_64.rpm>`_ (depends on the clients package)
* `foundationdb-clients-6.1.11-1.el7.x86_64.rpm <https://www.foundationdb.org/downloads/6.1.11/rhel7/installers/foundationdb-clients-6.1.11-1.el7.x86_64.rpm>`_
* `foundationdb-server-6.1.11-1.el7.x86_64.rpm <https://www.foundationdb.org/downloads/6.1.11/rhel7/installers/foundationdb-server-6.1.11-1.el7.x86_64.rpm>`_ (depends on the clients package)
Windows
-------
The Windows installer is supported on 64-bit Windows XP and later. It includes the client and (optionally) the server.
* `foundationdb-6.1.10-x64.msi <https://www.foundationdb.org/downloads/6.1.10/windows/installers/foundationdb-6.1.10-x64.msi>`_
* `foundationdb-6.1.11-x64.msi <https://www.foundationdb.org/downloads/6.1.11/windows/installers/foundationdb-6.1.11-x64.msi>`_
API Language Bindings
=====================
@ -58,18 +58,18 @@ On macOS and Windows, the FoundationDB Python API bindings are installed as part
If you need to use the FoundationDB Python API from other Python installations or paths, download the Python package:
* `foundationdb-6.1.10.tar.gz <https://www.foundationdb.org/downloads/6.1.10/bindings/python/foundationdb-6.1.10.tar.gz>`_
* `foundationdb-6.1.11.tar.gz <https://www.foundationdb.org/downloads/6.1.11/bindings/python/foundationdb-6.1.11.tar.gz>`_
Ruby 1.9.3/2.0.0+
-----------------
* `fdb-6.1.10.gem <https://www.foundationdb.org/downloads/6.1.10/bindings/ruby/fdb-6.1.10.gem>`_
* `fdb-6.1.11.gem <https://www.foundationdb.org/downloads/6.1.11/bindings/ruby/fdb-6.1.11.gem>`_
Java 8+
-------
* `fdb-java-6.1.10.jar <https://www.foundationdb.org/downloads/6.1.10/bindings/java/fdb-java-6.1.10.jar>`_
* `fdb-java-6.1.10-javadoc.jar <https://www.foundationdb.org/downloads/6.1.10/bindings/java/fdb-java-6.1.10-javadoc.jar>`_
* `fdb-java-6.1.11.jar <https://www.foundationdb.org/downloads/6.1.11/bindings/java/fdb-java-6.1.11.jar>`_
* `fdb-java-6.1.11-javadoc.jar <https://www.foundationdb.org/downloads/6.1.11/bindings/java/fdb-java-6.1.11-javadoc.jar>`_
Go 1.11+
--------

View File

@ -36,6 +36,7 @@
"roles":[
{
"query_queue_max":0,
"local_rate":0,
"input_bytes":{
"hz":0.0,
"counter":0,
@ -187,7 +188,8 @@
"megabits_received":{
"hz":0.0
}
}
},
"run_loop_busy":0.2 // fraction of time the run loop was busy
}
},
"old_logs":[
@ -229,7 +231,8 @@
"storage_server_min_free_space",
"storage_server_min_free_space_ratio",
"log_server_min_free_space",
"log_server_min_free_space_ratio"
"log_server_min_free_space_ratio",
"storage_server_durability_lag"
]
},
"description":"The database is not being saturated by the workload."
@ -248,7 +251,8 @@
"storage_server_min_free_space",
"storage_server_min_free_space_ratio",
"log_server_min_free_space",
"log_server_min_free_space_ratio"
"log_server_min_free_space_ratio",
"storage_server_durability_lag"
]
},
"description":"The database is not being saturated by the workload."
@ -264,7 +268,10 @@
},
"incompatible_connections":[
],
"datacenter_version_difference":0,
"datacenter_lag":{
"seconds":1.0,
"versions":1000000
},
"degraded_processes":0,
"database_available":true,
"database_locked":false,
@ -293,6 +300,10 @@
}
]
},
"page_cache":{
"log_hit_rate":0.5,
"storage_hit_rate":0.5
},
"messages":[
{
"reasons":[
@ -407,6 +418,21 @@
"counter":0,
"roughness":0.0
},
"started_immediate_priority":{
"hz":0.0,
"counter":0,
"roughness":0.0
},
"started_default_priority":{
"hz":0.0,
"counter":0,
"roughness":0.0
},
"started_batch_priority":{
"hz":0.0,
"counter":0,
"roughness":0.0
},
"conflicted":{
"hz":0.0,
"counter":0,

View File

@ -128,4 +128,5 @@ min_free_space Running out of space (approaching 100MB limi
min_free_space_ratio Running out of space (approaching 5% limit).
log_server_min_free_space Log server running out of space (approaching 100MB limit).
log_server_min_free_space_ratio Log server running out of space (approaching 5% limit).
storage_server_durability_lag Storage server durable version falling behind.
=================================== ====================================================

View File

@ -2,6 +2,14 @@
Release Notes
#############
6.1.11
======
Fixes
-----
* Machines which were added to a cluster immediately after the cluster was upgraded to 6.1 would not be given data. `(PR #1764) <https://github.com/apple/foundationdb/pull/1764>`_
6.1.10
======
@ -174,4 +182,4 @@ Earlier release notes
* :doc:`Beta 2 (API Version 22) </old-release-notes/release-notes-022>`
* :doc:`Beta 1 (API Version 21) </old-release-notes/release-notes-021>`
* :doc:`Alpha 6 (API Version 16) </old-release-notes/release-notes-016>`
* :doc:`Alpha 5 (API Version 14) </old-release-notes/release-notes-014>`
* :doc:`Alpha 5 (API Version 14) </old-release-notes/release-notes-014>`

View File

@ -14,16 +14,23 @@ Performance
Fixes
-----
* During an upgrade, the multi-version client now persists database default options and transaction options that aren't reset on retry (e.g. transaction timeout). In order for these options to function correctly during an upgrade, a 6.2 or later client should be used as the primary client. `(PR #1767) <https://github.com/apple/foundationdb/pull/1767>`_.
* If a cluster is upgraded during an ``onError`` call, the cluster could return a ``cluster_version_changed`` error. `(PR #1734) <https://github.com/apple/foundationdb/pull/1734>`_.
Status
------
* Added ``run_loop_busy`` to the ``processes`` section to record the fraction of time the run loop is busy. `(PR #1760) <https://github.com/apple/foundationdb/pull/1760>`_.
* Added ``cluster.page_cache`` section to status. In this section, added two new statistics ``storage_hit_rate`` and ``log_hit_rate`` that indicate the fraction of recent page reads that were served by cache. `(PR #1823) <https://github.com/apple/foundationdb/pull/1823>`_.
* Added transaction start counts by priority to ``cluster.workload.transactions``. The new counters are named ``started_immediate_priority``, ``started_default_priority``, and ``started_batch_priority``. `(PR #1836) <https://github.com/apple/foundationdb/pull/1836>`_.
* Remove ``cluster.datacenter_version_difference`` and replace it with ``cluster.datacenter_lag`` that has subfields ``versions`` and ``seconds``. `(PR #1800) <https://github.com/apple/foundationdb/pull/1800>`_.
Bindings
--------
* Go: The Go bindings now require Go version 1.11 or later.
* Go: Fix issue with finalizers running too early that could lead to undefined behavior. `(PR #1451) <https://github.com/apple/foundationdb/pull/1451>`_.
* Added transaction option to control the field length of keys and values in debug transaction logging in order to avoid truncation. `(PR #1844) <https://github.com/apple/foundationdb/pull/1844>`_.
Other Changes
-------------

View File

@ -3194,11 +3194,14 @@ int main(int argc, char* argv[]) {
}
TraceEvent("ProgramStart")
.setMaxEventLength(12000)
.detail("SourceVersion", getHGVersion())
.detail("Version", FDB_VT_VERSION )
.detail("PackageName", FDB_VT_PACKAGE_NAME)
.detailf("ActualTime", "%lld", DEBUG_DETERMINISM ? 0 : time(NULL))
.setMaxFieldLength(10000)
.detail("CommandLine", commandLine)
.setMaxFieldLength(0)
.detail("MemoryLimit", memLimit)
.trackLatest("ProgramStart");
@ -3244,7 +3247,7 @@ int main(int argc, char* argv[]) {
}
try {
db = Database::createDatabase(ccf, -1, localities);
db = Database::createDatabase(ccf, -1, true, localities);
}
catch (Error& e) {
fprintf(stderr, "ERROR: %s\n", e.what());
@ -3266,7 +3269,7 @@ int main(int argc, char* argv[]) {
}
try {
sourceDb = Database::createDatabase(sourceCcf, -1, localities);
sourceDb = Database::createDatabase(sourceCcf, -1, true, localities);
}
catch (Error& e) {
fprintf(stderr, "ERROR: %s\n", e.what());

View File

@ -59,31 +59,44 @@ extern const char* getHGVersion();
std::vector<std::string> validOptions;
enum { OPT_CONNFILE, OPT_DATABASE, OPT_HELP, OPT_TRACE, OPT_TRACE_DIR, OPT_TIMEOUT, OPT_EXEC, OPT_NO_STATUS, OPT_STATUS_FROM_JSON, OPT_VERSION, OPT_TRACE_FORMAT };
enum {
OPT_CONNFILE,
OPT_DATABASE,
OPT_HELP,
OPT_TRACE,
OPT_TRACE_DIR,
OPT_TIMEOUT,
OPT_EXEC,
OPT_NO_STATUS,
OPT_STATUS_FROM_JSON,
OPT_VERSION,
OPT_TRACE_FORMAT,
OPT_USE_OBJECT_SERIALIZER
};
CSimpleOpt::SOption g_rgOptions[] = {
{ OPT_CONNFILE, "-C", SO_REQ_SEP },
{ OPT_CONNFILE, "--cluster_file", SO_REQ_SEP },
{ OPT_DATABASE, "-d", SO_REQ_SEP },
{ OPT_TRACE, "--log", SO_NONE },
{ OPT_TRACE_DIR, "--log-dir", SO_REQ_SEP },
{ OPT_TIMEOUT, "--timeout", SO_REQ_SEP },
{ OPT_EXEC, "--exec", SO_REQ_SEP },
{ OPT_NO_STATUS, "--no-status", SO_NONE },
{ OPT_HELP, "-?", SO_NONE },
{ OPT_HELP, "-h", SO_NONE },
{ OPT_HELP, "--help", SO_NONE },
{ OPT_STATUS_FROM_JSON, "--status-from-json", SO_REQ_SEP },
{ OPT_VERSION, "--version", SO_NONE },
{ OPT_VERSION, "-v", SO_NONE },
{ OPT_TRACE_FORMAT, "--trace_format", SO_REQ_SEP },
CSimpleOpt::SOption g_rgOptions[] = { { OPT_CONNFILE, "-C", SO_REQ_SEP },
{ OPT_CONNFILE, "--cluster_file", SO_REQ_SEP },
{ OPT_DATABASE, "-d", SO_REQ_SEP },
{ OPT_TRACE, "--log", SO_NONE },
{ OPT_TRACE_DIR, "--log-dir", SO_REQ_SEP },
{ OPT_TIMEOUT, "--timeout", SO_REQ_SEP },
{ OPT_EXEC, "--exec", SO_REQ_SEP },
{ OPT_NO_STATUS, "--no-status", SO_NONE },
{ OPT_HELP, "-?", SO_NONE },
{ OPT_HELP, "-h", SO_NONE },
{ OPT_HELP, "--help", SO_NONE },
{ OPT_STATUS_FROM_JSON, "--status-from-json", SO_REQ_SEP },
{ OPT_VERSION, "--version", SO_NONE },
{ OPT_VERSION, "-v", SO_NONE },
{ OPT_TRACE_FORMAT, "--trace_format", SO_REQ_SEP },
{ OPT_USE_OBJECT_SERIALIZER, "-S", SO_REQ_SEP },
{ OPT_USE_OBJECT_SERIALIZER, "--object-serializer", SO_REQ_SEP },
#ifndef TLS_DISABLED
TLS_OPTION_FLAGS
TLS_OPTION_FLAGS
#endif
SO_END_OF_OPTIONS
};
SO_END_OF_OPTIONS };
void printAtCol(const char* text, int col) {
const char* iter = text;
@ -168,7 +181,7 @@ public:
private:
//Sets a transaction option. If intrans == true, then this option is also applied to the passed in transaction.
void setTransactionOption(Reference<ReadYourWritesTransaction> tr, FDBTransactionOptions::Option option, bool enabled, Optional<StringRef> arg, bool intrans) {
if(enabled && arg.present() != FDBTransactionOptions::optionInfo[option].hasParameter) {
if(enabled && arg.present() != FDBTransactionOptions::optionInfo.getMustExist(option).hasParameter) {
printf("ERROR: option %s a parameter\n", arg.present() ? "did not expect" : "expected");
throw invalid_option_value();
}
@ -224,7 +237,7 @@ private:
//Returns true if the specified option is documented
bool isDocumented(typename T::Option option) {
FDBOptionInfo info = T::optionInfo[option];
FDBOptionInfo info = T::optionInfo.getMustExist(option);
std::string deprecatedStr = "Deprecated";
return !info.comment.empty() && info.comment.substr(0, deprecatedStr.size()) != deprecatedStr;
@ -246,7 +259,7 @@ private:
void printHelpString() {
for(auto itr = legalOptions.begin(); itr != legalOptions.end(); ++itr) {
if(isDocumented(itr->second)) {
FDBOptionInfo info = T::optionInfo[itr->second];
FDBOptionInfo info = T::optionInfo.getMustExist(itr->second);
std::string helpStr = info.name + " - " + info.comment;
if(info.hasParameter)
helpStr += " " + info.parameterComment;
@ -401,21 +414,25 @@ static void printProgramUsage(const char* name) {
" FDB_CLUSTER_FILE environment variable, then `./fdb.cluster',\n"
" then `%s'.\n", platform::getDefaultClusterFilePath().c_str());
printf(" --log Enables trace file logging for the CLI session.\n"
" --log-dir PATH Specifes the output directory for trace files. If\n"
" unspecified, defaults to the current directory. Has\n"
" no effect unless --log is specified.\n"
" --trace_format FORMAT\n"
" Select the format of the log files. xml (the default) and json\n"
" are supported. Has no effect unless --log is specified.\n"
" --exec CMDS Immediately executes the semicolon separated CLI commands\n"
" and then exits.\n"
" --no-status Disables the initial status check done when starting\n"
" the CLI.\n"
" --log-dir PATH Specifes the output directory for trace files. If\n"
" unspecified, defaults to the current directory. Has\n"
" no effect unless --log is specified.\n"
" --trace_format FORMAT\n"
" Select the format of the log files. xml (the default) and json\n"
" are supported. Has no effect unless --log is specified.\n"
" -S ON|OFF, --object-serializer ON|OFF\n"
" Use object serializer for sending messages. The object serializer\n"
" is currently a beta feature and it allows fdb processes to talk to\n"
" each other even if they don't have the same version\n"
" --exec CMDS Immediately executes the semicolon separated CLI commands\n"
" and then exits.\n"
" --no-status Disables the initial status check done when starting\n"
" the CLI.\n"
#ifndef TLS_DISABLED
TLS_HELP
TLS_HELP
#endif
" -v, --version Print FoundationDB CLI version information and exit.\n"
" -h, --help Display this help and exit.\n");
" -v, --version Print FoundationDB CLI version information and exit.\n"
" -h, --help Display this help and exit.\n");
}
@ -2332,6 +2349,7 @@ struct CLIOptions {
bool trace;
std::string traceDir;
std::string traceFormat;
bool useObjectSerializer = false;
int exit_timeout;
Optional<std::string> exec;
bool initialStatusCheck;
@ -2403,41 +2421,55 @@ struct CLIOptions {
#ifndef TLS_DISABLED
// TLS Options
case TLSOptions::OPT_TLS_PLUGIN:
args.OptionArg();
break;
case TLSOptions::OPT_TLS_CERTIFICATES:
tlsCertPath = args.OptionArg();
break;
case TLSOptions::OPT_TLS_CA_FILE:
tlsCAPath = args.OptionArg();
break;
case TLSOptions::OPT_TLS_KEY:
tlsKeyPath = args.OptionArg();
break;
case TLSOptions::OPT_TLS_PASSWORD:
tlsPassword = args.OptionArg();
break;
case TLSOptions::OPT_TLS_VERIFY_PEERS:
tlsVerifyPeers = args.OptionArg();
break;
case TLSOptions::OPT_TLS_PLUGIN:
args.OptionArg();
break;
case TLSOptions::OPT_TLS_CERTIFICATES:
tlsCertPath = args.OptionArg();
break;
case TLSOptions::OPT_TLS_CA_FILE:
tlsCAPath = args.OptionArg();
break;
case TLSOptions::OPT_TLS_KEY:
tlsKeyPath = args.OptionArg();
break;
case TLSOptions::OPT_TLS_PASSWORD:
tlsPassword = args.OptionArg();
break;
case TLSOptions::OPT_TLS_VERIFY_PEERS:
tlsVerifyPeers = args.OptionArg();
break;
#endif
case OPT_HELP:
printProgramUsage(program_name.c_str());
return 0;
case OPT_STATUS_FROM_JSON:
return printStatusFromJSON(args.OptionArg());
case OPT_TRACE_FORMAT:
if (!validateTraceFormat(args.OptionArg())) {
fprintf(stderr, "WARNING: Unrecognized trace format `%s'\n", args.OptionArg());
}
traceFormat = args.OptionArg();
break;
case OPT_VERSION:
printVersion();
return FDB_EXIT_SUCCESS;
}
return -1;
case OPT_HELP:
printProgramUsage(program_name.c_str());
return 0;
case OPT_STATUS_FROM_JSON:
return printStatusFromJSON(args.OptionArg());
case OPT_TRACE_FORMAT:
if (!validateTraceFormat(args.OptionArg())) {
fprintf(stderr, "WARNING: Unrecognized trace format `%s'\n", args.OptionArg());
}
traceFormat = args.OptionArg();
break;
case OPT_USE_OBJECT_SERIALIZER: {
std::string s = args.OptionArg();
std::transform(s.begin(), s.end(), s.begin(), ::tolower);
if (s == "on" || s == "true" || s == "1") {
useObjectSerializer = true;
} else if (s == "off" || s == "false" || s == "0") {
useObjectSerializer = false;
} else {
fprintf(stderr, "ERROR: Could not parse object serializer option: `%s'\n", s.c_str());
printProgramUsage(program_name.c_str());
flushAndExit(FDB_EXIT_ERROR);
}
break;
}
case OPT_VERSION:
printVersion();
return FDB_EXIT_SUCCESS;
}
return -1;
}
};
@ -2484,7 +2516,7 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise) {
TraceEvent::setNetworkThread();
try {
db = Database::createDatabase(ccf, -1);
db = Database::createDatabase(ccf, -1, false);
if (!opt.exec.present()) {
printf("Using cluster file `%s'.\n", ccf->getFilename().c_str());
}
@ -2497,12 +2529,14 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise) {
if (opt.trace) {
TraceEvent("CLIProgramStart")
.setMaxEventLength(12000)
.detail("SourceVersion", getHGVersion())
.detail("Version", FDB_VT_VERSION)
.detail("PackageName", FDB_VT_PACKAGE_NAME)
.detailf("ActualTime", "%lld", DEBUG_DETERMINISM ? 0 : time(NULL))
.detail("ClusterFile", ccf->getFilename().c_str())
.detail("ConnectionString", ccf->getConnectionString().toString())
.setMaxFieldLength(10000)
.detail("CommandLine", opt.commandLine)
.trackLatest("ProgramStart");
}
@ -3490,6 +3524,11 @@ int main(int argc, char **argv) {
}
setNetworkOption(FDBNetworkOptions::ENABLE_SLOW_TASK_PROFILING);
}
// The USE_OBJECT_SERIALIZER network option expects an 8 byte little endian integer which is interpreted as zero =
// false, non-zero = true.
setNetworkOption(FDBNetworkOptions::USE_OBJECT_SERIALIZER,
opt.useObjectSerializer ? LiteralStringRef("\x01\x00\x00\x00\x00\x00\x00\x00")
: LiteralStringRef("\x00\x00\x00\x00\x00\x00\x00\x00"));
initHelp();

View File

@ -419,7 +419,7 @@ ACTOR Future<Void> readCommitted(Database cx, PromiseStream<RangeResultWithVersi
//add lock
releaser.release();
wait(lock->take(TaskDefaultYield, limits.bytes + CLIENT_KNOBS->VALUE_SIZE_LIMIT + CLIENT_KNOBS->SYSTEM_KEY_SIZE_LIMIT));
wait(lock->take(TaskPriority::DefaultYield, limits.bytes + CLIENT_KNOBS->VALUE_SIZE_LIMIT + CLIENT_KNOBS->SYSTEM_KEY_SIZE_LIMIT));
releaser = FlowLock::Releaser(*lock, limits.bytes + CLIENT_KNOBS->VALUE_SIZE_LIMIT + CLIENT_KNOBS->SYSTEM_KEY_SIZE_LIMIT);
state Standalone<RangeResultRef> values = wait(tr.getRange(begin, end, limits));
@ -495,7 +495,7 @@ ACTOR Future<Void> readCommitted(Database cx, PromiseStream<RCGroup> results, Fu
//add lock
wait(active);
releaser.release();
wait(lock->take(TaskDefaultYield, rangevalue.expectedSize() + rcGroup.items.expectedSize()));
wait(lock->take(TaskPriority::DefaultYield, rangevalue.expectedSize() + rcGroup.items.expectedSize()));
releaser = FlowLock::Releaser(*lock, rangevalue.expectedSize() + rcGroup.items.expectedSize());
for (auto & s : rangevalue){
@ -613,7 +613,7 @@ ACTOR Future<int> dumpData(Database cx, PromiseStream<RCGroup> results, Referenc
req.flags = req.flags | CommitTransactionRequest::FLAG_IS_LOCK_AWARE;
totalBytes += mutationSize;
wait( commitLock->take(TaskDefaultYield, mutationSize) );
wait( commitLock->take(TaskPriority::DefaultYield, mutationSize) );
addActor.send( commitLock->releaseWhen( success(commit.getReply(req)), mutationSize ) );
if(endOfStream) {
@ -653,7 +653,7 @@ ACTOR Future<Void> coalesceKeyVersionCache(Key uid, Version endVersion, Referenc
req.transaction.read_snapshot = committedVersion->get();
req.flags = req.flags | CommitTransactionRequest::FLAG_IS_LOCK_AWARE;
wait( commitLock->take(TaskDefaultYield, mutationSize) );
wait( commitLock->take(TaskPriority::DefaultYield, mutationSize) );
addActor.send( commitLock->releaseWhen( success(commit.getReply(req)), mutationSize ) );
}
@ -671,7 +671,7 @@ ACTOR Future<Void> applyMutations(Database cx, Key uid, Key addPrefix, Key remov
try {
loop {
if(beginVersion >= *endVersion) {
wait( commitLock.take(TaskDefaultYield, CLIENT_KNOBS->BACKUP_LOCK_BYTES) );
wait( commitLock.take(TaskPriority::DefaultYield, CLIENT_KNOBS->BACKUP_LOCK_BYTES) );
commitLock.release(CLIENT_KNOBS->BACKUP_LOCK_BYTES);
if(beginVersion >= *endVersion) {
return Void();

View File

@ -8,7 +8,6 @@ set(FDBCLIENT_SRCS
BackupContainer.actor.cpp
BackupContainer.h
BlobStore.actor.cpp
ClientDBInfo.h
ClientLogEvents.h
ClientWorkerInterface.h
ClusterInterface.h

View File

@ -1,49 +0,0 @@
/*
* ClientDBInfo.h
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2018 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef FDBCLIENT_CLIENTDBINFO_H
#define FDBCLIENT_CLIENTDBINFO_H
#pragma once
#include "fdbclient/MasterProxyInterface.h"
// ClientDBInfo is all the information needed by a database client to access the database
// It is returned (and kept up to date) by the OpenDatabaseRequest interface of ClusterInterface
struct ClientDBInfo {
constexpr static FileIdentifier file_identifier = 5355080;
UID id; // Changes each time anything else changes
vector< MasterProxyInterface > proxies;
double clientTxnInfoSampleRate;
int64_t clientTxnInfoSizeLimit;
ClientDBInfo() : clientTxnInfoSampleRate(std::numeric_limits<double>::infinity()), clientTxnInfoSizeLimit(-1) {}
bool operator == (ClientDBInfo const& r) const { return id == r.id; }
bool operator != (ClientDBInfo const& r) const { return id != r.id; }
template <class Archive>
void serialize(Archive& ar) {
if constexpr (!is_fb_function<Archive>) {
ASSERT(ar.protocolVersion().isValid());
}
serializer(ar, proxies, id, clientTxnInfoSampleRate, clientTxnInfoSizeLimit);
}
};
#endif

View File

@ -44,7 +44,7 @@ namespace FdbClientLogEvents {
EventType type{ EVENTTYPEEND };
double startTs{ 0 };
void logEvent(std::string id) const {}
void logEvent(std::string id, int maxFieldLength) const {}
};
struct EventGetVersion : public Event {
@ -60,8 +60,10 @@ namespace FdbClientLogEvents {
double latency;
void logEvent(std::string id) const {
TraceEvent("TransactionTrace_GetVersion").detail("TransactionID", id).detail("Latency", latency);
void logEvent(std::string id, int maxFieldLength) const {
TraceEvent("TransactionTrace_GetVersion")
.detail("TransactionID", id)
.detail("Latency", latency);
}
};
@ -80,8 +82,14 @@ namespace FdbClientLogEvents {
int valueSize;
Key key;
void logEvent(std::string id) const {
TraceEvent("TransactionTrace_Get").detail("TransactionID", id).detail("Latency", latency).detail("ValueSizeBytes", valueSize).detail("Key", key);
void logEvent(std::string id, int maxFieldLength) const {
TraceEvent("TransactionTrace_Get")
.setMaxEventLength(-1)
.detail("TransactionID", id)
.detail("Latency", latency)
.detail("ValueSizeBytes", valueSize)
.setMaxFieldLength(maxFieldLength)
.detail("Key", key);
}
};
@ -101,8 +109,15 @@ namespace FdbClientLogEvents {
Key startKey;
Key endKey;
void logEvent(std::string id) const {
TraceEvent("TransactionTrace_GetRange").detail("TransactionID", id).detail("Latency", latency).detail("RangeSizeBytes", rangeSize).detail("StartKey", startKey).detail("EndKey", endKey);
void logEvent(std::string id, int maxFieldLength) const {
TraceEvent("TransactionTrace_GetRange")
.setMaxEventLength(-1)
.detail("TransactionID", id)
.detail("Latency", latency)
.detail("RangeSizeBytes", rangeSize)
.setMaxFieldLength(maxFieldLength)
.detail("StartKey", startKey)
.detail("EndKey", endKey);
}
};
@ -122,20 +137,38 @@ namespace FdbClientLogEvents {
int commitBytes;
CommitTransactionRequest req; // Only CommitTransactionRef and Arena object within CommitTransactionRequest is serialized
void logEvent(std::string id) const {
void logEvent(std::string id, int maxFieldLength) const {
for (auto &read_range : req.transaction.read_conflict_ranges) {
TraceEvent("TransactionTrace_Commit_ReadConflictRange").detail("TransactionID", id).detail("Begin", read_range.begin).detail("End", read_range.end);
TraceEvent("TransactionTrace_Commit_ReadConflictRange")
.setMaxEventLength(-1)
.detail("TransactionID", id)
.setMaxFieldLength(maxFieldLength)
.detail("Begin", read_range.begin)
.detail("End", read_range.end);
}
for (auto &write_range : req.transaction.write_conflict_ranges) {
TraceEvent("TransactionTrace_Commit_WriteConflictRange").detail("TransactionID", id).detail("Begin", write_range.begin).detail("End", write_range.end);
TraceEvent("TransactionTrace_Commit_WriteConflictRange")
.setMaxEventLength(-1)
.detail("TransactionID", id)
.setMaxFieldLength(maxFieldLength)
.detail("Begin", write_range.begin)
.detail("End", write_range.end);
}
for (auto &mutation : req.transaction.mutations) {
TraceEvent("TransactionTrace_Commit_Mutation").detail("TransactionID", id).detail("Mutation", mutation.toString());
TraceEvent("TransactionTrace_Commit_Mutation")
.setMaxEventLength(-1)
.detail("TransactionID", id)
.setMaxFieldLength(maxFieldLength)
.detail("Mutation", mutation.toString());
}
TraceEvent("TransactionTrace_Commit").detail("TransactionID", id).detail("Latency", latency).detail("NumMutations", numMutations).detail("CommitSizeBytes", commitBytes);
TraceEvent("TransactionTrace_Commit")
.detail("TransactionID", id)
.detail("Latency", latency)
.detail("NumMutations", numMutations)
.detail("CommitSizeBytes", commitBytes);
}
};
@ -153,8 +186,13 @@ namespace FdbClientLogEvents {
int errCode;
Key key;
void logEvent(std::string id) const {
TraceEvent("TransactionTrace_GetError").detail("TransactionID", id).detail("ErrCode", errCode).detail("Key", key);
void logEvent(std::string id, int maxFieldLength) const {
TraceEvent("TransactionTrace_GetError")
.setMaxEventLength(-1)
.detail("TransactionID", id)
.detail("ErrCode", errCode)
.setMaxFieldLength(maxFieldLength)
.detail("Key", key);
}
};
@ -173,8 +211,14 @@ namespace FdbClientLogEvents {
Key startKey;
Key endKey;
void logEvent(std::string id) const {
TraceEvent("TransactionTrace_GetRangeError").detail("TransactionID", id).detail("ErrCode", errCode).detail("StartKey", startKey).detail("EndKey", endKey);
void logEvent(std::string id, int maxFieldLength) const {
TraceEvent("TransactionTrace_GetRangeError")
.setMaxEventLength(-1)
.detail("TransactionID", id)
.detail("ErrCode", errCode)
.setMaxFieldLength(maxFieldLength)
.detail("StartKey", startKey)
.detail("EndKey", endKey);
}
};
@ -192,20 +236,36 @@ namespace FdbClientLogEvents {
int errCode;
CommitTransactionRequest req; // Only CommitTransactionRef and Arena object within CommitTransactionRequest is serialized
void logEvent(std::string id) const {
void logEvent(std::string id, int maxFieldLength) const {
for (auto &read_range : req.transaction.read_conflict_ranges) {
TraceEvent("TransactionTrace_CommitError_ReadConflictRange").detail("TransactionID", id).detail("Begin", read_range.begin).detail("End", read_range.end);
TraceEvent("TransactionTrace_CommitError_ReadConflictRange")
.setMaxEventLength(-1)
.detail("TransactionID", id)
.setMaxFieldLength(maxFieldLength)
.detail("Begin", read_range.begin)
.detail("End", read_range.end);
}
for (auto &write_range : req.transaction.write_conflict_ranges) {
TraceEvent("TransactionTrace_CommitError_WriteConflictRange").detail("TransactionID", id).detail("Begin", write_range.begin).detail("End", write_range.end);
TraceEvent("TransactionTrace_CommitError_WriteConflictRange")
.setMaxEventLength(-1)
.detail("TransactionID", id)
.setMaxFieldLength(maxFieldLength)
.detail("Begin", write_range.begin)
.detail("End", write_range.end);
}
for (auto &mutation : req.transaction.mutations) {
TraceEvent("TransactionTrace_CommitError_Mutation").detail("TransactionID", id).detail("Mutation", mutation.toString());
TraceEvent("TransactionTrace_CommitError_Mutation")
.setMaxEventLength(-1)
.detail("TransactionID", id)
.setMaxFieldLength(maxFieldLength)
.detail("Mutation", mutation.toString());
}
TraceEvent("TransactionTrace_CommitError").detail("TransactionID", id).detail("ErrCode", errCode);
TraceEvent("TransactionTrace_CommitError")
.detail("TransactionID", id)
.detail("ErrCode", errCode);
}
};
}

View File

@ -25,7 +25,7 @@
#include "fdbclient/FDBTypes.h"
#include "fdbrpc/FailureMonitor.h"
#include "fdbclient/Status.h"
#include "fdbclient/ClientDBInfo.h"
#include "fdbclient/MasterProxyInterface.h"
// Streams from WorkerInterface that are safe and useful to call from a client.
// A ClientWorkerInterface is embedded as the first element of a WorkerInterface.

View File

@ -25,7 +25,7 @@
#include "fdbclient/FDBTypes.h"
#include "fdbrpc/FailureMonitor.h"
#include "fdbclient/Status.h"
#include "fdbclient/ClientDBInfo.h"
#include "fdbclient/MasterProxyInterface.h"
#include "fdbclient/ClientWorkerInterface.h"
struct ClusterInterface {
@ -52,12 +52,12 @@ struct ClusterInterface {
}
void initEndpoints() {
openDatabase.getEndpoint( TaskClusterController );
failureMonitoring.getEndpoint( TaskFailureMonitor );
databaseStatus.getEndpoint( TaskClusterController );
ping.getEndpoint( TaskClusterController );
getClientWorkers.getEndpoint( TaskClusterController );
forceRecovery.getEndpoint( TaskClusterController );
openDatabase.getEndpoint( TaskPriority::ClusterController );
failureMonitoring.getEndpoint( TaskPriority::FailureMonitor );
databaseStatus.getEndpoint( TaskPriority::ClusterController );
ping.getEndpoint( TaskPriority::ClusterController );
getClientWorkers.getEndpoint( TaskPriority::ClusterController );
forceRecovery.getEndpoint( TaskPriority::ClusterController );
}
template <class Ar>

View File

@ -25,7 +25,6 @@
#include "fdbclient/NativeAPI.actor.h"
#include "fdbclient/KeyRangeMap.h"
#include "fdbclient/MasterProxyInterface.h"
#include "fdbclient/ClientDBInfo.h"
#include "fdbrpc/QueueModel.h"
#include "fdbrpc/MultiInterface.h"
#include "flow/TDMetric.actor.h"
@ -54,11 +53,11 @@ public:
// For internal (fdbserver) use only
static Database create( Reference<AsyncVar<Optional<ClusterInterface>>> clusterInterface, Reference<ClusterConnectionFile> connFile, LocalityData const& clientLocality );
static Database create( Reference<AsyncVar<ClientDBInfo>> clientInfo, Future<Void> clientInfoMonitor, LocalityData clientLocality, bool enableLocalityLoadBalance, int taskID=TaskDefaultEndpoint, bool lockAware=false, int apiVersion=Database::API_VERSION_LATEST );
static Database create( Reference<AsyncVar<ClientDBInfo>> clientInfo, Future<Void> clientInfoMonitor, LocalityData clientLocality, bool enableLocalityLoadBalance, TaskPriority taskID=TaskPriority::DefaultEndpoint, bool lockAware=false, int apiVersion=Database::API_VERSION_LATEST );
~DatabaseContext();
Database clone() const { return Database(new DatabaseContext( cluster, clientInfo, clientInfoMonitor, dbId, taskID, clientLocality, enableLocalityLoadBalance, lockAware, apiVersion )); }
Database clone() const { return Database(new DatabaseContext( cluster, clientInfo, clientInfoMonitor, taskID, clientLocality, enableLocalityLoadBalance, lockAware, internal, apiVersion )); }
std::pair<KeyRange,Reference<LocationInfo>> getCachedLocation( const KeyRef&, bool isBackward = false );
bool getCachedLocations( const KeyRangeRef&, vector<std::pair<KeyRange,Reference<LocationInfo>>>&, int limit, bool reverse );
@ -97,8 +96,8 @@ public:
//private:
explicit DatabaseContext( Reference<Cluster> cluster, Reference<AsyncVar<ClientDBInfo>> clientDBInfo,
Future<Void> clientInfoMonitor, Standalone<StringRef> dbId, int taskID, LocalityData const& clientLocality,
bool enableLocalityLoadBalance, bool lockAware, int apiVersion = Database::API_VERSION_LATEST );
Future<Void> clientInfoMonitor, TaskPriority taskID, LocalityData const& clientLocality,
bool enableLocalityLoadBalance, bool lockAware, bool internal = true, int apiVersion = Database::API_VERSION_LATEST );
explicit DatabaseContext( const Error &err );
@ -133,36 +132,36 @@ public:
std::map< UID, StorageServerInfo* > server_interf;
Standalone<StringRef> dbId;
UID dbId;
bool internal; // Only contexts created through the C client and fdbcli are non-internal
CounterCollection cc;
Counter transactionReadVersions;
Counter transactionLogicalReads;
Counter transactionPhysicalReads;
Counter transactionCommittedMutations;
Counter transactionCommittedMutationBytes;
Counter transactionsCommitStarted;
Counter transactionsCommitCompleted;
Counter transactionsTooOld;
Counter transactionsFutureVersions;
Counter transactionsNotCommitted;
Counter transactionsMaybeCommitted;
Counter transactionsResourceConstrained;
Counter transactionsProcessBehind;
Counter transactionWaitsForFullRecovery;
int64_t transactionReadVersions;
int64_t transactionLogicalReads;
int64_t transactionPhysicalReads;
int64_t transactionCommittedMutations;
int64_t transactionCommittedMutationBytes;
int64_t transactionsCommitStarted;
int64_t transactionsCommitCompleted;
int64_t transactionsTooOld;
int64_t transactionsFutureVersions;
int64_t transactionsNotCommitted;
int64_t transactionsMaybeCommitted;
int64_t transactionsResourceConstrained;
int64_t transactionsProcessBehind;
int64_t transactionWaitsForFullRecovery;
ContinuousSample<double> latencies, readLatencies, commitLatencies, GRVLatencies, mutationsPerCommit, bytesPerCommit;
int outstandingWatches;
int maxOutstandingWatches;
double transactionTimeout;
int transactionMaxRetries;
double transactionMaxBackoff;
int transactionMaxSize; // Max size in bytes.
int snapshotRywEnabled;
Future<Void> logger;
int taskID;
TaskPriority taskID;
Int64MetricHandle getValueSubmitted;
EventMetricHandle<GetValueComplete> getValueCompleted;
@ -180,6 +179,8 @@ public:
HealthMetrics healthMetrics;
double healthMetricsLastUpdated;
double detailedHealthMetricsLastUpdated;
UniqueOrderedOptionList<FDBTransactionOptions> transactionDefaults;
};
#endif

View File

@ -23,8 +23,11 @@
#define FDBCLIENT_FDBOPTIONS_H
#include <string>
#include <list>
#include <map>
#include "flow/Arena.h"
struct FDBOptionInfo {
std::string name;
std::string comment;
@ -32,9 +35,15 @@ struct FDBOptionInfo {
bool hasParameter;
bool hidden;
bool persistent;
FDBOptionInfo(std::string name, std::string comment, std::string parameterComment, bool hasParameter, bool hidden)
: name(name), comment(comment), parameterComment(parameterComment), hasParameter(hasParameter), hidden(hidden) { }
// If non-negative, this specifies the code for the transaction option that this option is the default value for.
// Options that have a defaultFor will only retain the value from time they were most recently set (i.e. there can be no cumulative effects from calling multiple times).
int defaultFor;
FDBOptionInfo(std::string name, std::string comment, std::string parameterComment, bool hasParameter, bool hidden, bool persistent, int defaultFor)
: name(name), comment(comment), parameterComment(parameterComment), hasParameter(hasParameter), hidden(hidden), persistent(persistent),
defaultFor(defaultFor) { }
FDBOptionInfo() { }
};
@ -45,15 +54,48 @@ private:
std::map<typename T::Option, FDBOptionInfo> optionInfo;
public:
typename std::map<typename T::Option, FDBOptionInfo>::iterator begin() { return optionInfo.begin(); }
typename std::map<typename T::Option, FDBOptionInfo>::iterator end() { return optionInfo.end(); }
typename std::map<typename T::Option, FDBOptionInfo>::iterator find(const typename T::Option& key) { return optionInfo.find(key); }
typename std::map<typename T::Option, FDBOptionInfo>::const_iterator begin() const { return optionInfo.begin(); }
typename std::map<typename T::Option, FDBOptionInfo>::const_iterator end() const { return optionInfo.end(); }
typename std::map<typename T::Option, FDBOptionInfo>::const_iterator find(const typename T::Option& key) const { return optionInfo.find(key); }
FDBOptionInfo& operator[] (const typename T::Option& key) { return optionInfo[key]; }
void insert(const typename T::Option& key, FDBOptionInfo info) {
optionInfo[key] = info;
}
FDBOptionInfo const& getMustExist(const typename T::Option& key) const {
auto itr = optionInfo.find(key);
ASSERT(itr != optionInfo.end());
return itr->second;
}
FDBOptionInfoMap() { T::init(); }
};
#define ADD_OPTION_INFO( type, var, name, comment, parameterComment, hasParameter, hidden ) type::optionInfo[var] = FDBOptionInfo(name, comment, parameterComment, hasParameter, hidden);
// An ordered list of options where each option is represented only once. Subsequent insertions will remove the option from its
// original location and add it to the end with the new value.
template<class T>
class UniqueOrderedOptionList {
public:
typedef std::list<std::pair<typename T::Option, Optional<Standalone<StringRef>>>> OptionList;
private:
OptionList options;
std::map<typename T::Option, typename OptionList::iterator> optionsIndexMap;
public:
void addOption(typename T::Option option, Optional<Standalone<StringRef>> value) {
auto itr = optionsIndexMap.find(option);
if(itr != optionsIndexMap.end()) {
options.erase(itr->second);
}
options.push_back(std::make_pair(option, value));
optionsIndexMap[option] = --options.end();
}
typename OptionList::const_iterator begin() const { return options.cbegin(); }
typename OptionList::const_iterator end() const { return options.cend(); }
};
#define ADD_OPTION_INFO( type, var, name, comment, parameterComment, hasParameter, hidden, persistent, defaultFor ) type::optionInfo.insert(var, FDBOptionInfo(name, comment, parameterComment, hasParameter, hidden, persistent, defaultFor));
#endif

View File

@ -43,6 +43,7 @@ enum {
tagLocalityUpgraded = -4,
tagLocalitySatellite = -5,
tagLocalityLogRouterMapped = -6,
tagLocalityTxs = -7,
tagLocalityInvalid = -99
}; //The TLog and LogRouter require these number to be as compact as possible

View File

@ -41,7 +41,7 @@ ACTOR Future<Void> failureMonitorClientLoop(
{
state Version version = 0;
state Future<FailureMonitoringReply> request = Never();
state Future<Void> nextRequest = delay(0, TaskFailureMonitor);
state Future<Void> nextRequest = delay(0, TaskPriority::FailureMonitor);
state Future<Void> requestTimeout = Never();
state double before = now();
state double waitfor = 0;
@ -61,7 +61,7 @@ ACTOR Future<Void> failureMonitorClientLoop(
loop {
choose {
when( FailureMonitoringReply reply = wait( request ) ) {
g_network->setCurrentTask(TaskDefaultDelay);
g_network->setCurrentTask(TaskPriority::DefaultDelay);
request = Never();
requestTimeout = Never();
if (reply.allOthersFailed) {
@ -122,10 +122,10 @@ ACTOR Future<Void> failureMonitorClientLoop(
}
before = now();
waitfor = reply.clientRequestIntervalMS * .001;
nextRequest = delayJittered( waitfor, TaskFailureMonitor );
nextRequest = delayJittered( waitfor, TaskPriority::FailureMonitor );
}
when( wait( requestTimeout ) ) {
g_network->setCurrentTask(TaskDefaultDelay);
g_network->setCurrentTask(TaskPriority::DefaultDelay);
requestTimeout = Never();
TraceEvent(SevWarn, "FailureMonitoringServerDown").detail("OldServerID",controller.id());
monitor->setStatus(controlAddr.address, FailureStatus(true));
@ -136,7 +136,7 @@ ACTOR Future<Void> failureMonitorClientLoop(
}
}
when( wait( nextRequest ) ) {
g_network->setCurrentTask(TaskDefaultDelay);
g_network->setCurrentTask(TaskPriority::DefaultDelay);
nextRequest = Never();
double elapsed = now() - before;
@ -152,9 +152,9 @@ ACTOR Future<Void> failureMonitorClientLoop(
req.addresses = g_network->getLocalAddresses();
if (trackMyStatus)
req.senderStatus = FailureStatus(false);
request = controller.failureMonitoring.getReply( req, TaskFailureMonitor );
request = controller.failureMonitoring.getReply( req, TaskPriority::FailureMonitor );
if(!controller.failureMonitoring.getEndpoint().isLocal())
requestTimeout = delay( fmState->serverFailedTimeout, TaskFailureMonitor );
requestTimeout = delay( fmState->serverFailedTimeout, TaskPriority::FailureMonitor );
}
}
}

View File

@ -93,7 +93,7 @@ namespace HTTP {
loop {
// Wait for connection to have something to read
wait(conn->onReadable());
wait( delay( 0, TaskReadSocket ) );
wait( delay( 0, TaskPriority::ReadSocket ) );
// Read into buffer
int originalSize = buf->size();
@ -353,7 +353,7 @@ namespace HTTP {
loop {
wait(conn->onWritable());
wait( delay( 0, TaskWriteSocket ) );
wait( delay( 0, TaskPriority::WriteSocket ) );
// If we already got a response, before finishing sending the request, then close the connection,
// set the Connection header to "close" as a hint to the caller that this connection can't be used

View File

@ -967,7 +967,7 @@ ACTOR Future<CoordinatorsResult::Type> changeQuorum( Database cx, Reference<IQuo
vector<Future<Optional<LeaderInfo>>> leaderServers;
ClientCoordinators coord( Reference<ClusterConnectionFile>( new ClusterConnectionFile( conn ) ) );
for( int i = 0; i < coord.clientLeaderServers.size(); i++ )
leaderServers.push_back( retryBrokenPromise( coord.clientLeaderServers[i].getLeader, GetLeaderRequest( coord.clusterKey, UID() ), TaskCoordinationReply ) );
leaderServers.push_back( retryBrokenPromise( coord.clientLeaderServers[i].getLeader, GetLeaderRequest( coord.clusterKey, UID() ), TaskPriority::CoordinationReply ) );
choose {
when( wait( waitForAll( leaderServers ) ) ) {}
@ -1047,7 +1047,7 @@ struct AutoQuorumChange : IQuorumChange {
ClientCoordinators coord(ccf);
vector<Future<Optional<LeaderInfo>>> leaderServers;
for( int i = 0; i < coord.clientLeaderServers.size(); i++ )
leaderServers.push_back( retryBrokenPromise( coord.clientLeaderServers[i].getLeader, GetLeaderRequest( coord.clusterKey, UID() ), TaskCoordinationReply ) );
leaderServers.push_back( retryBrokenPromise( coord.clientLeaderServers[i].getLeader, GetLeaderRequest( coord.clusterKey, UID() ), TaskPriority::CoordinationReply ) );
Optional<vector<Optional<LeaderInfo>>> results = wait( timeout( getAll(leaderServers), CLIENT_KNOBS->IS_ACCEPTABLE_DELAY ) );
if (!results.present()) return false; // Not all responded
for(auto& r : results.get())

View File

@ -67,15 +67,47 @@ struct MasterProxyInterface {
}
void initEndpoints() {
getConsistentReadVersion.getEndpoint(TaskProxyGetConsistentReadVersion);
getRawCommittedVersion.getEndpoint(TaskProxyGetRawCommittedVersion);
commit.getEndpoint(TaskProxyCommitDispatcher);
getStorageServerRejoinInfo.getEndpoint(TaskProxyStorageRejoin);
getConsistentReadVersion.getEndpoint(TaskPriority::ProxyGetConsistentReadVersion);
getRawCommittedVersion.getEndpoint(TaskPriority::ProxyGetRawCommittedVersion);
commit.getEndpoint(TaskPriority::ProxyCommitDispatcher);
getStorageServerRejoinInfo.getEndpoint(TaskPriority::ProxyStorageRejoin);
//getKeyServersLocations.getEndpoint(TaskProxyGetKeyServersLocations); //do not increase the priority of these requests, because clients cans bring down the cluster with too many of these messages.
}
};
struct CommitID {
// ClientDBInfo is all the information needed by a database client to access the database
// It is returned (and kept up to date) by the OpenDatabaseRequest interface of ClusterInterface
struct ClientDBInfo {
constexpr static FileIdentifier file_identifier = 5355080;
UID id; // Changes each time anything else changes
vector< MasterProxyInterface > proxies;
double clientTxnInfoSampleRate;
int64_t clientTxnInfoSizeLimit;
ClientDBInfo() : clientTxnInfoSampleRate(std::numeric_limits<double>::infinity()), clientTxnInfoSizeLimit(-1) {}
bool operator == (ClientDBInfo const& r) const { return id == r.id; }
bool operator != (ClientDBInfo const& r) const { return id != r.id; }
template <class Archive>
void serialize(Archive& ar) {
if constexpr (!is_fb_function<Archive>) {
ASSERT(ar.protocolVersion().isValid());
}
serializer(ar, proxies, id, clientTxnInfoSampleRate, clientTxnInfoSizeLimit);
}
};
struct ProxyForwardReply {
Optional<ClientDBInfo> newClientInfo;
ProxyForwardReply() {}
template <class Ar>
void serialize(Ar &ar) {
serializer(ar, newClientInfo);
}
};
struct CommitID : public ProxyForwardReply {
constexpr static FileIdentifier file_identifier = 14254927;
Version version; // returns invalidVersion if transaction conflicts
uint16_t txnBatchId;
@ -83,7 +115,7 @@ struct CommitID {
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, version, txnBatchId, metadataVersion);
serializer(ar, *(ProxyForwardReply*)this, version, txnBatchId, metadataVersion);
}
CommitID() : version(invalidVersion), txnBatchId(0) {}
@ -127,7 +159,7 @@ static inline int getBytes( CommitTransactionRequest const& r ) {
return total;
}
struct GetReadVersionReply {
struct GetReadVersionReply : public ProxyForwardReply {
constexpr static FileIdentifier file_identifier = 15709388;
Version version;
bool locked;
@ -135,7 +167,7 @@ struct GetReadVersionReply {
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, version, locked, metadataVersion);
serializer(ar, *(ProxyForwardReply*)this, version, locked, metadataVersion);
}
};
@ -169,14 +201,14 @@ struct GetReadVersionRequest : TimedRequest {
}
};
struct GetKeyServerLocationsReply {
struct GetKeyServerLocationsReply : public ProxyForwardReply {
constexpr static FileIdentifier file_identifier = 10636023;
Arena arena;
std::vector<std::pair<KeyRangeRef, vector<StorageServerInterface>>> results;
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, results, arena);
serializer(ar, *(ProxyForwardReply*)this, results, arena);
}
};

View File

@ -371,7 +371,7 @@ ClientLeaderRegInterface::ClientLeaderRegInterface( NetworkAddress remote )
}
ClientLeaderRegInterface::ClientLeaderRegInterface( INetwork* local ) {
getLeader.makeWellKnownEndpoint( WLTOKEN_CLIENTLEADERREG_GETLEADER, TaskCoordination );
getLeader.makeWellKnownEndpoint( WLTOKEN_CLIENTLEADERREG_GETLEADER, TaskPriority::Coordination );
}
// Nominee is the worker among all workers that are considered as leader by a coordinator
@ -380,7 +380,7 @@ ClientLeaderRegInterface::ClientLeaderRegInterface( INetwork* local ) {
ACTOR Future<Void> monitorNominee( Key key, ClientLeaderRegInterface coord, AsyncTrigger* nomineeChange, Optional<LeaderInfo> *info, int generation, Reference<AsyncVar<int>> connectedCoordinatorsNum ) {
state bool hasCounted = false;
loop {
state Optional<LeaderInfo> li = wait( retryBrokenPromise( coord.getLeader, GetLeaderRequest( key, info->present() ? info->get().changeID : UID() ), TaskCoordinationReply ) );
state Optional<LeaderInfo> li = wait( retryBrokenPromise( coord.getLeader, GetLeaderRequest( key, info->present() ? info->get().changeID : UID() ), TaskPriority::CoordinationReply ) );
if (li.present() && !hasCounted && connectedCoordinatorsNum.isValid()) {
connectedCoordinatorsNum->set(connectedCoordinatorsNum->get() + 1);
hasCounted = true;

View File

@ -408,17 +408,39 @@ void DLApi::addNetworkThreadCompletionHook(void (*hook)(void*), void *hookParame
}
// MultiVersionTransaction
MultiVersionTransaction::MultiVersionTransaction(Reference<MultiVersionDatabase> db) : db(db) {
MultiVersionTransaction::MultiVersionTransaction(Reference<MultiVersionDatabase> db, UniqueOrderedOptionList<FDBTransactionOptions> defaultOptions) : db(db) {
setDefaultOptions(defaultOptions);
updateTransaction();
}
// SOMEDAY: This function is unsafe if it's possible to set Database options that affect subsequently created transactions. There are currently no such options.
void MultiVersionTransaction::setDefaultOptions(UniqueOrderedOptionList<FDBTransactionOptions> options) {
MutexHolder holder(db->dbState->optionLock);
std::copy(options.begin(), options.end(), std::back_inserter(persistentOptions));
}
void MultiVersionTransaction::updateTransaction() {
auto currentDb = db->dbState->dbVar->get();
TransactionInfo newTr;
if(currentDb.value) {
newTr.transaction = currentDb.value->createTransaction();
Optional<StringRef> timeout;
for (auto option : persistentOptions) {
if(option.first == FDBTransactionOptions::TIMEOUT) {
timeout = option.second.castTo<StringRef>();
}
else {
newTr.transaction->setOption(option.first, option.second.castTo<StringRef>());
}
}
// Setting a timeout can immediately cause a transaction to fail. The only timeout
// that matters is the one most recently set, so we ignore any earlier set timeouts
// that might inadvertently fail the transaction.
if(timeout.present()) {
newTr.transaction->setOption(FDBTransactionOptions::TIMEOUT, timeout);
}
}
newTr.onChange = currentDb.onChange;
@ -574,6 +596,15 @@ Version MultiVersionTransaction::getCommittedVersion() {
}
void MultiVersionTransaction::setOption(FDBTransactionOptions::Option option, Optional<StringRef> value) {
auto itr = FDBTransactionOptions::optionInfo.find(option);
if(itr == FDBTransactionOptions::optionInfo.end()) {
TraceEvent("UnknownTransactionOption").detail("Option", option);
throw invalid_option();
}
if(MultiVersionApi::apiVersionAtLeast(610) && itr->second.persistent) {
persistentOptions.emplace_back(option, value.castTo<Standalone<StringRef>>());
}
auto tr = getTransaction();
if(tr.transaction) {
tr.transaction->setOption(option, value);
@ -606,6 +637,8 @@ ThreadFuture<Void> MultiVersionTransaction::onError(Error const& e) {
}
void MultiVersionTransaction::reset() {
persistentOptions.clear();
setDefaultOptions(db->dbState->transactionDefaultOptions);
updateTransaction();
}
@ -643,27 +676,30 @@ Reference<IDatabase> MultiVersionDatabase::debugCreateFromExistingDatabase(Refer
}
Reference<ITransaction> MultiVersionDatabase::createTransaction() {
return Reference<ITransaction>(new MultiVersionTransaction(Reference<MultiVersionDatabase>::addRef(this)));
return Reference<ITransaction>(new MultiVersionTransaction(Reference<MultiVersionDatabase>::addRef(this), dbState->transactionDefaultOptions));
}
void MultiVersionDatabase::setOption(FDBDatabaseOptions::Option option, Optional<StringRef> value) {
MutexHolder holder(dbState->optionLock);
auto itr = FDBDatabaseOptions::optionInfo.find(option);
if(itr != FDBDatabaseOptions::optionInfo.end()) {
TraceEvent("SetDatabaseOption").detail("Option", itr->second.name);
}
else {
if(itr == FDBDatabaseOptions::optionInfo.end()) {
TraceEvent("UnknownDatabaseOption").detail("Option", option);
throw invalid_option();
}
if(dbState->db) {
dbState->db->setOption(option, value);
int defaultFor = itr->second.defaultFor;
if (defaultFor >= 0) {
ASSERT(FDBTransactionOptions::optionInfo.find((FDBTransactionOptions::Option)defaultFor) !=
FDBTransactionOptions::optionInfo.end());
dbState->transactionDefaultOptions.addOption((FDBTransactionOptions::Option)defaultFor, value.castTo<Standalone<StringRef>>());
}
dbState->options.push_back(std::make_pair(option, value.castTo<Standalone<StringRef>>()));
if(dbState->db) {
dbState->db->setOption(option, value);
}
}
void MultiVersionDatabase::Connector::connect() {
@ -824,6 +860,11 @@ void MultiVersionDatabase::DatabaseState::cancelConnections() {
// MultiVersionApi
bool MultiVersionApi::apiVersionAtLeast(int minVersion) {
ASSERT(MultiVersionApi::api->apiVersion != 0);
return MultiVersionApi::api->apiVersion >= minVersion || MultiVersionApi::api->apiVersion < 0;
}
// runOnFailedClients should be used cautiously. Some failed clients may not have successfully loaded all symbols.
void MultiVersionApi::runOnExternalClients(std::function<void(Reference<ClientInfo>)> func, bool runOnFailedClients) {
bool newFailure = false;

View File

@ -210,7 +210,7 @@ class MultiVersionDatabase;
class MultiVersionTransaction : public ITransaction, ThreadSafeReferenceCounted<MultiVersionTransaction> {
public:
MultiVersionTransaction(Reference<MultiVersionDatabase> db);
MultiVersionTransaction(Reference<MultiVersionDatabase> db, UniqueOrderedOptionList<FDBTransactionOptions> defaultOptions);
void cancel();
void setVersion(Version v);
@ -261,6 +261,9 @@ private:
TransactionInfo getTransaction();
void updateTransaction();
void setDefaultOptions(UniqueOrderedOptionList<FDBTransactionOptions> options);
std::vector<std::pair<FDBTransactionOptions::Option, Optional<Standalone<StringRef>>>> persistentOptions;
};
struct ClientInfo : ThreadSafeReferenceCounted<ClientInfo> {
@ -341,6 +344,7 @@ private:
std::vector<Reference<Connector>> connectionAttempts;
std::vector<std::pair<FDBDatabaseOptions::Option, Optional<Standalone<StringRef>>>> options;
UniqueOrderedOptionList<FDBTransactionOptions> transactionDefaultOptions;
Mutex optionLock;
};
@ -370,6 +374,8 @@ public:
bool callbackOnMainThread;
bool localClientDisabled;
static bool apiVersionAtLeast(int minVersion);
private:
MultiVersionApi();

View File

@ -208,24 +208,18 @@ template <> void addref( DatabaseContext* ptr ) { ptr->addref(); }
template <> void delref( DatabaseContext* ptr ) { ptr->delref(); }
ACTOR Future<Void> databaseLogger( DatabaseContext *cx ) {
state double lastLogged = 0;
loop {
wait( delay( CLIENT_KNOBS->SYSTEM_MONITOR_INTERVAL, cx->taskID ) );
TraceEvent("TransactionMetrics")
wait(delay(CLIENT_KNOBS->SYSTEM_MONITOR_INTERVAL, cx->taskID));
TraceEvent ev("TransactionMetrics", cx->dbId);
ev.detail("Elapsed", (lastLogged == 0) ? 0 : now() - lastLogged)
.detail("Cluster", cx->cluster && cx->getConnectionFile() ? cx->getConnectionFile()->getConnectionString().clusterKeyName().toString() : "")
.detail("ReadVersions", cx->transactionReadVersions)
.detail("LogicalUncachedReads", cx->transactionLogicalReads)
.detail("PhysicalReadRequests", cx->transactionPhysicalReads)
.detail("CommittedMutations", cx->transactionCommittedMutations)
.detail("CommittedMutationBytes", cx->transactionCommittedMutationBytes)
.detail("CommitStarted", cx->transactionsCommitStarted)
.detail("CommitCompleted", cx->transactionsCommitCompleted)
.detail("TooOld", cx->transactionsTooOld)
.detail("FutureVersions", cx->transactionsFutureVersions)
.detail("NotCommitted", cx->transactionsNotCommitted)
.detail("MaybeCommitted", cx->transactionsMaybeCommitted)
.detail("ResourceConstrained", cx->transactionsResourceConstrained)
.detail("ProcessBehind", cx->transactionsProcessBehind)
.detail("MeanLatency", cx->latencies.mean())
.detail("Internal", cx->internal);
cx->cc.logToTraceEvent(ev);
ev.detail("MeanLatency", cx->latencies.mean())
.detail("MedianLatency", cx->latencies.median())
.detail("Latency90", cx->latencies.percentile(0.90))
.detail("Latency98", cx->latencies.percentile(0.98))
@ -245,12 +239,15 @@ ACTOR Future<Void> databaseLogger( DatabaseContext *cx ) {
.detail("MeanBytesPerCommit", cx->bytesPerCommit.mean())
.detail("MedianBytesPerCommit", cx->bytesPerCommit.median())
.detail("MaxBytesPerCommit", cx->bytesPerCommit.max());
cx->latencies.clear();
cx->readLatencies.clear();
cx->GRVLatencies.clear();
cx->commitLatencies.clear();
cx->mutationsPerCommit.clear();
cx->bytesPerCommit.clear();
lastLogged = now();
}
}
@ -508,23 +505,24 @@ ACTOR static Future<HealthMetrics> getHealthMetricsActor(DatabaseContext *cx, bo
Future<HealthMetrics> DatabaseContext::getHealthMetrics(bool detailed = false) {
return getHealthMetricsActor(this, detailed);
}
DatabaseContext::DatabaseContext(
Reference<Cluster> cluster, Reference<AsyncVar<ClientDBInfo>> clientInfo, Future<Void> clientInfoMonitor, Standalone<StringRef> dbId,
int taskID, LocalityData const& clientLocality, bool enableLocalityLoadBalance, bool lockAware, int apiVersion )
: cluster(cluster), clientInfo(clientInfo), clientInfoMonitor(clientInfoMonitor), dbId(dbId), taskID(taskID), clientLocality(clientLocality), enableLocalityLoadBalance(enableLocalityLoadBalance),
lockAware(lockAware), apiVersion(apiVersion), provisional(false),
transactionReadVersions(0), transactionLogicalReads(0), transactionPhysicalReads(0), transactionCommittedMutations(0), transactionCommittedMutationBytes(0),
transactionsCommitStarted(0), transactionsCommitCompleted(0), transactionsTooOld(0), transactionsFutureVersions(0), transactionsNotCommitted(0),
transactionsMaybeCommitted(0), transactionsResourceConstrained(0), transactionsProcessBehind(0), outstandingWatches(0), transactionTimeout(0.0), transactionMaxRetries(-1),
Reference<Cluster> cluster, Reference<AsyncVar<ClientDBInfo>> clientInfo, Future<Void> clientInfoMonitor,
TaskPriority taskID, LocalityData const& clientLocality, bool enableLocalityLoadBalance, bool lockAware, bool internal, int apiVersion )
: cluster(cluster), clientInfo(clientInfo), clientInfoMonitor(clientInfoMonitor), taskID(taskID), clientLocality(clientLocality), enableLocalityLoadBalance(enableLocalityLoadBalance),
lockAware(lockAware), apiVersion(apiVersion), provisional(false), cc("TransactionMetrics"),
transactionReadVersions("ReadVersions", cc), transactionLogicalReads("LogicalUncachedReads", cc), transactionPhysicalReads("PhysicalReadRequests", cc),
transactionCommittedMutations("CommittedMutations", cc), transactionCommittedMutationBytes("CommittedMutationBytes", cc), transactionsCommitStarted("CommitStarted", cc),
transactionsCommitCompleted("CommitCompleted", cc), transactionsTooOld("TooOld", cc), transactionsFutureVersions("FutureVersions", cc),
transactionsNotCommitted("NotCommitted", cc), transactionsMaybeCommitted("MaybeCommitted", cc), transactionsResourceConstrained("ResourceConstrained", cc),
transactionsProcessBehind("ProcessBehind", cc), transactionWaitsForFullRecovery("WaitsForFullRecovery", cc), outstandingWatches(0),
latencies(1000), readLatencies(1000), commitLatencies(1000), GRVLatencies(1000), mutationsPerCommit(1000), bytesPerCommit(1000), mvCacheInsertLocation(0),
healthMetricsLastUpdated(0), detailedHealthMetricsLastUpdated(0)
healthMetricsLastUpdated(0), detailedHealthMetricsLastUpdated(0), internal(internal)
{
dbId = deterministicRandom()->randomUniqueID();
metadataVersionCache.resize(CLIENT_KNOBS->METADATA_VERSION_CACHE_SIZE);
maxOutstandingWatches = CLIENT_KNOBS->DEFAULT_MAX_OUTSTANDING_WATCHES;
transactionMaxBackoff = CLIENT_KNOBS->FAILURE_MAX_DELAY;
transactionMaxSize = CLIENT_KNOBS->TRANSACTION_SIZE_LIMIT;
snapshotRywEnabled = apiVersionAtLeast(300) ? 1 : 0;
logger = databaseLogger( this );
@ -539,7 +537,14 @@ DatabaseContext::DatabaseContext(
clientStatusUpdater.actor = clientStatusUpdateActor(this);
}
DatabaseContext::DatabaseContext( const Error &err ) : deferredError(err), latencies(1000), readLatencies(1000), commitLatencies(1000), GRVLatencies(1000), mutationsPerCommit(1000), bytesPerCommit(1000) {}
DatabaseContext::DatabaseContext( const Error &err ) : deferredError(err), cc("TransactionMetrics"),
transactionReadVersions("ReadVersions", cc), transactionLogicalReads("LogicalUncachedReads", cc), transactionPhysicalReads("PhysicalReadRequests", cc),
transactionCommittedMutations("CommittedMutations", cc), transactionCommittedMutationBytes("CommittedMutationBytes", cc), transactionsCommitStarted("CommitStarted", cc),
transactionsCommitCompleted("CommitCompleted", cc), transactionsTooOld("TooOld", cc), transactionsFutureVersions("FutureVersions", cc),
transactionsNotCommitted("NotCommitted", cc), transactionsMaybeCommitted("MaybeCommitted", cc), transactionsResourceConstrained("ResourceConstrained", cc),
transactionsProcessBehind("ProcessBehind", cc), transactionWaitsForFullRecovery("WaitsForFullRecovery", cc), latencies(1000), readLatencies(1000), commitLatencies(1000),
GRVLatencies(1000), mutationsPerCommit(1000), bytesPerCommit(1000),
internal(false) {}
ACTOR static Future<Void> monitorClientInfo( Reference<AsyncVar<Optional<ClusterInterface>>> clusterInterface, Reference<ClusterConnectionFile> ccf, Reference<AsyncVar<ClientDBInfo>> outInfo, Reference<AsyncVar<int>> connectedCoordinatorsNumDelayed ) {
try {
@ -573,38 +578,44 @@ ACTOR static Future<Void> monitorClientInfo( Reference<AsyncVar<Optional<Cluster
choose {
when( ClientDBInfo ni = wait( clusterInterface->get().present() ? brokenPromiseToNever( clusterInterface->get().get().openDatabase.getReply( req ) ) : Never() ) ) {
TraceEvent("ClientInfoChange").detail("ChangeID", ni.id);
outInfo->set(ni);
if (ni.proxies.empty()) {
TraceEvent("ClientInfo_NoProxiesReturned").detail("ChangeID", ni.id);
continue;
} else if (!FlowTransport::transport().isClient()) {
continue;
}
vector<Future<Void>> onProxyFailureVec;
bool skipWaitForProxyFail = false;
for (const auto& proxy : ni.proxies) {
if (proxy.provisional) {
skipWaitForProxyFail = true;
loop {
TraceEvent("ClientInfoChange").detail("ChangeID", outInfo->get().id);
if (outInfo->get().proxies.empty()) {
TraceEvent("ClientInfo_NoProxiesReturned").detail("ChangeID", outInfo->get().id);
break;
} else if (!FlowTransport::transport().isClient()) {
break;
}
onProxyFailureVec.push_back(
IFailureMonitor::failureMonitor().onDisconnectOrFailure(
proxy.getConsistentReadVersion.getEndpoint()) ||
IFailureMonitor::failureMonitor().onDisconnectOrFailure(proxy.commit.getEndpoint()) ||
IFailureMonitor::failureMonitor().onDisconnectOrFailure(
proxy.getKeyServersLocations.getEndpoint()) ||
IFailureMonitor::failureMonitor().onDisconnectOrFailure(
proxy.getStorageServerRejoinInfo.getEndpoint()));
}
if (skipWaitForProxyFail) continue;
state vector<Future<Void>> onProxyFailureVec;
bool skipWaitForProxyFail = false;
for (const auto& proxy : outInfo->get().proxies) {
if (proxy.provisional) {
skipWaitForProxyFail = true;
break;
}
leaderMon = Void();
wait(waitForAny(onProxyFailureVec));
leaderMon = ccf ? monitorLeader(ccf, clusterInterface) : Void();
onProxyFailureVec.push_back(
IFailureMonitor::failureMonitor().onStateEqual(
proxy.getConsistentReadVersion.getEndpoint(), FailureStatus()) ||
IFailureMonitor::failureMonitor().onStateEqual(proxy.commit.getEndpoint(), FailureStatus()) ||
IFailureMonitor::failureMonitor().onStateEqual(
proxy.getKeyServersLocations.getEndpoint(), FailureStatus()) ||
IFailureMonitor::failureMonitor().onStateEqual(
proxy.getStorageServerRejoinInfo.getEndpoint(), FailureStatus()));
}
if (skipWaitForProxyFail) break;
leaderMon = Void();
state Future<Void> anyFailures = waitForAny(onProxyFailureVec);
wait(anyFailures || outInfo->onChange());
if(anyFailures.isReady()) {
leaderMon = ccf ? monitorLeader(ccf, clusterInterface) : Void();
break;
}
}
}
when( wait( clusterInterface->onChange() ) ) {
if(clusterInterface->get().present())
@ -632,11 +643,11 @@ Database DatabaseContext::create(Reference<AsyncVar<Optional<ClusterInterface>>>
Reference<AsyncVar<ClientDBInfo>> clientInfo(new AsyncVar<ClientDBInfo>());
Future<Void> clientInfoMonitor = delayedAsyncVar(connectedCoordinatorsNum, connectedCoordinatorsNumDelayed, CLIENT_KNOBS->CHECK_CONNECTED_COORDINATOR_NUM_DELAY) || monitorClientInfo(clusterInterface, connFile, clientInfo, connectedCoordinatorsNumDelayed);
return Database(new DatabaseContext(cluster, clientInfo, clientInfoMonitor, LiteralStringRef(""), TaskDefaultEndpoint, clientLocality, true, false));
return Database(new DatabaseContext(cluster, clientInfo, clientInfoMonitor, TaskPriority::DefaultEndpoint, clientLocality, true, false, true));
}
Database DatabaseContext::create(Reference<AsyncVar<ClientDBInfo>> clientInfo, Future<Void> clientInfoMonitor, LocalityData clientLocality, bool enableLocalityLoadBalance, int taskID, bool lockAware, int apiVersion) {
return Database( new DatabaseContext( Reference<Cluster>(nullptr), clientInfo, clientInfoMonitor, LiteralStringRef(""), taskID, clientLocality, enableLocalityLoadBalance, lockAware, apiVersion ) );
Database DatabaseContext::create(Reference<AsyncVar<ClientDBInfo>> clientInfo, Future<Void> clientInfoMonitor, LocalityData clientLocality, bool enableLocalityLoadBalance, TaskPriority taskID, bool lockAware, int apiVersion) {
return Database( new DatabaseContext( Reference<Cluster>(nullptr), clientInfo, clientInfoMonitor, taskID, clientLocality, enableLocalityLoadBalance, lockAware, true, apiVersion ) );
}
DatabaseContext::~DatabaseContext() {
@ -745,52 +756,45 @@ uint64_t extractHexOption( StringRef value ) {
}
void DatabaseContext::setOption( FDBDatabaseOptions::Option option, Optional<StringRef> value) {
switch(option) {
case FDBDatabaseOptions::LOCATION_CACHE_SIZE:
locationCacheSize = (int)extractIntOption(value, 0, std::numeric_limits<int>::max());
break;
case FDBDatabaseOptions::MACHINE_ID:
clientLocality = LocalityData( clientLocality.processId(), value.present() ? Standalone<StringRef>(value.get()) : Optional<Standalone<StringRef>>(), clientLocality.machineId(), clientLocality.dcId() );
if( clientInfo->get().proxies.size() )
masterProxies = Reference<ProxyInfo>( new ProxyInfo( clientInfo->get().proxies, clientLocality ) );
server_interf.clear();
locationCache.insert( allKeys, Reference<LocationInfo>() );
break;
case FDBDatabaseOptions::MAX_WATCHES:
maxOutstandingWatches = (int)extractIntOption(value, 0, CLIENT_KNOBS->ABSOLUTE_MAX_WATCHES);
break;
case FDBDatabaseOptions::DATACENTER_ID:
clientLocality = LocalityData(clientLocality.processId(), clientLocality.zoneId(), clientLocality.machineId(), value.present() ? Standalone<StringRef>(value.get()) : Optional<Standalone<StringRef>>());
if( clientInfo->get().proxies.size() )
masterProxies = Reference<ProxyInfo>( new ProxyInfo( clientInfo->get().proxies, clientLocality ));
server_interf.clear();
locationCache.insert( allKeys, Reference<LocationInfo>() );
break;
case FDBDatabaseOptions::TRANSACTION_TIMEOUT:
if( !apiVersionAtLeast(610) ) {
throw invalid_option();
}
transactionTimeout = extractIntOption(value, 0, std::numeric_limits<int>::max())/1000.0;
break;
case FDBDatabaseOptions::TRANSACTION_RETRY_LIMIT:
transactionMaxRetries = (int)extractIntOption(value, -1, std::numeric_limits<int>::max());
break;
case FDBDatabaseOptions::TRANSACTION_MAX_RETRY_DELAY:
validateOptionValue(value, true);
transactionMaxBackoff = extractIntOption(value, 0, std::numeric_limits<int32_t>::max()) / 1000.0;
break;
case FDBDatabaseOptions::TRANSACTION_SIZE_LIMIT:
validateOptionValue(value, true);
transactionMaxSize = extractIntOption(value, 32, CLIENT_KNOBS->TRANSACTION_SIZE_LIMIT);
break;
case FDBDatabaseOptions::SNAPSHOT_RYW_ENABLE:
validateOptionValue(value, false);
snapshotRywEnabled++;
break;
case FDBDatabaseOptions::SNAPSHOT_RYW_DISABLE:
validateOptionValue(value, false);
snapshotRywEnabled--;
break;
int defaultFor = FDBDatabaseOptions::optionInfo.getMustExist(option).defaultFor;
if (defaultFor >= 0) {
ASSERT(FDBTransactionOptions::optionInfo.find((FDBTransactionOptions::Option)defaultFor) !=
FDBTransactionOptions::optionInfo.end());
transactionDefaults.addOption((FDBTransactionOptions::Option)option, value.castTo<Standalone<StringRef>>());
}
else {
switch(option) {
case FDBDatabaseOptions::LOCATION_CACHE_SIZE:
locationCacheSize = (int)extractIntOption(value, 0, std::numeric_limits<int>::max());
break;
case FDBDatabaseOptions::MACHINE_ID:
clientLocality = LocalityData( clientLocality.processId(), value.present() ? Standalone<StringRef>(value.get()) : Optional<Standalone<StringRef>>(), clientLocality.machineId(), clientLocality.dcId() );
if( clientInfo->get().proxies.size() )
masterProxies = Reference<ProxyInfo>( new ProxyInfo( clientInfo->get().proxies, clientLocality ) );
server_interf.clear();
locationCache.insert( allKeys, Reference<LocationInfo>() );
break;
case FDBDatabaseOptions::MAX_WATCHES:
maxOutstandingWatches = (int)extractIntOption(value, 0, CLIENT_KNOBS->ABSOLUTE_MAX_WATCHES);
break;
case FDBDatabaseOptions::DATACENTER_ID:
clientLocality = LocalityData(clientLocality.processId(), clientLocality.zoneId(), clientLocality.machineId(), value.present() ? Standalone<StringRef>(value.get()) : Optional<Standalone<StringRef>>());
if( clientInfo->get().proxies.size() )
masterProxies = Reference<ProxyInfo>( new ProxyInfo( clientInfo->get().proxies, clientLocality ));
server_interf.clear();
locationCache.insert( allKeys, Reference<LocationInfo>() );
break;
case FDBDatabaseOptions::SNAPSHOT_RYW_ENABLE:
validateOptionValue(value, false);
snapshotRywEnabled++;
break;
case FDBDatabaseOptions::SNAPSHOT_RYW_DISABLE:
validateOptionValue(value, false);
snapshotRywEnabled--;
break;
default:
break;
}
}
}
@ -816,7 +820,7 @@ Reference<ClusterConnectionFile> DatabaseContext::getConnectionFile() {
return cluster->getConnectionFile();
}
Database Database::createDatabase( Reference<ClusterConnectionFile> connFile, int apiVersion, LocalityData const& clientLocality, DatabaseContext *preallocatedDb ) {
Database Database::createDatabase( Reference<ClusterConnectionFile> connFile, int apiVersion, bool internal, LocalityData const& clientLocality, DatabaseContext *preallocatedDb ) {
Reference<AsyncVar<int>> connectedCoordinatorsNum(new AsyncVar<int>(0)); // Number of connected coordinators for the client
Reference<AsyncVar<int>> connectedCoordinatorsNumDelayed(new AsyncVar<int>(0));
Reference<Cluster> cluster(new Cluster(connFile, connectedCoordinatorsNum, apiVersion));
@ -825,18 +829,23 @@ Database Database::createDatabase( Reference<ClusterConnectionFile> connFile, in
DatabaseContext *db;
if(preallocatedDb) {
db = new (preallocatedDb) DatabaseContext(cluster, clientInfo, clientInfoMonitor, LiteralStringRef(""), TaskDefaultEndpoint, clientLocality, true, false, apiVersion);
db = new (preallocatedDb) DatabaseContext(cluster, clientInfo, clientInfoMonitor, TaskPriority::DefaultEndpoint, clientLocality, true, false, internal, apiVersion);
}
else {
db = new DatabaseContext(cluster, clientInfo, clientInfoMonitor, LiteralStringRef(""), TaskDefaultEndpoint, clientLocality, true, false, apiVersion);
db = new DatabaseContext(cluster, clientInfo, clientInfoMonitor, TaskPriority::DefaultEndpoint, clientLocality, true, false, internal, apiVersion);
}
return Database(db);
}
Database Database::createDatabase( std::string connFileName, int apiVersion, LocalityData const& clientLocality ) {
Database Database::createDatabase( std::string connFileName, int apiVersion, bool internal, LocalityData const& clientLocality ) {
Reference<ClusterConnectionFile> rccf = Reference<ClusterConnectionFile>(new ClusterConnectionFile(ClusterConnectionFile::lookupClusterFileName(connFileName).first));
return Database::createDatabase(rccf, apiVersion, clientLocality);
return Database::createDatabase(rccf, apiVersion, internal, clientLocality);
}
const UniqueOrderedOptionList<FDBTransactionOptions>& Database::getTransactionDefaults() const {
ASSERT(db);
return db->transactionDefaults;
}
extern IPAddress determinePublicIPAutomatically(ClusterConnectionString const& ccs);
@ -884,7 +893,7 @@ void Cluster::init( Reference<ClusterConnectionFile> connFile, bool startClientI
initializeSystemMonitorMachineState(SystemMonitorMachineState(IPAddress(publicIP)));
systemMonitor();
uncancellable( recurring( &systemMonitor, CLIENT_KNOBS->SYSTEM_MONITOR_INTERVAL, TaskFlushTrace ) );
uncancellable( recurring( &systemMonitor, CLIENT_KNOBS->SYSTEM_MONITOR_INTERVAL, TaskPriority::FlushTrace ) );
}
failMon = failureMonitorClient( clusterInterface, false );
@ -1240,7 +1249,11 @@ ACTOR Future< pair<KeyRange,Reference<LocationInfo>> > getKeyLocation_internal(
loop {
choose {
when ( wait( cx->onMasterProxiesChanged() ) ) {}
when ( GetKeyServerLocationsReply rep = wait( loadBalance( cx->getMasterProxies(info.useProvisionalProxies), &MasterProxyInterface::getKeyServersLocations, GetKeyServerLocationsRequest(key, Optional<KeyRef>(), 100, isBackward, key.arena()), TaskDefaultPromiseEndpoint ) ) ) {
when ( GetKeyServerLocationsReply rep = wait( loadBalance( cx->getMasterProxies(info.useProvisionalProxies), &MasterProxyInterface::getKeyServersLocations, GetKeyServerLocationsRequest(key, Optional<KeyRef>(), 100, isBackward, key.arena()), TaskPriority::DefaultPromiseEndpoint ) ) ) {
if(rep.newClientInfo.present()) {
cx->clientInfo->set(rep.newClientInfo.get());
continue;
}
if( info.debugID.present() )
g_traceBatch.addEvent("TransactionDebug", info.debugID.get().first(), "NativeAPI.getKeyLocation.After");
ASSERT( rep.results.size() == 1 );
@ -1277,7 +1290,12 @@ ACTOR Future< vector< pair<KeyRange,Reference<LocationInfo>> > > getKeyRangeLoca
loop {
choose {
when ( wait( cx->onMasterProxiesChanged() ) ) {}
when ( GetKeyServerLocationsReply _rep = wait( loadBalance( cx->getMasterProxies(info.useProvisionalProxies), &MasterProxyInterface::getKeyServersLocations, GetKeyServerLocationsRequest(keys.begin, keys.end, limit, reverse, keys.arena()), TaskDefaultPromiseEndpoint ) ) ) {
when ( GetKeyServerLocationsReply _rep = wait( loadBalance( cx->getMasterProxies(info.useProvisionalProxies), &MasterProxyInterface::getKeyServersLocations, GetKeyServerLocationsRequest(keys.begin, keys.end, limit, reverse, keys.arena()), TaskPriority::DefaultPromiseEndpoint ) ) ) {
if(_rep.newClientInfo.present()) {
cx->clientInfo->set(_rep.newClientInfo.get());
continue;
}
state GetKeyServerLocationsReply rep = _rep;
if( info.debugID.present() )
g_traceBatch.addEvent("TransactionDebug", info.debugID.get().first(), "NativeAPI.getKeyLocations.After");
@ -1398,7 +1416,7 @@ ACTOR Future<Optional<Value>> getValue( Future<Version> version, Key key, Databa
}
state GetValueReply reply = wait(
loadBalance(ssi.second, &StorageServerInterface::getValue, GetValueRequest(key, ver, getValueID),
TaskDefaultPromiseEndpoint, false, cx->enableLocalityLoadBalance ? &cx->queueModel : NULL));
TaskPriority::DefaultPromiseEndpoint, false, cx->enableLocalityLoadBalance ? &cx->queueModel : NULL));
double latency = now() - startTimeD;
cx->readLatencies.addSample(latency);
if (trLogInfo) {
@ -1461,7 +1479,7 @@ ACTOR Future<Key> getKey( Database cx, KeySelector k, Future<Version> version, T
if( info.debugID.present() )
g_traceBatch.addEvent("TransactionDebug", info.debugID.get().first(), "NativeAPI.getKey.Before"); //.detail("StartKey", k.getKey()).detail("Offset",k.offset).detail("OrEqual",k.orEqual);
++cx->transactionPhysicalReads;
GetKeyReply reply = wait( loadBalance( ssi.second, &StorageServerInterface::getKey, GetKeyRequest(k, version.get()), TaskDefaultPromiseEndpoint, false, cx->enableLocalityLoadBalance ? &cx->queueModel : NULL ) );
GetKeyReply reply = wait( loadBalance( ssi.second, &StorageServerInterface::getKey, GetKeyRequest(k, version.get()), TaskPriority::DefaultPromiseEndpoint, false, cx->enableLocalityLoadBalance ? &cx->queueModel : NULL ) );
if( info.debugID.present() )
g_traceBatch.addEvent("TransactionDebug", info.debugID.get().first(), "NativeAPI.getKey.After"); //.detail("NextKey",reply.sel.key).detail("Offset", reply.sel.offset).detail("OrEqual", k.orEqual);
k = reply.sel;
@ -1490,6 +1508,11 @@ ACTOR Future<Version> waitForCommittedVersion( Database cx, Version version ) {
choose {
when ( wait( cx->onMasterProxiesChanged() ) ) {}
when ( GetReadVersionReply v = wait( loadBalance( cx->getMasterProxies(false), &MasterProxyInterface::getConsistentReadVersion, GetReadVersionRequest( 0, GetReadVersionRequest::PRIORITY_SYSTEM_IMMEDIATE ), cx->taskID ) ) ) {
if(v.newClientInfo.present()) {
cx->clientInfo->set(v.newClientInfo.get());
continue;
}
if (v.version >= version)
return v.version;
// SOMEDAY: Do the wait on the server side, possibly use less expensive source of committed version (causal consistency is not needed for this purpose)
@ -1524,7 +1547,7 @@ ACTOR Future< Void > watchValue( Future<Version> version, Key key, Optional<Valu
g_traceBatch.addAttach("WatchValueAttachID", info.debugID.get().first(), watchValueID.get().first());
g_traceBatch.addEvent("WatchValueDebug", watchValueID.get().first(), "NativeAPI.watchValue.Before"); //.detail("TaskID", g_network->getCurrentTask());
}
state Version resp = wait( loadBalance( ssi.second, &StorageServerInterface::watchValue, WatchValueRequest(key, value, ver, watchValueID), TaskDefaultPromiseEndpoint ) );
state Version resp = wait( loadBalance( ssi.second, &StorageServerInterface::watchValue, WatchValueRequest(key, value, ver, watchValueID), TaskPriority::DefaultPromiseEndpoint ) );
if( info.debugID.present() ) {
g_traceBatch.addEvent("WatchValueDebug", watchValueID.get().first(), "NativeAPI.watchValue.After"); //.detail("TaskID", g_network->getCurrentTask());
}
@ -1616,7 +1639,7 @@ ACTOR Future<Standalone<RangeResultRef>> getExactRange( Database cx, Version ver
.detail("Servers", locations[shard].second->description());*/
}
++cx->transactionPhysicalReads;
GetKeyValuesReply rep = wait( loadBalance( locations[shard].second, &StorageServerInterface::getKeyValues, req, TaskDefaultPromiseEndpoint, false, cx->enableLocalityLoadBalance ? &cx->queueModel : NULL ) );
GetKeyValuesReply rep = wait( loadBalance( locations[shard].second, &StorageServerInterface::getKeyValues, req, TaskPriority::DefaultPromiseEndpoint, false, cx->enableLocalityLoadBalance ? &cx->queueModel : NULL ) );
if( info.debugID.present() )
g_traceBatch.addEvent("TransactionDebug", info.debugID.get().first(), "NativeAPI.getExactRange.After");
output.arena().dependsOn( rep.arena );
@ -1893,7 +1916,7 @@ ACTOR Future<Standalone<RangeResultRef>> getRange( Database cx, Reference<Transa
transaction_too_old(), future_version()
});
}
GetKeyValuesReply rep = wait( loadBalance(beginServer.second, &StorageServerInterface::getKeyValues, req, TaskDefaultPromiseEndpoint, false, cx->enableLocalityLoadBalance ? &cx->queueModel : NULL ) );
GetKeyValuesReply rep = wait( loadBalance(beginServer.second, &StorageServerInterface::getKeyValues, req, TaskPriority::DefaultPromiseEndpoint, false, cx->enableLocalityLoadBalance ? &cx->queueModel : NULL ) );
if( info.debugID.present() ) {
g_traceBatch.addEvent("TransactionDebug", info.debugID.get().first(), "NativeAPI.getRange.After");//.detail("SizeOf", rep.data.size());
@ -2457,8 +2480,6 @@ double Transaction::getBackoff(int errCode) {
}
TransactionOptions::TransactionOptions(Database const& cx) {
maxBackoff = cx->transactionMaxBackoff;
sizeLimit = cx->transactionMaxSize;
reset(cx);
if (BUGGIFY) {
commitOnFirstProxy = true;
@ -2472,11 +2493,9 @@ TransactionOptions::TransactionOptions() {
}
void TransactionOptions::reset(Database const& cx) {
double oldMaxBackoff = maxBackoff;
uint32_t oldSizeLimit = sizeLimit;
memset(this, 0, sizeof(*this));
maxBackoff = cx->apiVersionAtLeast(610) ? oldMaxBackoff : cx->transactionMaxBackoff;
sizeLimit = oldSizeLimit;
maxBackoff = CLIENT_KNOBS->DEFAULT_MAX_BACKOFF;
sizeLimit = CLIENT_KNOBS->TRANSACTION_SIZE_LIMIT;
lockAware = cx->lockAware;
}
@ -2503,7 +2522,6 @@ void Transaction::reset() {
void Transaction::fullReset() {
reset();
backoff = CLIENT_KNOBS->DEFAULT_BACKOFF;
options.maxBackoff = getDatabase()->transactionMaxBackoff;
}
int Transaction::apiVersionAtLeast(int minVersion) const {
@ -2694,7 +2712,7 @@ ACTOR static Future<Void> tryCommit( Database cx, Reference<TransactionLogInfo>
const std::vector<MasterProxyInterface>& proxies = cx->clientInfo->get().proxies;
reply = proxies.size() ? throwErrorOr ( brokenPromiseToMaybeDelivered ( proxies[0].commit.tryGetReply(req) ) ) : Never();
} else {
reply = loadBalance( cx->getMasterProxies(info.useProvisionalProxies), &MasterProxyInterface::commit, req, TaskDefaultPromiseEndpoint, true );
reply = loadBalance( cx->getMasterProxies(info.useProvisionalProxies), &MasterProxyInterface::commit, req, TaskPriority::DefaultPromiseEndpoint, true );
}
choose {
@ -2703,6 +2721,11 @@ ACTOR static Future<Void> tryCommit( Database cx, Reference<TransactionLogInfo>
throw request_maybe_delivered();
}
when (CommitID ci = wait( reply )) {
if(ci.newClientInfo.present()) {
cx->clientInfo->set(ci.newClientInfo.get());
throw not_committed();
}
Version v = ci.version;
if (v != invalidVersion) {
if (info.debugID.present())
@ -2718,7 +2741,7 @@ ACTOR static Future<Void> tryCommit( Database cx, Reference<TransactionLogInfo>
tr->versionstampPromise.send(ret);
tr->numErrors = 0;
cx->transactionsCommitCompleted++;
++cx->transactionsCommitCompleted;
cx->transactionCommittedMutations += req.transaction.mutations.size();
cx->transactionCommittedMutationBytes += req.transaction.mutations.expectedSize();
@ -2793,7 +2816,7 @@ Future<Void> Transaction::commitMutations() {
return Void();
}
cx->transactionsCommitStarted++;
++cx->transactionsCommitStarted;
if(options.readOnly)
return transaction_read_only();
@ -2971,6 +2994,7 @@ void Transaction::setOption( FDBTransactionOptions::Option option, Optional<Stri
}
else {
trLogInfo = Reference<TransactionLogInfo>(new TransactionLogInfo(value.get().printable(), TransactionLogInfo::DONT_LOG));
trLogInfo->maxFieldLength = options.maxTransactionLoggingFieldLength;
}
break;
@ -2985,6 +3009,20 @@ void Transaction::setOption( FDBTransactionOptions::Option option, Optional<Stri
}
break;
case FDBTransactionOptions::TRANSACTION_LOGGING_MAX_FIELD_LENGTH:
validateOptionValue(value, true);
{
int maxFieldLength = extractIntOption(value, -1, std::numeric_limits<int32_t>::max());
if(maxFieldLength == 0) {
throw invalid_option_value();
}
options.maxTransactionLoggingFieldLength = maxFieldLength;
}
if(trLogInfo) {
trLogInfo->maxFieldLength = options.maxTransactionLoggingFieldLength;
}
break;
case FDBTransactionOptions::MAX_RETRY_DELAY:
validateOptionValue(value, true);
options.maxBackoff = extractIntOption(value, 0, std::numeric_limits<int32_t>::max()) / 1000.0;
@ -3034,6 +3072,10 @@ ACTOR Future<GetReadVersionReply> getConsistentReadVersion( DatabaseContext *cx,
choose {
when ( wait( cx->onMasterProxiesChanged() ) ) {}
when ( GetReadVersionReply v = wait( loadBalance( cx->getMasterProxies(flags & GetReadVersionRequest::FLAG_USE_PROVISIONAL_PROXIES), &MasterProxyInterface::getConsistentReadVersion, req, cx->taskID ) ) ) {
if(v.newClientInfo.present()) {
cx->clientInfo->set(v.newClientInfo.get());
continue;
}
if( debugID.present() )
g_traceBatch.addEvent("TransactionDebug", debugID.get().first(), "NativeAPI.getConsistentReadVersion.After");
ASSERT( v.version > 0 );
@ -3074,7 +3116,7 @@ ACTOR Future<Void> readVersionBatcher( DatabaseContext *cx, FutureStream< std::p
if (requests.size() == CLIENT_KNOBS->MAX_BATCH_SIZE)
send_batch = true;
else if (!timeout.isValid())
timeout = delay(batchTime, TaskProxyGetConsistentReadVersion);
timeout = delay(batchTime, TaskPriority::ProxyGetConsistentReadVersion);
}
when(wait(timeout.isValid() ? timeout : Never())) {
send_batch = true;
@ -3126,7 +3168,7 @@ ACTOR Future<Version> extractReadVersion(DatabaseContext* cx, Reference<Transact
}
Future<Version> Transaction::getReadVersion(uint32_t flags) {
cx->transactionReadVersions++;
++cx->transactionReadVersions;
flags |= options.getReadVersionFlags;
auto& batcher = cx->versionBatcher[ flags ];
@ -3150,8 +3192,7 @@ Future<Standalone<StringRef>> Transaction::getVersionstamp() {
}
Future<Void> Transaction::onError( Error const& e ) {
if (e.code() == error_code_success)
{
if (e.code() == error_code_success) {
return client_invalid_operation();
}
if (e.code() == error_code_not_committed ||
@ -3162,32 +3203,32 @@ Future<Void> Transaction::onError( Error const& e ) {
e.code() == error_code_cluster_not_fully_recovered)
{
if(e.code() == error_code_not_committed)
cx->transactionsNotCommitted++;
++cx->transactionsNotCommitted;
if(e.code() == error_code_commit_unknown_result)
cx->transactionsMaybeCommitted++;
++cx->transactionsMaybeCommitted;
if (e.code() == error_code_proxy_memory_limit_exceeded)
cx->transactionsResourceConstrained++;
++cx->transactionsResourceConstrained;
if (e.code() == error_code_process_behind)
cx->transactionsProcessBehind++;
++cx->transactionsProcessBehind;
if (e.code() == error_code_cluster_not_fully_recovered) {
cx->transactionWaitsForFullRecovery++;
++cx->transactionWaitsForFullRecovery;
}
double backoff = getBackoff(e.code());
reset();
return delay( backoff, info.taskID );
return delay(backoff, info.taskID);
}
if (e.code() == error_code_transaction_too_old ||
e.code() == error_code_future_version)
{
if( e.code() == error_code_transaction_too_old )
cx->transactionsTooOld++;
++cx->transactionsTooOld;
else if( e.code() == error_code_future_version )
cx->transactionsFutureVersions++;
++cx->transactionsFutureVersions;
double maxBackoff = options.maxBackoff;
reset();
return delay( std::min(CLIENT_KNOBS->FUTURE_VERSION_RETRY_DELAY, maxBackoff), info.taskID );
return delay(std::min(CLIENT_KNOBS->FUTURE_VERSION_RETRY_DELAY, maxBackoff), info.taskID);
}
if(g_network->isSimulated() && ++numErrors % 10 == 0)
@ -3235,7 +3276,7 @@ ACTOR Future< StorageMetrics > waitStorageMetricsMultipleLocations(
WaitMetricsRequest req(locations[i].first, StorageMetrics(), StorageMetrics());
req.min.bytes = 0;
req.max.bytes = -1;
fx[i] = loadBalance( locations[i].second, &StorageServerInterface::waitMetrics, req, TaskDataDistribution );
fx[i] = loadBalance( locations[i].second, &StorageServerInterface::waitMetrics, req, TaskPriority::DataDistribution );
}
wait( waitForAll(fx) );
@ -3266,7 +3307,7 @@ ACTOR Future< StorageMetrics > waitStorageMetrics(
int shardLimit )
{
loop {
vector< pair<KeyRange, Reference<LocationInfo>> > locations = wait( getKeyRangeLocations( cx, keys, shardLimit, false, &StorageServerInterface::waitMetrics, TransactionInfo(TaskDataDistribution) ) );
vector< pair<KeyRange, Reference<LocationInfo>> > locations = wait( getKeyRangeLocations( cx, keys, shardLimit, false, &StorageServerInterface::waitMetrics, TransactionInfo(TaskPriority::DataDistribution) ) );
//SOMEDAY: Right now, if there are too many shards we delay and check again later. There may be a better solution to this.
if(locations.size() < shardLimit) {
@ -3276,7 +3317,7 @@ ACTOR Future< StorageMetrics > waitStorageMetrics(
fx = waitStorageMetricsMultipleLocations( locations, min, max, permittedError );
} else {
WaitMetricsRequest req( keys, min, max );
fx = loadBalance( locations[0].second, &StorageServerInterface::waitMetrics, req, TaskDataDistribution );
fx = loadBalance( locations[0].second, &StorageServerInterface::waitMetrics, req, TaskPriority::DataDistribution );
}
StorageMetrics x = wait(fx);
return x;
@ -3286,14 +3327,14 @@ ACTOR Future< StorageMetrics > waitStorageMetrics(
throw;
}
cx->invalidateCache(keys);
wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, TaskDataDistribution));
wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, TaskPriority::DataDistribution));
}
} else {
TraceEvent(SevWarn, "WaitStorageMetricsPenalty")
.detail("Keys", keys)
.detail("Limit", CLIENT_KNOBS->STORAGE_METRICS_SHARD_LIMIT)
.detail("JitteredSecondsOfPenitence", CLIENT_KNOBS->STORAGE_METRICS_TOO_MANY_SHARDS_DELAY);
wait(delayJittered(CLIENT_KNOBS->STORAGE_METRICS_TOO_MANY_SHARDS_DELAY, TaskDataDistribution));
wait(delayJittered(CLIENT_KNOBS->STORAGE_METRICS_TOO_MANY_SHARDS_DELAY, TaskPriority::DataDistribution));
// make sure that the next getKeyRangeLocations() call will actually re-fetch the range
cx->invalidateCache( keys );
}
@ -3319,13 +3360,13 @@ Future< StorageMetrics > Transaction::getStorageMetrics( KeyRange const& keys, i
ACTOR Future< Standalone<VectorRef<KeyRef>> > splitStorageMetrics( Database cx, KeyRange keys, StorageMetrics limit, StorageMetrics estimated )
{
loop {
state vector< pair<KeyRange, Reference<LocationInfo>> > locations = wait( getKeyRangeLocations( cx, keys, CLIENT_KNOBS->STORAGE_METRICS_SHARD_LIMIT, false, &StorageServerInterface::splitMetrics, TransactionInfo(TaskDataDistribution) ) );
state vector< pair<KeyRange, Reference<LocationInfo>> > locations = wait( getKeyRangeLocations( cx, keys, CLIENT_KNOBS->STORAGE_METRICS_SHARD_LIMIT, false, &StorageServerInterface::splitMetrics, TransactionInfo(TaskPriority::DataDistribution) ) );
state StorageMetrics used;
state Standalone<VectorRef<KeyRef>> results;
//SOMEDAY: Right now, if there are too many shards we delay and check again later. There may be a better solution to this.
if(locations.size() == CLIENT_KNOBS->STORAGE_METRICS_SHARD_LIMIT) {
wait(delay(CLIENT_KNOBS->STORAGE_METRICS_TOO_MANY_SHARDS_DELAY, TaskDataDistribution));
wait(delay(CLIENT_KNOBS->STORAGE_METRICS_TOO_MANY_SHARDS_DELAY, TaskPriority::DataDistribution));
cx->invalidateCache(keys);
}
else {
@ -3336,7 +3377,7 @@ ACTOR Future< Standalone<VectorRef<KeyRef>> > splitStorageMetrics( Database cx,
state int i = 0;
for(; i<locations.size(); i++) {
SplitMetricsRequest req( locations[i].first, limit, used, estimated, i == locations.size() - 1 );
SplitMetricsReply res = wait( loadBalance( locations[i].second, &StorageServerInterface::splitMetrics, req, TaskDataDistribution ) );
SplitMetricsReply res = wait( loadBalance( locations[i].second, &StorageServerInterface::splitMetrics, req, TaskPriority::DataDistribution ) );
if( res.splits.size() && res.splits[0] <= results.back() ) { // split points are out of order, possibly because of moving data, throw error to retry
ASSERT_WE_THINK(false); // FIXME: This seems impossible and doesn't seem to be covered by testing
throw all_alternatives_failed();
@ -3362,7 +3403,7 @@ ACTOR Future< Standalone<VectorRef<KeyRef>> > splitStorageMetrics( Database cx,
throw;
}
cx->invalidateCache( keys );
wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, TaskDataDistribution));
wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, TaskPriority::DataDistribution));
}
}
}

View File

@ -74,8 +74,8 @@ class Database {
public:
enum { API_VERSION_LATEST = -1 };
static Database createDatabase( Reference<ClusterConnectionFile> connFile, int apiVersion, LocalityData const& clientLocality=LocalityData(), DatabaseContext *preallocatedDb=nullptr );
static Database createDatabase( std::string connFileName, int apiVersion, LocalityData const& clientLocality=LocalityData() );
static Database createDatabase( Reference<ClusterConnectionFile> connFile, int apiVersion, bool internal=true, LocalityData const& clientLocality=LocalityData(), DatabaseContext *preallocatedDb=nullptr );
static Database createDatabase( std::string connFileName, int apiVersion, bool internal=true, LocalityData const& clientLocality=LocalityData() );
Database() {} // an uninitialized database can be destructed or reassigned safely; that's it
void operator= ( Database const& rhs ) { db = rhs.db; }
@ -90,6 +90,8 @@ public:
inline DatabaseContext* extractPtr() { return db.extractPtr(); }
DatabaseContext* operator->() const { return db.getPtr(); }
const UniqueOrderedOptionList<FDBTransactionOptions>& getTransactionDefaults() const;
private:
Reference<DatabaseContext> db;
};
@ -147,6 +149,7 @@ struct TransactionOptions {
double maxBackoff;
uint32_t getReadVersionFlags;
uint32_t sizeLimit;
int maxTransactionLoggingFieldLength;
bool checkWritesEnabled : 1;
bool causalWriteRisky : 1;
bool commitOnFirstProxy : 1;
@ -163,26 +166,27 @@ struct TransactionOptions {
struct TransactionInfo {
Optional<UID> debugID;
int taskID;
TaskPriority taskID;
bool useProvisionalProxies;
explicit TransactionInfo( int taskID ) : taskID(taskID), useProvisionalProxies(false) {}
explicit TransactionInfo( TaskPriority taskID ) : taskID(taskID), useProvisionalProxies(false) {}
};
struct TransactionLogInfo : public ReferenceCounted<TransactionLogInfo>, NonCopyable {
enum LoggingLocation { DONT_LOG = 0, TRACE_LOG = 1, DATABASE = 2 };
TransactionLogInfo() : logLocation(DONT_LOG) {}
TransactionLogInfo(LoggingLocation location) : logLocation(location) {}
TransactionLogInfo(std::string id, LoggingLocation location) : logLocation(location), identifier(id) {}
TransactionLogInfo() : logLocation(DONT_LOG), maxFieldLength(0) {}
TransactionLogInfo(LoggingLocation location) : logLocation(location), maxFieldLength(0) {}
TransactionLogInfo(std::string id, LoggingLocation location) : logLocation(location), identifier(id), maxFieldLength(0) {}
void setIdentifier(std::string id) { identifier = id; }
void logTo(LoggingLocation loc) { logLocation = logLocation | loc; }
template <typename T>
void addLog(const T& event) {
if(logLocation & TRACE_LOG) {
ASSERT(!identifier.empty())
event.logEvent(identifier);
event.logEvent(identifier, maxFieldLength);
}
if (flushed) {
@ -200,6 +204,7 @@ struct TransactionLogInfo : public ReferenceCounted<TransactionLogInfo>, NonCopy
bool logsAdded{ false };
bool flushed{ false };
int logLocation;
int maxFieldLength;
std::string identifier;
};
@ -286,7 +291,7 @@ public:
void flushTrLogsIfEnabled();
// These are to permit use as state variables in actors:
Transaction() : info( TaskDefaultEndpoint ) {}
Transaction() : info( TaskPriority::DefaultEndpoint ) {}
void operator=(Transaction&& r) BOOST_NOEXCEPT;
void reset();

View File

@ -1124,7 +1124,8 @@ public:
};
ReadYourWritesTransaction::ReadYourWritesTransaction( Database const& cx ) : cache(&arena), writes(&arena), tr(cx), retries(0), creationTime(now()), commitStarted(false), options(tr), deferredError(cx->deferredError) {
resetTimeout();
std::copy(cx.getTransactionDefaults().begin(), cx.getTransactionDefaults().end(), std::back_inserter(persistentOptions));
applyPersistentOptions();
}
ACTOR Future<Void> timebomb(double endTime, Promise<Void> resetPromise) {
@ -1473,36 +1474,16 @@ void ReadYourWritesTransaction::writeRangeToNativeTransaction( KeyRangeRef const
}
ReadYourWritesTransactionOptions::ReadYourWritesTransactionOptions(Transaction const& tr) {
Database cx = tr.getDatabase();
timeoutInSeconds = cx->transactionTimeout;
maxRetries = cx->transactionMaxRetries;
reset(tr);
}
void ReadYourWritesTransactionOptions::reset(Transaction const& tr) {
double oldTimeout = timeoutInSeconds;
int oldMaxRetries = maxRetries;
memset(this, 0, sizeof(*this));
if( tr.apiVersionAtLeast(610) ) {
// Starting in API version 610, these options are not cleared after reset.
timeoutInSeconds = oldTimeout;
maxRetries = oldMaxRetries;
}
else {
Database cx = tr.getDatabase();
maxRetries = cx->transactionMaxRetries;
timeoutInSeconds = cx->transactionTimeout;
}
timeoutInSeconds = 0.0;
maxRetries = -1;
snapshotRywEnabled = tr.getDatabase()->snapshotRywEnabled;
}
void ReadYourWritesTransactionOptions::fullReset(Transaction const& tr) {
reset(tr);
Database cx = tr.getDatabase();
maxRetries = cx->transactionMaxRetries;
timeoutInSeconds = cx->transactionTimeout;
}
bool ReadYourWritesTransactionOptions::getAndResetWriteConflictDisabled() {
bool disabled = nextWriteDisableConflictRange;
nextWriteDisableConflictRange = false;
@ -1777,7 +1758,15 @@ Future<Standalone<StringRef>> ReadYourWritesTransaction::getVersionstamp() {
return waitOrError(tr.getVersionstamp(), resetPromise.getFuture());
}
void ReadYourWritesTransaction::setOption( FDBTransactionOptions::Option option, Optional<StringRef> value ) {
void ReadYourWritesTransaction::setOption( FDBTransactionOptions::Option option, Optional<StringRef> value ) {
setOptionImpl(option, value);
if (FDBTransactionOptions::optionInfo.getMustExist(option).persistent) {
persistentOptions.emplace_back(option, value.castTo<Standalone<StringRef>>());
}
}
void ReadYourWritesTransaction::setOptionImpl( FDBTransactionOptions::Option option, Optional<StringRef> value ) {
switch(option) {
case FDBTransactionOptions::READ_YOUR_WRITES_DISABLE:
validateOptionValue(value, false);
@ -1872,6 +1861,7 @@ void ReadYourWritesTransaction::operator=(ReadYourWritesTransaction&& r) BOOST_N
transactionDebugInfo = r.transactionDebugInfo;
cache.arena = &arena;
writes.arena = &arena;
persistentOptions = std::move(r.persistentOptions);
}
ReadYourWritesTransaction::ReadYourWritesTransaction(ReadYourWritesTransaction&& r) BOOST_NOEXCEPT :
@ -1894,12 +1884,32 @@ ReadYourWritesTransaction::ReadYourWritesTransaction(ReadYourWritesTransaction&&
readConflicts = std::move(r.readConflicts);
watchMap = std::move( r.watchMap );
r.resetPromise = Promise<Void>();
persistentOptions = std::move(r.persistentOptions);
}
Future<Void> ReadYourWritesTransaction::onError(Error const& e) {
return RYWImpl::onError( this, e );
}
void ReadYourWritesTransaction::applyPersistentOptions() {
Optional<StringRef> timeout;
for (auto option : persistentOptions) {
if(option.first == FDBTransactionOptions::TIMEOUT) {
timeout = option.second.castTo<StringRef>();
}
else {
setOptionImpl(option.first, option.second.castTo<StringRef>());
}
}
// Setting a timeout can immediately cause a transaction to fail. The only timeout
// that matters is the one most recently set, so we ignore any earlier set timeouts
// that might inadvertently fail the transaction.
if(timeout.present()) {
setOptionImpl(FDBTransactionOptions::TIMEOUT, timeout);
}
}
void ReadYourWritesTransaction::resetRyow() {
Promise<Void> oldReset = resetPromise;
resetPromise = Promise<Void>();
@ -1917,7 +1927,7 @@ void ReadYourWritesTransaction::resetRyow() {
if(tr.apiVersionAtLeast(16)) {
options.reset(tr);
resetTimeout();
applyPersistentOptions();
}
if ( !oldReset.isSet() )
@ -1933,9 +1943,11 @@ void ReadYourWritesTransaction::reset() {
retries = 0;
creationTime = now();
timeoutActor.cancel();
options.fullReset(tr);
persistentOptions.clear();
options.reset(tr);
transactionDebugInfo.clear();
tr.fullReset();
std::copy(tr.getDatabase().getTransactionDefaults().begin(), tr.getDatabase().getTransactionDefaults().end(), std::back_inserter(persistentOptions));
resetRyow();
}

View File

@ -44,7 +44,6 @@ struct ReadYourWritesTransactionOptions {
ReadYourWritesTransactionOptions() {}
explicit ReadYourWritesTransactionOptions(Transaction const& tr);
void reset(Transaction const& tr);
void fullReset(Transaction const& tr);
bool getAndResetWriteConflictDisabled();
};
@ -160,6 +159,10 @@ private:
void debugLogRetries(Optional<Error> error = Optional<Error>());
void setOptionImpl( FDBTransactionOptions::Option option, Optional<StringRef> value = Optional<StringRef>() );
void applyPersistentOptions();
std::vector<std::pair<FDBTransactionOptions::Option, Optional<Standalone<StringRef>>>> persistentOptions;
ReadYourWritesTransactionOptions options;
};

View File

@ -56,6 +56,7 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema(
"roles":[
{
"query_queue_max":0,
"local_rate":0,
"input_bytes":{
"hz":0.0,
"counter":0,
@ -207,7 +208,8 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema(
"megabits_received":{
"hz":0.0
}
}
},
"run_loop_busy":0.2
}
},
"old_logs":[
@ -249,7 +251,8 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema(
"storage_server_min_free_space",
"storage_server_min_free_space_ratio",
"log_server_min_free_space",
"log_server_min_free_space_ratio"
"log_server_min_free_space_ratio",
"storage_server_durability_lag"
]
},
"description":"The database is not being saturated by the workload."
@ -268,7 +271,8 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema(
"storage_server_min_free_space",
"storage_server_min_free_space_ratio",
"log_server_min_free_space",
"log_server_min_free_space_ratio"
"log_server_min_free_space_ratio",
"storage_server_durability_lag"
]
},
"description":"The database is not being saturated by the workload."
@ -285,7 +289,10 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema(
"incompatible_connections":[
],
"datacenter_version_difference":0,
"datacenter_lag": {
"seconds" : 1.0,
"versions" : 1000000
},
"degraded_processes":0,
"database_available":true,
"database_locked":false,
@ -315,6 +322,10 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema(
}
]
},
"page_cache":{
"log_hit_rate":0.5,
"storage_hit_rate":0.5
},
"messages":[
{
"reasons":[
@ -430,6 +441,21 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema(
"counter":0,
"roughness":0.0
},
"started_immediate_priority":{
"hz":0.0,
"counter":0,
"roughness":0.0
},
"started_default_priority":{
"hz":0.0,
"counter":0,
"roughness":0.0
},
"started_batch_priority":{
"hz":0.0,
"counter":0,
"roughness":0.0
},
"conflicted":{
"hz":0.0,
"counter":0,

View File

@ -291,7 +291,7 @@ ACTOR Future<Optional<StatusObject>> clientCoordinatorsStatusFetcher(Reference<C
state vector<Future<Optional<LeaderInfo>>> leaderServers;
for (int i = 0; i < coord.clientLeaderServers.size(); i++)
leaderServers.push_back(retryBrokenPromise(coord.clientLeaderServers[i].getLeader, GetLeaderRequest(coord.clusterKey, UID()), TaskCoordinationReply));
leaderServers.push_back(retryBrokenPromise(coord.clientLeaderServers[i].getLeader, GetLeaderRequest(coord.clusterKey, UID()), TaskPriority::CoordinationReply));
wait( smartQuorum(leaderServers, leaderServers.size() / 2 + 1, 1.5) || delay(2.0) );

View File

@ -80,9 +80,9 @@ struct StorageServerInterface {
bool operator == (StorageServerInterface const& s) const { return uniqueID == s.uniqueID; }
bool operator < (StorageServerInterface const& s) const { return uniqueID < s.uniqueID; }
void initEndpoints() {
getValue.getEndpoint( TaskLoadBalancedEndpoint );
getKey.getEndpoint( TaskLoadBalancedEndpoint );
getKeyValues.getEndpoint( TaskLoadBalancedEndpoint );
getValue.getEndpoint( TaskPriority::LoadBalancedEndpoint );
getKey.getEndpoint( TaskPriority::LoadBalancedEndpoint );
getKeyValues.getEndpoint( TaskPriority::LoadBalancedEndpoint );
}
};

View File

@ -51,8 +51,19 @@ Reference<ITransaction> ThreadSafeDatabase::createTransaction() {
}
void ThreadSafeDatabase::setOption( FDBDatabaseOptions::Option option, Optional<StringRef> value) {
auto itr = FDBDatabaseOptions::optionInfo.find(option);
if(itr != FDBDatabaseOptions::optionInfo.end()) {
TraceEvent("SetDatabaseOption").detail("Option", itr->second.name);
}
else {
TraceEvent("UnknownDatabaseOption").detail("Option", option);
throw invalid_option();
}
DatabaseContext *db = this->db;
Standalone<Optional<StringRef>> passValue = value;
// ThreadSafeDatabase is not allowed to do anything with options except pass them through to RYW.
onMainThreadVoid( [db, option, passValue](){
db->checkDeferredError();
db->setOption(option, passValue.contents());
@ -68,7 +79,7 @@ ThreadSafeDatabase::ThreadSafeDatabase(std::string connFilename, int apiVersion)
onMainThreadVoid([db, connFile, apiVersion](){
try {
Database::createDatabase(connFile, apiVersion, LocalityData(), db).extractPtr();
Database::createDatabase(connFile, apiVersion, false, LocalityData(), db).extractPtr();
}
catch(Error &e) {
new (db) DatabaseContext(e);
@ -272,8 +283,16 @@ ThreadFuture<Standalone<StringRef>> ThreadSafeTransaction::getVersionstamp() {
}
void ThreadSafeTransaction::setOption( FDBTransactionOptions::Option option, Optional<StringRef> value ) {
auto itr = FDBTransactionOptions::optionInfo.find(option);
if(itr == FDBTransactionOptions::optionInfo.end()) {
TraceEvent("UnknownTransactionOption").detail("Option", option);
throw invalid_option();
}
ReadYourWritesTransaction *tr = this->tr;
Standalone<Optional<StringRef>> passValue = value;
// ThreadSafeTransaction is not allowed to do anything with options except pass them through to RYW.
onMainThreadVoid( [tr, option, passValue](){ tr->setOption(option, passValue.contents()); }, &tr->deferredError );
}

View File

@ -31,7 +31,7 @@
#include "flow/actorcompiler.h" // This must be the last #include.
ACTOR template <class Tree>
Future<Void> deferredCleanupActor( std::vector<Tree> toFree, int taskID = 7000 ) {
Future<Void> deferredCleanupActor( std::vector<Tree> toFree, TaskPriority taskID = TaskPriority::DefaultYield ) {
state int freeCount = 0;
while (!toFree.empty()) {
Tree a = std::move( toFree.back() );

View File

@ -511,7 +511,7 @@ public:
oldestVersion = newOldestVersion;
}
Future<Void> forgetVersionsBeforeAsync( Version newOldestVersion, int taskID = 7000 ) {
Future<Void> forgetVersionsBeforeAsync( Version newOldestVersion, TaskPriority taskID = TaskPriority::DefaultYield ) {
ASSERT( newOldestVersion <= latestVersion );
roots[newOldestVersion] = getRoot(newOldestVersion);

View File

@ -31,7 +31,6 @@
<EnableCompile Condition="'$(Configuration)|$(Platform)'=='Release|X64'">false</EnableCompile>
</ActorCompiler>
<ClInclude Include="BlobStore.h" />
<ClInclude Include="ClientDBInfo.h" />
<ClInclude Include="ClientLogEvents.h" />
<ClInclude Include="ClientWorkerInterface.h" />
<ClInclude Include="ClusterInterface.h" />

View File

@ -47,8 +47,8 @@ namespace vexillographer
private static string getCInfoLine(Option o, string indent, string structName)
{
return String.Format("{0}ADD_OPTION_INFO({1}, {2}, \"{2}\", \"{3}\", \"{4}\", {5}, {6})",
indent, structName, o.name.ToUpper(), o.comment, o.getParameterComment(), (o.paramDesc != null).ToString().ToLower(), o.hidden.ToString().ToLower());
return String.Format("{0}ADD_OPTION_INFO({1}, {2}, \"{2}\", \"{3}\", \"{4}\", {5}, {6}, {7}, {8})",
indent, structName, o.name.ToUpper(), o.comment, o.getParameterComment(), (o.paramDesc != null).ToString().ToLower(), o.hidden.ToString().ToLower(), o.persistent.ToString().ToLower(), o.defaultFor);
}
private static void writeCppInfo(TextWriter outFile, Scope scope, IEnumerable<Option> options)

View File

@ -146,22 +146,31 @@ description is not currently required but encouraged.
<Option name="datacenter_id" code="22"
paramType="String" paramDescription="Hexadecimal ID"
description="Specify the datacenter ID that was passed to fdbserver processes running in the same datacenter as this client, for better location-aware load balancing." />
<Option name="transaction_timeout" code="500"
paramType="Int" paramDescription="value in milliseconds of timeout"
description="Set a timeout in milliseconds which, when elapsed, will cause each transaction automatically to be cancelled. This sets the ``timeout`` option of each transaction created by this database. See the transaction option description for more information. Using this option requires that the API version is 610 or higher." />
<Option name="transaction_retry_limit" code="501"
paramType="Int" paramDescription="number of times to retry"
description="Set a timeout in milliseconds which, when elapsed, will cause a transaction automatically to be cancelled. This sets the ``retry_limit`` option of each transaction created by this database. See the transaction option description for more information." />
<Option name="transaction_max_retry_delay" code="502"
paramType="Int" paramDescription="value in milliseconds of maximum delay"
description="Set the maximum amount of backoff delay incurred in the call to ``onError`` if the error is retryable. This sets the ``max_retry_delay`` option of each transaction created by this database. See the transaction option description for more information." />
<Option name="transaction_size_limit" code="503"
paramType="Int" paramDescription="value in bytes"
description="Set the maximum transaction size which, if exceeded, will cause the transaction to be cancelled. Default to 10,000,000 bytes." />
<!-- The snapshot RYW options act like defaults for the equivalent transaction options, but database defaults cannot have cumulative effects from multiple calls.
Thus, we don't use the defaultFor annotation on these options. -->
<Option name="snapshot_ryw_enable" code="26"
description="Snapshot read operations will see the results of writes done in the same transaction. This is the default behavior." />
<Option name="snapshot_ryw_disable" code="27"
description="Snapshot read operations will not see the results of writes done in the same transaction. This was the default behavior prior to API version 300." />
<Option name="transaction_logging_max_field_length" code="405" paramType="Int" paramDescription="Maximum length of escaped key and value fields."
description="Sets the maximum escaped length of key and value fields to be logged to the trace file via the LOG_TRANSACTION option. This sets the ``transaction_logging_max_field_length`` option of each transaction created by this database. See the transaction option description for more information."
defaultFor="405"/>
<Option name="transaction_timeout" code="500"
paramType="Int" paramDescription="value in milliseconds of timeout"
description="Set a timeout in milliseconds which, when elapsed, will cause each transaction automatically to be cancelled. This sets the ``timeout`` option of each transaction created by this database. See the transaction option description for more information. Using this option requires that the API version is 610 or higher."
defaultFor="500"/>
<Option name="transaction_retry_limit" code="501"
paramType="Int" paramDescription="number of times to retry"
description="Set a timeout in milliseconds which, when elapsed, will cause a transaction automatically to be cancelled. This sets the ``retry_limit`` option of each transaction created by this database. See the transaction option description for more information."
defaultFor="501"/>
<Option name="transaction_max_retry_delay" code="502"
paramType="Int" paramDescription="value in milliseconds of maximum delay"
description="Set the maximum amount of backoff delay incurred in the call to ``onError`` if the error is retryable. This sets the ``max_retry_delay`` option of each transaction created by this database. See the transaction option description for more information."
defaultFor="502"/>
<Option name="transaction_size_limit" code="503"
paramType="Int" paramDescription="value in bytes"
description="Set the maximum transaction size in bytes. This sets the ``size_limit`` option on each transaction created by this database. See the transaction option description for more information."
defaultFor="503"/>
</Scope>
<Scope name="TransactionOption">
@ -204,19 +213,24 @@ description is not currently required but encouraged.
description="Sets a client provided identifier for the transaction that will be used in scenarios like tracing or profiling. Client trace logging or transaction profiling must be separately enabled." />
<Option name="log_transaction" code="404"
description="Enables tracing for this transaction and logs results to the client trace logs. The DEBUG_TRANSACTION_IDENTIFIER option must be set before using this option, and client trace logging must be enabled and to get log output." />
<Option name="transaction_logging_max_field_length" code="405" paramType="Int" paramDescription="Maximum length of escaped key and value fields."
description="Sets the maximum escaped length of key and value fields to be logged to the trace file via the LOG_TRANSACTION option, after which the field will be truncated. A negative value disables truncation." />
<Option name="timeout" code="500"
paramType="Int" paramDescription="value in milliseconds of timeout"
description="Set a timeout in milliseconds which, when elapsed, will cause the transaction automatically to be cancelled. Valid parameter values are ``[0, INT_MAX]``. If set to 0, will disable all timeouts. All pending and any future uses of the transaction will throw an exception. The transaction can be used again after it is reset. Prior to API version 610, like all other transaction options, the timeout must be reset after a call to ``onError``. If the API version is 610 or greater, the timeout is not reset after an ``onError`` call. This allows the user to specify a longer timeout on specific transactions than the default timeout specified through the ``transaction_timeout`` database option without the shorter database timeout cancelling transactions that encounter a retryable error. Note that at all API versions, it is safe and legal to set the timeout each time the transaction begins, so most code written assuming the older behavior can be upgraded to the newer behavior without requiring any modification, and the caller is not required to implement special logic in retry loops to only conditionally set this option." />
description="Set a timeout in milliseconds which, when elapsed, will cause the transaction automatically to be cancelled. Valid parameter values are ``[0, INT_MAX]``. If set to 0, will disable all timeouts. All pending and any future uses of the transaction will throw an exception. The transaction can be used again after it is reset. Prior to API version 610, like all other transaction options, the timeout must be reset after a call to ``onError``. If the API version is 610 or greater, the timeout is not reset after an ``onError`` call. This allows the user to specify a longer timeout on specific transactions than the default timeout specified through the ``transaction_timeout`` database option without the shorter database timeout cancelling transactions that encounter a retryable error. Note that at all API versions, it is safe and legal to set the timeout each time the transaction begins, so most code written assuming the older behavior can be upgraded to the newer behavior without requiring any modification, and the caller is not required to implement special logic in retry loops to only conditionally set this option."
persistent="true" />
<Option name="retry_limit" code="501"
paramType="Int" paramDescription="number of times to retry"
description="Set a maximum number of retries after which additional calls to ``onError`` will throw the most recently seen error code. Valid parameter values are ``[-1, INT_MAX]``. If set to -1, will disable the retry limit. Prior to API version 610, like all other transaction options, the retry limit must be reset after a call to ``onError``. If the API version is 610 or greater, the retry limit is not reset after an ``onError`` call. Note that at all API versions, it is safe and legal to set the retry limit each time the transaction begins, so most code written assuming the older behavior can be upgraded to the newer behavior without requiring any modification, and the caller is not required to implement special logic in retry loops to only conditionally set this option." />
description="Set a maximum number of retries after which additional calls to ``onError`` will throw the most recently seen error code. Valid parameter values are ``[-1, INT_MAX]``. If set to -1, will disable the retry limit. Prior to API version 610, like all other transaction options, the retry limit must be reset after a call to ``onError``. If the API version is 610 or greater, the retry limit is not reset after an ``onError`` call. Note that at all API versions, it is safe and legal to set the retry limit each time the transaction begins, so most code written assuming the older behavior can be upgraded to the newer behavior without requiring any modification, and the caller is not required to implement special logic in retry loops to only conditionally set this option."
persistent="true"/>
<Option name="max_retry_delay" code="502"
paramType="Int" paramDescription="value in milliseconds of maximum delay"
description="Set the maximum amount of backoff delay incurred in the call to ``onError`` if the error is retryable. Defaults to 1000 ms. Valid parameter values are ``[0, INT_MAX]``. If the maximum retry delay is less than the current retry delay of the transaction, then the current retry delay will be clamped to the maximum retry delay. Prior to API version 610, like all other transaction options, the maximum retry delay must be reset after a call to ``onError``. If the API version is 610 or greater, the retry limit is not reset after an ``onError`` call. Note that at all API versions, it is safe and legal to set the maximum retry delay each time the transaction begins, so most code written assuming the older behavior can be upgraded to the newer behavior without requiring any modification, and the caller is not required to implement special logic in retry loops to only conditionally set this option."/>
description="Set the maximum amount of backoff delay incurred in the call to ``onError`` if the error is retryable. Defaults to 1000 ms. Valid parameter values are ``[0, INT_MAX]``. If the maximum retry delay is less than the current retry delay of the transaction, then the current retry delay will be clamped to the maximum retry delay. Prior to API version 610, like all other transaction options, the maximum retry delay must be reset after a call to ``onError``. If the API version is 610 or greater, the retry limit is not reset after an ``onError`` call. Note that at all API versions, it is safe and legal to set the maximum retry delay each time the transaction begins, so most code written assuming the older behavior can be upgraded to the newer behavior without requiring any modification, and the caller is not required to implement special logic in retry loops to only conditionally set this option."
persistent="true"/>
<Option name="size_limit" code="503"
paramType="Int" paramDescription="value in bytes"
description="Set the maximum transaction size which, if exceeded, will cause the transaction to be cancelled. Valid parameter values are ``[32, 10,000,000]```." />
<Option name="snapshot_ryw_enable" code="600"
description="Set the transaction size limit in bytes. The size is calculated by combining the sizes of all keys and values written or mutated, all key ranges cleared, and all read and write conflict ranges. (In other words, it includes the total size of all data included in the request to the cluster to commit the transaction.) Large transactions can cause performance problems on FoundationDB clusters, so setting this limit to a smaller value than the default can help prevent the client from accidentally degrading the cluster's performance. This value must be at least 32 and cannot be set to higher than 10,000,000, the default transaction size limit." />
<Option name="snapshot_ryw_enable" code="600"
description="Snapshot read operations will see the results of writes done in the same transaction. This is the default behavior." />
<Option name="snapshot_ryw_disable" code="601"
description="Snapshot read operations will not see the results of writes done in the same transaction. This was the default behavior prior to API version 300." />

View File

@ -54,6 +54,8 @@ namespace vexillographer
public string paramDesc { get; set; }
public int code { get; set; }
public bool hidden { get; set; }
public bool persistent { get; set; }
public int defaultFor { get; set; }
private string _comment;
public string comment {
get {
@ -132,6 +134,9 @@ namespace vexillographer
var paramTypeStr = oDoc.AttributeOrNull("paramType");
ParamType p = paramTypeStr == null ? ParamType.None : (ParamType)Enum.Parse(typeof(ParamType), paramTypeStr);
bool hidden = oDoc.AttributeOrNull("hidden") == "true";
bool persistent = oDoc.AttributeOrNull("persistent") == "true";
String defaultForString = oDoc.AttributeOrNull("defaultFor");
int defaultFor = defaultForString == null ? -1 : int.Parse(defaultForString);
string disableOn = oDoc.AttributeOrNull("disableOn");
bool disabled = false;
if(disableOn != null)
@ -150,7 +155,9 @@ namespace vexillographer
paramType = p,
paramDesc = oDoc.AttributeOrNull("paramDescription"),
comment = oDoc.AttributeOrNull("description"),
hidden = hidden
hidden = hidden,
persistent = persistent,
defaultFor = defaultFor
});
}
}

View File

@ -67,8 +67,6 @@ struct EvictablePageCache : ReferenceCounted<EvictablePageCache> {
EvictablePageCache() : pageSize(0), maxPages(0), cacheEvictionType(RANDOM) {}
explicit EvictablePageCache(int pageSize, int64_t maxSize) : pageSize(pageSize), maxPages(maxSize / pageSize), cacheEvictionType(evictionPolicyStringToEnum(FLOW_KNOBS->CACHE_EVICTION_POLICY)) {
cacheHits.init(LiteralStringRef("EvictablePageCache.CacheHits"));
cacheMisses.init(LiteralStringRef("EvictablePageCache.CacheMisses"));
cacheEvictions.init(LiteralStringRef("EvictablePageCache.CacheEvictions"));
}
@ -82,7 +80,6 @@ struct EvictablePageCache : ReferenceCounted<EvictablePageCache> {
} else {
lruPages.push_back(*page); // new page is considered the most recently used (placed at LRU tail)
}
++cacheMisses;
}
void updateHit(EvictablePage* page) {
@ -91,7 +88,6 @@ struct EvictablePageCache : ReferenceCounted<EvictablePageCache> {
lruPages.erase(List::s_iterator_to(*page));
lruPages.push_back(*page);
}
++cacheHits;
}
void try_evict() {
@ -126,8 +122,6 @@ struct EvictablePageCache : ReferenceCounted<EvictablePageCache> {
List lruPages;
int pageSize;
int64_t maxPages;
Int64MetricHandle cacheHits;
Int64MetricHandle cacheMisses;
Int64MetricHandle cacheEvictions;
const CacheEvictionType cacheEvictionType;
};
@ -278,6 +272,8 @@ private:
Int64MetricHandle countFileCacheWrites;
Int64MetricHandle countFileCacheReadsBlocked;
Int64MetricHandle countFileCacheWritesBlocked;
Int64MetricHandle countFileCachePageReadsHit;
Int64MetricHandle countFileCachePageReadsMissed;
Int64MetricHandle countFileCachePageReadsMerged;
Int64MetricHandle countFileCacheReadBytes;
@ -286,28 +282,33 @@ private:
Int64MetricHandle countCacheWrites;
Int64MetricHandle countCacheReadsBlocked;
Int64MetricHandle countCacheWritesBlocked;
Int64MetricHandle countCachePageReadsHit;
Int64MetricHandle countCachePageReadsMissed;
Int64MetricHandle countCachePageReadsMerged;
Int64MetricHandle countCacheReadBytes;
AsyncFileCached( Reference<IAsyncFile> uncached, const std::string& filename, int64_t length, Reference<EvictablePageCache> pageCache )
AsyncFileCached( Reference<IAsyncFile> uncached, const std::string& filename, int64_t length, Reference<EvictablePageCache> pageCache )
: uncached(uncached), filename(filename), length(length), prevLength(length), pageCache(pageCache), currentTruncate(Void()), currentTruncateSize(0) {
if( !g_network->isSimulated() ) {
countFileCacheWrites.init( LiteralStringRef("AsyncFile.CountFileCacheWrites"), filename);
countFileCacheReads.init( LiteralStringRef("AsyncFile.CountFileCacheReads"), filename);
countFileCacheWritesBlocked.init( LiteralStringRef("AsyncFile.CountFileCacheWritesBlocked"), filename);
countFileCacheReadsBlocked.init( LiteralStringRef("AsyncFile.CountFileCacheReadsBlocked"), filename);
countFileCacheWrites.init(LiteralStringRef("AsyncFile.CountFileCacheWrites"), filename);
countFileCacheReads.init(LiteralStringRef("AsyncFile.CountFileCacheReads"), filename);
countFileCacheWritesBlocked.init(LiteralStringRef("AsyncFile.CountFileCacheWritesBlocked"), filename);
countFileCacheReadsBlocked.init(LiteralStringRef("AsyncFile.CountFileCacheReadsBlocked"), filename);
countFileCachePageReadsHit.init(LiteralStringRef("AsyncFile.CountFileCachePageReadsHit"), filename);
countFileCachePageReadsMissed.init(LiteralStringRef("AsyncFile.CountFileCachePageReadsMissed"), filename);
countFileCachePageReadsMerged.init(LiteralStringRef("AsyncFile.CountFileCachePageReadsMerged"), filename);
countFileCacheFinds.init( LiteralStringRef("AsyncFile.CountFileCacheFinds"), filename);
countFileCacheReadBytes.init( LiteralStringRef("AsyncFile.CountFileCacheReadBytes"), filename);
countFileCacheFinds.init(LiteralStringRef("AsyncFile.CountFileCacheFinds"), filename);
countFileCacheReadBytes.init(LiteralStringRef("AsyncFile.CountFileCacheReadBytes"), filename);
countCacheWrites.init( LiteralStringRef("AsyncFile.CountCacheWrites"));
countCacheReads.init( LiteralStringRef("AsyncFile.CountCacheReads"));
countCacheWritesBlocked.init( LiteralStringRef("AsyncFile.CountCacheWritesBlocked"));
countCacheReadsBlocked.init( LiteralStringRef("AsyncFile.CountCacheReadsBlocked"));
countCacheWrites.init(LiteralStringRef("AsyncFile.CountCacheWrites"));
countCacheReads.init(LiteralStringRef("AsyncFile.CountCacheReads"));
countCacheWritesBlocked.init(LiteralStringRef("AsyncFile.CountCacheWritesBlocked"));
countCacheReadsBlocked.init(LiteralStringRef("AsyncFile.CountCacheReadsBlocked"));
countCachePageReadsHit.init(LiteralStringRef("AsyncFile.CountCachePageReadsHit"));
countCachePageReadsMissed.init(LiteralStringRef("AsyncFile.CountCachePageReadsMissed"));
countCachePageReadsMerged.init(LiteralStringRef("AsyncFile.CountCachePageReadsMerged"));
countCacheFinds.init( LiteralStringRef("AsyncFile.CountCacheFinds"));
countCacheReadBytes.init( LiteralStringRef("AsyncFile.CountCacheReadBytes"));
countCacheFinds.init(LiteralStringRef("AsyncFile.CountCacheFinds"));
countCacheReadBytes.init(LiteralStringRef("AsyncFile.CountCacheReadBytes"));
}
}
@ -387,11 +388,18 @@ struct AFCPage : public EvictablePage, public FastAllocated<AFCPage> {
// If there are no active readers then if data is valid or we're replacing all of it we can write directly
if (valid || fullPage) {
if(!fullPage) {
++owner->countFileCachePageReadsHit;
++owner->countCachePageReadsHit;
}
valid = true;
memcpy( static_cast<uint8_t*>(this->data) + offset, data, length );
return yield();
}
++owner->countFileCachePageReadsMissed;
++owner->countCachePageReadsMissed;
// If data is not valid but no read is in progress, start reading
if (notReading.isReady()) {
notReading = readThrough( this );
@ -410,7 +418,14 @@ struct AFCPage : public EvictablePage, public FastAllocated<AFCPage> {
Future<Void> readZeroCopy() {
++zeroCopyRefCount;
if (valid) return yield();
if (valid) {
++owner->countFileCachePageReadsHit;
++owner->countCachePageReadsHit;
return yield();
}
++owner->countFileCachePageReadsMissed;
++owner->countCachePageReadsMissed;
if (notReading.isReady()) {
notReading = readThrough( this );
@ -428,12 +443,17 @@ struct AFCPage : public EvictablePage, public FastAllocated<AFCPage> {
Future<Void> read( void* data, int length, int offset ) {
if (valid) {
++owner->countFileCachePageReadsHit;
++owner->countCachePageReadsHit;
owner->countFileCacheReadBytes += length;
owner->countCacheReadBytes += length;
memcpy( data, static_cast<uint8_t const*>(this->data) + offset, length );
return yield();
}
++owner->countFileCachePageReadsMissed;
++owner->countCachePageReadsMissed;
if (notReading.isReady()) {
notReading = readThrough( this );
} else {

View File

@ -266,7 +266,7 @@ private:
}
ACTOR static Future<int> read_impl( int fd, void* data, int length, int64_t offset ) {
state int taskID = g_network->getCurrentTask();
state TaskPriority taskID = g_network->getCurrentTask();
state Promise<Void> p;
//fprintf(stderr, "eio_read (fd=%d length=%d offset=%lld)\n", fd, length, offset);
state eio_req* r = eio_read(fd, data, length, offset, 0, eio_callback, &p);
@ -289,7 +289,7 @@ private:
}
ACTOR static Future<Void> write_impl( int fd, Reference<ErrorInfo> err, StringRef data, int64_t offset ) {
state int taskID = g_network->getCurrentTask();
state TaskPriority taskID = g_network->getCurrentTask();
state Promise<Void> p;
state eio_req* r = eio_write(fd, (void*)data.begin(), data.size(), offset, 0, eio_callback, &p);
try { wait( p.getFuture() ); } catch (...) { g_network->setCurrentTask( taskID ); eio_cancel(r); throw; }
@ -299,7 +299,7 @@ private:
}
ACTOR static Future<Void> truncate_impl( int fd, Reference<ErrorInfo> err, int64_t size ) {
state int taskID = g_network->getCurrentTask();
state TaskPriority taskID = g_network->getCurrentTask();
state Promise<Void> p;
state eio_req* r = eio_ftruncate(fd, size, 0, eio_callback, &p);
try { wait( p.getFuture() ); } catch (...) { g_network->setCurrentTask( taskID ); eio_cancel(r); throw; }
@ -330,7 +330,7 @@ private:
}
ACTOR static Future<Void> sync_impl( int fd, Reference<ErrorInfo> err, bool sync_metadata=false ) {
state int taskID = g_network->getCurrentTask();
state TaskPriority taskID = g_network->getCurrentTask();
state Promise<Void> p;
state eio_req* r = start_fsync( fd, p, sync_metadata );
@ -350,7 +350,7 @@ private:
}
ACTOR static Future<int64_t> size_impl( int fd ) {
state int taskID = g_network->getCurrentTask();
state TaskPriority taskID = g_network->getCurrentTask();
state Promise<Void> p;
state eio_req* r = eio_fstat( fd, 0, eio_callback, &p );
try { wait( p.getFuture() ); } catch (...) { g_network->setCurrentTask( taskID ); eio_cancel(r); throw; }
@ -363,7 +363,7 @@ private:
}
ACTOR static Future<EIO_STRUCT_STAT> stat_impl( std::string filename ) {
state int taskID = g_network->getCurrentTask();
state TaskPriority taskID = g_network->getCurrentTask();
state Promise<Void> p;
state EIO_STRUCT_STAT statdata;
state eio_req* r = eio_stat( filename.c_str(), 0, eio_callback, &p );
@ -377,7 +377,7 @@ private:
ACTOR template <class R> static Future<R> dispatch_impl( std::function<R()> func) {
state Dispatch<R> data( func );
state int taskID = g_network->getCurrentTask();
state TaskPriority taskID = g_network->getCurrentTask();
state eio_req* r = eio_custom( [](eio_req* req) {
// Runs on the eio thread pool
@ -418,7 +418,7 @@ private:
static void eio_want_poll() {
want_poll = 1;
// SOMEDAY: NULL for deferred error, no analysis of correctness (itp)
onMainThreadVoid([](){ poll_eio(); }, NULL, TaskPollEIO);
onMainThreadVoid([](){ poll_eio(); }, NULL, TaskPriority::PollEIO);
}
static int eio_callback( eio_req* req ) {

View File

@ -472,9 +472,9 @@ private:
#endif
}
int getTask() const { return (prio>>32)+1; }
TaskPriority getTask() const { return static_cast<TaskPriority>((prio>>32)+1); }
ACTOR static void deliver( Promise<int> result, bool failed, int r, int task ) {
ACTOR static void deliver( Promise<int> result, bool failed, int r, TaskPriority task ) {
wait( delay(0, task) );
if (failed) result.sendError(io_timeout());
else if (r < 0) result.sendError(io_error());
@ -649,7 +649,7 @@ private:
loop {
wait(success(ev->read()));
wait(delay(0, TaskDiskIOComplete));
wait(delay(0, TaskPriority::DiskIOComplete));
linux_ioresult ev[FLOW_KNOBS->MAX_OUTSTANDING];
timespec tm; tm.tv_sec = 0; tm.tv_nsec = 0;

View File

@ -23,13 +23,13 @@
std::map<std::string, Future<Void>> AsyncFileNonDurable::filesBeingDeleted;
ACTOR Future<Void> sendOnProcess( ISimulator::ProcessInfo* process, Promise<Void> promise, int taskID ) {
ACTOR Future<Void> sendOnProcess( ISimulator::ProcessInfo* process, Promise<Void> promise, TaskPriority taskID ) {
wait( g_simulator.onProcess( process, taskID ) );
promise.send(Void());
return Void();
}
ACTOR Future<Void> sendErrorOnProcess( ISimulator::ProcessInfo* process, Promise<Void> promise, Error e, int taskID ) {
ACTOR Future<Void> sendErrorOnProcess( ISimulator::ProcessInfo* process, Promise<Void> promise, Error e, TaskPriority taskID ) {
wait( g_simulator.onProcess( process, taskID ) );
promise.sendError(e);
return Void();

View File

@ -38,8 +38,8 @@
#undef max
#undef min
Future<Void> sendOnProcess( ISimulator::ProcessInfo* const& process, Promise<Void> const& promise, int const& taskID );
Future<Void> sendErrorOnProcess( ISimulator::ProcessInfo* const& process, Promise<Void> const& promise, Error const& e, int const& taskID );
ACTOR Future<Void> sendOnProcess( ISimulator::ProcessInfo* process, Promise<Void> promise, TaskPriority taskID );
ACTOR Future<Void> sendErrorOnProcess( ISimulator::ProcessInfo* process, Promise<Void> promise, Error e, TaskPriority taskID );
ACTOR template <class T>
Future<T> sendErrorOnShutdown( Future<T> in ) {
@ -198,7 +198,7 @@ public:
//Creates a new AsyncFileNonDurable which wraps the provided IAsyncFile
ACTOR static Future<Reference<IAsyncFile>> open(std::string filename, std::string actualFilename, Future<Reference<IAsyncFile>> wrappedFile, Reference<DiskParameters> diskParameters) {
state ISimulator::ProcessInfo* currentProcess = g_simulator.getCurrentProcess();
state int currentTaskID = g_network->getCurrentTask();
state TaskPriority currentTaskID = g_network->getCurrentTask();
state Future<Void> shutdown = success(currentProcess->shutdownSignal.getFuture());
//TraceEvent("AsyncFileNonDurableOpenBegin").detail("Filename", filename).detail("Addr", g_simulator.getCurrentProcess()->address);
@ -391,7 +391,7 @@ private:
ACTOR Future<int> read(AsyncFileNonDurable *self, void *data, int length, int64_t offset) {
state ISimulator::ProcessInfo* currentProcess = g_simulator.getCurrentProcess();
state int currentTaskID = g_network->getCurrentTask();
state TaskPriority currentTaskID = g_network->getCurrentTask();
wait( g_simulator.onMachine( currentProcess ) );
try {
@ -411,7 +411,7 @@ private:
//or none of the write. It may also corrupt parts of sectors which have not been written correctly
ACTOR Future<Void> write(AsyncFileNonDurable *self, Promise<Void> writeStarted, Future<Future<Void>> ownFuture, void const* data, int length, int64_t offset) {
state ISimulator::ProcessInfo* currentProcess = g_simulator.getCurrentProcess();
state int currentTaskID = g_network->getCurrentTask();
state TaskPriority currentTaskID = g_network->getCurrentTask();
wait( g_simulator.onMachine( currentProcess ) );
state double delayDuration = deterministicRandom()->random01() * self->maxWriteDelay;
@ -535,7 +535,7 @@ private:
//If a kill interrupts the delay, then the truncate may or may not be performed
ACTOR Future<Void> truncate(AsyncFileNonDurable *self, Promise<Void> truncateStarted, Future<Future<Void>> ownFuture, int64_t size) {
state ISimulator::ProcessInfo* currentProcess = g_simulator.getCurrentProcess();
state int currentTaskID = g_network->getCurrentTask();
state TaskPriority currentTaskID = g_network->getCurrentTask();
wait( g_simulator.onMachine( currentProcess ) );
state double delayDuration = deterministicRandom()->random01() * self->maxWriteDelay;
@ -573,8 +573,8 @@ private:
}
}
if(g_network->check_yield(TaskDefaultYield)) {
wait(delay(0, TaskDefaultYield));
if(g_network->check_yield(TaskPriority::DefaultYield)) {
wait(delay(0, TaskPriority::DefaultYield));
}
//If performing a durable truncate, then pass it through to the file. Otherwise, pass it through with a 1/2 chance
@ -663,7 +663,7 @@ private:
ACTOR Future<Void> sync(AsyncFileNonDurable *self, bool durable) {
state ISimulator::ProcessInfo* currentProcess = g_simulator.getCurrentProcess();
state int currentTaskID = g_network->getCurrentTask();
state TaskPriority currentTaskID = g_network->getCurrentTask();
wait( g_simulator.onMachine( currentProcess ) );
try {
@ -695,7 +695,7 @@ private:
ACTOR Future<int64_t> size(AsyncFileNonDurable *self) {
state ISimulator::ProcessInfo* currentProcess = g_simulator.getCurrentProcess();
state int currentTaskID = g_network->getCurrentTask();
state TaskPriority currentTaskID = g_network->getCurrentTask();
wait( g_simulator.onMachine( currentProcess ) );
@ -714,7 +714,7 @@ private:
//Finishes all outstanding actors on an AsyncFileNonDurable and then deletes it
ACTOR Future<Void> deleteFile(AsyncFileNonDurable *self) {
state ISimulator::ProcessInfo* currentProcess = g_simulator.getCurrentProcess();
state int currentTaskID = g_network->getCurrentTask();
state TaskPriority currentTaskID = g_network->getCurrentTask();
state std::string filename = self->filename;
wait( g_simulator.onMachine( currentProcess ) );

View File

@ -172,28 +172,28 @@ struct YieldMockNetwork : INetwork, ReferenceCounted<YieldMockNetwork> {
t.send(Void());
}
virtual Future<class Void> delay(double seconds, int taskID) {
virtual Future<class Void> delay(double seconds, TaskPriority taskID) {
return nextTick.getFuture();
}
virtual Future<class Void> yield(int taskID) {
virtual Future<class Void> yield(TaskPriority taskID) {
if (check_yield(taskID))
return delay(0,taskID);
return Void();
}
virtual bool check_yield(int taskID) {
virtual bool check_yield(TaskPriority taskID) {
if (nextYield > 0) --nextYield;
return nextYield == 0;
}
// Delegate everything else. TODO: Make a base class NetworkWrapper for delegating everything in INetwork
virtual int getCurrentTask() { return baseNetwork->getCurrentTask(); }
virtual void setCurrentTask(int taskID) { baseNetwork->setCurrentTask(taskID); }
virtual TaskPriority getCurrentTask() { return baseNetwork->getCurrentTask(); }
virtual void setCurrentTask(TaskPriority taskID) { baseNetwork->setCurrentTask(taskID); }
virtual double now() { return baseNetwork->now(); }
virtual void stop() { return baseNetwork->stop(); }
virtual bool isSimulated() const { return baseNetwork->isSimulated(); }
virtual void onMainThread(Promise<Void>&& signal, int taskID) { return baseNetwork->onMainThread(std::move(signal), taskID); }
virtual void onMainThread(Promise<Void>&& signal, TaskPriority taskID) { return baseNetwork->onMainThread(std::move(signal), taskID); }
bool isOnMainThread() const override { return baseNetwork->isOnMainThread(); }
virtual THREAD_HANDLE startThread(THREAD_FUNC_RETURN(*func) (void *), void *arg) { return baseNetwork->startThread(func,arg); }
virtual Future< Reference<class IAsyncFile> > open(std::string filename, int64_t flags, int64_t mode) { return IAsyncFileSystem::filesystem()->open(filename,flags,mode); }

View File

@ -50,9 +50,9 @@ const uint64_t TOKEN_STREAM_FLAG = 1;
class EndpointMap : NonCopyable {
public:
EndpointMap();
void insert( NetworkMessageReceiver* r, Endpoint::Token& token, uint32_t priority );
void insert( NetworkMessageReceiver* r, Endpoint::Token& token, TaskPriority priority );
NetworkMessageReceiver* get( Endpoint::Token const& token );
uint32_t getPriority( Endpoint::Token const& token );
TaskPriority getPriority( Endpoint::Token const& token );
void remove( Endpoint::Token const& token, NetworkMessageReceiver* r );
private:
@ -86,12 +86,12 @@ void EndpointMap::realloc() {
firstFree = oldSize;
}
void EndpointMap::insert( NetworkMessageReceiver* r, Endpoint::Token& token, uint32_t priority ) {
void EndpointMap::insert( NetworkMessageReceiver* r, Endpoint::Token& token, TaskPriority priority ) {
if (firstFree == uint32_t(-1)) realloc();
int index = firstFree;
firstFree = data[index].nextFree;
token = Endpoint::Token( token.first(), (token.second()&0xffffffff00000000LL) | index );
data[index].token() = Endpoint::Token( token.first(), (token.second()&0xffffffff00000000LL) | priority );
data[index].token() = Endpoint::Token( token.first(), (token.second()&0xffffffff00000000LL) | static_cast<uint32_t>(priority) );
data[index].receiver = r;
}
@ -102,11 +102,11 @@ NetworkMessageReceiver* EndpointMap::get( Endpoint::Token const& token ) {
return 0;
}
uint32_t EndpointMap::getPriority( Endpoint::Token const& token ) {
TaskPriority EndpointMap::getPriority( Endpoint::Token const& token ) {
uint32_t index = token.second();
if ( index < data.size() && data[index].token().first() == token.first() && ((data[index].token().second()&0xffffffff00000000LL)|index)==token.second() )
return data[index].token().second();
return TaskUnknownEndpoint;
return static_cast<TaskPriority>(data[index].token().second());
return TaskPriority::UnknownEndpoint;
}
void EndpointMap::remove( Endpoint::Token const& token, NetworkMessageReceiver* r ) {
@ -122,7 +122,7 @@ struct EndpointNotFoundReceiver : NetworkMessageReceiver {
EndpointNotFoundReceiver(EndpointMap& endpoints) {
//endpoints[WLTOKEN_ENDPOINT_NOT_FOUND] = this;
Endpoint::Token e = WLTOKEN_ENDPOINT_NOT_FOUND;
endpoints.insert(this, e, TaskDefaultEndpoint);
endpoints.insert(this, e, TaskPriority::DefaultEndpoint);
ASSERT( e == WLTOKEN_ENDPOINT_NOT_FOUND );
}
virtual void receive( ArenaReader& reader ) {
@ -141,7 +141,7 @@ struct EndpointNotFoundReceiver : NetworkMessageReceiver {
struct PingReceiver : NetworkMessageReceiver {
PingReceiver(EndpointMap& endpoints) {
Endpoint::Token e = WLTOKEN_PING_PACKET;
endpoints.insert(this, e, TaskReadSocket);
endpoints.insert(this, e, TaskPriority::ReadSocket);
ASSERT( e == WLTOKEN_PING_PACKET );
}
virtual void receive( ArenaReader& reader ) {
@ -305,11 +305,12 @@ struct Peer : NonCopyable {
int peerReferences;
bool incompatibleProtocolVersionNewer;
int64_t bytesReceived;
double lastDataPacketSentTime;
explicit Peer( TransportData* transport, NetworkAddress const& destination )
: transport(transport), destination(destination), outgoingConnectionIdle(false), lastConnectTime(0.0), reconnectionDelay(FLOW_KNOBS->INITIAL_RECONNECTION_TIME),
compatible(true), incompatibleProtocolVersionNewer(false), peerReferences(-1), bytesReceived(0)
{
explicit Peer(TransportData* transport, NetworkAddress const& destination)
: transport(transport), destination(destination), outgoingConnectionIdle(false), lastConnectTime(0.0),
reconnectionDelay(FLOW_KNOBS->INITIAL_RECONNECTION_TIME), compatible(true),
incompatibleProtocolVersionNewer(false), peerReferences(-1), bytesReceived(0), lastDataPacketSentTime(now()) {
connect = connectionKeeper(this);
}
@ -396,19 +397,44 @@ struct Peer : NonCopyable {
}
ACTOR static Future<Void> connectionMonitor( Peer *peer ) {
state RequestStream< ReplyPromise<Void> > remotePing( Endpoint( {peer->destination}, WLTOKEN_PING_PACKET ) );
state Endpoint remotePingEndpoint({ peer->destination }, WLTOKEN_PING_PACKET);
loop {
if(peer->peerReferences == 0 && peer->reliable.empty() && peer->unsent.empty()) {
throw connection_unreferenced();
if (!FlowTransport::transport().isClient() && !peer->destination.isPublic()) {
// Don't send ping messages to clients unless necessary. Instead monitor incoming client pings.
state double lastRefreshed = now();
state int64_t lastBytesReceived = peer->bytesReceived;
loop {
wait(delay(FLOW_KNOBS->CONNECTION_MONITOR_LOOP_TIME));
if (lastBytesReceived < peer->bytesReceived) {
lastRefreshed = now();
lastBytesReceived = peer->bytesReceived;
} else if (lastRefreshed < now() - FLOW_KNOBS->CONNECTION_MONITOR_IDLE_TIMEOUT *
FLOW_KNOBS->CONNECTION_MONITOR_INCOMING_IDLE_MULTIPLIER) {
// If we have not received anything in this period, client must have closed
// connection by now. Break loop to check if it is still alive by sending a ping.
break;
}
}
}
wait( delayJittered( FLOW_KNOBS->CONNECTION_MONITOR_LOOP_TIME ) );
if (peer->reliable.empty() && peer->unsent.empty()) {
if (peer->peerReferences == 0 &&
(peer->lastDataPacketSentTime < now() - FLOW_KNOBS->CONNECTION_MONITOR_UNREFERENCED_CLOSE_DELAY)) {
// TODO: What about when peerReference == -1?
throw connection_unreferenced();
} else if (FlowTransport::transport().isClient() && peer->destination.isPublic() &&
(peer->lastConnectTime < now() - FLOW_KNOBS->CONNECTION_MONITOR_IDLE_TIMEOUT) &&
(peer->lastDataPacketSentTime < now() - FLOW_KNOBS->CONNECTION_MONITOR_IDLE_TIMEOUT)) {
// First condition is necessary because we may get here if we are server.
throw connection_idle();
}
}
// SOMEDAY: Stop monitoring and close the connection after a long period of inactivity with no reliable or onDisconnect requests outstanding
wait (delayJittered(FLOW_KNOBS->CONNECTION_MONITOR_LOOP_TIME));
// TODO: Stop monitoring and close the connection with no onDisconnect requests outstanding
state ReplyPromise<Void> reply;
FlowTransport::transport().sendUnreliable( SerializeSource<ReplyPromise<Void>>(reply), remotePing.getEndpoint() );
FlowTransport::transport().sendUnreliable( SerializeSource<ReplyPromise<Void>>(reply), remotePingEndpoint );
state int64_t startingBytes = peer->bytesReceived;
state int timeouts = 0;
loop {
@ -419,7 +445,10 @@ struct Peer : NonCopyable {
throw connection_failed();
}
if(timeouts > 1) {
TraceEvent(SevWarnAlways, "ConnectionSlowPing").suppressFor(1.0).detail("WithAddr", peer->destination).detail("Timeouts", timeouts);
TraceEvent(SevWarnAlways, "ConnectionSlowPing")
.suppressFor(1.0)
.detail("WithAddr", peer->destination)
.detail("Timeouts", timeouts);
}
startingBytes = peer->bytesReceived;
timeouts++;
@ -438,10 +467,10 @@ struct Peer : NonCopyable {
ACTOR static Future<Void> connectionWriter( Peer* self, Reference<IConnection> conn ) {
state double lastWriteTime = now();
loop {
//wait( delay(0, TaskWriteSocket) );
wait( delayJittered(std::max<double>(FLOW_KNOBS->MIN_COALESCE_DELAY, FLOW_KNOBS->MAX_COALESCE_DELAY - (now() - lastWriteTime)), TaskWriteSocket) );
//wait( delay(500e-6, TaskWriteSocket) );
//wait( yield(TaskWriteSocket) );
//wait( delay(0, TaskPriority::WriteSocket) );
wait( delayJittered(std::max<double>(FLOW_KNOBS->MIN_COALESCE_DELAY, FLOW_KNOBS->MAX_COALESCE_DELAY - (now() - lastWriteTime)), TaskPriority::WriteSocket) );
//wait( delay(500e-6, TaskPriority::WriteSocket) );
//wait( yield(TaskPriority::WriteSocket) );
// Send until there is nothing left to send
loop {
@ -456,7 +485,7 @@ struct Peer : NonCopyable {
TEST(true); // We didn't write everything, so apparently the write buffer is full. Wait for it to be nonfull.
wait( conn->onWritable() );
wait( yield(TaskWriteSocket) );
wait( yield(TaskPriority::WriteSocket) );
}
// Wait until there is something to send
@ -550,14 +579,21 @@ struct Peer : NonCopyable {
self->discardUnreliablePackets();
reader = Future<Void>();
bool ok = e.code() == error_code_connection_failed || e.code() == error_code_actor_cancelled ||
e.code() == error_code_connection_unreferenced ||
e.code() == error_code_connection_unreferenced || e.code() == error_code_connection_idle ||
(g_network->isSimulated() && e.code() == error_code_checksum_failed);
if(self->compatible) {
TraceEvent(ok ? SevInfo : SevWarnAlways, "ConnectionClosed", conn ? conn->getDebugID() : UID()).error(e, true).suppressFor(1.0).detail("PeerAddr", self->destination);
TraceEvent(ok ? SevInfo : SevWarnAlways, "ConnectionClosed", conn ? conn->getDebugID() : UID())
.error(e, true)
.suppressFor(1.0)
.detail("PeerAddr", self->destination);
}
else {
TraceEvent(ok ? SevInfo : SevWarnAlways, "IncompatibleConnectionClosed", conn ? conn->getDebugID() : UID()).error(e, true).suppressFor(1.0).detail("PeerAddr", self->destination);
TraceEvent(ok ? SevInfo : SevWarnAlways, "IncompatibleConnectionClosed",
conn ? conn->getDebugID() : UID())
.error(e, true)
.suppressFor(1.0)
.detail("PeerAddr", self->destination);
}
if(self->destination.isPublic() && IFailureMonitor::failureMonitor().getState(self->destination).isAvailable()) {
@ -565,20 +601,25 @@ struct Peer : NonCopyable {
if(now() - it.second > FLOW_KNOBS->TOO_MANY_CONNECTIONS_CLOSED_RESET_DELAY) {
it.first = now();
} else if(now() - it.first > FLOW_KNOBS->TOO_MANY_CONNECTIONS_CLOSED_TIMEOUT) {
TraceEvent(SevWarnAlways, "TooManyConnectionsClosed", conn ? conn->getDebugID() : UID()).suppressFor(5.0).detail("PeerAddr", self->destination);
TraceEvent(SevWarnAlways, "TooManyConnectionsClosed", conn ? conn->getDebugID() : UID())
.suppressFor(5.0)
.detail("PeerAddr", self->destination);
self->transport->degraded->set(true);
}
it.second = now();
}
if (conn) {
if (FlowTransport::transport().isClient()) {
if (FlowTransport::transport().isClient() && e.code() != error_code_connection_idle) {
clientReconnectDelay = true;
}
conn->close();
conn = Reference<IConnection>();
}
IFailureMonitor::failureMonitor().notifyDisconnect( self->destination ); //< Clients might send more packets in response, which needs to go out on the next connection
// Clients might send more packets in response, which needs to go out on the next connection
IFailureMonitor::failureMonitor().notifyDisconnect( self->destination );
if (e.code() == error_code_actor_cancelled) throw;
// Try to recover, even from serious errors, by retrying
@ -602,8 +643,8 @@ TransportData::~TransportData() {
}
ACTOR static void deliver(TransportData* self, Endpoint destination, ArenaReader reader, bool inReadSocket) {
int priority = self->endpoints.getPriority(destination.token);
if (priority < TaskReadSocket || !inReadSocket) {
TaskPriority priority = self->endpoints.getPriority(destination.token);
if (priority < TaskPriority::ReadSocket || !inReadSocket) {
wait( delay(0, priority) );
} else {
g_network->setCurrentTask( priority );
@ -637,7 +678,7 @@ ACTOR static void deliver(TransportData* self, Endpoint destination, ArenaReader
}
if( inReadSocket )
g_network->setCurrentTask( TaskReadSocket );
g_network->setCurrentTask( TaskPriority::ReadSocket );
}
static void scanPackets(TransportData* transport, uint8_t*& unprocessed_begin, const uint8_t* e, Arena& arena,
@ -797,7 +838,7 @@ ACTOR static Future<Void> connectionReader(
if (len == 0) break;
state int readBytes = conn->read(unprocessed_end, unprocessed_end + len);
if (readBytes == 0) break;
wait(yield(TaskReadSocket));
wait(yield(TaskPriority::ReadSocket));
totalReadBytes += readBytes;
unprocessed_end += readBytes;
}
@ -908,11 +949,11 @@ ACTOR static Future<Void> connectionReader(
if (readWillBlock)
break;
wait(yield(TaskReadSocket));
wait(yield(TaskPriority::ReadSocket));
}
wait( conn->onReadable() );
wait(delay(0, TaskReadSocket)); // We don't want to call conn->read directly from the reactor - we could get stuck in the reactor reading 1 packet at a time
wait(delay(0, TaskPriority::ReadSocket)); // We don't want to call conn->read directly from the reactor - we could get stuck in the reactor reading 1 packet at a time
}
}
catch (Error& e) {
@ -956,7 +997,7 @@ ACTOR static Future<Void> listen( TransportData* self, NetworkAddress listenAddr
.detail("FromAddress", conn->getPeerAddress())
.detail("ListenAddress", listenAddr.toString());
incoming.add( connectionIncoming(self, conn) );
wait(delay(0) || delay(FLOW_KNOBS->CONNECTION_ACCEPT_DELAY, TaskWriteSocket));
wait(delay(0) || delay(FLOW_KNOBS->CONNECTION_ACCEPT_DELAY, TaskPriority::WriteSocket));
}
} catch (Error& e) {
TraceEvent(SevError, "ListenError").error(e);
@ -1047,12 +1088,12 @@ Endpoint FlowTransport::loadedEndpoint( const UID& token ) {
return Endpoint(g_currentDeliveryPeerAddress, token);
}
void FlowTransport::addPeerReference( const Endpoint& endpoint, NetworkMessageReceiver* receiver ) {
if (FlowTransport::transport().isClient()) {
void FlowTransport::addPeerReference(const Endpoint& endpoint, bool isStream) {
if (!isStream || !endpoint.getPrimaryAddress().isValid())
return;
else if (FlowTransport::transport().isClient())
IFailureMonitor::failureMonitor().setStatus(endpoint.getPrimaryAddress(), FailureStatus(false));
}
if (!receiver->isStream() || !endpoint.getPrimaryAddress().isValid()) return;
Peer* peer = self->getPeer(endpoint.getPrimaryAddress());
if(peer->peerReferences == -1) {
peer->peerReferences = 1;
@ -1061,8 +1102,8 @@ void FlowTransport::addPeerReference( const Endpoint& endpoint, NetworkMessageRe
}
}
void FlowTransport::removePeerReference( const Endpoint& endpoint, NetworkMessageReceiver* receiver ) {
if (!receiver->isStream() || !endpoint.getPrimaryAddress().isValid()) return;
void FlowTransport::removePeerReference(const Endpoint& endpoint, bool isStream) {
if (!isStream || !endpoint.getPrimaryAddress().isValid()) return;
Peer* peer = self->getPeer(endpoint.getPrimaryAddress(), false);
if(peer) {
peer->peerReferences--;
@ -1078,7 +1119,7 @@ void FlowTransport::removePeerReference( const Endpoint& endpoint, NetworkMessag
}
}
void FlowTransport::addEndpoint( Endpoint& endpoint, NetworkMessageReceiver* receiver, uint32_t taskID ) {
void FlowTransport::addEndpoint( Endpoint& endpoint, NetworkMessageReceiver* receiver, TaskPriority taskID ) {
endpoint.token = deterministicRandom()->randomUniqueID();
if (receiver->isStream()) {
endpoint.addresses = self->localAddresses;
@ -1094,7 +1135,7 @@ void FlowTransport::removeEndpoint( const Endpoint& endpoint, NetworkMessageRece
self->endpoints.remove(endpoint.token, receiver);
}
void FlowTransport::addWellKnownEndpoint( Endpoint& endpoint, NetworkMessageReceiver* receiver, uint32_t taskID ) {
void FlowTransport::addWellKnownEndpoint( Endpoint& endpoint, NetworkMessageReceiver* receiver, TaskPriority taskID ) {
endpoint.addresses = self->localAddresses;
ASSERT( ((endpoint.token.first() & TOKEN_STREAM_FLAG)!=0) == receiver->isStream() );
Endpoint::Token otoken = endpoint.token;
@ -1213,7 +1254,9 @@ static PacketID sendPacket( TransportData* self, ISerializeSource const& what, c
#endif
peer->send(pb, rp, firstUnsent);
if (destination.token != WLTOKEN_PING_PACKET) {
peer->lastDataPacketSentTime = now();
}
return (PacketID)rp;
}
}

View File

@ -132,19 +132,19 @@ public:
std::map<NetworkAddress, std::pair<uint64_t, double>>* getIncompatiblePeers();
// Returns the same of all peers that have attempted to connect, but have incompatible protocol versions
void addPeerReference( const Endpoint&, NetworkMessageReceiver* );
void addPeerReference(const Endpoint&, bool isStream);
// Signal that a peer connection is being used, even if no messages are currently being sent to the peer
void removePeerReference( const Endpoint&, NetworkMessageReceiver* );
void removePeerReference(const Endpoint&, bool isStream);
// Signal that a peer connection is no longer being used
void addEndpoint( Endpoint& endpoint, NetworkMessageReceiver*, uint32_t taskID );
void addEndpoint( Endpoint& endpoint, NetworkMessageReceiver*, TaskPriority taskID );
// Sets endpoint to be a new local endpoint which delivers messages to the given receiver
void removeEndpoint( const Endpoint&, NetworkMessageReceiver* );
// The given local endpoint no longer delivers messages to the given receiver or uses resources
void addWellKnownEndpoint( Endpoint& endpoint, NetworkMessageReceiver*, uint32_t taskID );
void addWellKnownEndpoint( Endpoint& endpoint, NetworkMessageReceiver*, TaskPriority taskID );
// Sets endpoint to a new local endpoint (without changing its token) which delivers messages to the given receiver
// Implementations may have limitations on when this function is called and what endpoint.token may be!

View File

@ -179,7 +179,7 @@ Future< REPLY_TYPE(Request) > loadBalance(
Reference<MultiInterface<Multi>> alternatives,
RequestStream<Request> Interface::* channel,
Request request = Request(),
int taskID = TaskDefaultPromiseEndpoint,
TaskPriority taskID = TaskPriority::DefaultPromiseEndpoint,
bool atMostOnce = false, // if true, throws request_maybe_delivered() instead of retrying automatically
QueueModel* model = NULL)
{

View File

@ -70,6 +70,13 @@ struct IReplicationPolicy : public ReferenceCounted<IReplicationPolicy> {
return keys;
}
virtual void attributeKeys(std::set<std::string>*) const = 0;
// For flatbuffers, IReplicationPolicy is just encoded as a string using
// |serializeReplicationPolicy|. |writer| is a member of IReplicationPolicy
// so that this string outlives all calls to
// dynamic_size_traits<Reference<IReplicationPolicy>>::save
mutable BinaryWriter writer{ IncludeVersion() };
mutable bool alreadyWritten = false;
};
template <class Archive>
@ -276,12 +283,28 @@ void serializeReplicationPolicy(Ar& ar, Reference<IReplicationPolicy>& policy) {
template <>
struct dynamic_size_traits<Reference<IReplicationPolicy>> : std::true_type {
static WriteRawMemory save(const Reference<IReplicationPolicy>& value) {
BinaryWriter writer(IncludeVersion());
serializeReplicationPolicy(writer, const_cast<Reference<IReplicationPolicy>&>(value));
std::unique_ptr<uint8_t[]> memory(new uint8_t[writer.getLength()]);
memcpy(memory.get(), writer.getData(), writer.getLength());
return std::make_pair<OwnershipErasedPtr<const uint8_t>, size_t>(ownedPtr(const_cast<const uint8_t*>(memory.release())), writer.getLength());
static Block save(const Reference<IReplicationPolicy>& value) {
if (value.getPtr() == nullptr) {
static BinaryWriter writer{ IncludeVersion() };
writer = BinaryWriter{ IncludeVersion() };
serializeReplicationPolicy(writer, const_cast<Reference<IReplicationPolicy>&>(value));
return unownedPtr(const_cast<const uint8_t*>(reinterpret_cast<uint8_t*>(writer.getData())),
writer.getLength());
}
if (!value->alreadyWritten) {
serializeReplicationPolicy(value->writer, const_cast<Reference<IReplicationPolicy>&>(value));
value->alreadyWritten = true;
}
return unownedPtr(const_cast<const uint8_t*>(reinterpret_cast<uint8_t*>(value->writer.getData())),
value->writer.getLength());
}
static void serialization_done(const Reference<IReplicationPolicy>& value) {
if (value.getPtr() == nullptr) {
return;
}
value->alreadyWritten = false;
value->writer = BinaryWriter{ IncludeVersion() };
}
// Context is an arbitrary type that is plumbed by reference throughout the
@ -294,5 +317,6 @@ struct dynamic_size_traits<Reference<IReplicationPolicy>> : std::true_type {
}
};
static_assert(detail::has_serialization_done<dynamic_size_traits<Reference<IReplicationPolicy>>>::value);
#endif

View File

@ -47,7 +47,7 @@ bool firstInBatch(CommitTransactionRequest x) {
}
ACTOR template <class X>
Future<Void> batcher(PromiseStream<std::pair<std::vector<X>, int> > out, FutureStream<X> in, double avgMinDelay, double* avgMaxDelay, double emptyBatchTimeout, int maxCount, int desiredBytes, int maxBytes, Optional<PromiseStream<Void>> batchStartedStream, int64_t *commitBatchesMemBytesCount, int64_t commitBatchesMemBytesLimit, int taskID = TaskDefaultDelay, Counter* counter = 0)
Future<Void> batcher(PromiseStream<std::pair<std::vector<X>, int> > out, FutureStream<X> in, double avgMinDelay, double* avgMaxDelay, double emptyBatchTimeout, int maxCount, int desiredBytes, int maxBytes, Optional<PromiseStream<Void>> batchStartedStream, int64_t *commitBatchesMemBytesCount, int64_t commitBatchesMemBytesLimit, TaskPriority taskID = TaskPriority::DefaultDelay, Counter* counter = 0)
{
wait( delayJittered(*avgMaxDelay, taskID) ); // smooth out
// This is set up to deliver even zero-size batches if emptyBatchTimeout elapses, because that's what master proxy wants. The source control history

View File

@ -31,15 +31,19 @@
struct FlowReceiver : private NetworkMessageReceiver {
// Common endpoint code for NetSAV<> and NetNotifiedQueue<>
FlowReceiver() : m_isLocalEndpoint(false) {}
FlowReceiver(Endpoint const& remoteEndpoint) : endpoint(remoteEndpoint), m_isLocalEndpoint(false) {
FlowTransport::transport().addPeerReference(endpoint, this);
FlowReceiver() : m_isLocalEndpoint(false), m_stream(false) {
}
FlowReceiver(Endpoint const& remoteEndpoint, bool stream)
: endpoint(remoteEndpoint), m_isLocalEndpoint(false), m_stream(stream) {
FlowTransport::transport().addPeerReference(endpoint, m_stream);
}
~FlowReceiver() {
if (m_isLocalEndpoint) {
FlowTransport::transport().removeEndpoint(endpoint, this);
} else {
FlowTransport::transport().removePeerReference(endpoint, this);
FlowTransport::transport().removePeerReference(endpoint, m_stream);
}
}
@ -48,7 +52,7 @@ struct FlowReceiver : private NetworkMessageReceiver {
// If already a remote endpoint, returns that. Otherwise makes this
// a local endpoint and returns that.
const Endpoint& getEndpoint(int taskID) {
const Endpoint& getEndpoint(TaskPriority taskID) {
if (!endpoint.isValid()) {
m_isLocalEndpoint = true;
FlowTransport::transport().addEndpoint(endpoint, this, taskID);
@ -56,16 +60,17 @@ struct FlowReceiver : private NetworkMessageReceiver {
return endpoint;
}
void makeWellKnownEndpoint(Endpoint::Token token, int taskID) {
void makeWellKnownEndpoint(Endpoint::Token token, TaskPriority taskID) {
ASSERT(!endpoint.isValid());
m_isLocalEndpoint = true;
endpoint.token = token;
FlowTransport::transport().addWellKnownEndpoint(endpoint, this, taskID);
}
protected:
private:
Endpoint endpoint;
bool m_isLocalEndpoint;
bool m_stream;
};
template <class T>
@ -74,7 +79,9 @@ struct NetSAV : SAV<T>, FlowReceiver, FastAllocated<NetSAV<T>> {
using FastAllocated<NetSAV<T>>::operator delete;
NetSAV(int futures, int promises) : SAV<T>(futures, promises) {}
NetSAV(int futures, int promises, const Endpoint& remoteEndpoint) : SAV<T>(futures, promises), FlowReceiver(remoteEndpoint) {}
NetSAV(int futures, int promises, const Endpoint& remoteEndpoint)
: SAV<T>(futures, promises), FlowReceiver(remoteEndpoint, false) {
}
virtual void destroy() { delete this; }
virtual void receive(ArenaReader& reader) {
@ -128,7 +135,7 @@ public:
~ReplyPromise() { if (sav) sav->delPromiseRef(); }
ReplyPromise(const Endpoint& endpoint) : sav(new NetSAV<T>(0, 1, endpoint)) {}
const Endpoint& getEndpoint(int taskID = TaskDefaultPromiseEndpoint) const { return sav->getEndpoint(taskID); }
const Endpoint& getEndpoint(TaskPriority taskID = TaskPriority::DefaultPromiseEndpoint) const { return sav->getEndpoint(taskID); }
void operator=(const ReplyPromise& rhs) {
if (rhs.sav) rhs.sav->addPromiseRef();
@ -204,19 +211,19 @@ template <class Reply>
void resetReply(ReplyPromise<Reply> & p) { p.reset(); }
template <class Request>
void resetReply(Request& r, int taskID) { r.reply.reset(); r.reply.getEndpoint(taskID); }
void resetReply(Request& r, TaskPriority taskID) { r.reply.reset(); r.reply.getEndpoint(taskID); }
template <class Reply>
void resetReply(ReplyPromise<Reply> & p, int taskID) { p.reset(); p.getEndpoint(taskID); }
void resetReply(ReplyPromise<Reply> & p, TaskPriority taskID) { p.reset(); p.getEndpoint(taskID); }
template <class Request>
void setReplyPriority(Request& r, int taskID) { r.reply.getEndpoint(taskID); }
void setReplyPriority(Request& r, TaskPriority taskID) { r.reply.getEndpoint(taskID); }
template <class Reply>
void setReplyPriority(ReplyPromise<Reply> & p, int taskID) { p.getEndpoint(taskID); }
void setReplyPriority(ReplyPromise<Reply> & p, TaskPriority taskID) { p.getEndpoint(taskID); }
template <class Reply>
void setReplyPriority(const ReplyPromise<Reply> & p, int taskID) { p.getEndpoint(taskID); }
void setReplyPriority(const ReplyPromise<Reply> & p, TaskPriority taskID) { p.getEndpoint(taskID); }
@ -228,7 +235,8 @@ struct NetNotifiedQueue : NotifiedQueue<T>, FlowReceiver, FastAllocated<NetNotif
using FastAllocated<NetNotifiedQueue<T>>::operator delete;
NetNotifiedQueue(int futures, int promises) : NotifiedQueue<T>(futures, promises) {}
NetNotifiedQueue(int futures, int promises, const Endpoint& remoteEndpoint) : NotifiedQueue<T>(futures, promises), FlowReceiver(remoteEndpoint) {}
NetNotifiedQueue(int futures, int promises, const Endpoint& remoteEndpoint)
: NotifiedQueue<T>(futures, promises), FlowReceiver(remoteEndpoint, true) {}
virtual void destroy() { delete this; }
virtual void receive(ArenaReader& reader) {
@ -281,7 +289,7 @@ public:
return reportEndpointFailure(getReplyPromise(value).getFuture(), getEndpoint());
}
template <class X>
Future<REPLY_TYPE(X)> getReply(const X& value, int taskID) const {
Future<REPLY_TYPE(X)> getReply(const X& value, TaskPriority taskID) const {
setReplyPriority(value, taskID);
return getReply(value);
}
@ -290,7 +298,7 @@ public:
return getReply(ReplyPromise<X>());
}
template <class X>
Future<X> getReplyWithTaskID(int taskID) const {
Future<X> getReplyWithTaskID(TaskPriority taskID) const {
ReplyPromise<X> reply;
reply.getEndpoint(taskID);
return getReply(reply);
@ -302,7 +310,7 @@ public:
// If cancelled or returns failure, request was or will be delivered zero or one times.
// The caller must be capable of retrying if this request returns failure
template <class X>
Future<ErrorOr<REPLY_TYPE(X)>> tryGetReply(const X& value, int taskID) const {
Future<ErrorOr<REPLY_TYPE(X)>> tryGetReply(const X& value, TaskPriority taskID) const {
setReplyPriority(value, taskID);
if (queue->isRemoteEndpoint()) {
Future<Void> disc = makeDependent<T>(IFailureMonitor::failureMonitor()).onDisconnectOrFailure(getEndpoint(taskID));
@ -344,7 +352,7 @@ public:
// If it returns failure, the failure detector considers the endpoint failed permanently or for the given amount of time
// See IFailureMonitor::onFailedFor() for an explanation of the duration and slope parameters.
template <class X>
Future<ErrorOr<REPLY_TYPE(X)>> getReplyUnlessFailedFor(const X& value, double sustainedFailureDuration, double sustainedFailureSlope, int taskID) const {
Future<ErrorOr<REPLY_TYPE(X)>> getReplyUnlessFailedFor(const X& value, double sustainedFailureDuration, double sustainedFailureSlope, TaskPriority taskID) const {
// If it is local endpoint, no need for failure monitoring
return waitValueOrSignal(getReply(value, taskID),
makeDependent<T>(IFailureMonitor::failureMonitor()).onFailedFor(getEndpoint(taskID), sustainedFailureDuration, sustainedFailureSlope),
@ -388,8 +396,8 @@ public:
//queue = (NetNotifiedQueue<T>*)0xdeadbeef;
}
Endpoint getEndpoint(int taskID = TaskDefaultEndpoint) const { return queue->getEndpoint(taskID); }
void makeWellKnownEndpoint(Endpoint::Token token, int taskID) {
Endpoint getEndpoint(TaskPriority taskID = TaskPriority::DefaultEndpoint) const { return queue->getEndpoint(taskID); }
void makeWellKnownEndpoint(Endpoint::Token token, TaskPriority taskID) {
queue->makeWellKnownEndpoint(token, taskID);
}
@ -425,7 +433,10 @@ struct serializable_traits<RequestStream<T>> : std::true_type {
} else {
const auto& ep = stream.getEndpoint();
serializer(ar, ep);
UNSTOPPABLE_ASSERT(ep.getPrimaryAddress().isValid()); // No serializing PromiseStreams on a client with no public address
if constexpr (Archiver::isSerializing) { // Don't assert this when collecting vtable for flatbuffers
UNSTOPPABLE_ASSERT(ep.getPrimaryAddress()
.isValid()); // No serializing PromiseStreams on a client with no public address
}
}
}
};

View File

@ -50,7 +50,7 @@ Future<REPLY_TYPE(Req)> retryBrokenPromise( RequestStream<Req> to, Req request )
}
ACTOR template <class Req>
Future<REPLY_TYPE(Req)> retryBrokenPromise( RequestStream<Req> to, Req request, int taskID ) {
Future<REPLY_TYPE(Req)> retryBrokenPromise( RequestStream<Req> to, Req request, TaskPriority taskID ) {
// Like to.getReply(request), except that a broken_promise exception results in retrying request immediately.
// Suitable for use with well known endpoints, which are likely to return to existence after the other process restarts.
// Not normally useful for ordinary endpoints, which conventionally are permanently destroyed after replying with broken_promise.

View File

@ -380,9 +380,17 @@ private:
ACTOR static Future<Void> trackLeakedConnection( Sim2Conn* self ) {
wait( g_simulator.onProcess( self->process ) );
// SOMEDAY: Make this value variable? Dependent on buggification status?
wait( delay( 20.0 ) );
TraceEvent(SevError, "LeakedConnection", self->dbgid).error(connection_leaked()).detail("MyAddr", self->process->address).detail("PeerAddr", self->peerEndpoint).detail("PeerId", self->peerId).detail("Opened", self->opened);
if (self->process->address.isPublic()) {
wait( delay( FLOW_KNOBS->CONNECTION_MONITOR_IDLE_TIMEOUT * FLOW_KNOBS->CONNECTION_MONITOR_IDLE_TIMEOUT * 1.5 ) );
} else {
wait( delay( FLOW_KNOBS->CONNECTION_MONITOR_IDLE_TIMEOUT * 1.5 ) );
}
TraceEvent(SevError, "LeakedConnection", self->dbgid)
.error(connection_leaked())
.detail("MyAddr", self->process->address)
.detail("PeerAddr", self->peerEndpoint)
.detail("PeerId", self->peerId)
.detail("Opened", self->opened);
return Void();
}
};
@ -423,7 +431,7 @@ public:
ACTOR static Future<Reference<IAsyncFile>> open( std::string filename, int flags, int mode,
Reference<DiskParameters> diskParameters = Reference<DiskParameters>(new DiskParameters(25000, 150000000)), bool delayOnWrite = true ) {
state ISimulator::ProcessInfo* currentProcess = g_simulator.getCurrentProcess();
state int currentTaskID = g_network->getCurrentTask();
state TaskPriority currentTaskID = g_network->getCurrentTask();
if(++openCount >= 3000) {
TraceEvent(SevError, "TooManyFiles");
@ -742,11 +750,11 @@ public:
// Everything actually network related is delegated to the Sim2Net class; Sim2 is only concerned with simulating machines and time
virtual double now() { return time; }
virtual Future<class Void> delay( double seconds, int taskID ) {
ASSERT(taskID >= TaskMinPriority && taskID <= TaskMaxPriority);
virtual Future<class Void> delay( double seconds, TaskPriority taskID ) {
ASSERT(taskID >= TaskPriority::Min && taskID <= TaskPriority::Max);
return delay( seconds, taskID, currentProcess );
}
Future<class Void> delay( double seconds, int taskID, ProcessInfo* machine ) {
Future<class Void> delay( double seconds, TaskPriority taskID, ProcessInfo* machine ) {
ASSERT( seconds >= -0.0001 );
seconds = std::max(0.0, seconds);
Future<Void> f;
@ -761,13 +769,13 @@ public:
return f;
}
ACTOR static Future<Void> checkShutdown(Sim2 *self, int taskID) {
ACTOR static Future<Void> checkShutdown(Sim2 *self, TaskPriority taskID) {
wait(success(self->getCurrentProcess()->shutdownSignal.getFuture()));
self->setCurrentTask(taskID);
return Void();
}
virtual Future<class Void> yield( int taskID ) {
if (taskID == TaskDefaultYield) taskID = currentTaskID;
virtual Future<class Void> yield( TaskPriority taskID ) {
if (taskID == TaskPriority::DefaultYield) taskID = currentTaskID;
if (check_yield(taskID)) {
// We want to check that yielders can handle actual time elapsing (it sometimes will outside simulation), but
// don't want to prevent instantaneous shutdown of "rebooted" machines.
@ -776,7 +784,7 @@ public:
setCurrentTask(taskID);
return Void();
}
virtual bool check_yield( int taskID ) {
virtual bool check_yield( TaskPriority taskID ) {
if (yielded) return true;
if (--yield_limit <= 0) {
yield_limit = deterministicRandom()->randomInt(1, 150); // If yield returns false *too* many times in a row, there could be a stack overflow, since we can't deterministically check stack size as the real network does
@ -784,10 +792,10 @@ public:
}
return yielded = BUGGIFY_WITH_PROB(0.01);
}
virtual int getCurrentTask() {
virtual TaskPriority getCurrentTask() {
return currentTaskID;
}
virtual void setCurrentTask(int taskID ) {
virtual void setCurrentTask(TaskPriority taskID ) {
currentTaskID = taskID;
}
// Sets the taskID/priority of the current task, without yielding
@ -924,7 +932,7 @@ public:
}
if ( mustBeDurable || deterministicRandom()->random01() < 0.5 ) {
state ISimulator::ProcessInfo* currentProcess = g_simulator.getCurrentProcess();
state int currentTaskID = g_network->getCurrentTask();
state TaskPriority currentTaskID = g_network->getCurrentTask();
wait( g_simulator.onMachine( currentProcess ) );
try {
wait( ::delay(0.05 * deterministicRandom()->random01()) );
@ -950,7 +958,7 @@ public:
ACTOR static Future<Void> runLoop(Sim2 *self) {
state ISimulator::ProcessInfo *callingMachine = self->currentProcess;
while ( !self->isStopped ) {
wait( self->net2->yield(TaskDefaultYield) );
wait( self->net2->yield(TaskPriority::DefaultYield) );
self->mutex.enter();
if( self->tasks.size() == 0 ) {
@ -1580,23 +1588,23 @@ public:
machines.erase(machineId);
}
Sim2(bool objSerializer) : time(0.0), taskCount(0), yielded(false), yield_limit(0), currentTaskID(-1) {
Sim2(bool objSerializer) : time(0.0), taskCount(0), yielded(false), yield_limit(0), currentTaskID(TaskPriority::Zero) {
// Not letting currentProcess be NULL eliminates some annoying special cases
currentProcess = new ProcessInfo("NoMachine", LocalityData(Optional<Standalone<StringRef>>(), StringRef(), StringRef(), StringRef()), ProcessClass(), {NetworkAddress()}, this, "", "");
g_network = net2 = newNet2(false, true, objSerializer);
Net2FileSystem::newFileSystem();
check_yield(0);
check_yield(TaskPriority::Zero);
}
// Implementation
struct Task {
int taskID;
TaskPriority taskID;
double time;
uint64_t stable;
ProcessInfo* machine;
Promise<Void> action;
Task( double time, int taskID, uint64_t stable, ProcessInfo* machine, Promise<Void>&& action ) : time(time), taskID(taskID), stable(stable), machine(machine), action(std::move(action)) {}
Task( double time, int taskID, uint64_t stable, ProcessInfo* machine, Future<Void>& future ) : time(time), taskID(taskID), stable(stable), machine(machine) { future = action.getFuture(); }
Task( double time, TaskPriority taskID, uint64_t stable, ProcessInfo* machine, Promise<Void>&& action ) : time(time), taskID(taskID), stable(stable), machine(machine), action(std::move(action)) {}
Task( double time, TaskPriority taskID, uint64_t stable, ProcessInfo* machine, Future<Void>& future ) : time(time), taskID(taskID), stable(stable), machine(machine) { future = action.getFuture(); }
Task(Task&& rhs) BOOST_NOEXCEPT : time(rhs.time), taskID(rhs.taskID), stable(rhs.stable), machine(rhs.machine), action(std::move(rhs.action)) {}
void operator= ( Task const& rhs ) { taskID = rhs.taskID; time = rhs.time; stable = rhs.stable; machine = rhs.machine; action = rhs.action; }
Task( Task const& rhs ) : taskID(rhs.taskID), time(rhs.time), stable(rhs.stable), machine(rhs.machine), action(rhs.action) {}
@ -1643,23 +1651,23 @@ public:
}
}
virtual void onMainThread( Promise<Void>&& signal, int taskID ) {
virtual void onMainThread( Promise<Void>&& signal, TaskPriority taskID ) {
// This is presumably coming from either a "fake" thread pool thread, i.e. it is actually on this thread
// or a thread created with g_network->startThread
ASSERT(getCurrentProcess());
mutex.enter();
ASSERT(taskID >= TaskMinPriority && taskID <= TaskMaxPriority);
ASSERT(taskID >= TaskPriority::Min && taskID <= TaskPriority::Max);
tasks.push( Task( time, taskID, taskCount++, getCurrentProcess(), std::move(signal) ) );
mutex.leave();
}
bool isOnMainThread() const override {
return net2->isOnMainThread();
}
virtual Future<Void> onProcess( ISimulator::ProcessInfo *process, int taskID ) {
virtual Future<Void> onProcess( ISimulator::ProcessInfo *process, TaskPriority taskID ) {
return delay( 0, taskID, process );
}
virtual Future<Void> onMachine( ISimulator::ProcessInfo *process, int taskID ) {
virtual Future<Void> onMachine( ISimulator::ProcessInfo *process, TaskPriority taskID ) {
if( process->machine == 0 )
return Void();
return delay( 0, taskID, process->machine->machineProcess );
@ -1668,7 +1676,7 @@ public:
//time is guarded by ISimulator::mutex. It is not necessary to guard reads on the main thread because
//time should only be modified from the main thread.
double time;
int currentTaskID;
TaskPriority currentTaskID;
//taskCount is guarded by ISimulator::mutex
uint64_t taskCount;
@ -1698,9 +1706,9 @@ void startNewSimulator(bool objSerializer) {
}
ACTOR void doReboot( ISimulator::ProcessInfo *p, ISimulator::KillType kt ) {
TraceEvent("RebootingProcessAttempt").detail("ZoneId", p->locality.zoneId()).detail("KillType", kt).detail("Process", p->toString()).detail("StartingClass", p->startingClass.toString()).detail("Failed", p->failed).detail("Excluded", p->excluded).detail("Cleared", p->cleared).detail("Rebooting", p->rebooting).detail("TaskDefaultDelay", TaskDefaultDelay);
TraceEvent("RebootingProcessAttempt").detail("ZoneId", p->locality.zoneId()).detail("KillType", kt).detail("Process", p->toString()).detail("StartingClass", p->startingClass.toString()).detail("Failed", p->failed).detail("Excluded", p->excluded).detail("Cleared", p->cleared).detail("Rebooting", p->rebooting).detail("TaskPriorityDefaultDelay", TaskPriority::DefaultDelay);
wait( g_sim2.delay( 0, TaskDefaultDelay, p ) ); // Switch to the machine in question
wait( g_sim2.delay( 0, TaskPriority::DefaultDelay, p ) ); // Switch to the machine in question
try {
ASSERT( kt == ISimulator::RebootProcess || kt == ISimulator::Reboot || kt == ISimulator::RebootAndDelete || kt == ISimulator::RebootProcessAndDelete );

View File

@ -137,8 +137,8 @@ public:
ProcessInfo* getProcess( Endpoint const& endpoint ) { return getProcessByAddress(endpoint.getPrimaryAddress()); }
ProcessInfo* getCurrentProcess() { return currentProcess; }
virtual Future<Void> onProcess( ISimulator::ProcessInfo *process, int taskID = -1 ) = 0;
virtual Future<Void> onMachine( ISimulator::ProcessInfo *process, int taskID = -1 ) = 0;
virtual Future<Void> onProcess( ISimulator::ProcessInfo *process, TaskPriority taskID = TaskPriority::Zero ) = 0;
virtual Future<Void> onMachine( ISimulator::ProcessInfo *process, TaskPriority taskID = TaskPriority::Zero ) = 0;
virtual ProcessInfo* newProcess(const char* name, IPAddress ip, uint16_t port, uint16_t listenPerProcess,
LocalityData locality, ProcessClass startingClass, const char* dataFolder,

View File

@ -107,7 +107,7 @@ public:
DBInfo() : masterRegistrationCount(0), recoveryStalled(false), forceRecovery(false), unfinishedRecoveries(0), logGenerations(0),
clientInfo( new AsyncVar<ClientDBInfo>( ClientDBInfo() ) ),
serverInfo( new AsyncVar<ServerDBInfo>( ServerDBInfo() ) ),
db( DatabaseContext::create( clientInfo, Future<Void>(), LocalityData(), true, TaskDefaultEndpoint, true ) ) // SOMEDAY: Locality!
db( DatabaseContext::create( clientInfo, Future<Void>(), LocalityData(), true, TaskPriority::DefaultEndpoint, true ) ) // SOMEDAY: Locality!
{
}
@ -1171,7 +1171,7 @@ public:
serverInfo.clusterInterface = ccInterface;
serverInfo.myLocality = locality;
db.serverInfo->set( serverInfo );
cx = openDBOnServer(db.serverInfo, TaskDefaultEndpoint, true, true);
cx = openDBOnServer(db.serverInfo, TaskPriority::DefaultEndpoint, true, true);
}
~ClusterControllerData() {
@ -1425,7 +1425,7 @@ void checkBetterDDOrRK(ClusterControllerData* self) {
rkFitness = ProcessClass::ExcludeFit;
}
if (self->isProxyOrResolver(rkWorker.details.interf.locality.processId()) || rkFitness > bestFitnessForRK) {
TraceEvent("CC_HaltRK", self->id).detail("RKID", db.ratekeeper.get().id())
TraceEvent("CCHaltRK", self->id).detail("RKID", db.ratekeeper.get().id())
.detail("Excluded", rkWorker.priorityInfo.isExcluded)
.detail("Fitness", rkFitness).detail("BestFitness", bestFitnessForRK);
self->recruitRatekeeper.set(true);
@ -1439,7 +1439,7 @@ void checkBetterDDOrRK(ClusterControllerData* self) {
ddFitness = ProcessClass::ExcludeFit;
}
if (self->isProxyOrResolver(ddWorker.details.interf.locality.processId()) || ddFitness > bestFitnessForDD) {
TraceEvent("CC_HaltDD", self->id).detail("DDID", db.distributor.get().id())
TraceEvent("CCHaltDD", self->id).detail("DDID", db.distributor.get().id())
.detail("Excluded", ddWorker.priorityInfo.isExcluded)
.detail("Fitness", ddFitness).detail("BestFitness", bestFitnessForDD);
ddWorker.haltDistributor = brokenPromiseToNever(db.distributor.get().haltDataDistributor.getReply(HaltDataDistributorRequest(self->id)));
@ -1920,13 +1920,13 @@ void registerWorker( RegisterWorkerRequest req, ClusterControllerData *self ) {
self->clusterControllerDcId == req.distributorInterf.get().locality.dcId() &&
!self->recruitingDistributor) {
const DataDistributorInterface& di = req.distributorInterf.get();
TraceEvent("CC_RegisterDataDistributor", self->id).detail("DDID", di.id());
TraceEvent("CCRegisterDataDistributor", self->id).detail("DDID", di.id());
self->db.setDistributor(di);
}
if (req.ratekeeperInterf.present()) {
if((self->recruitingRatekeeperID.present() && self->recruitingRatekeeperID.get() != req.ratekeeperInterf.get().id()) ||
self->clusterControllerDcId != w.locality.dcId()) {
TraceEvent("CC_HaltRegisteringRatekeeper", self->id).detail("RKID", req.ratekeeperInterf.get().id())
TraceEvent("CCHaltRegisteringRatekeeper", self->id).detail("RKID", req.ratekeeperInterf.get().id())
.detail("DcID", printable(self->clusterControllerDcId))
.detail("ReqDcID", printable(w.locality.dcId()))
.detail("RecruitingRKID", self->recruitingRatekeeperID.present() ? self->recruitingRatekeeperID.get() : UID());
@ -1934,9 +1934,9 @@ void registerWorker( RegisterWorkerRequest req, ClusterControllerData *self ) {
} else if(!self->recruitingRatekeeperID.present()) {
const RatekeeperInterface& rki = req.ratekeeperInterf.get();
const auto& ratekeeper = self->db.serverInfo->get().ratekeeper;
TraceEvent("CC_RegisterRatekeeper", self->id).detail("RKID", rki.id());
TraceEvent("CCRegisterRatekeeper", self->id).detail("RKID", rki.id());
if (ratekeeper.present() && ratekeeper.get().id() != rki.id() && self->id_worker.count(ratekeeper.get().locality.processId())) {
TraceEvent("CC_HaltPreviousRatekeeper", self->id).detail("RKID", ratekeeper.get().id())
TraceEvent("CCHaltPreviousRatekeeper", self->id).detail("RKID", ratekeeper.get().id())
.detail("DcID", printable(self->clusterControllerDcId))
.detail("ReqDcID", printable(w.locality.dcId()))
.detail("RecruitingRKID", self->recruitingRatekeeperID.present() ? self->recruitingRatekeeperID.get() : UID());
@ -2475,7 +2475,7 @@ ACTOR Future<Void> handleForcedRecoveries( ClusterControllerData *self, ClusterC
ACTOR Future<DataDistributorInterface> startDataDistributor( ClusterControllerData *self ) {
wait(delay(0.0)); // If master fails at the same time, give it a chance to clear master PID.
TraceEvent("CC_StartDataDistributor", self->id);
TraceEvent("CCStartDataDistributor", self->id);
loop {
try {
state bool no_distributor = !self->db.serverInfo->get().distributor.present();
@ -2494,16 +2494,16 @@ ACTOR Future<DataDistributorInterface> startDataDistributor( ClusterControllerDa
}
InitializeDataDistributorRequest req(deterministicRandom()->randomUniqueID());
TraceEvent("CC_DataDistributorRecruit", self->id).detail("Addr", worker.interf.address());
TraceEvent("CCDataDistributorRecruit", self->id).detail("Addr", worker.interf.address());
ErrorOr<DataDistributorInterface> distributor = wait( worker.interf.dataDistributor.getReplyUnlessFailedFor(req, SERVER_KNOBS->WAIT_FOR_DISTRIBUTOR_JOIN_DELAY, 0) );
if (distributor.present()) {
TraceEvent("CC_DataDistributorRecruited", self->id).detail("Addr", worker.interf.address());
TraceEvent("CCDataDistributorRecruited", self->id).detail("Addr", worker.interf.address());
return distributor.get();
}
}
catch (Error& e) {
TraceEvent("CC_DataDistributorRecruitError", self->id).error(e);
TraceEvent("CCDataDistributorRecruitError", self->id).error(e);
if ( e.code() != error_code_no_more_servers ) {
throw;
}
@ -2520,7 +2520,7 @@ ACTOR Future<Void> monitorDataDistributor(ClusterControllerData *self) {
loop {
if ( self->db.serverInfo->get().distributor.present() ) {
wait( waitFailureClient( self->db.serverInfo->get().distributor.get().waitFailure, SERVER_KNOBS->DD_FAILURE_TIME ) );
TraceEvent("CC_DataDistributorDied", self->id)
TraceEvent("CCDataDistributorDied", self->id)
.detail("DistributorId", self->db.serverInfo->get().distributor.get().id());
self->db.clearInterf(ProcessClass::DataDistributorClass);
} else {
@ -2535,7 +2535,7 @@ ACTOR Future<Void> monitorDataDistributor(ClusterControllerData *self) {
ACTOR Future<Void> startRatekeeper(ClusterControllerData *self) {
wait(delay(0.0)); // If master fails at the same time, give it a chance to clear master PID.
TraceEvent("CC_StartRatekeeper", self->id);
TraceEvent("CCStartRatekeeper", self->id);
loop {
try {
state bool no_ratekeeper = !self->db.serverInfo->get().ratekeeper.present();
@ -2556,16 +2556,16 @@ ACTOR Future<Void> startRatekeeper(ClusterControllerData *self) {
}
self->recruitingRatekeeperID = req.reqId;
TraceEvent("CC_RecruitRatekeeper", self->id).detail("Addr", worker.interf.address()).detail("RKID", req.reqId);
TraceEvent("CCRecruitRatekeeper", self->id).detail("Addr", worker.interf.address()).detail("RKID", req.reqId);
ErrorOr<RatekeeperInterface> interf = wait( worker.interf.ratekeeper.getReplyUnlessFailedFor(req, SERVER_KNOBS->WAIT_FOR_RATEKEEPER_JOIN_DELAY, 0) );
if (interf.present()) {
self->recruitRatekeeper.set(false);
self->recruitingRatekeeperID = interf.get().id();
const auto& ratekeeper = self->db.serverInfo->get().ratekeeper;
TraceEvent("CC_RatekeeperRecruited", self->id).detail("Addr", worker.interf.address()).detail("RKID", interf.get().id());
TraceEvent("CCRatekeeperRecruited", self->id).detail("Addr", worker.interf.address()).detail("RKID", interf.get().id());
if (ratekeeper.present() && ratekeeper.get().id() != interf.get().id() && self->id_worker.count(ratekeeper.get().locality.processId())) {
TraceEvent("CC_HaltRatekeeperAfterRecruit", self->id).detail("RKID", ratekeeper.get().id())
TraceEvent("CCHaltRatekeeperAfterRecruit", self->id).detail("RKID", ratekeeper.get().id())
.detail("DcID", printable(self->clusterControllerDcId));
self->id_worker[ratekeeper.get().locality.processId()].haltRatekeeper = brokenPromiseToNever(ratekeeper.get().haltRatekeeper.getReply(HaltRatekeeperRequest(self->id)));
}
@ -2577,7 +2577,7 @@ ACTOR Future<Void> startRatekeeper(ClusterControllerData *self) {
}
}
catch (Error& e) {
TraceEvent("CC_RatekeeperRecruitError", self->id).error(e);
TraceEvent("CCRatekeeperRecruitError", self->id).error(e);
if ( e.code() != error_code_no_more_servers ) {
throw;
}
@ -2595,7 +2595,7 @@ ACTOR Future<Void> monitorRatekeeper(ClusterControllerData *self) {
if ( self->db.serverInfo->get().ratekeeper.present() && !self->recruitRatekeeper.get() ) {
choose {
when(wait(waitFailureClient( self->db.serverInfo->get().ratekeeper.get().waitFailure, SERVER_KNOBS->RATEKEEPER_FAILURE_TIME ))) {
TraceEvent("CC_RatekeeperDied", self->id)
TraceEvent("CCRatekeeperDied", self->id)
.detail("RKID", self->db.serverInfo->get().ratekeeper.get().id());
self->db.clearInterf(ProcessClass::RatekeeperClass);
}

View File

@ -63,13 +63,13 @@ struct ClusterControllerFullInterface {
void initEndpoints() {
clientInterface.initEndpoints();
recruitFromConfiguration.getEndpoint( TaskClusterController );
recruitRemoteFromConfiguration.getEndpoint( TaskClusterController );
recruitStorage.getEndpoint( TaskClusterController );
registerWorker.getEndpoint( TaskClusterController );
getWorkers.getEndpoint( TaskClusterController );
registerMaster.getEndpoint( TaskClusterController );
getServerDBInfo.getEndpoint( TaskClusterController );
recruitFromConfiguration.getEndpoint( TaskPriority::ClusterController );
recruitRemoteFromConfiguration.getEndpoint( TaskPriority::ClusterController );
recruitStorage.getEndpoint( TaskPriority::ClusterController );
registerWorker.getEndpoint( TaskPriority::ClusterController );
getWorkers.getEndpoint( TaskPriority::ClusterController );
registerMaster.getEndpoint( TaskPriority::ClusterController );
getServerDBInfo.getEndpoint( TaskPriority::ClusterController );
}
template <class Ar>

View File

@ -53,8 +53,8 @@ GenerationRegInterface::GenerationRegInterface( NetworkAddress remote )
GenerationRegInterface::GenerationRegInterface( INetwork* local )
{
read.makeWellKnownEndpoint( WLTOKEN_GENERATIONREG_READ, TaskCoordination );
write.makeWellKnownEndpoint( WLTOKEN_GENERATIONREG_WRITE, TaskCoordination );
read.makeWellKnownEndpoint( WLTOKEN_GENERATIONREG_READ, TaskPriority::Coordination );
write.makeWellKnownEndpoint( WLTOKEN_GENERATIONREG_WRITE, TaskPriority::Coordination );
}
LeaderElectionRegInterface::LeaderElectionRegInterface(NetworkAddress remote)
@ -68,9 +68,9 @@ LeaderElectionRegInterface::LeaderElectionRegInterface(NetworkAddress remote)
LeaderElectionRegInterface::LeaderElectionRegInterface(INetwork* local)
: ClientLeaderRegInterface(local)
{
candidacy.makeWellKnownEndpoint( WLTOKEN_LEADERELECTIONREG_CANDIDACY, TaskCoordination );
leaderHeartbeat.makeWellKnownEndpoint( WLTOKEN_LEADERELECTIONREG_LEADERHEARTBEAT, TaskCoordination );
forward.makeWellKnownEndpoint( WLTOKEN_LEADERELECTIONREG_FORWARD, TaskCoordination );
candidacy.makeWellKnownEndpoint( WLTOKEN_LEADERELECTIONREG_CANDIDACY, TaskPriority::Coordination );
leaderHeartbeat.makeWellKnownEndpoint( WLTOKEN_LEADERELECTIONREG_LEADERHEARTBEAT, TaskPriority::Coordination );
forward.makeWellKnownEndpoint( WLTOKEN_LEADERELECTIONREG_FORWARD, TaskPriority::Coordination );
}
ServerCoordinators::ServerCoordinators( Reference<ClusterConnectionFile> cf )

View File

@ -263,7 +263,7 @@ typedef WorkPool<Coroutine, ThreadUnsafeSpinLock, true> CoroPool;
ACTOR void coroSwitcher( Future<Void> what, int taskID, Coro* coro ) {
ACTOR void coroSwitcher( Future<Void> what, TaskPriority taskID, Coro* coro ) {
try {
// state double t = now();
wait(what);

View File

@ -76,14 +76,15 @@ struct CoreTLogSet {
struct OldTLogCoreData {
std::vector<CoreTLogSet> tLogs;
int32_t logRouterTags;
int32_t txsTags;
Version epochEnd;
std::set<int8_t> pseudoLocalities;
OldTLogCoreData() : epochEnd(0), logRouterTags(0) {}
OldTLogCoreData() : epochEnd(0), logRouterTags(0), txsTags(0) {}
explicit OldTLogCoreData(const OldLogData&);
bool operator == (OldTLogCoreData const& rhs) const {
return tLogs == rhs.tLogs && logRouterTags == rhs.logRouterTags && epochEnd == rhs.epochEnd && pseudoLocalities == rhs.pseudoLocalities;
return tLogs == rhs.tLogs && logRouterTags == rhs.logRouterTags && txsTags == rhs.txsTags && epochEnd == rhs.epochEnd && pseudoLocalities == rhs.pseudoLocalities;
}
template <class Archive>
@ -99,18 +100,22 @@ struct OldTLogCoreData {
if (ar.protocolVersion().hasPseudoLocalities()) {
serializer(ar, pseudoLocalities);
}
if (ar.protocolVersion().hasShardedTxsTags()) {
serializer(ar, txsTags);
}
}
};
struct DBCoreState {
std::vector<CoreTLogSet> tLogs;
int32_t logRouterTags;
int32_t txsTags;
std::vector<OldTLogCoreData> oldTLogData;
DBRecoveryCount recoveryCount; // Increases with sequential successful recoveries.
LogSystemType logSystemType;
std::set<int8_t> pseudoLocalities;
DBCoreState() : logRouterTags(0), recoveryCount(0), logSystemType(LogSystemType::empty) {}
DBCoreState() : logRouterTags(0), txsTags(0), recoveryCount(0), logSystemType(LogSystemType::empty) {}
vector<UID> getPriorCommittedLogServers() {
vector<UID> priorCommittedLogServers;
@ -130,7 +135,7 @@ struct DBCoreState {
}
bool isEqual(DBCoreState const& r) const {
return logSystemType == r.logSystemType && recoveryCount == r.recoveryCount && tLogs == r.tLogs && oldTLogData == r.oldTLogData && logRouterTags == r.logRouterTags && pseudoLocalities == r.pseudoLocalities;
return logSystemType == r.logSystemType && recoveryCount == r.recoveryCount && tLogs == r.tLogs && oldTLogData == r.oldTLogData && logRouterTags == r.logRouterTags && txsTags == r.txsTags && pseudoLocalities == r.pseudoLocalities;
}
bool operator == ( const DBCoreState& rhs ) const { return isEqual(rhs); }
@ -148,6 +153,9 @@ struct DBCoreState {
if (ar.protocolVersion().hasPseudoLocalities()) {
serializer(ar, pseudoLocalities);
}
if (ar.protocolVersion().hasShardedTxsTags()) {
serializer(ar, txsTags);
}
} else if(ar.isDeserializing) {
tLogs.push_back(CoreTLogSet());
serializer(ar, tLogs[0].tLogs, tLogs[0].tLogWriteAntiQuorum, recoveryCount, tLogs[0].tLogReplicationFactor, logSystemType);

View File

@ -88,7 +88,7 @@ struct TCMachineInfo : public ReferenceCounted<TCMachineInfo> {
ACTOR Future<Void> updateServerMetrics( TCServerInfo *server ) {
state StorageServerInterface ssi = server->lastKnownInterface;
state Future<ErrorOr<GetPhysicalMetricsReply>> metricsRequest = ssi.getPhysicalMetrics.tryGetReply( GetPhysicalMetricsRequest(), TaskDataDistributionLaunch );
state Future<ErrorOr<GetPhysicalMetricsReply>> metricsRequest = ssi.getPhysicalMetrics.tryGetReply( GetPhysicalMetricsRequest(), TaskPriority::DataDistributionLaunch );
state Future<Void> resetRequest = Never();
state Future<std::pair<StorageServerInterface, ProcessClass>> interfaceChanged( server->onInterfaceChanged );
state Future<Void> serverRemoved( server->onRemoved );
@ -104,7 +104,7 @@ ACTOR Future<Void> updateServerMetrics( TCServerInfo *server ) {
return Void();
}
metricsRequest = Never();
resetRequest = delay( SERVER_KNOBS->METRIC_DELAY, TaskDataDistributionLaunch );
resetRequest = delay( SERVER_KNOBS->METRIC_DELAY, TaskPriority::DataDistributionLaunch );
}
when( std::pair<StorageServerInterface,ProcessClass> _ssi = wait( interfaceChanged ) ) {
ssi = _ssi.first;
@ -120,7 +120,7 @@ ACTOR Future<Void> updateServerMetrics( TCServerInfo *server ) {
}
else {
resetRequest = Never();
metricsRequest = ssi.getPhysicalMetrics.tryGetReply( GetPhysicalMetricsRequest(), TaskDataDistributionLaunch );
metricsRequest = ssi.getPhysicalMetrics.tryGetReply( GetPhysicalMetricsRequest(), TaskPriority::DataDistributionLaunch );
}
}
}
@ -635,9 +635,9 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
shardsAffectedByTeamFailure(shardsAffectedByTeamFailure), doBuildTeams(true), teamBuilder(Void()),
badTeamRemover(Void()), redundantTeamRemover(Void()), configuration(configuration),
readyToStart(readyToStart), clearHealthyZoneFuture(Void()),
checkTeamDelay(delay(SERVER_KNOBS->CHECK_TEAM_DELAY, TaskDataDistribution)),
checkTeamDelay(delay(SERVER_KNOBS->CHECK_TEAM_DELAY, TaskPriority::DataDistribution)),
initialFailureReactionDelay(
delayed(readyToStart, SERVER_KNOBS->INITIAL_FAILURE_REACTION_DELAY, TaskDataDistribution)),
delayed(readyToStart, SERVER_KNOBS->INITIAL_FAILURE_REACTION_DELAY, TaskPriority::DataDistribution)),
healthyTeamCount(0), storageServerSet(new LocalityMap<UID>()),
initializationDoneActor(logOnCompletion(readyToStart && initialFailureReactionDelay, this)),
optimalTeamCount(0), recruitingStream(0), restartRecruiting(SERVER_KNOBS->DEBOUNCE_RECRUITING_DELAY),
@ -671,7 +671,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
ACTOR static Future<Void> logOnCompletion( Future<Void> signal, DDTeamCollection* self ) {
wait(signal);
wait(delay(SERVER_KNOBS->LOG_ON_COMPLETION_DELAY, TaskDataDistribution));
wait(delay(SERVER_KNOBS->LOG_ON_COMPLETION_DELAY, TaskPriority::DataDistribution));
if(!self->primary || self->configuration.usableRegions == 1) {
TraceEvent("DDTrackerStarting", self->distributorId)
@ -1309,7 +1309,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
// Five steps to create each machine team, which are document in the function
// Reuse ReplicationPolicy selectReplicas func to select machine team
// return number of added machine teams
int addBestMachineTeams(int targetMachineTeamsToBuild) {
int addBestMachineTeams(int targetMachineTeamsToBuild, int remainingMachineTeamBudget) {
int addedMachineTeams = 0;
int machineTeamsToBuild = 0;
@ -1327,7 +1327,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
int loopCount = 0;
// Add a team in each iteration
while (addedMachineTeams < machineTeamsToBuild) {
while (addedMachineTeams < machineTeamsToBuild || addedMachineTeams < remainingMachineTeamBudget) {
// Step 2: Get least used machines from which we choose machines as a machine team
std::vector<Reference<TCMachineInfo>> leastUsedMachines; // A less used machine has less number of teams
int minTeamCount = std::numeric_limits<int>::max();
@ -1377,6 +1377,8 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
// that have the least-utilized server
team.clear();
auto success = machineLocalityMap.selectReplicas(configuration.storagePolicy, forcedAttributes, team);
// NOTE: selectReplicas() should always return success when storageTeamSize = 1
ASSERT_WE_THINK(configuration.storageTeamSize > 1 || (configuration.storageTeamSize == 1 && success));
if (!success) {
break;
}
@ -1430,6 +1432,9 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
addMachineTeam(machines);
addedMachineTeams++;
// Update the remaining machine team budget because the budget may decrease by
// any value between 1 and storageTeamSize
remainingMachineTeamBudget = getRemainingMachineTeamBudget();
} else {
TraceEvent(SevWarn, "DataDistributionBuildTeams", distributorId)
.detail("Primary", primary)
@ -1589,6 +1594,32 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
return totalHealthyMachineCount;
}
std::pair<int64_t, int64_t> calculateMinMaxServerTeamNumOnServer() {
int64_t minTeamNumber = std::numeric_limits<int64_t>::max();
int64_t maxTeamNumber = 0;
for (auto& server : server_info) {
if (server_status.get(server.first).isUnhealthy()) {
continue;
}
minTeamNumber = std::min((int64_t) server.second->teams.size(), minTeamNumber);
maxTeamNumber = std::max((int64_t) server.second->teams.size(), maxTeamNumber);
}
return std::make_pair(minTeamNumber, maxTeamNumber);
}
std::pair<int64_t, int64_t> calculateMinMaxMachineTeamNumOnMachine() {
int64_t minTeamNumber = std::numeric_limits<int64_t>::max();
int64_t maxTeamNumber = 0;
for (auto& machine : machine_info) {
if (!isMachineHealthy(machine.second)) {
continue;
}
minTeamNumber = std::min<int64_t>((int64_t) machine.second->machineTeams.size(), minTeamNumber);
maxTeamNumber = std::max<int64_t>((int64_t) machine.second->machineTeams.size(), maxTeamNumber);
}
return std::make_pair(minTeamNumber, maxTeamNumber);
}
// Sanity check
bool isServerTeamNumberCorrect(Reference<TCMachineTeamInfo>& mt) {
int num = 0;
@ -1639,12 +1670,41 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
return healthyTeamCount;
}
// Each machine is expected to have SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER,
// remainingMachineTeamBudget is the number of machine teams needed to ensure every machine has
// SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER teams
int getRemainingMachineTeamBudget() {
int remainingMachineTeamBudget = 0;
for (auto& m : machine_info) {
int machineTeamCount = m.second->machineTeams.size();
remainingMachineTeamBudget += std::max(0, (int)(SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER - machineTeamCount));
}
// We over-provision the remainingMachineTeamBudget because we do not know, when a new machine team is built,
// how many times it can be counted into the budget. For example, when a new machine is added,
// a new machine team only consume 1 such budget
return remainingMachineTeamBudget;
}
// Each server is expected to have SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER,
int getRemainingServerTeamBudget() {
// remainingTeamBudget is the number of teams needed to ensure every server has
// SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER teams
int remainingTeamBudget = 0;
for (auto& s : server_info) {
int numValidTeams = s.second->teams.size();
remainingTeamBudget += std::max(0, (int)(SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER - numValidTeams));
}
return remainingTeamBudget;
}
// Create server teams based on machine teams
// Before the number of machine teams reaches the threshold, build a machine team for each server team
// When it reaches the threshold, first try to build a server team with existing machine teams; if failed,
// build an extra machine team and record the event in trace
int addTeamsBestOf(int teamsToBuild, int desiredTeamNumber, int maxTeamNumber) {
ASSERT(teamsToBuild > 0);
int addTeamsBestOf(int teamsToBuild, int desiredTeamNumber, int maxTeamNumber, int remainingTeamBudget) {
ASSERT(teamsToBuild >= 0);
ASSERT_WE_THINK(machine_info.size() > 0 || server_info.size() == 0);
int addedMachineTeams = 0;
@ -1655,27 +1715,28 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
// When we change configuration, we may have machine teams with storageTeamSize in the old configuration.
int healthyMachineTeamCount = getHealthyMachineTeamCount();
int totalMachineTeamCount = machineTeams.size();
int totalHealthyMachineCount = calculateHealthyMachineCount();
int remainingMachineTeamBudget = getRemainingMachineTeamBudget();
int desiredMachineTeams = SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER * totalHealthyMachineCount;
int maxMachineTeams = SERVER_KNOBS->MAX_TEAMS_PER_SERVER * totalHealthyMachineCount;
// machineTeamsToBuild mimics how the teamsToBuild is calculated in buildTeams()
int machineTeamsToBuild =
std::min(desiredMachineTeams - healthyMachineTeamCount, maxMachineTeams - totalMachineTeamCount);
int machineTeamsToBuild = std::max(
0, std::min(desiredMachineTeams - healthyMachineTeamCount, maxMachineTeams - totalMachineTeamCount));
TraceEvent("BuildMachineTeams")
.detail("TotalHealthyMachine", totalHealthyMachineCount)
.detail("HealthyMachineTeamCount", healthyMachineTeamCount)
.detail("DesiredMachineTeams", desiredMachineTeams)
.detail("MaxMachineTeams", maxMachineTeams)
.detail("MachineTeamsToBuild", machineTeamsToBuild);
.detail("MachineTeamsToBuild", machineTeamsToBuild)
.detail("RemainingMachineTeamBudget", remainingMachineTeamBudget);
// Pre-build all machine teams until we have the desired number of machine teams
if (machineTeamsToBuild > 0) {
addedMachineTeams = addBestMachineTeams(machineTeamsToBuild);
if (machineTeamsToBuild > 0 || remainingMachineTeamBudget > 0) {
addedMachineTeams = addBestMachineTeams(machineTeamsToBuild, remainingMachineTeamBudget);
}
while (addedTeams < teamsToBuild) {
while (addedTeams < teamsToBuild || addedTeams < remainingTeamBudget) {
// Step 1: Create 1 best machine team
std::vector<UID> bestServerTeam;
int bestScore = std::numeric_limits<int>::max();
@ -1752,6 +1813,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
// Step 4: Add the server team
addTeam(bestServerTeam.begin(), bestServerTeam.end(), false);
addedTeams++;
remainingTeamBudget = getRemainingServerTeamBudget();
if (++loopCount > 2 * teamsToBuild * (configuration.storageTeamSize + 1)) {
break;
@ -1760,10 +1822,14 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
healthyMachineTeamCount = getHealthyMachineTeamCount();
std::pair<uint64_t, uint64_t> minMaxTeamNumberOnServer = calculateMinMaxServerTeamNumOnServer();
std::pair<uint64_t, uint64_t> minMaxMachineTeamNumberOnMachine = calculateMinMaxMachineTeamNumOnMachine();
TraceEvent("TeamCollectionInfo", distributorId)
.detail("Primary", primary)
.detail("AddedTeamNumber", addedTeams)
.detail("AimToBuildTeamNumber", teamsToBuild)
.detail("RemainingTeamBudget", remainingTeamBudget)
.detail("CurrentTeamNumber", teams.size())
.detail("DesiredTeamNumber", desiredTeamNumber)
.detail("MaxTeamNumber", maxTeamNumber)
@ -1773,6 +1839,11 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
.detail("DesiredMachineTeams", desiredMachineTeams)
.detail("MaxMachineTeams", maxMachineTeams)
.detail("TotalHealthyMachine", totalHealthyMachineCount)
.detail("MinTeamNumberOnServer", minMaxTeamNumberOnServer.first)
.detail("MaxTeamNumberOnServer", minMaxTeamNumberOnServer.second)
.detail("MinMachineTeamNumberOnMachine", minMaxMachineTeamNumberOnMachine.first)
.detail("MaxMachineTeamNumberOnMachine", minMaxMachineTeamNumberOnMachine.second)
.detail("DoBuildTeams", doBuildTeams)
.trackLatest("TeamCollectionInfo");
return addedTeams;
@ -1789,10 +1860,14 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
int maxMachineTeams = SERVER_KNOBS->MAX_TEAMS_PER_SERVER * totalHealthyMachineCount;
int healthyMachineTeamCount = getHealthyMachineTeamCount();
std::pair<uint64_t, uint64_t> minMaxTeamNumberOnServer = calculateMinMaxServerTeamNumOnServer();
std::pair<uint64_t, uint64_t> minMaxMachineTeamNumberOnMachine = calculateMinMaxMachineTeamNumOnMachine();
TraceEvent("TeamCollectionInfo", distributorId)
.detail("Primary", primary)
.detail("AddedTeamNumber", 0)
.detail("AimToBuildTeamNumber", 0)
.detail("RemainingTeamBudget", 0)
.detail("CurrentTeamNumber", teams.size())
.detail("DesiredTeamNumber", desiredServerTeams)
.detail("MaxTeamNumber", maxServerTeams)
@ -1802,14 +1877,22 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
.detail("DesiredMachineTeams", desiredMachineTeams)
.detail("MaxMachineTeams", maxMachineTeams)
.detail("TotalHealthyMachine", totalHealthyMachineCount)
.detail("MinTeamNumberOnServer", minMaxTeamNumberOnServer.first)
.detail("MaxTeamNumberOnServer", minMaxTeamNumberOnServer.second)
.detail("MinMachineTeamNumberOnMachine", minMaxMachineTeamNumberOnMachine.first)
.detail("MaxMachineTeamNumberOnMachine", minMaxMachineTeamNumberOnMachine.second)
.detail("DoBuildTeams", doBuildTeams)
.trackLatest("TeamCollectionInfo");
// Debug purpose
// if (healthyMachineTeamCount > desiredMachineTeams || machineTeams.size() > maxMachineTeams) {
// // When the number of machine teams is over the limit, print out the current team info.
// traceAllInfo(true);
// }
// Advance time so that we will not have multiple TeamCollectionInfo at the same time, otherwise
// simulation test will randomly pick one TeamCollectionInfo trace, which could be the one before build teams
// wait(delay(0.01));
// Debug purpose
// if (healthyMachineTeamCount > desiredMachineTeams || machineTeams.size() > maxMachineTeams) {
// // When the number of machine teams is over the limit, print out the current team info.
// traceAllInfo(true);
// }
}
// Use the current set of known processes (from server_info) to compute an optimized set of storage server teams.
@ -1856,10 +1939,14 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
totalTeamCount++;
}
}
// Each server is expected to have SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER,
// remainingTeamBudget is the number of teams needed to ensure every server has
// SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER teams
int remainingTeamBudget = self->getRemainingServerTeamBudget();
// teamsToBuild is calculated such that we will not build too many teams in the situation
// when all (or most of) teams become unhealthy temporarily and then healthy again
state int teamsToBuild = std::min(desiredTeams - teamCount, maxTeams - totalTeamCount);
state int teamsToBuild = std::max(0, std::min(desiredTeams - teamCount, maxTeams - totalTeamCount));
TraceEvent("BuildTeamsBegin", self->distributorId)
.detail("TeamsToBuild", teamsToBuild)
@ -1876,13 +1963,13 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
.detail("MachineCount", self->machine_info.size())
.detail("DesiredTeamsPerServer", SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER);
if (teamsToBuild > 0) {
if (teamsToBuild > 0 || remainingTeamBudget > 0) {
state vector<std::vector<UID>> builtTeams;
// addTeamsBestOf() will not add more teams than needed.
// If the team number is more than the desired, the extra teams are added in the code path when
// a team is added as an initial team
int addedTeams = self->addTeamsBestOf(teamsToBuild, desiredTeams, maxTeams);
int addedTeams = self->addTeamsBestOf(teamsToBuild, desiredTeams, maxTeams, remainingTeamBudget);
if (addedTeams <= 0 && self->teams.size() == 0) {
TraceEvent(SevWarn, "NoTeamAfterBuildTeam")
@ -1898,10 +1985,14 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
int maxMachineTeams = SERVER_KNOBS->MAX_TEAMS_PER_SERVER * totalHealthyMachineCount;
int healthyMachineTeamCount = self->getHealthyMachineTeamCount();
std::pair<uint64_t, uint64_t> minMaxTeamNumberOnServer = self->calculateMinMaxServerTeamNumOnServer();
std::pair<uint64_t, uint64_t> minMaxMachineTeamNumberOnMachine = self->calculateMinMaxMachineTeamNumOnMachine();
TraceEvent("TeamCollectionInfo", self->distributorId)
.detail("Primary", self->primary)
.detail("AddedTeamNumber", 0)
.detail("AimToBuildTeamNumber", teamsToBuild)
.detail("RemainingTeamBudget", remainingTeamBudget)
.detail("CurrentTeamNumber", self->teams.size())
.detail("DesiredTeamNumber", desiredTeams)
.detail("MaxTeamNumber", maxTeams)
@ -1911,6 +2002,11 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
.detail("DesiredMachineTeams", desiredMachineTeams)
.detail("MaxMachineTeams", maxMachineTeams)
.detail("TotalHealthyMachine", totalHealthyMachineCount)
.detail("MinTeamNumberOnServer", minMaxTeamNumberOnServer.first)
.detail("MaxTeamNumberOnServer", minMaxTeamNumberOnServer.second)
.detail("MinMachineTeamNumberOnMachine", minMaxMachineTeamNumberOnMachine.first)
.detail("MaxMachineTeamNumberOnMachine", minMaxMachineTeamNumberOnMachine.second)
.detail("DoBuildTeams", self->doBuildTeams)
.trackLatest("TeamCollectionInfo");
}
}
@ -1919,7 +2015,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
//Building teams can cause servers to become undesired, which can make teams unhealthy.
//Let all of these changes get worked out before responding to the get team request
wait( delay(0, TaskDataDistributionLaunch) );
wait( delay(0, TaskPriority::DataDistributionLaunch) );
return Void();
}
@ -2232,7 +2328,7 @@ ACTOR Future<Void> waitUntilHealthy(DDTeamCollection* self) {
TraceEvent("WaitUntilHealthyStalled", self->distributorId).detail("Primary", self->primary).detail("ZeroHealthy", self->zeroHealthyTeams->get()).detail("ProcessingUnhealthy", self->processingUnhealthy->get());
wait(self->zeroHealthyTeams->onChange() || self->processingUnhealthy->onChange());
}
wait(delay(SERVER_KNOBS->DD_STALL_CHECK_DELAY, TaskLowPriority)); //After the team trackers wait on the initial failure reaction delay, they yield. We want to make sure every tracker has had the opportunity to send their relocations to the queue.
wait(delay(SERVER_KNOBS->DD_STALL_CHECK_DELAY, TaskPriority::Low)); //After the team trackers wait on the initial failure reaction delay, they yield. We want to make sure every tracker has had the opportunity to send their relocations to the queue.
if(!self->zeroHealthyTeams->get() && !self->processingUnhealthy->get()) {
return Void();
}
@ -2308,6 +2404,16 @@ ACTOR Future<Void> teamRemover(DDTeamCollection* self) {
team = mt->serverTeams[teamIndex];
ASSERT(team->machineTeam->machineIDs == mt->machineIDs); // Sanity check
// Check if a server will have 0 team after the team is removed
for (auto& s : team->getServers()) {
if (s->teams.size() == 0) {
TraceEvent(SevError, "TeamRemoverTooAggressive")
.detail("Server", s->id)
.detail("Team", team->getServerIDsStr());
self->traceAllInfo(true);
}
}
// The team will be marked as a bad team
bool foundTeam = self->removeTeam(team);
ASSERT(foundTeam == true);
@ -2540,7 +2646,12 @@ ACTOR Future<Void> teamTracker(DDTeamCollection* self, Reference<TCTeamInfo> tea
//If we cannot find the team, it could be a bad team so assume unhealthy priority
if(!found) {
maxPriority = std::max<int>( maxPriority, PRIORITY_TEAM_UNHEALTHY );
// If the input team (in function parameters) is a redundant team, found will be
// false We want to differentiate the redundant_team from unhealthy_team in
// terms of relocate priority
maxPriority =
std::max<int>(maxPriority, redundantTeam ? PRIORITY_TEAM_REDUNDANT
: PRIORITY_TEAM_UNHEALTHY);
}
} else {
TEST(true); // A removed server is still associated with a team in SABTF
@ -2638,7 +2749,7 @@ ACTOR Future<Void> trackExcludedServers( DDTeamCollection* self ) {
if (nchid != lastChangeID)
break;
wait( delay( SERVER_KNOBS->SERVER_LIST_DELAY, TaskDataDistribution ) ); // FIXME: make this tr.watch( excludedServersVersionKey ) instead
wait( delay( SERVER_KNOBS->SERVER_LIST_DELAY, TaskPriority::DataDistribution ) ); // FIXME: make this tr.watch( excludedServersVersionKey ) instead
tr = Transaction(self->cx);
} catch (Error& e) {
wait( tr.onError(e) );
@ -2734,12 +2845,18 @@ ACTOR Future<Void> waitHealthyZoneChange( DDTeamCollection* self ) {
if(val.present()) {
auto p = decodeHealthyZoneValue(val.get());
if(p.second > tr.getReadVersion().get()) {
healthyZoneTimeout = delay((p.second - tr.getReadVersion().get())/(double)SERVER_KNOBS->VERSIONS_PER_SECOND);
self->healthyZone.set(p.first);
} else {
double timeoutSeconds = (p.second - tr.getReadVersion().get())/(double)SERVER_KNOBS->VERSIONS_PER_SECOND;
healthyZoneTimeout = delay(timeoutSeconds);
if(self->healthyZone.get() != p.first) {
TraceEvent("MaintenanceZoneStart", self->distributorId).detail("ZoneID", printable(p.first)).detail("EndVersion", p.second).detail("Duration", timeoutSeconds);
self->healthyZone.set(p.first);
}
} else if(self->healthyZone.get().present()) {
TraceEvent("MaintenanceZoneEnd", self->distributorId);
self->healthyZone.set(Optional<Key>());
}
} else {
} else if(self->healthyZone.get().present()) {
TraceEvent("MaintenanceZoneEnd", self->distributorId);
self->healthyZone.set(Optional<Key>());
}
@ -2757,14 +2874,14 @@ ACTOR Future<Void> serverMetricsPolling( TCServerInfo *server) {
state double lastUpdate = now();
loop {
wait( updateServerMetrics( server ) );
wait( delayUntil( lastUpdate + SERVER_KNOBS->STORAGE_METRICS_POLLING_DELAY + SERVER_KNOBS->STORAGE_METRICS_RANDOM_DELAY * deterministicRandom()->random01(), TaskDataDistributionLaunch ) );
wait( delayUntil( lastUpdate + SERVER_KNOBS->STORAGE_METRICS_POLLING_DELAY + SERVER_KNOBS->STORAGE_METRICS_RANDOM_DELAY * deterministicRandom()->random01(), TaskPriority::DataDistributionLaunch ) );
lastUpdate = now();
}
}
//Returns the KeyValueStoreType of server if it is different from self->storeType
ACTOR Future<KeyValueStoreType> keyValueStoreTypeTracker(DDTeamCollection* self, TCServerInfo *server) {
state KeyValueStoreType type = wait(brokenPromiseToNever(server->lastKnownInterface.getKeyValueStoreType.getReplyWithTaskID<KeyValueStoreType>(TaskDataDistribution)));
state KeyValueStoreType type = wait(brokenPromiseToNever(server->lastKnownInterface.getKeyValueStoreType.getReplyWithTaskID<KeyValueStoreType>(TaskPriority::DataDistribution)));
if(type == self->configuration.storageServerStoreType && (self->includedDCs.empty() || std::find(self->includedDCs.begin(), self->includedDCs.end(), server->lastKnownInterface.locality.dcId()) != self->includedDCs.end()) )
wait(Future<Void>(Never()));
@ -2787,7 +2904,7 @@ ACTOR Future<Void> waitForAllDataRemoved( Database cx, UID serverID, Version add
}
// Wait for any change to the serverKeys for this server
wait( delay(SERVER_KNOBS->ALL_DATA_REMOVED_DELAY, TaskDataDistribution) );
wait( delay(SERVER_KNOBS->ALL_DATA_REMOVED_DELAY, TaskPriority::DataDistribution) );
tr.reset();
} catch (Error& e) {
wait( tr.onError(e) );
@ -2830,7 +2947,7 @@ ACTOR Future<Void> storageServerFailureTracker(
ASSERT(!inHealthyZone);
healthChanged = IFailureMonitor::failureMonitor().onStateEqual( interf.waitFailure.getEndpoint(), FailureStatus(false));
} else if(!inHealthyZone) {
healthChanged = waitFailureClientStrict(interf.waitFailure, SERVER_KNOBS->DATA_DISTRIBUTION_FAILURE_REACTION_TIME, TaskDataDistribution);
healthChanged = waitFailureClientStrict(interf.waitFailure, SERVER_KNOBS->DATA_DISTRIBUTION_FAILURE_REACTION_TIME, TaskPriority::DataDistribution);
}
choose {
when ( wait(healthChanged) ) {
@ -2840,6 +2957,7 @@ ACTOR Future<Void> storageServerFailureTracker(
}
if(status->isFailed && self->healthyZone.get().present() && self->clearHealthyZoneFuture.isReady()) {
self->clearHealthyZoneFuture = clearHealthyZone(self->cx);
TraceEvent("MaintenanceZoneCleared", self->distributorId);
self->healthyZone.set(Optional<Key>());
}
@ -2953,11 +3071,14 @@ ACTOR Future<Void> storageServerTracker(
if(hasWrongStoreTypeOrDC)
self->restartRecruiting.trigger();
if ( lastIsUnhealthy && !status.isUnhealthy() && !server->teams.size() ) {
if (lastIsUnhealthy && !status.isUnhealthy() &&
server->teams.size() < SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER) {
self->doBuildTeams = true;
self->restartTeamBuilder.trigger(); // This does not trigger building teams if there exist healthy teams
}
lastIsUnhealthy = status.isUnhealthy();
state bool recordTeamCollectionInfo = false;
choose {
when( wait( failureTracker ) ) {
// The server is failed AND all data has been removed from it, so permanently remove it.
@ -3061,7 +3182,8 @@ ACTOR Future<Void> storageServerTracker(
self->badTeamRemover = removeBadTeams(self);
self->addActor.send(self->badTeamRemover);
// The team number changes, so we need to update the team number info
self->traceTeamCollectionInfo();
// self->traceTeamCollectionInfo();
recordTeamCollectionInfo = true;
}
}
@ -3069,10 +3191,13 @@ ACTOR Future<Void> storageServerTracker(
// We rely on the old failureTracker being actorCancelled since the old actor now has a pointer to an invalid location
status = ServerStatus( status.isFailed, status.isUndesired, server->lastKnownInterface.locality );
// self->traceTeamCollectionInfo();
recordTeamCollectionInfo = true;
//Restart the storeTracker for the new interface
storeTracker = keyValueStoreTypeTracker(self, server);
hasWrongStoreTypeOrDC = false;
self->restartTeamBuilder.trigger();
if(restartRecruiting)
self->restartRecruiting.trigger();
}
@ -3093,6 +3218,10 @@ ACTOR Future<Void> storageServerTracker(
server->wakeUpTracker = Promise<Void>();
}
}
if (recordTeamCollectionInfo) {
self->traceTeamCollectionInfo();
}
}
} catch( Error &e ) {
if (e.code() != error_code_actor_cancelled && errorOut.canBeSet())
@ -3120,7 +3249,7 @@ ACTOR Future<Void> monitorStorageServerRecruitment(DDTeamCollection* self) {
loop {
choose {
when( wait( self->recruitingStream.onChange() ) ) {}
when( wait( self->recruitingStream.get() == 0 ? delay(SERVER_KNOBS->RECRUITMENT_IDLE_DELAY, TaskDataDistribution) : Future<Void>(Never()) ) ) { break; }
when( wait( self->recruitingStream.get() == 0 ? delay(SERVER_KNOBS->RECRUITMENT_IDLE_DELAY, TaskPriority::DataDistribution) : Future<Void>(Never()) ) ) { break; }
}
}
TraceEvent("StorageServerRecruitment", self->distributorId)
@ -3147,12 +3276,12 @@ ACTOR Future<Void> initializeStorage( DDTeamCollection* self, RecruitStorageRepl
self->recruitingIds.insert(interfaceId);
self->recruitingLocalities.insert(candidateWorker.worker.address());
state ErrorOr<InitializeStorageReply> newServer = wait( candidateWorker.worker.storage.tryGetReply( isr, TaskDataDistribution ) );
state ErrorOr<InitializeStorageReply> newServer = wait( candidateWorker.worker.storage.tryGetReply( isr, TaskPriority::DataDistribution ) );
if(newServer.isError()) {
TraceEvent(SevWarn, "DDRecruitmentError").error(newServer.getError());
if( !newServer.isError( error_code_recruitment_failed ) && !newServer.isError( error_code_request_maybe_delivered ) )
throw newServer.getError();
wait( delay(SERVER_KNOBS->STORAGE_RECRUITMENT_DELAY, TaskDataDistribution) );
wait( delay(SERVER_KNOBS->STORAGE_RECRUITMENT_DELAY, TaskPriority::DataDistribution) );
}
self->recruitingIds.erase(interfaceId);
self->recruitingLocalities.erase(candidateWorker.worker.address());
@ -3217,7 +3346,7 @@ ACTOR Future<Void> storageRecruiter( DDTeamCollection* self, Reference<AsyncVar<
if(!fCandidateWorker.isValid() || fCandidateWorker.isReady() || rsr.excludeAddresses != lastRequest.excludeAddresses || rsr.criticalRecruitment != lastRequest.criticalRecruitment) {
lastRequest = rsr;
fCandidateWorker = brokenPromiseToNever( db->get().clusterInterface.recruitStorage.getReply( rsr, TaskDataDistribution ) );
fCandidateWorker = brokenPromiseToNever( db->get().clusterInterface.recruitStorage.getReply( rsr, TaskPriority::DataDistribution ) );
}
choose {
@ -3388,7 +3517,7 @@ ACTOR Future<Void> dataDistributionTeamCollection(
ACTOR Future<Void> waitForDataDistributionEnabled( Database cx ) {
state Transaction tr(cx);
loop {
wait(delay(SERVER_KNOBS->DD_ENABLED_CHECK_DELAY, TaskDataDistribution));
wait(delay(SERVER_KNOBS->DD_ENABLED_CHECK_DELAY, TaskPriority::DataDistribution));
try {
Optional<Value> mode = wait( tr.get( dataDistributionModeKey ) );
@ -3516,7 +3645,7 @@ ACTOR Future<Void> dataDistribution(Reference<DataDistributorData> self)
state double lastLimited = 0;
self->addActor.send( monitorBatchLimitedTime(self->dbInfo, &lastLimited) );
state Database cx = openDBOnServer(self->dbInfo, TaskDataDistributionLaunch, true, true);
state Database cx = openDBOnServer(self->dbInfo, TaskPriority::DataDistributionLaunch, true, true);
cx->locationCacheSize = SERVER_KNOBS->DD_LOCATION_CACHE_SIZE;
//cx->setOption( FDBDatabaseOptions::LOCATION_CACHE_SIZE, StringRef((uint8_t*) &SERVER_KNOBS->DD_LOCATION_CACHE_SIZE, 8) );
@ -3646,7 +3775,7 @@ ACTOR Future<Void> dataDistribution(Reference<DataDistributorData> self)
}
output.send( RelocateShard( keys, unhealthy ? PRIORITY_TEAM_UNHEALTHY : PRIORITY_RECOVER_MOVE ) );
}
wait( yield(TaskDataDistribution) );
wait( yield(TaskPriority::DataDistribution) );
}
vector<TeamCollectionInterface> tcis;
@ -3718,7 +3847,7 @@ ACTOR Future<Void> dataDistributor(DataDistributorInterface di, Reference<AsyncV
state Future<Void> collection = actorCollection( self->addActor.getFuture() );
try {
TraceEvent("DataDistributor_Running", di.id());
TraceEvent("DataDistributorRunning", di.id());
self->addActor.send( waitFailureServer(di.waitFailure.getFuture()) );
state Future<Void> distributor = reportErrorsExcept( dataDistribution(self), "DataDistribution", di.id(), &normalDataDistributorErrors() );
@ -3736,10 +3865,10 @@ ACTOR Future<Void> dataDistributor(DataDistributorInterface di, Reference<AsyncV
}
catch ( Error &err ) {
if ( normalDataDistributorErrors().count(err.code()) == 0 ) {
TraceEvent("DataDistributor_Error", di.id()).error(err, true);
TraceEvent("DataDistributorError", di.id()).error(err, true);
throw err;
}
TraceEvent("DataDistributor_Died", di.id()).error(err, true);
TraceEvent("DataDistributorDied", di.id()).error(err, true);
}
return Void();
@ -3842,7 +3971,7 @@ TEST_CASE("DataDistribution/AddTeamsBestOf/UseMachineID") {
Reference<IReplicationPolicy> policy = Reference<IReplicationPolicy>(new PolicyAcross(teamSize, "zoneid", Reference<IReplicationPolicy>(new PolicyOne())));
state DDTeamCollection* collection = testMachineTeamCollection(teamSize, policy, processSize);
collection->addTeamsBestOf(30, desiredTeams, maxTeams);
collection->addTeamsBestOf(30, desiredTeams, maxTeams, 30);
ASSERT(collection->sanityCheckTeams() == true);
@ -3867,8 +3996,8 @@ TEST_CASE("DataDistribution/AddTeamsBestOf/NotUseMachineID") {
return Void();
}
collection->addBestMachineTeams(30); // Create machine teams to help debug
collection->addTeamsBestOf(30, desiredTeams, maxTeams);
collection->addBestMachineTeams(30, 30); // Create machine teams to help debug
collection->addTeamsBestOf(30, desiredTeams, maxTeams, 30);
collection->sanityCheckTeams(); // Server team may happen to be on the same machine team, although unlikely
if (collection) delete (collection);
@ -3883,7 +4012,7 @@ TEST_CASE("DataDistribution/AddAllTeams/isExhaustive") {
state int maxTeams = SERVER_KNOBS->MAX_TEAMS_PER_SERVER * processSize;
state DDTeamCollection* collection = testTeamCollection(3, policy, processSize);
int result = collection->addTeamsBestOf(200, desiredTeams, maxTeams);
int result = collection->addTeamsBestOf(200, desiredTeams, maxTeams, 200);
delete(collection);
@ -3903,11 +4032,11 @@ TEST_CASE("/DataDistribution/AddAllTeams/withLimit") {
state DDTeamCollection* collection = testTeamCollection(3, policy, processSize);
int result = collection->addTeamsBestOf(10, desiredTeams, maxTeams);
int result = collection->addTeamsBestOf(10, desiredTeams, maxTeams, 10);
delete(collection);
ASSERT(result == 10);
ASSERT(result >= 10);
return Void();
}
@ -3923,9 +4052,9 @@ TEST_CASE("/DataDistribution/AddTeamsBestOf/SkippingBusyServers") {
collection->addTeam(std::set<UID>({ UID(1, 0), UID(2, 0), UID(3, 0) }), true);
collection->addTeam(std::set<UID>({ UID(1, 0), UID(3, 0), UID(4, 0) }), true);
int result = collection->addTeamsBestOf(8, desiredTeams, maxTeams);
int result = collection->addTeamsBestOf(8, desiredTeams, maxTeams, 8);
ASSERT(result == 8);
ASSERT(result >= 8);
for(auto process = collection->server_info.begin(); process != collection->server_info.end(); process++) {
auto teamCount = process->second->teams.size();
@ -3953,8 +4082,8 @@ TEST_CASE("/DataDistribution/AddTeamsBestOf/NotEnoughServers") {
collection->addTeam(std::set<UID>({ UID(1, 0), UID(2, 0), UID(3, 0) }), true);
collection->addTeam(std::set<UID>({ UID(1, 0), UID(3, 0), UID(4, 0) }), true);
collection->addBestMachineTeams(10);
int result = collection->addTeamsBestOf(10, desiredTeams, maxTeams);
collection->addBestMachineTeams(10, 10);
int result = collection->addTeamsBestOf(10, desiredTeams, maxTeams, 10);
if (collection->machineTeams.size() != 10 || result != 8) {
collection->traceAllInfo(true); // Debug message

View File

@ -512,9 +512,9 @@ struct DDQueueData {
// FIXME: is the merge case needed
if( input.priority == PRIORITY_MERGE_SHARD ) {
wait( delay( 0.5, TaskDataDistribution - 2 ) );
wait( delay( 0.5, decrementPriority(decrementPriority(TaskPriority::DataDistribution )) ) );
} else {
wait( delay( 0.0001, TaskDataDistributionLaunch ) );
wait( delay( 0.0001, TaskPriority::DataDistributionLaunch ) );
}
loop {
@ -933,7 +933,7 @@ ACTOR Future<Void> dataDistributionRelocator( DDQueueData *self, RelocateData rd
.detail("Count", stuckCount)
.detail("TeamCollectionId", tciIndex)
.detail("NumOfTeamCollections", self->teamCollections.size());
wait( delay( SERVER_KNOBS->BEST_TEAM_STUCK_DELAY, TaskDataDistributionLaunch ) );
wait( delay( SERVER_KNOBS->BEST_TEAM_STUCK_DELAY, TaskPriority::DataDistributionLaunch ) );
}
state std::vector<UID> destIds;
@ -993,7 +993,7 @@ ACTOR Future<Void> dataDistributionRelocator( DDQueueData *self, RelocateData rd
state Error error = success();
state Promise<Void> dataMovementComplete;
state Future<Void> doMoveKeys = moveKeys(self->cx, rd.keys, destIds, healthyIds, self->lock, dataMovementComplete, &self->startMoveKeysParallelismLock, &self->finishMoveKeysParallelismLock, self->teamCollections.size() > 1, relocateShardInterval.pairID );
state Future<Void> pollHealth = signalledTransferComplete ? Never() : delay( SERVER_KNOBS->HEALTH_POLL_TIME, TaskDataDistributionLaunch );
state Future<Void> pollHealth = signalledTransferComplete ? Never() : delay( SERVER_KNOBS->HEALTH_POLL_TIME, TaskPriority::DataDistributionLaunch );
try {
loop {
choose {
@ -1016,7 +1016,7 @@ ACTOR Future<Void> dataDistributionRelocator( DDQueueData *self, RelocateData rd
self->dataTransferComplete.send(rd);
}
}
pollHealth = signalledTransferComplete ? Never() : delay( SERVER_KNOBS->HEALTH_POLL_TIME, TaskDataDistributionLaunch );
pollHealth = signalledTransferComplete ? Never() : delay( SERVER_KNOBS->HEALTH_POLL_TIME, TaskPriority::DataDistributionLaunch );
}
when( wait( signalledTransferComplete ? Never() : dataMovementComplete.getFuture() ) ) {
self->fetchKeysComplete.insert( rd );
@ -1066,7 +1066,7 @@ ACTOR Future<Void> dataDistributionRelocator( DDQueueData *self, RelocateData rd
} else {
TEST(true); // move to removed server
healthyDestinations.addDataInFlightToTeam( -metrics.bytes );
wait( delay( SERVER_KNOBS->RETRY_RELOCATESHARD_DELAY, TaskDataDistributionLaunch ) );
wait( delay( SERVER_KNOBS->RETRY_RELOCATESHARD_DELAY, TaskPriority::DataDistributionLaunch ) );
}
}
} catch (Error& e) {
@ -1125,7 +1125,7 @@ ACTOR Future<Void> BgDDMountainChopper( DDQueueData* self, int teamCollectionInd
state double checkDelay = SERVER_KNOBS->BG_DD_POLLING_INTERVAL;
state int resetCount = SERVER_KNOBS->DD_REBALANCE_RESET_AMOUNT;
loop {
wait( delay(checkDelay, TaskDataDistributionLaunch) );
wait( delay(checkDelay, TaskPriority::DataDistributionLaunch) );
if (self->priority_relocations[PRIORITY_REBALANCE_OVERUTILIZED_TEAM] < SERVER_KNOBS->DD_REBALANCE_PARALLELISM) {
state Optional<Reference<IDataDistributionTeam>> randomTeam = wait( brokenPromiseToNever( self->teamCollections[teamCollectionIndex].getTeam.getReply( GetTeamRequest( true, false, true ) ) ) );
if( randomTeam.present() ) {
@ -1160,7 +1160,7 @@ ACTOR Future<Void> BgDDValleyFiller( DDQueueData* self, int teamCollectionIndex)
state double checkDelay = SERVER_KNOBS->BG_DD_POLLING_INTERVAL;
state int resetCount = SERVER_KNOBS->DD_REBALANCE_RESET_AMOUNT;
loop {
wait( delay(checkDelay, TaskDataDistributionLaunch) );
wait( delay(checkDelay, TaskPriority::DataDistributionLaunch) );
if (self->priority_relocations[PRIORITY_REBALANCE_UNDERUTILIZED_TEAM] < SERVER_KNOBS->DD_REBALANCE_PARALLELISM) {
state Optional<Reference<IDataDistributionTeam>> randomTeam = wait( brokenPromiseToNever( self->teamCollections[teamCollectionIndex].getTeam.getReply( GetTeamRequest( true, false, false ) ) ) );
if( randomTeam.present() ) {
@ -1244,7 +1244,7 @@ ACTOR Future<Void> dataDistributionQueue(
bool wasEmpty = serversToLaunchFrom.empty();
self.queueRelocation( rs, serversToLaunchFrom );
if(wasEmpty && !serversToLaunchFrom.empty())
launchQueuedWorkTimeout = delay(0, TaskDataDistributionLaunch);
launchQueuedWorkTimeout = delay(0, TaskPriority::DataDistributionLaunch);
}
when ( wait(launchQueuedWorkTimeout) ) {
self.launchQueuedWork( serversToLaunchFrom );
@ -1258,7 +1258,7 @@ ACTOR Future<Void> dataDistributionQueue(
when ( RelocateData done = waitNext( self.dataTransferComplete.getFuture() ) ) {
complete( done, self.busymap );
if(serversToLaunchFrom.empty() && !done.src.empty())
launchQueuedWorkTimeout = delay(0, TaskDataDistributionLaunch);
launchQueuedWorkTimeout = delay(0, TaskPriority::DataDistributionLaunch);
serversToLaunchFrom.insert(done.src.begin(), done.src.end());
}
when ( RelocateData done = waitNext( self.relocationComplete.getFuture() ) ) {
@ -1266,7 +1266,7 @@ ACTOR Future<Void> dataDistributionQueue(
self.finishRelocation(done.priority);
self.fetchKeysComplete.erase( done );
//self.logRelocation( done, "ShardRelocatorDone" );
actors.add( tag( delay(0, TaskDataDistributionLaunch), done.keys, rangesComplete ) );
actors.add( tag( delay(0, TaskPriority::DataDistributionLaunch), done.keys, rangesComplete ) );
if( g_network->isSimulated() && debug_isCheckRelocationDuration() && now() - done.startTime > 60 ) {
TraceEvent(SevWarnAlways, "RelocationDurationTooLong").detail("Duration", now() - done.startTime);
debug_setCheckRelocationDuration(false);

View File

@ -140,7 +140,7 @@ ACTOR Future<Void> trackShardBytes(
Reference<AsyncVar<Optional<StorageMetrics>>> shardSize,
bool addToSizeEstimate = true)
{
wait( delay( 0, TaskDataDistribution ) );
wait( delay( 0, TaskPriority::DataDistribution ) );
/*TraceEvent("TrackShardBytesStarting")
.detail("TrackerID", trackerID)
@ -260,7 +260,7 @@ ACTOR Future<Void> changeSizes( DataDistributionTracker* self, KeyRangeRef keys,
}
wait( waitForAll( sizes ) );
wait( yield(TaskDataDistribution) );
wait( yield(TaskPriority::DataDistribution) );
int64_t newShardsStartingSize = 0;
for ( int i = 0; i < sizes.size(); i++ )
@ -281,7 +281,7 @@ struct HasBeenTrueFor : NonCopyable {
Future<Void> set() {
if( !trigger.isValid() ) {
cleared = Promise<Void>();
trigger = delayJittered( SERVER_KNOBS->DD_MERGE_COALESCE_DELAY, TaskDataDistribution - 1 ) || cleared.getFuture();
trigger = delayJittered( SERVER_KNOBS->DD_MERGE_COALESCE_DELAY, decrementPriority(TaskPriority::DataDistribution) ) || cleared.getFuture();
}
return trigger;
}
@ -361,7 +361,7 @@ ACTOR Future<Void> shardSplitter(
self->sizeChanges.add( changeSizes( self, keys, shardSize->get().get().bytes ) );
} else {
wait( delay(1.0, TaskDataDistribution) ); //In case the reason the split point was off was due to a discrepancy between storage servers
wait( delay(1.0, TaskPriority::DataDistribution) ); //In case the reason the split point was off was due to a discrepancy between storage servers
}
return Void();
}
@ -529,7 +529,7 @@ ACTOR Future<Void> shardTracker(
wait( yieldedFuture(self->maxShardSize->onChange()) );
// Since maxShardSize will become present for all shards at once, avoid slow tasks with a short delay
wait( delay( 0, TaskDataDistribution ) );
wait( delay( 0, TaskPriority::DataDistribution ) );
/*TraceEvent("ShardTracker", self->distributorId)
.detail("Begin", keys.begin)
@ -546,7 +546,7 @@ ACTOR Future<Void> shardTracker(
// We could have a lot of actors being released from the previous wait at the same time. Immediately calling
// delay(0) mitigates the resulting SlowTask
wait( delay(0, TaskDataDistribution) );
wait( delay(0, TaskPriority::DataDistribution) );
}
} catch (Error& e) {
if (e.code() != error_code_actor_cancelled)
@ -593,12 +593,12 @@ ACTOR Future<Void> trackInitialShards(DataDistributionTracker *self, Reference<I
//This line reduces the priority of shard initialization to prevent interference with failure monitoring.
//SOMEDAY: Figure out what this priority should actually be
wait( delay( 0.0, TaskDataDistribution ) );
wait( delay( 0.0, TaskPriority::DataDistribution ) );
state int s;
for(s=0; s<initData->shards.size()-1; s++) {
restartShardTrackers( self, KeyRangeRef( initData->shards[s].key, initData->shards[s+1].key ) );
wait( yield( TaskDataDistribution ) );
wait( yield( TaskPriority::DataDistribution ) );
}
Future<Void> initialSize = changeSizes( self, KeyRangeRef(allKeys.begin, allKeys.end), 0 );

View File

@ -1937,8 +1937,8 @@ KeyValueStoreSQLite::KeyValueStoreSQLite(std::string const& filename, UID id, Ke
readCursors.resize(64); //< number of read threads
sqlite3_soft_heap_limit64( SERVER_KNOBS->SOFT_HEAP_LIMIT ); // SOMEDAY: Is this a performance issue? Should we drop the cache sizes for individual threads?
int taskId = g_network->getCurrentTask();
g_network->setCurrentTask(TaskDiskWrite);
TaskPriority taskId = g_network->getCurrentTask();
g_network->setCurrentTask(TaskPriority::DiskWrite);
writeThread->addThread( new Writer(filename, type==KeyValueStoreType::SSD_BTREE_V2, checkChecksums, checkIntegrity, writesComplete, springCleaningStats, diskBytesUsed, freeListPages, id, &readCursors) );
g_network->setCurrentTask(taskId);
auto p = new Writer::InitAction();
@ -1963,8 +1963,8 @@ StorageBytes KeyValueStoreSQLite::getStorageBytes() {
void KeyValueStoreSQLite::startReadThreads() {
int nReadThreads = readCursors.size();
int taskId = g_network->getCurrentTask();
g_network->setCurrentTask(TaskDiskRead);
TaskPriority taskId = g_network->getCurrentTask();
g_network->setCurrentTask(TaskPriority::DiskRead);
for(int i=0; i<nReadThreads; i++)
readThreads->addThread( new Reader(filename, type==KeyValueStoreType::SSD_BTREE_V2, readsComplete, logID, &readCursors[i]) );
g_network->setCurrentTask(taskId);

View File

@ -68,6 +68,7 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) {
init( MAX_QUEUE_COMMIT_BYTES, 15e6 ); if( randomize && BUGGIFY ) MAX_QUEUE_COMMIT_BYTES = 5000;
init( VERSIONS_PER_BATCH, VERSIONS_PER_SECOND/20 ); if( randomize && BUGGIFY ) VERSIONS_PER_BATCH = std::max<int64_t>(1,VERSIONS_PER_SECOND/1000);
init( CONCURRENT_LOG_ROUTER_READS, 1 );
init( LOG_ROUTER_PEEK_FROM_SATELLITES_PREFERRED, 1 ); if( randomize && BUGGIFY ) LOG_ROUTER_PEEK_FROM_SATELLITES_PREFERRED = 0;
init( DISK_QUEUE_ADAPTER_MIN_SWITCH_TIME, 1.0 );
init( DISK_QUEUE_ADAPTER_MAX_SWITCH_TIME, 5.0 );
init( TLOG_SPILL_REFERENCE_MAX_PEEK_MEMORY_BYTES, 2e9 ); if ( randomize && BUGGIFY ) TLOG_SPILL_REFERENCE_MAX_PEEK_MEMORY_BYTES = 2e6;
@ -282,6 +283,8 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) {
init( PROXY_SPIN_DELAY, 0.01 );
init( UPDATE_REMOTE_LOG_VERSION_INTERVAL, 2.0 );
init( MAX_TXS_POP_VERSION_HISTORY, 1e5 );
init( PROXY_FORWARD_DELAY, 10.0 );
init( MAX_FORWARD_MESSAGES, 1e6 ); if( randomize && BUGGIFY ) MAX_FORWARD_MESSAGES = 10;
// Master Server
// masterCommitter() in the master server will allow lower priority tasks (e.g. DataDistibution)
@ -393,6 +396,15 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) {
init( MAX_TL_SS_VERSION_DIFFERENCE_BATCH, 1e99 );
init( MAX_MACHINES_FALLING_BEHIND, 1 );
init( MAX_TPS_HISTORY_SAMPLES, 600 );
init( NEEDED_TPS_HISTORY_SAMPLES, 200 );
init( TARGET_DURABILITY_LAG_VERSIONS, 200e6 );
init( TARGET_DURABILITY_LAG_VERSIONS_BATCH, 100e6 );
init( DURABILITY_LAG_UNLIMITED_THRESHOLD, 50e6 );
init( INITIAL_DURABILITY_LAG_MULTIPLIER, 1.02 );
init( DURABILITY_LAG_REDUCTION_RATE, 0.9999 );
init( DURABILITY_LAG_INCREASE_RATE, 1.001 );
//Storage Metrics
init( STORAGE_METRICS_AVERAGE_INTERVAL, 120.0 );
init( STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS, 1000.0 / STORAGE_METRICS_AVERAGE_INTERVAL ); // milliHz!
@ -411,6 +423,7 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) {
init( BUGGIFY_BLOCK_BYTES, 10000 );
init( STORAGE_COMMIT_BYTES, 10000000 ); if( randomize && BUGGIFY ) STORAGE_COMMIT_BYTES = 2000000;
init( STORAGE_DURABILITY_LAG_REJECT_THRESHOLD, 0.25 );
init( STORAGE_DURABILITY_LAG_MIN_RATE, 0.1 );
init( STORAGE_COMMIT_INTERVAL, 0.5 ); if( randomize && BUGGIFY ) STORAGE_COMMIT_INTERVAL = 2.0;
init( UPDATE_SHARD_VERSION_INTERVAL, 0.25 ); if( randomize && BUGGIFY ) UPDATE_SHARD_VERSION_INTERVAL = 1.0;
init( BYTE_SAMPLING_FACTOR, 250 ); //cannot buggify because of differences in restarting tests

View File

@ -72,6 +72,7 @@ public:
int64_t MAX_QUEUE_COMMIT_BYTES;
int64_t VERSIONS_PER_BATCH;
int CONCURRENT_LOG_ROUTER_READS;
int LOG_ROUTER_PEEK_FROM_SATELLITES_PREFERRED; // 0==peek from primary, non-zero==peek from satellites
double DISK_QUEUE_ADAPTER_MIN_SWITCH_TIME;
double DISK_QUEUE_ADAPTER_MAX_SWITCH_TIME;
int64_t TLOG_SPILL_REFERENCE_MAX_PEEK_MEMORY_BYTES;
@ -227,6 +228,8 @@ public:
double PROXY_SPIN_DELAY;
double UPDATE_REMOTE_LOG_VERSION_INTERVAL;
int MAX_TXS_POP_VERSION_HISTORY;
double PROXY_FORWARD_DELAY;
int MAX_FORWARD_MESSAGES;
// Master Server
double COMMIT_SLEEP_TIME;
@ -329,6 +332,15 @@ public:
double MAX_TL_SS_VERSION_DIFFERENCE_BATCH;
int MAX_MACHINES_FALLING_BEHIND;
int MAX_TPS_HISTORY_SAMPLES;
int NEEDED_TPS_HISTORY_SAMPLES;
int64_t TARGET_DURABILITY_LAG_VERSIONS;
int64_t TARGET_DURABILITY_LAG_VERSIONS_BATCH;
int64_t DURABILITY_LAG_UNLIMITED_THRESHOLD;
double INITIAL_DURABILITY_LAG_MULTIPLIER;
double DURABILITY_LAG_REDUCTION_RATE;
double DURABILITY_LAG_INCREASE_RATE;
//Storage Metrics
double STORAGE_METRICS_AVERAGE_INTERVAL;
double STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS;
@ -349,6 +361,7 @@ public:
int64_t STORAGE_DURABILITY_LAG_HARD_MAX;
int64_t STORAGE_DURABILITY_LAG_SOFT_MAX;
double STORAGE_DURABILITY_LAG_REJECT_THRESHOLD;
double STORAGE_DURABILITY_LAG_MIN_RATE;
int STORAGE_COMMIT_BYTES;
double STORAGE_COMMIT_INTERVAL;
double UPDATE_SHARD_VERSION_INTERVAL;

View File

@ -30,7 +30,7 @@ Optional<std::pair<LeaderInfo, bool>> getLeader( const vector<Optional<LeaderInf
ACTOR Future<Void> submitCandidacy( Key key, LeaderElectionRegInterface coord, LeaderInfo myInfo, UID prevChangeID, Reference<AsyncVar<vector<Optional<LeaderInfo>>>> nominees, int index ) {
loop {
auto const& nom = nominees->get()[index];
Optional<LeaderInfo> li = wait( retryBrokenPromise( coord.candidacy, CandidacyRequest( key, myInfo, nom.present() ? nom.get().changeID : UID(), prevChangeID ), TaskCoordinationReply ) );
Optional<LeaderInfo> li = wait( retryBrokenPromise( coord.candidacy, CandidacyRequest( key, myInfo, nom.present() ? nom.get().changeID : UID(), prevChangeID ), TaskPriority::CoordinationReply ) );
if (li != nominees->get()[index]) {
vector<Optional<LeaderInfo>> v = nominees->get();
@ -150,7 +150,7 @@ ACTOR Future<Void> tryBecomeLeaderInternal(ServerCoordinators coordinators, Valu
// we might be breaking the leader election process for someone with better communications but lower ID, so change IDs.
if ((!leader.present() || !leader.get().second) && std::count( nominees->get().begin(), nominees->get().end(), myInfo )) {
if (!badCandidateTimeout.isValid())
badCandidateTimeout = delay( SERVER_KNOBS->POLLING_FREQUENCY*2, TaskCoordinationReply );
badCandidateTimeout = delay( SERVER_KNOBS->POLLING_FREQUENCY*2, TaskPriority::CoordinationReply );
} else
badCandidateTimeout = Future<Void>();
@ -183,12 +183,12 @@ ACTOR Future<Void> tryBecomeLeaderInternal(ServerCoordinators coordinators, Valu
state vector<Future<Void>> true_heartbeats;
state vector<Future<Void>> false_heartbeats;
for(int i=0; i<coordinators.leaderElectionServers.size(); i++) {
Future<bool> hb = retryBrokenPromise( coordinators.leaderElectionServers[i].leaderHeartbeat, LeaderHeartbeatRequest( coordinators.clusterKey, myInfo, prevChangeID ), TaskCoordinationReply );
Future<bool> hb = retryBrokenPromise( coordinators.leaderElectionServers[i].leaderHeartbeat, LeaderHeartbeatRequest( coordinators.clusterKey, myInfo, prevChangeID ), TaskPriority::CoordinationReply );
true_heartbeats.push_back( onEqual(hb, true) );
false_heartbeats.push_back( onEqual(hb, false) );
}
state Future<Void> rate = delay( SERVER_KNOBS->HEARTBEAT_FREQUENCY, TaskCoordinationReply ) || asyncPriorityInfo->onChange(); // SOMEDAY: Move to server side?
state Future<Void> rate = delay( SERVER_KNOBS->HEARTBEAT_FREQUENCY, TaskPriority::CoordinationReply ) || asyncPriorityInfo->onChange(); // SOMEDAY: Move to server side?
choose {
when ( wait( quorum( true_heartbeats, true_heartbeats.size()/2+1 ) ) ) {

View File

@ -51,7 +51,7 @@ struct LogRouterData {
}
// Erase messages not needed to update *from* versions >= before (thus, messages with toversion <= before)
ACTOR Future<Void> eraseMessagesBefore( TagData *self, Version before, LogRouterData *tlogData, int taskID ) {
ACTOR Future<Void> eraseMessagesBefore( TagData *self, Version before, LogRouterData *tlogData, TaskPriority taskID ) {
while(!self->version_messages.empty() && self->version_messages.front().first < before) {
Version version = self->version_messages.front().first;
int64_t messagesErased = 0;
@ -68,7 +68,7 @@ struct LogRouterData {
return Void();
}
Future<Void> eraseMessagesBefore(Version before, LogRouterData *tlogData, int taskID) {
Future<Void> eraseMessagesBefore(Version before, LogRouterData *tlogData, TaskPriority taskID) {
return eraseMessagesBefore(this, before, tlogData, taskID);
}
};
@ -197,7 +197,7 @@ ACTOR Future<Void> waitForVersion( LogRouterData *self, Version ver ) {
while(self->minPopped.get() + SERVER_KNOBS->MAX_READ_TRANSACTION_LIFE_VERSIONS < ver) {
if(self->minPopped.get() + SERVER_KNOBS->MAX_READ_TRANSACTION_LIFE_VERSIONS > self->version.get()) {
self->version.set( self->minPopped.get() + SERVER_KNOBS->MAX_READ_TRANSACTION_LIFE_VERSIONS );
wait(yield(TaskTLogCommit));
wait(yield(TaskPriority::TLogCommit));
} else {
wait(self->minPopped.whenAtLeast((self->minPopped.get()+1)));
}
@ -220,7 +220,7 @@ ACTOR Future<Void> pullAsyncData( LogRouterData *self ) {
loop {
loop {
choose {
when(wait( r ? r->getMore(TaskTLogCommit) : Never() ) ) {
when(wait( r ? r->getMore(TaskPriority::TLogCommit) : Never() ) ) {
break;
}
when( wait( dbInfoChange ) ) { //FIXME: does this actually happen?
@ -247,7 +247,7 @@ ACTOR Future<Void> pullAsyncData( LogRouterData *self ) {
commitMessages(self, ver, messages);
self->version.set( ver );
wait(yield(TaskTLogCommit));
wait(yield(TaskPriority::TLogCommit));
//TraceEvent("LogRouterVersion").detail("Ver",ver);
}
lastVer = ver;
@ -260,7 +260,7 @@ ACTOR Future<Void> pullAsyncData( LogRouterData *self ) {
wait( waitForVersion(self, ver) );
self->version.set( ver );
wait(yield(TaskTLogCommit));
wait(yield(TaskPriority::TLogCommit));
}
break;
}
@ -371,7 +371,7 @@ ACTOR Future<Void> logRouterPop( LogRouterData* self, TLogPopRequest req ) {
} else if (req.to > tagData->popped) {
tagData->popped = req.to;
tagData->durableKnownCommittedVersion = req.durableKnownCommittedVersion;
wait(tagData->eraseMessagesBefore( req.to, self, TaskTLogPop ));
wait(tagData->eraseMessagesBefore( req.to, self, TaskPriority::TLogPop ));
}
state Version minPopped = std::numeric_limits<Version>::max();
@ -385,7 +385,7 @@ ACTOR Future<Void> logRouterPop( LogRouterData* self, TLogPopRequest req ) {
while(!self->messageBlocks.empty() && self->messageBlocks.front().first < minPopped) {
self->messageBlocks.pop_front();
wait(yield(TaskTLogPop));
wait(yield(TaskPriority::TLogPop));
}
self->poppedVersion = std::min(minKnownCommittedVersion, self->minKnownCommittedVersion);

View File

@ -89,9 +89,9 @@ public:
return result;
}
void populateSatelliteTagLocations(int logRouterTags, int oldLogRouterTags) {
void populateSatelliteTagLocations(int logRouterTags, int oldLogRouterTags, int txsTags, int oldTxsTags) {
satelliteTagLocations.clear();
satelliteTagLocations.resize(std::max(logRouterTags,oldLogRouterTags) + 1);
satelliteTagLocations.resize(std::max({logRouterTags,oldLogRouterTags,txsTags,oldTxsTags})+1);
std::map<int,int> server_usedBest;
std::set<std::pair<int,int>> used_servers;
@ -235,7 +235,7 @@ public:
bool allLocations = false) {
if(locality == tagLocalitySatellite) {
for(auto& t : tags) {
if(t == txsTag || t.locality == tagLocalityLogRouter) {
if(t == txsTag || t.locality == tagLocalityTxs || t.locality == tagLocalityLogRouter) {
for(int loc : satelliteTagLocations[t == txsTag ? 0 : t.id + 1]) {
locations.push_back(locationOffset + loc);
}
@ -341,7 +341,7 @@ struct ILogSystem {
//returns immediately if hasMessage() returns true.
//returns when either the result of hasMessage() or version() has changed, or a cursor has internally been exhausted.
virtual Future<Void> getMore(int taskID = TaskTLogPeekReply) = 0;
virtual Future<Void> getMore(TaskPriority taskID = TaskPriority::TLogPeekReply) = 0;
//returns when the failure monitor detects that the servers associated with the cursor are failed
virtual Future<Void> onFailed() = 0;
@ -407,7 +407,7 @@ struct ILogSystem {
virtual StringRef getMessageWithTags();
virtual const std::vector<Tag>& getTags();
virtual void advanceTo(LogMessageVersion n);
virtual Future<Void> getMore(int taskID = TaskTLogPeekReply);
virtual Future<Void> getMore(TaskPriority taskID = TaskPriority::TLogPeekReply);
virtual Future<Void> onFailed();
virtual bool isActive();
virtual bool isExhausted();
@ -455,7 +455,7 @@ struct ILogSystem {
virtual StringRef getMessageWithTags();
virtual const std::vector<Tag>& getTags();
virtual void advanceTo(LogMessageVersion n);
virtual Future<Void> getMore(int taskID = TaskTLogPeekReply);
virtual Future<Void> getMore(TaskPriority taskID = TaskPriority::TLogPeekReply);
virtual Future<Void> onFailed();
virtual bool isActive();
virtual bool isExhausted();
@ -500,7 +500,7 @@ struct ILogSystem {
virtual StringRef getMessageWithTags();
virtual const std::vector<Tag>& getTags();
virtual void advanceTo(LogMessageVersion n);
virtual Future<Void> getMore(int taskID = TaskTLogPeekReply);
virtual Future<Void> getMore(TaskPriority taskID = TaskPriority::TLogPeekReply);
virtual Future<Void> onFailed();
virtual bool isActive();
virtual bool isExhausted();
@ -521,8 +521,9 @@ struct ILogSystem {
std::vector<Reference<IPeekCursor>> cursors;
std::vector<LogMessageVersion> epochEnds;
Version poppedVersion;
bool needsPopped;
MultiCursor( std::vector<Reference<IPeekCursor>> cursors, std::vector<LogMessageVersion> epochEnds );
MultiCursor( std::vector<Reference<IPeekCursor>> cursors, std::vector<LogMessageVersion> epochEnds, bool needsPopped = true );
virtual Reference<IPeekCursor> cloneNoMore();
virtual void setProtocolVersion( ProtocolVersion version );
@ -534,7 +535,7 @@ struct ILogSystem {
virtual StringRef getMessageWithTags();
virtual const std::vector<Tag>& getTags();
virtual void advanceTo(LogMessageVersion n);
virtual Future<Void> getMore(int taskID = TaskTLogPeekReply);
virtual Future<Void> getMore(TaskPriority taskID = TaskPriority::TLogPeekReply);
virtual Future<Void> onFailed();
virtual bool isActive();
virtual bool isExhausted();
@ -576,13 +577,14 @@ struct ILogSystem {
LogMessageVersion messageVersion;
Version end;
bool hasNextMessage;
bool withTags;
//FIXME: collectTags is needed to support upgrades from 5.X to 6.0. Remove this code when we no longer support that upgrade.
bool collectTags;
std::vector<Tag> tags;
void combineMessages();
BufferedCursor( std::vector<Reference<IPeekCursor>> cursors, Version begin, Version end, bool collectTags );
BufferedCursor( std::vector<Reference<IPeekCursor>> cursors, Version begin, Version end, bool withTags, bool collectTags = false );
virtual Reference<IPeekCursor> cloneNoMore();
virtual void setProtocolVersion( ProtocolVersion version );
@ -594,7 +596,7 @@ struct ILogSystem {
virtual StringRef getMessageWithTags();
virtual const std::vector<Tag>& getTags();
virtual void advanceTo(LogMessageVersion n);
virtual Future<Void> getMore(int taskID = TaskTLogPeekReply);
virtual Future<Void> getMore(TaskPriority taskID = TaskPriority::TLogPeekReply);
virtual Future<Void> onFailed();
virtual bool isActive();
virtual bool isExhausted();
@ -653,13 +655,15 @@ struct ILogSystem {
// Same contract as peek(), but can only peek from the logs elected in the same generation.
// If the preferred log server is down, a different log from the same generation will merge results locally before sending them to the log router.
virtual Reference<IPeekCursor> peekSpecial( UID dbgid, Version begin, Tag tag, int8_t peekLocality, Version localEnd ) = 0;
// Same contract as peek(), but it allows specifying a preferred peek locality for tags that do not have locality
virtual Reference<IPeekCursor> peekTxs( UID dbgid, Version begin, int8_t peekLocality, Version localEnd ) = 0;
// Same contract as peek(), but only for peeking the txsLocality. It allows specifying a preferred peek locality.
virtual Version getKnownCommittedVersion() = 0;
virtual Future<Void> onKnownCommittedVersionChange() = 0;
virtual void popTxs( Version upTo, int8_t popLocality = tagLocalityInvalid ) = 0;
virtual void pop( Version upTo, Tag tag, Version knownCommittedVersion = 0, int8_t popLocality = tagLocalityInvalid ) = 0;
// Permits, but does not require, the log subsystem to strip `tag` from any or all messages with message versions < (upTo,0)
// The popping of any given message may be arbitrarily delayed.
@ -706,6 +710,8 @@ struct ILogSystem {
virtual Tag getRandomRouterTag() = 0;
virtual Tag getRandomTxsTag() = 0;
virtual void stopRejoins() = 0;
// Returns the pseudo tag to be popped for the given process class. If the
@ -753,6 +759,10 @@ struct LogPushData : NonCopyable {
}
}
void addTxsTag() {
next_message_tags.push_back( logSystem->getRandomTxsTag() );
}
// addTag() adds a tag for the *next* message to be added
void addTag( Tag tag ) {
next_message_tags.push_back( tag );

View File

@ -157,9 +157,10 @@ struct OldTLogConf {
std::vector<TLogSet> tLogs;
Version epochEnd;
int32_t logRouterTags;
int32_t txsTags;
std::set<int8_t> pseudoLocalities;
OldTLogConf() : epochEnd(0), logRouterTags(0) {}
OldTLogConf() : epochEnd(0), logRouterTags(0), txsTags(0) {}
explicit OldTLogConf(const OldLogData&);
std::string toString() const {
@ -167,7 +168,7 @@ struct OldTLogConf {
}
bool operator == ( const OldTLogConf& rhs ) const {
return tLogs == rhs.tLogs && epochEnd == rhs.epochEnd && logRouterTags == rhs.logRouterTags && pseudoLocalities == rhs.pseudoLocalities;
return tLogs == rhs.tLogs && epochEnd == rhs.epochEnd && logRouterTags == rhs.logRouterTags && txsTags == rhs.txsTags && pseudoLocalities == rhs.pseudoLocalities;
}
bool isEqualIds(OldTLogConf const& r) const {
@ -184,7 +185,7 @@ struct OldTLogConf {
template <class Ar>
void serialize( Ar& ar ) {
serializer(ar, tLogs, epochEnd, logRouterTags, pseudoLocalities);
serializer(ar, tLogs, epochEnd, logRouterTags, pseudoLocalities, txsTags);
}
};
@ -199,6 +200,7 @@ struct LogSystemConfig {
LogSystemType logSystemType;
std::vector<TLogSet> tLogs;
int32_t logRouterTags;
int32_t txsTags;
std::vector<OldTLogConf> oldTLogs;
int32_t expectedLogSets;
UID recruitmentID;
@ -206,7 +208,7 @@ struct LogSystemConfig {
Optional<Version> recoveredAt;
std::set<int8_t> pseudoLocalities;
LogSystemConfig() : logSystemType(LogSystemType::empty), logRouterTags(0), expectedLogSets(0), stopped(false) {}
LogSystemConfig() : logSystemType(LogSystemType::empty), logRouterTags(0), txsTags(0), expectedLogSets(0), stopped(false) {}
std::string toString() const {
return format("type: %d oldGenerations: %d tags: %d %s", logSystemType, oldTLogs.size(), logRouterTags, describe(tLogs).c_str());
@ -327,7 +329,7 @@ struct LogSystemConfig {
bool operator == ( const LogSystemConfig& rhs ) const { return isEqual(rhs); }
bool isEqual(LogSystemConfig const& r) const {
return logSystemType == r.logSystemType && tLogs == r.tLogs && oldTLogs == r.oldTLogs && expectedLogSets == r.expectedLogSets && logRouterTags == r.logRouterTags && recruitmentID == r.recruitmentID && stopped == r.stopped && recoveredAt == r.recoveredAt && pseudoLocalities == r.pseudoLocalities;
return logSystemType == r.logSystemType && tLogs == r.tLogs && oldTLogs == r.oldTLogs && expectedLogSets == r.expectedLogSets && logRouterTags == r.logRouterTags && txsTags == r.txsTags && recruitmentID == r.recruitmentID && stopped == r.stopped && recoveredAt == r.recoveredAt && pseudoLocalities == r.pseudoLocalities;
}
bool isEqualIds(LogSystemConfig const& r) const {
@ -358,7 +360,7 @@ struct LogSystemConfig {
template <class Ar>
void serialize( Ar& ar ) {
serializer(ar, logSystemType, tLogs, logRouterTags, oldTLogs, expectedLogSets, recruitmentID, stopped, recoveredAt, pseudoLocalities);
serializer(ar, logSystemType, tLogs, logRouterTags, oldTLogs, expectedLogSets, recruitmentID, stopped, recoveredAt, pseudoLocalities, txsTags);
}
};

View File

@ -42,30 +42,32 @@ public:
break;
}
when( wait( self->localityChanged ) ) {
self->cursor = self->logSystem->peekSpecial( UID(), self->recoveryLoc, self->tag, self->peekLocality ? self->peekLocality->get().primaryLocality : tagLocalityInvalid, self->peekLocality ? self->peekLocality->get().knownCommittedVersion : invalidVersion );
self->cursor = self->logSystem->peekTxs( UID(), self->recoveryLoc, self->peekLocality ? self->peekLocality->get().primaryLocality : tagLocalityInvalid, self->peekLocality ? self->peekLocality->get().knownCommittedVersion : invalidVersion );
self->localityChanged = self->peekLocality->onChange();
}
when( wait( delay(self->peekTypeSwitches==0 ? SERVER_KNOBS->DISK_QUEUE_ADAPTER_MIN_SWITCH_TIME : SERVER_KNOBS->DISK_QUEUE_ADAPTER_MAX_SWITCH_TIME)) ) {
self->peekTypeSwitches++;
if(self->peekTypeSwitches%3==1) {
self->cursor = self->logSystem->peek( UID(), self->recoveryLoc, self->tag, true );
self->cursor = self->logSystem->peekTxs( UID(), self->recoveryLoc, tagLocalityInvalid, invalidVersion );
self->localityChanged = Never();
} else if(self->peekTypeSwitches%3==2) {
self->cursor = self->logSystem->peekSpecial( UID(), self->recoveryLoc, self->tag, self->peekLocality ? self->peekLocality->get().secondaryLocality : tagLocalityInvalid, self->peekLocality ? self->peekLocality->get().knownCommittedVersion : invalidVersion );
self->cursor = self->logSystem->peekTxs( UID(), self->recoveryLoc, self->peekLocality ? self->peekLocality->get().secondaryLocality : tagLocalityInvalid, self->peekLocality ? self->peekLocality->get().knownCommittedVersion : invalidVersion );
self->localityChanged = self->peekLocality->onChange();
} else {
self->cursor = self->logSystem->peekSpecial( UID(), self->recoveryLoc, self->tag, self->peekLocality ? self->peekLocality->get().primaryLocality : tagLocalityInvalid, self->peekLocality ? self->peekLocality->get().knownCommittedVersion : invalidVersion );
self->cursor = self->logSystem->peekTxs( UID(), self->recoveryLoc, self->peekLocality ? self->peekLocality->get().primaryLocality : tagLocalityInvalid, self->peekLocality ? self->peekLocality->get().knownCommittedVersion : invalidVersion );
self->localityChanged = self->peekLocality->onChange();
}
}
}
}
TraceEvent("PeekNextGetMore").detail("Queue", self->recoveryQueue.size()).detail("Bytes", bytes).detail("Loc", self->recoveryLoc).detail("End", self->logSystem->getEnd());
TraceEvent("PeekNextGetMore").detail("Queue", self->recoveryQueue.size()).detail("Bytes", bytes).detail("Loc", self->recoveryLoc)
.detail("End", self->logSystem->getEnd()).detail("HasMessage", self->cursor->hasMessage()).detail("Version", self->cursor->version().version);
if(self->recoveryQueueDataSize == 0) {
self->recoveryQueueLoc = self->recoveryLoc;
}
if(!self->cursor->hasMessage()) {
self->recoveryLoc = self->cursor->version().version;
wait(yield());
continue;
}
}
@ -168,6 +170,6 @@ Future<LogSystemDiskQueueAdapter::CommitMessage> LogSystemDiskQueueAdapter::getC
return pcm.getFuture();
}
LogSystemDiskQueueAdapter* openDiskQueueAdapter( Reference<ILogSystem> logSystem, Tag tag, Reference<AsyncVar<PeekSpecialInfo>> peekLocality ) {
return new LogSystemDiskQueueAdapter( logSystem, tag, peekLocality );
LogSystemDiskQueueAdapter* openDiskQueueAdapter( Reference<ILogSystem> logSystem, Reference<AsyncVar<PeekTxsInfo>> peekLocality ) {
return new LogSystemDiskQueueAdapter( logSystem, peekLocality );
}

View File

@ -25,16 +25,16 @@
#include "fdbclient/FDBTypes.h"
#include "fdbserver/IDiskQueue.h"
struct PeekSpecialInfo {
struct PeekTxsInfo {
int8_t primaryLocality;
int8_t secondaryLocality;
Version knownCommittedVersion;
bool operator == (const PeekSpecialInfo& r) const {
bool operator == (const PeekTxsInfo& r) const {
return primaryLocality == r.primaryLocality && secondaryLocality == r.secondaryLocality && knownCommittedVersion == r.knownCommittedVersion;
}
PeekSpecialInfo(int8_t primaryLocality, int8_t secondaryLocality, Version knownCommittedVersion) : primaryLocality(primaryLocality), secondaryLocality(secondaryLocality), knownCommittedVersion(knownCommittedVersion) {}
PeekTxsInfo(int8_t primaryLocality, int8_t secondaryLocality, Version knownCommittedVersion) : primaryLocality(primaryLocality), secondaryLocality(secondaryLocality), knownCommittedVersion(knownCommittedVersion) {}
};
class LogSystemDiskQueueAdapter : public IDiskQueue {
@ -52,10 +52,10 @@ public:
// It does, however, peek the specified tag directly at recovery time.
LogSystemDiskQueueAdapter( Reference<ILogSystem> logSystem, Tag tag, Reference<AsyncVar<PeekSpecialInfo>> peekLocality, bool recover=true ) : logSystem(logSystem), tag(tag), peekLocality(peekLocality), enableRecovery(recover), recoveryLoc(1), recoveryQueueLoc(1), poppedUpTo(0), nextCommit(1), recoveryQueueDataSize(0), peekTypeSwitches(0) {
LogSystemDiskQueueAdapter( Reference<ILogSystem> logSystem, Reference<AsyncVar<PeekTxsInfo>> peekLocality, bool recover=true ) : logSystem(logSystem), peekLocality(peekLocality), enableRecovery(recover), recoveryLoc(1), recoveryQueueLoc(1), poppedUpTo(0), nextCommit(1), recoveryQueueDataSize(0), peekTypeSwitches(0) {
if (enableRecovery) {
localityChanged = peekLocality ? peekLocality->onChange() : Never();
cursor = logSystem->peekSpecial( UID(), 1, tag, peekLocality ? peekLocality->get().primaryLocality : tagLocalityInvalid, peekLocality ? peekLocality->get().knownCommittedVersion : invalidVersion );
cursor = logSystem->peekTxs( UID(), 1, peekLocality ? peekLocality->get().primaryLocality : tagLocalityInvalid, peekLocality ? peekLocality->get().knownCommittedVersion : invalidVersion );
}
}
@ -92,11 +92,10 @@ public:
virtual int getCommitOverhead() { return 0; } //SOMEDAY: could this be more accurate?
private:
Reference<AsyncVar<PeekSpecialInfo>> peekLocality;
Reference<AsyncVar<PeekTxsInfo>> peekLocality;
Future<Void> localityChanged;
Reference<ILogSystem::IPeekCursor> cursor;
int peekTypeSwitches;
Tag tag;
// Recovery state (used while readNext() is being called repeatedly)
bool enableRecovery;
@ -114,6 +113,6 @@ private:
friend class LogSystemDiskQueueAdapterImpl;
};
LogSystemDiskQueueAdapter* openDiskQueueAdapter( Reference<ILogSystem> logSystem, Tag tag, Reference<AsyncVar<PeekSpecialInfo>> peekLocality );
LogSystemDiskQueueAdapter* openDiskQueueAdapter( Reference<ILogSystem> logSystem, Reference<AsyncVar<PeekTxsInfo>> peekLocality );
#endif

View File

@ -133,7 +133,7 @@ void ILogSystem::ServerPeekCursor::advanceTo(LogMessageVersion n) {
}
}
ACTOR Future<Void> serverPeekParallelGetMore( ILogSystem::ServerPeekCursor* self, int taskID ) {
ACTOR Future<Void> serverPeekParallelGetMore( ILogSystem::ServerPeekCursor* self, TaskPriority taskID ) {
if( !self->interf || self->messageVersion >= self->end ) {
wait( Future<Void>(Never()));
throw internal_error();
@ -199,7 +199,7 @@ ACTOR Future<Void> serverPeekParallelGetMore( ILogSystem::ServerPeekCursor* self
}
}
ACTOR Future<Void> serverPeekGetMore( ILogSystem::ServerPeekCursor* self, int taskID ) {
ACTOR Future<Void> serverPeekGetMore( ILogSystem::ServerPeekCursor* self, TaskPriority taskID ) {
if( !self->interf || self->messageVersion >= self->end ) {
wait( Future<Void>(Never()));
throw internal_error();
@ -235,7 +235,7 @@ ACTOR Future<Void> serverPeekGetMore( ILogSystem::ServerPeekCursor* self, int ta
}
}
Future<Void> ILogSystem::ServerPeekCursor::getMore(int taskID) {
Future<Void> ILogSystem::ServerPeekCursor::getMore(TaskPriority taskID) {
//TraceEvent("SPC_GetMore", randomID).detail("HasMessage", hasMessage()).detail("More", !more.isValid() || more.isReady()).detail("MessageVersion", messageVersion.toString()).detail("End", end.toString());
if( hasMessage() )
return Void();
@ -252,7 +252,7 @@ Future<Void> ILogSystem::ServerPeekCursor::getMore(int taskID) {
ACTOR Future<Void> serverPeekOnFailed( ILogSystem::ServerPeekCursor* self ) {
loop {
choose {
when( wait( self->interf->get().present() ? IFailureMonitor::failureMonitor().onDisconnectOrFailure( self->interf->get().interf().peekMessages.getEndpoint() ) : Never() ) ) { return Void(); }
when( wait( self->interf->get().present() ? IFailureMonitor::failureMonitor().onStateEqual( self->interf->get().interf().peekMessages.getEndpoint(), FailureStatus() ) : Never() ) ) { return Void(); }
when( wait( self->interf->onChange() ) ) {}
}
}
@ -445,7 +445,7 @@ void ILogSystem::MergedPeekCursor::advanceTo(LogMessageVersion n) {
}
}
ACTOR Future<Void> mergedPeekGetMore(ILogSystem::MergedPeekCursor* self, LogMessageVersion startVersion, int taskID) {
ACTOR Future<Void> mergedPeekGetMore(ILogSystem::MergedPeekCursor* self, LogMessageVersion startVersion, TaskPriority taskID) {
loop {
//TraceEvent("MPC_GetMoreA", self->randomID).detail("Start", startVersion.toString());
if(self->bestServer >= 0 && self->serverCursors[self->bestServer]->isActive()) {
@ -466,7 +466,7 @@ ACTOR Future<Void> mergedPeekGetMore(ILogSystem::MergedPeekCursor* self, LogMess
}
}
Future<Void> ILogSystem::MergedPeekCursor::getMore(int taskID) {
Future<Void> ILogSystem::MergedPeekCursor::getMore(TaskPriority taskID) {
if(!serverCursors.size())
return Never();
@ -706,7 +706,7 @@ void ILogSystem::SetPeekCursor::advanceTo(LogMessageVersion n) {
}
}
ACTOR Future<Void> setPeekGetMore(ILogSystem::SetPeekCursor* self, LogMessageVersion startVersion, int taskID) {
ACTOR Future<Void> setPeekGetMore(ILogSystem::SetPeekCursor* self, LogMessageVersion startVersion, TaskPriority taskID) {
loop {
//TraceEvent("LPC_GetMore1", self->randomID).detail("Start", startVersion.toString()).detail("Tag", self->tag);
if(self->bestServer >= 0 && self->bestSet >= 0 && self->serverCursors[self->bestSet][self->bestServer]->isActive()) {
@ -767,7 +767,7 @@ ACTOR Future<Void> setPeekGetMore(ILogSystem::SetPeekCursor* self, LogMessageVer
}
}
Future<Void> ILogSystem::SetPeekCursor::getMore(int taskID) {
Future<Void> ILogSystem::SetPeekCursor::getMore(TaskPriority taskID) {
auto startVersion = version();
calcHasMessage();
if( hasMessage() )
@ -811,7 +811,7 @@ Version ILogSystem::SetPeekCursor::popped() {
return poppedVersion;
}
ILogSystem::MultiCursor::MultiCursor( std::vector<Reference<IPeekCursor>> cursors, std::vector<LogMessageVersion> epochEnds ) : cursors(cursors), epochEnds(epochEnds), poppedVersion(0) {
ILogSystem::MultiCursor::MultiCursor( std::vector<Reference<IPeekCursor>> cursors, std::vector<LogMessageVersion> epochEnds, bool needsPopped ) : cursors(cursors), epochEnds(epochEnds), needsPopped(needsPopped), poppedVersion(0) {
for(int i = 0; i < std::min<int>(cursors.size(),SERVER_KNOBS->MULTI_CURSOR_PRE_FETCH_LIMIT); i++) {
cursors[cursors.size()-i-1]->getMore();
}
@ -855,17 +855,17 @@ const std::vector<Tag>& ILogSystem::MultiCursor::getTags() {
void ILogSystem::MultiCursor::advanceTo(LogMessageVersion n) {
while( cursors.size() > 1 && n >= epochEnds.back() ) {
poppedVersion = std::max(poppedVersion, cursors.back()->popped());
if(needsPopped) poppedVersion = std::max(poppedVersion, cursors.back()->popped());
cursors.pop_back();
epochEnds.pop_back();
}
cursors.back()->advanceTo(n);
}
Future<Void> ILogSystem::MultiCursor::getMore(int taskID) {
Future<Void> ILogSystem::MultiCursor::getMore(TaskPriority taskID) {
LogMessageVersion startVersion = cursors.back()->version();
while( cursors.size() > 1 && cursors.back()->version() >= epochEnds.back() ) {
poppedVersion = std::max(poppedVersion, cursors.back()->popped());
if(needsPopped) poppedVersion = std::max(poppedVersion, cursors.back()->popped());
cursors.pop_back();
epochEnds.pop_back();
}
@ -896,10 +896,11 @@ Version ILogSystem::MultiCursor::getMinKnownCommittedVersion() {
}
Version ILogSystem::MultiCursor::popped() {
ASSERT(needsPopped);
return std::max(poppedVersion, cursors.back()->popped());
}
ILogSystem::BufferedCursor::BufferedCursor( std::vector<Reference<IPeekCursor>> cursors, Version begin, Version end, bool collectTags ) : cursors(cursors), messageVersion(begin), end(end), collectTags(collectTags), hasNextMessage(false), messageIndex(0) {
ILogSystem::BufferedCursor::BufferedCursor( std::vector<Reference<IPeekCursor>> cursors, Version begin, Version end, bool withTags, bool collectTags ) : cursors(cursors), messageVersion(begin), end(end), withTags(withTags), collectTags(collectTags), hasNextMessage(false), messageIndex(0) {
messages.reserve(10000);
}
@ -962,15 +963,17 @@ void ILogSystem::BufferedCursor::nextMessage() {
}
StringRef ILogSystem::BufferedCursor::getMessage() {
ASSERT(false);
return StringRef();
ASSERT(!withTags);
return messages[messageIndex].message;
}
StringRef ILogSystem::BufferedCursor::getMessageWithTags() {
ASSERT(withTags);
return messages[messageIndex].message;
}
const std::vector<Tag>& ILogSystem::BufferedCursor::getTags() {
ASSERT(withTags);
return messages[messageIndex].tags;
}
@ -978,14 +981,14 @@ void ILogSystem::BufferedCursor::advanceTo(LogMessageVersion n) {
ASSERT(false);
}
ACTOR Future<Void> bufferedGetMoreLoader( ILogSystem::BufferedCursor* self, Reference<ILogSystem::IPeekCursor> cursor, Version maxVersion, int taskID ) {
ACTOR Future<Void> bufferedGetMoreLoader( ILogSystem::BufferedCursor* self, Reference<ILogSystem::IPeekCursor> cursor, Version maxVersion, TaskPriority taskID ) {
loop {
wait(yield());
if(cursor->version().version >= maxVersion) {
return Void();
}
while(cursor->hasMessage()) {
self->messages.push_back(ILogSystem::BufferedCursor::BufferedMessage(cursor->arena(), self->collectTags ? cursor->getMessage() : cursor->getMessageWithTags(), cursor->getTags(), cursor->version()));
self->messages.push_back(ILogSystem::BufferedCursor::BufferedMessage(cursor->arena(), (!self->withTags || self->collectTags) ? cursor->getMessage() : cursor->getMessageWithTags(), !self->withTags ? std::vector<Tag>() : cursor->getTags(), cursor->version()));
cursor->nextMessage();
if(cursor->version().version >= maxVersion) {
return Void();
@ -995,7 +998,7 @@ ACTOR Future<Void> bufferedGetMoreLoader( ILogSystem::BufferedCursor* self, Refe
}
}
ACTOR Future<Void> bufferedGetMore( ILogSystem::BufferedCursor* self, int taskID ) {
ACTOR Future<Void> bufferedGetMore( ILogSystem::BufferedCursor* self, TaskPriority taskID ) {
if( self->messageVersion.version >= self->end ) {
wait( Future<Void>(Never()));
throw internal_error();
@ -1019,7 +1022,11 @@ ACTOR Future<Void> bufferedGetMore( ILogSystem::BufferedCursor* self, int taskID
}
self->messageIndex = 0;
self->hasNextMessage = self->messages.size() > 0;
self->messageVersion = LogMessageVersion(targetVersion);
Version minVersion = self->end;
for(auto& cursor : self->cursors) {
minVersion = std::min(minVersion, cursor->version().version);
}
self->messageVersion = LogMessageVersion(minVersion);
if(self->collectTags) {
self->combineMessages();
@ -1029,7 +1036,7 @@ ACTOR Future<Void> bufferedGetMore( ILogSystem::BufferedCursor* self, int taskID
return Void();
}
Future<Void> ILogSystem::BufferedCursor::getMore(int taskID) {
Future<Void> ILogSystem::BufferedCursor::getMore(TaskPriority taskID) {
if( hasMessage() )
return Void();
return bufferedGetMore(this, taskID);

View File

@ -50,7 +50,7 @@ struct MasterInterface {
}
void initEndpoints() {
getCommitVersion.getEndpoint( TaskProxyGetConsistentReadVersion );
getCommitVersion.getEndpoint( TaskPriority::ProxyGetConsistentReadVersion );
}
};

View File

@ -95,11 +95,11 @@ ACTOR Future<Void> getRate(UID myID, Reference<AsyncVar<ServerDBInfo>> db, int64
loop choose {
when ( wait( db->onChange() ) ) {
if ( db->get().ratekeeper.present() ) {
TraceEvent("Proxy_RatekeeperChanged", myID)
TraceEvent("ProxyRatekeeperChanged", myID)
.detail("RKID", db->get().ratekeeper.get().id());
nextRequestTimer = Void(); // trigger GetRate request
} else {
TraceEvent("Proxy_RatekeeperDied", myID);
TraceEvent("ProxyRatekeeperDied", myID);
nextRequestTimer = Never();
reply = Never();
}
@ -158,7 +158,7 @@ ACTOR Future<Void> queueTransactionStartRequests(
if (now() - *lastGRVTime > *GRVBatchTime)
*lastGRVTime = now() - *GRVBatchTime;
forwardPromise(GRVTimer, delayJittered(*GRVBatchTime - (now() - *lastGRVTime), TaskProxyGRVTimer));
forwardPromise(GRVTimer, delayJittered(*GRVBatchTime - (now() - *lastGRVTime), TaskPriority::ProxyGRVTimer));
}
transactionQueue->push(std::make_pair(req, counter--));
@ -263,7 +263,7 @@ struct ProxyCommitData {
lastVersionTime(0), commitVersionRequestNumber(1), mostRecentProcessedRequestNumber(0),
getConsistentReadVersion(getConsistentReadVersion), commit(commit), lastCoalesceTime(0),
localCommitBatchesStarted(0), locked(false), commitBatchInterval(SERVER_KNOBS->COMMIT_TRANSACTION_BATCH_INTERVAL_MIN),
firstProxy(firstProxy), cx(openDBOnServer(db, TaskDefaultEndpoint, true, true)), db(db),
firstProxy(firstProxy), cx(openDBOnServer(db, TaskPriority::DefaultEndpoint, true, true)), db(db),
singleKeyMutationEvent(LiteralStringRef("SingleKeyMutation")), commitBatchesMemBytesCount(0), lastTxsPop(0)
{}
};
@ -350,7 +350,7 @@ struct ResolutionRequestBuilder {
};
ACTOR Future<Void> commitBatcher(ProxyCommitData *commitData, PromiseStream<std::pair<std::vector<CommitTransactionRequest>, int> > out, FutureStream<CommitTransactionRequest> in, int desiredBytes, int64_t memBytesLimit) {
wait(delayJittered(commitData->commitBatchInterval, TaskProxyCommitBatcher));
wait(delayJittered(commitData->commitBatchInterval, TaskPriority::ProxyCommitBatcher));
state double lastBatch = 0;
@ -363,7 +363,7 @@ ACTOR Future<Void> commitBatcher(ProxyCommitData *commitData, PromiseStream<std:
timeout = Never();
}
else {
timeout = delayJittered(SERVER_KNOBS->MAX_COMMIT_BATCH_INTERVAL, TaskProxyCommitBatcher);
timeout = delayJittered(SERVER_KNOBS->MAX_COMMIT_BATCH_INTERVAL, TaskPriority::ProxyCommitBatcher);
}
while(!timeout.isReady() && !(batch.size() == SERVER_KNOBS->COMMIT_TRANSACTION_BATCH_COUNT_MAX || batchBytes >= desiredBytes)) {
@ -387,10 +387,10 @@ ACTOR Future<Void> commitBatcher(ProxyCommitData *commitData, PromiseStream<std:
if(!batch.size()) {
commitData->commitBatchStartNotifications.send(Void());
if(now() - lastBatch > commitData->commitBatchInterval) {
timeout = delayJittered(SERVER_KNOBS->COMMIT_TRANSACTION_BATCH_INTERVAL_FROM_IDLE, TaskProxyCommitBatcher);
timeout = delayJittered(SERVER_KNOBS->COMMIT_TRANSACTION_BATCH_INTERVAL_FROM_IDLE, TaskPriority::ProxyCommitBatcher);
}
else {
timeout = delayJittered(commitData->commitBatchInterval - (now() - lastBatch), TaskProxyCommitBatcher);
timeout = delayJittered(commitData->commitBatchInterval - (now() - lastBatch), TaskPriority::ProxyCommitBatcher);
}
}
@ -398,7 +398,7 @@ ACTOR Future<Void> commitBatcher(ProxyCommitData *commitData, PromiseStream<std:
out.send({ batch, batchBytes });
lastBatch = now();
commitData->commitBatchStartNotifications.send(Void());
timeout = delayJittered(commitData->commitBatchInterval, TaskProxyCommitBatcher);
timeout = delayJittered(commitData->commitBatchInterval, TaskPriority::ProxyCommitBatcher);
batch = std::vector<CommitTransactionRequest>();
batchBytes = 0;
}
@ -457,7 +457,7 @@ ACTOR Future<Void> commitBatch(
ASSERT(SERVER_KNOBS->MAX_READ_TRANSACTION_LIFE_VERSIONS <= SERVER_KNOBS->MAX_VERSIONS_IN_FLIGHT); // since we are using just the former to limit the number of versions actually in flight!
// Active load balancing runs at a very high priority (to obtain accurate estimate of memory used by commit batches) so we need to downgrade here
wait(delay(0, TaskProxyCommit));
wait(delay(0, TaskPriority::ProxyCommit));
self->lastVersionTime = t1;
@ -534,7 +534,7 @@ ACTOR Future<Void> commitBatch(
vector< Future<ResolveTransactionBatchReply> > replies;
for (int r = 0; r<self->resolvers.size(); r++) {
requests.requests[r].debugID = debugID;
replies.push_back(brokenPromiseToNever(self->resolvers[r].resolve.getReply(requests.requests[r], TaskProxyResolverReply)));
replies.push_back(brokenPromiseToNever(self->resolvers[r].resolve.getReply(requests.requests[r], TaskPriority::ProxyResolverReply)));
}
state vector<vector<int>> transactionResolverMap = std::move( requests.transactionResolverMap );
@ -965,7 +965,7 @@ ACTOR Future<Void> commitBatch(
break;
}
when(GetReadVersionReply v = wait(self->getConsistentReadVersion.getReply(GetReadVersionRequest(0, GetReadVersionRequest::PRIORITY_SYSTEM_IMMEDIATE | GetReadVersionRequest::FLAG_CAUSAL_READ_RISKY)))) {
if(v.version > self->committedVersion.get()) {
if(!v.newClientInfo.present() && v.version > self->committedVersion.get()) {
self->locked = v.locked;
self->metadataVersion = v.metadataVersion;
self->committedVersion.set(v.version);
@ -986,7 +986,7 @@ ACTOR Future<Void> commitBatch(
bool firstMessage = true;
for(auto m : msg.messages) {
if(firstMessage) {
toCommit.addTag(txsTag);
toCommit.addTxsTag();
}
toCommit.addMessage(StringRef(m.begin(), m.size()), !firstMessage);
firstMessage = false;
@ -1033,7 +1033,7 @@ ACTOR Future<Void> commitBatch(
self->txsPopVersions.emplace_back(commitVersion, msg.popTo);
}
self->logSystem->pop(msg.popTo, txsTag);
self->logSystem->popTxs(msg.popTo);
/////// Phase 5: Replies (CPU bound; no particular order required, though ordered execution would be best for latency)
if ( prevVersion && commitVersion - prevVersion < SERVER_KNOBS->MAX_VERSIONS_IN_FLIGHT/2 )
@ -1135,7 +1135,7 @@ ACTOR Future<GetReadVersionReply> getLiveCommittedVersion(ProxyCommitData* commi
state vector<Future<GetReadVersionReply>> proxyVersions;
for (auto const& p : *otherProxies)
proxyVersions.push_back(brokenPromiseToNever(p.getRawCommittedVersion.getReply(GetRawCommittedVersionRequest(debugID), TaskTLogConfirmRunningReply)));
proxyVersions.push_back(brokenPromiseToNever(p.getRawCommittedVersion.getReply(GetRawCommittedVersionRequest(debugID), TaskPriority::TLogConfirmRunningReply)));
if (!(flags&GetReadVersionRequest::FLAG_CAUSAL_READ_RISKY))
{
@ -1292,7 +1292,7 @@ ACTOR static Future<Void> transactionStarter(
}
if (!transactionQueue.empty())
forwardPromise(GRVTimer, delayJittered(SERVER_KNOBS->START_TRANSACTION_BATCH_QUEUE_CHECK_INTERVAL, TaskProxyGRVTimer));
forwardPromise(GRVTimer, delayJittered(SERVER_KNOBS->START_TRANSACTION_BATCH_QUEUE_CHECK_INTERVAL, TaskPriority::ProxyGRVTimer));
/*TraceEvent("GRVBatch", proxy.id())
.detail("Elapsed", elapsed)
@ -1505,7 +1505,7 @@ ACTOR Future<Void> monitorRemoteCommitted(ProxyCommitData* self) {
while(self->txsPopVersions.size() && self->txsPopVersions.front().first <= minVersion) {
self->lastTxsPop = self->txsPopVersions.front().second;
self->logSystem->pop(self->txsPopVersions.front().second, txsTag, 0, tagLocalityRemoteLog);
self->logSystem->popTxs(self->txsPopVersions.front().second, tagLocalityRemoteLog);
self->txsPopVersions.pop_front();
}
@ -1563,7 +1563,7 @@ ACTOR Future<Void> masterProxyServerCore(
r->value().emplace_back(0,0);
commitData.logSystem = ILogSystem::fromServerDBInfo(proxy.id(), commitData.db->get(), false, addActor);
commitData.logAdapter = new LogSystemDiskQueueAdapter(commitData.logSystem, txsTag, Reference<AsyncVar<PeekSpecialInfo>>(), false);
commitData.logAdapter = new LogSystemDiskQueueAdapter(commitData.logSystem, Reference<AsyncVar<PeekTxsInfo>>(), false);
commitData.txnStateStore = keyValueStoreLogSystem(commitData.logAdapter, proxy.id(), 2e9, true, true, true);
createWhitelistBinPathVec(whitelistBinPaths, commitData.whitelistedBinPathVec);
@ -1595,7 +1595,7 @@ ACTOR Future<Void> masterProxyServerCore(
for(auto it : commitData.tag_popped) {
commitData.logSystem->pop(it.second, it.first);
}
commitData.logSystem->pop(commitData.lastTxsPop, txsTag, 0, tagLocalityRemoteLog);
commitData.logSystem->popTxs(commitData.lastTxsPop, tagLocalityRemoteLog);
}
Optional<LatencyBandConfig> newLatencyBandConfig = commitData.db->get().latencyBandConfig;
@ -1782,33 +1782,85 @@ ACTOR Future<Void> masterProxyServerCore(
ACTOR Future<Void> checkRemoved(Reference<AsyncVar<ServerDBInfo>> db, uint64_t recoveryCount, MasterProxyInterface myInterface) {
loop{
if (db->get().recoveryCount >= recoveryCount && !std::count(db->get().client.proxies.begin(), db->get().client.proxies.end(), myInterface))
throw worker_removed();
if (db->get().recoveryCount >= recoveryCount && !std::count(db->get().client.proxies.begin(), db->get().client.proxies.end(), myInterface)) {
throw worker_removed();
}
wait(db->onChange());
}
}
ACTOR template <class X> Future<Void> stripRequests( RequestStream<X> in, PromiseStream<ReplyPromise<REPLY_TYPE(X)>> out, int* count) {
loop {
X req = waitNext(in.getFuture());
out.send(req.reply);
if((*count) >= 0 && ++(*count) >= SERVER_KNOBS->MAX_FORWARD_MESSAGES) {
TraceEvent(SevWarnAlways, "TooManyProxyForwardRequests");
return Void();
}
}
}
ACTOR Future<Void> forwardProxy(ClientDBInfo info, PromiseStream<ReplyPromise<CommitID>> commitReplies, PromiseStream<ReplyPromise<GetReadVersionReply>> grvReplies, PromiseStream<ReplyPromise<GetKeyServerLocationsReply>> locationReplies) {
loop {
choose {
when(ReplyPromise<CommitID> req = waitNext(commitReplies.getFuture())) {
CommitID rep;
rep.newClientInfo = info;
req.send(rep);
}
when(ReplyPromise<GetReadVersionReply> req = waitNext(grvReplies.getFuture())) {
GetReadVersionReply rep;
rep.newClientInfo = info;
req.send(rep);
}
when(ReplyPromise<GetKeyServerLocationsReply> req = waitNext(locationReplies.getFuture())) {
GetKeyServerLocationsReply rep;
rep.newClientInfo = info;
req.send(rep);
}
}
wait(yield());
}
}
ACTOR Future<Void> masterProxyServer(
MasterProxyInterface proxy,
InitializeMasterProxyRequest req,
Reference<AsyncVar<ServerDBInfo>> db,
std::string whitelistBinPaths)
{
state Future<Void> core;
try {
state Future<Void> core = masterProxyServerCore(proxy, req.master, db, req.recoveryCount, req.recoveryTransactionVersion, req.firstProxy, whitelistBinPaths);
loop choose{
when(wait(core)) { return Void(); }
when(wait(checkRemoved(db, req.recoveryCount, proxy))) {}
}
core = masterProxyServerCore(proxy, req.master, db, req.recoveryCount, req.recoveryTransactionVersion, req.firstProxy, whitelistBinPaths);
wait(core || checkRemoved(db, req.recoveryCount, proxy));
}
catch (Error& e) {
if (e.code() == error_code_actor_cancelled || e.code() == error_code_worker_removed || e.code() == error_code_tlog_stopped ||
e.code() == error_code_master_tlog_failed || e.code() == error_code_coordinators_changed || e.code() == error_code_coordinated_state_conflict ||
e.code() == error_code_new_coordinators_timed_out)
{
TraceEvent("MasterProxyTerminated", proxy.id()).error(e, true);
TraceEvent("MasterProxyTerminated", proxy.id()).error(e, true);
if (e.code() != error_code_worker_removed && e.code() != error_code_tlog_stopped &&
e.code() != error_code_master_tlog_failed && e.code() != error_code_coordinators_changed &&
e.code() != error_code_coordinated_state_conflict && e.code() != error_code_new_coordinators_timed_out) {
throw;
}
}
core.cancel();
state PromiseStream<ReplyPromise<CommitID>> commitReplies;
state PromiseStream<ReplyPromise<GetReadVersionReply>> grvReplies;
state PromiseStream<ReplyPromise<GetKeyServerLocationsReply>> locationReplies;
state int replyCount = 0;
state Future<Void> finishForward = delay(SERVER_KNOBS->PROXY_FORWARD_DELAY) || stripRequests(proxy.commit, commitReplies, &replyCount) || stripRequests(proxy.getConsistentReadVersion, grvReplies, &replyCount) || stripRequests(proxy.getKeyServersLocations, locationReplies, &replyCount);
proxy = MasterProxyInterface();
loop {
if(finishForward.isReady()) {
return Void();
}
throw;
if(db->get().client.proxies.size() > 0 && !db->get().client.proxies[0].provisional && db->get().recoveryCount >= req.recoveryCount
&& !std::count(db->get().client.proxies.begin(), db->get().client.proxies.end(), proxy)) {
replyCount = -1;
core = forwardProxy(db->get().client, commitReplies, grvReplies, locationReplies);
wait(finishForward);
return Void();
}
wait(db->onChange() || finishForward);
}
}

View File

@ -130,12 +130,12 @@ ACTOR Future<vector<UID>> addReadWriteDestinations(KeyRangeRef shard, vector<Sto
state vector< Future<Optional<UID>> > srcChecks;
for(int s=0; s<srcInterfs.size(); s++) {
srcChecks.push_back( checkReadWrite( srcInterfs[s].getShardState.getReplyUnlessFailedFor( GetShardStateRequest( shard, GetShardStateRequest::NO_WAIT), SERVER_KNOBS->SERVER_READY_QUORUM_INTERVAL, 0, TaskMoveKeys ), srcInterfs[s].id(), 0 ) );
srcChecks.push_back( checkReadWrite( srcInterfs[s].getShardState.getReplyUnlessFailedFor( GetShardStateRequest( shard, GetShardStateRequest::NO_WAIT), SERVER_KNOBS->SERVER_READY_QUORUM_INTERVAL, 0, TaskPriority::MoveKeys ), srcInterfs[s].id(), 0 ) );
}
state vector< Future<Optional<UID>> > destChecks;
for(int s=0; s<destInterfs.size(); s++) {
destChecks.push_back( checkReadWrite( destInterfs[s].getShardState.getReplyUnlessFailedFor( GetShardStateRequest( shard, GetShardStateRequest::NO_WAIT), SERVER_KNOBS->SERVER_READY_QUORUM_INTERVAL, 0, TaskMoveKeys ), destInterfs[s].id(), version ) );
destChecks.push_back( checkReadWrite( destInterfs[s].getShardState.getReplyUnlessFailedFor( GetShardStateRequest( shard, GetShardStateRequest::NO_WAIT), SERVER_KNOBS->SERVER_READY_QUORUM_INTERVAL, 0, TaskPriority::MoveKeys ), destInterfs[s].id(), version ) );
}
wait( waitForAll(srcChecks) && waitForAll(destChecks) );
@ -225,7 +225,7 @@ ACTOR Future<Void> startMoveKeys( Database occ, KeyRange keys, vector<UID> serve
state TraceInterval interval("RelocateShard_StartMoveKeys");
//state TraceInterval waitInterval("");
wait( startMoveKeysLock->take( TaskDataDistributionLaunch ) );
wait( startMoveKeysLock->take( TaskPriority::DataDistributionLaunch ) );
state FlowLock::Releaser releaser( *startMoveKeysLock );
TraceEvent(SevDebug, interval.begin(), relocationIntervalId);
@ -255,7 +255,7 @@ ACTOR Future<Void> startMoveKeys( Database occ, KeyRange keys, vector<UID> serve
//Keep track of shards for all src servers so that we can preserve their values in serverKeys
state Map<UID, VectorRef<KeyRangeRef>> shardMap;
tr.info.taskID = TaskMoveKeys;
tr.info.taskID = TaskPriority::MoveKeys;
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
wait( checkMoveKeysLock(&tr, lock) );
@ -394,11 +394,11 @@ ACTOR Future<Void> startMoveKeys( Database occ, KeyRange keys, vector<UID> serve
ACTOR Future<Void> waitForShardReady( StorageServerInterface server, KeyRange keys, Version minVersion, GetShardStateRequest::waitMode mode ) {
loop {
try {
std::pair<Version,Version> rep = wait( server.getShardState.getReply( GetShardStateRequest(keys, mode), TaskMoveKeys ) );
std::pair<Version,Version> rep = wait( server.getShardState.getReply( GetShardStateRequest(keys, mode), TaskPriority::MoveKeys ) );
if (rep.first >= minVersion) {
return Void();
}
wait( delayJittered( SERVER_KNOBS->SHARD_READY_DELAY, TaskMoveKeys ) );
wait( delayJittered( SERVER_KNOBS->SHARD_READY_DELAY, TaskPriority::MoveKeys ) );
}
catch (Error& e) {
if( e.code() != error_code_timed_out ) {
@ -419,7 +419,7 @@ ACTOR Future<Void> checkFetchingState( Database cx, vector<UID> dest, KeyRange k
try {
if (BUGGIFY) wait(delay(5));
tr.info.taskID = TaskMoveKeys;
tr.info.taskID = TaskPriority::MoveKeys;
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
vector< Future< Optional<Value> > > serverListEntries;
@ -439,7 +439,7 @@ ACTOR Future<Void> checkFetchingState( Database cx, vector<UID> dest, KeyRange k
}
wait( timeoutError( waitForAll( requests ),
SERVER_KNOBS->SERVER_READY_QUORUM_TIMEOUT, TaskMoveKeys ) );
SERVER_KNOBS->SERVER_READY_QUORUM_TIMEOUT, TaskPriority::MoveKeys ) );
dataMovementComplete.send(Void());
return Void();
@ -480,11 +480,11 @@ ACTOR Future<Void> finishMoveKeys( Database occ, KeyRange keys, vector<UID> dest
//printf("finishMoveKeys( '%s'-'%s' )\n", keys.begin.toString().c_str(), keys.end.toString().c_str());
loop {
try {
tr.info.taskID = TaskMoveKeys;
tr.info.taskID = TaskPriority::MoveKeys;
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
releaser.release();
wait( finishMoveKeysParallelismLock->take( TaskDataDistributionLaunch ) );
wait( finishMoveKeysParallelismLock->take( TaskPriority::DataDistributionLaunch ) );
releaser = FlowLock::Releaser( *finishMoveKeysParallelismLock );
wait( checkMoveKeysLock(&tr, lock) );
@ -632,7 +632,7 @@ ACTOR Future<Void> finishMoveKeys( Database occ, KeyRange keys, vector<UID> dest
for(int s=0; s<storageServerInterfaces.size(); s++)
serverReady.push_back( waitForShardReady( storageServerInterfaces[s], keys, tr.getReadVersion().get(), GetShardStateRequest::READABLE) );
wait( timeout( waitForAll( serverReady ), SERVER_KNOBS->SERVER_READY_QUORUM_TIMEOUT, Void(), TaskMoveKeys ) );
wait( timeout( waitForAll( serverReady ), SERVER_KNOBS->SERVER_READY_QUORUM_TIMEOUT, Void(), TaskPriority::MoveKeys ) );
int count = dest.size() - newDestinations.size();
for(int s=0; s<serverReady.size(); s++)
count += serverReady[s].isReady() && !serverReady[s].isError();
@ -808,7 +808,7 @@ ACTOR Future<Void> removeStorageServer( Database cx, UID serverID, MoveKeysLock
if (!canRemove) {
TEST(true); // The caller had a transaction in flight that assigned keys to the server. Wait for it to reverse its mistake.
TraceEvent(SevWarn,"NoCanRemove").detail("Count", noCanRemoveCount++).detail("ServerID", serverID);
wait( delayJittered(SERVER_KNOBS->REMOVE_RETRY_DELAY, TaskDataDistributionLaunch) );
wait( delayJittered(SERVER_KNOBS->REMOVE_RETRY_DELAY, TaskPriority::DataDistributionLaunch) );
tr.reset();
TraceEvent("RemoveStorageServerRetrying").detail("CanRemove", canRemove);
} else {

View File

@ -48,7 +48,7 @@ namespace oldTLog_4_6 {
typedef int16_t OldTag;
OldTag convertTag( Tag tag ) {
if(tag == invalidTag) return invalidTagOld;
if(tag == invalidTag || tag.locality == tagLocalityTxs) return invalidTagOld;
if(tag == txsTag) return txsTagOld;
ASSERT(tag.id >= 0);
return tag.id;
@ -333,7 +333,7 @@ namespace oldTLog_4_6 {
}
// Erase messages not needed to update *from* versions >= before (thus, messages with toversion <= before)
ACTOR Future<Void> eraseMessagesBefore( TagData *self, Version before, int64_t* gBytesErased, Reference<LogData> tlogData, int taskID ) {
ACTOR Future<Void> eraseMessagesBefore( TagData *self, Version before, int64_t* gBytesErased, Reference<LogData> tlogData, TaskPriority taskID ) {
while(!self->version_messages.empty() && self->version_messages.front().first < before) {
Version version = self->version_messages.front().first;
std::pair<int, int> &sizes = tlogData->version_sizes[version];
@ -359,7 +359,7 @@ namespace oldTLog_4_6 {
return Void();
}
Future<Void> eraseMessagesBefore(Version before, int64_t* gBytesErased, Reference<LogData> tlogData, int taskID) {
Future<Void> eraseMessagesBefore(Version before, int64_t* gBytesErased, Reference<LogData> tlogData, TaskPriority taskID) {
return eraseMessagesBefore(this, before, gBytesErased, tlogData, taskID);
}
};
@ -526,21 +526,21 @@ namespace oldTLog_4_6 {
self->persistentData->set( KeyValueRef( persistTagMessagesKey( logData->logId, tag->key, currentVersion ), wr.toValue() ) );
Future<Void> f = yield(TaskUpdateStorage);
Future<Void> f = yield(TaskPriority::UpdateStorage);
if(!f.isReady()) {
wait(f);
msg = std::upper_bound(tag->value.version_messages.begin(), tag->value.version_messages.end(), std::make_pair(currentVersion, LengthPrefixedStringRef()), CompareFirst<std::pair<Version, LengthPrefixedStringRef>>());
}
}
wait(yield(TaskUpdateStorage));
wait(yield(TaskPriority::UpdateStorage));
}
self->persistentData->set( KeyValueRef( BinaryWriter::toValue(logData->logId,Unversioned()).withPrefix(persistCurrentVersionKeys.begin), BinaryWriter::toValue(newPersistentDataVersion, Unversioned()) ) );
logData->persistentDataVersion = newPersistentDataVersion;
wait( self->persistentData->commit() ); // SOMEDAY: This seems to be running pretty often, should we slow it down???
wait( delay(0, TaskUpdateStorage) );
wait( delay(0, TaskPriority::UpdateStorage) );
// Now that the changes we made to persistentData are durable, erase the data we moved from memory and the queue, increase bytesDurable accordingly, and update persistentDataDurableVersion.
@ -548,20 +548,20 @@ namespace oldTLog_4_6 {
logData->persistentDataDurableVersion = newPersistentDataVersion;
for(tag = logData->tag_data.begin(); tag != logData->tag_data.end(); ++tag) {
wait(tag->value.eraseMessagesBefore( newPersistentDataVersion+1, &self->bytesDurable, logData, TaskUpdateStorage ));
wait(yield(TaskUpdateStorage));
wait(tag->value.eraseMessagesBefore( newPersistentDataVersion+1, &self->bytesDurable, logData, TaskPriority::UpdateStorage ));
wait(yield(TaskPriority::UpdateStorage));
}
logData->version_sizes.erase(logData->version_sizes.begin(), logData->version_sizes.lower_bound(logData->persistentDataDurableVersion));
wait(yield(TaskUpdateStorage));
wait(yield(TaskPriority::UpdateStorage));
while(!logData->messageBlocks.empty() && logData->messageBlocks.front().first <= newPersistentDataVersion) {
int64_t bytesErased = int64_t(logData->messageBlocks.front().second.size()) * SERVER_KNOBS->TLOG_MESSAGE_BLOCK_OVERHEAD_FACTOR;
logData->bytesDurable += bytesErased;
self->bytesDurable += bytesErased;
logData->messageBlocks.pop_front();
wait(yield(TaskUpdateStorage));
wait(yield(TaskPriority::UpdateStorage));
}
if(logData->bytesDurable.getValue() > logData->bytesInput.getValue() || self->bytesDurable > self->bytesInput) {
@ -586,7 +586,7 @@ namespace oldTLog_4_6 {
}
if(!self->queueOrder.size()) {
wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskUpdateStorage) );
wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskPriority::UpdateStorage) );
return Void();
}
@ -621,14 +621,14 @@ namespace oldTLog_4_6 {
}
wait( logData->queueCommittedVersion.whenAtLeast( nextVersion ) );
wait( delay(0, TaskUpdateStorage) );
wait( delay(0, TaskPriority::UpdateStorage) );
//TraceEvent("TlogUpdatePersist", self->dbgid).detail("LogId", logData->logId).detail("NextVersion", nextVersion).detail("Version", logData->version.get()).detail("PersistentDataDurableVer", logData->persistentDataDurableVersion).detail("QueueCommitVer", logData->queueCommittedVersion.get()).detail("PersistDataVer", logData->persistentDataVersion);
if (nextVersion > logData->persistentDataVersion) {
self->updatePersist = updatePersistentData(self, logData, nextVersion);
wait( self->updatePersist );
} else {
wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskUpdateStorage) );
wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskPriority::UpdateStorage) );
}
if( logData->removed.isReady() ) {
@ -639,9 +639,9 @@ namespace oldTLog_4_6 {
if(logData->persistentDataDurableVersion == logData->version.get()) {
self->queueOrder.pop_front();
}
wait( delay(0.0, TaskUpdateStorage) );
wait( delay(0.0, TaskPriority::UpdateStorage) );
} else {
wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskUpdateStorage) );
wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskPriority::UpdateStorage) );
}
}
else if(logData->initialized) {
@ -650,7 +650,7 @@ namespace oldTLog_4_6 {
while( totalSize < SERVER_KNOBS->UPDATE_STORAGE_BYTE_LIMIT && sizeItr != logData->version_sizes.end()
&& (logData->bytesInput.getValue() - logData->bytesDurable.getValue() - totalSize >= SERVER_KNOBS->TLOG_SPILL_THRESHOLD || sizeItr->value.first == 0) )
{
wait( yield(TaskUpdateStorage) );
wait( yield(TaskPriority::UpdateStorage) );
++sizeItr;
nextVersion = sizeItr == logData->version_sizes.end() ? logData->version.get() : sizeItr->key;
@ -662,7 +662,7 @@ namespace oldTLog_4_6 {
totalSize += it->second.expectedSize();
}
wait(yield(TaskUpdateStorage));
wait(yield(TaskPriority::UpdateStorage));
}
prevVersion = nextVersion;
@ -673,7 +673,7 @@ namespace oldTLog_4_6 {
//TraceEvent("UpdateStorageVer", logData->logId).detail("NextVersion", nextVersion).detail("PersistentDataVersion", logData->persistentDataVersion).detail("TotalSize", totalSize);
wait( logData->queueCommittedVersion.whenAtLeast( nextVersion ) );
wait( delay(0, TaskUpdateStorage) );
wait( delay(0, TaskPriority::UpdateStorage) );
if (nextVersion > logData->persistentDataVersion) {
self->updatePersist = updatePersistentData(self, logData, nextVersion);
@ -681,21 +681,21 @@ namespace oldTLog_4_6 {
}
if( totalSize < SERVER_KNOBS->UPDATE_STORAGE_BYTE_LIMIT ) {
wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskUpdateStorage) );
wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskPriority::UpdateStorage) );
}
else {
//recovery wants to commit to persistant data when updatePersistentData is not active, this delay ensures that immediately after
//updatePersist returns another one has not been started yet.
wait( delay(0.0, TaskUpdateStorage) );
wait( delay(0.0, TaskPriority::UpdateStorage) );
}
} else {
wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskUpdateStorage) );
wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskPriority::UpdateStorage) );
}
return Void();
}
ACTOR Future<Void> updateStorageLoop( TLogData* self ) {
wait(delay(0, TaskUpdateStorage));
wait(delay(0, TaskPriority::UpdateStorage));
loop {
wait( updateStorage(self) );
@ -823,7 +823,7 @@ namespace oldTLog_4_6 {
ti->value.popped_recently = true;
//if (to.epoch == self->epoch())
if ( req.to > logData->persistentDataDurableVersion )
wait(ti->value.eraseMessagesBefore( req.to, &self->bytesDurable, logData, TaskTLogPop ));
wait(ti->value.eraseMessagesBefore( req.to, &self->bytesDurable, logData, TaskPriority::TLogPop ));
}
req.reply.send(Void());

View File

@ -195,6 +195,7 @@ static const KeyRangeRef persistCurrentVersionKeys = KeyRangeRef( LiteralStringR
static const KeyRangeRef persistKnownCommittedVersionKeys = KeyRangeRef( LiteralStringRef( "knownCommitted/" ), LiteralStringRef( "knownCommitted0" ) );
static const KeyRangeRef persistLocalityKeys = KeyRangeRef( LiteralStringRef( "Locality/" ), LiteralStringRef( "Locality0" ) );
static const KeyRangeRef persistLogRouterTagsKeys = KeyRangeRef( LiteralStringRef( "LogRouterTags/" ), LiteralStringRef( "LogRouterTags0" ) );
static const KeyRangeRef persistTxsTagsKeys = KeyRangeRef( LiteralStringRef( "TxsTags/" ), LiteralStringRef( "TxsTags0" ) );
static const KeyRange persistTagMessagesKeys = prefixRange(LiteralStringRef("TagMsg/"));
static const KeyRange persistTagPoppedKeys = prefixRange(LiteralStringRef("TagPop/"));
@ -291,7 +292,7 @@ struct TLogData : NonCopyable {
concurrentLogRouterReads(SERVER_KNOBS->CONCURRENT_LOG_ROUTER_READS),
ignorePopRequest(false), ignorePopDeadline(), ignorePopUid(), dataFolder(folder), toBePopped()
{
cx = openDBOnServer(dbInfo, TaskDefaultEndpoint, true, true);
cx = openDBOnServer(dbInfo, TaskPriority::DefaultEndpoint, true, true);
}
};
@ -317,7 +318,7 @@ struct LogData : NonCopyable, public ReferenceCounted<LogData> {
}
// Erase messages not needed to update *from* versions >= before (thus, messages with toversion <= before)
ACTOR Future<Void> eraseMessagesBefore( TagData *self, Version before, TLogData *tlogData, Reference<LogData> logData, int taskID ) {
ACTOR Future<Void> eraseMessagesBefore( TagData *self, Version before, TLogData *tlogData, Reference<LogData> logData, TaskPriority taskID ) {
while(!self->versionMessages.empty() && self->versionMessages.front().first < before) {
Version version = self->versionMessages.front().first;
std::pair<int,int> &sizes = logData->version_sizes[version];
@ -327,7 +328,7 @@ struct LogData : NonCopyable, public ReferenceCounted<LogData> {
auto const& m = self->versionMessages.front();
++messagesErased;
if(self->tag != txsTag) {
if(self->tag.locality != tagLocalityTxs && self->tag != txsTag) {
sizes.first -= m.second.expectedSize();
} else {
sizes.second -= m.second.expectedSize();
@ -346,7 +347,7 @@ struct LogData : NonCopyable, public ReferenceCounted<LogData> {
return Void();
}
Future<Void> eraseMessagesBefore(Version before, TLogData *tlogData, Reference<LogData> logData, int taskID) {
Future<Void> eraseMessagesBefore(Version before, TLogData *tlogData, Reference<LogData> logData, TaskPriority taskID) {
return eraseMessagesBefore(this, before, tlogData, logData, taskID);
}
};
@ -378,7 +379,7 @@ struct LogData : NonCopyable, public ReferenceCounted<LogData> {
Version knownCommittedVersion, durableKnownCommittedVersion, minKnownCommittedVersion;
struct PeekTrackerData {
std::map<int, Promise<Version>> sequence_version;
std::map<int, Promise<std::pair<Version, bool>>> sequence_version;
double lastUpdate;
};
@ -401,7 +402,7 @@ struct LogData : NonCopyable, public ReferenceCounted<LogData> {
//only callable after getTagData returns a null reference
Reference<TagData> createTagData(Tag tag, Version popped, bool nothingPersistent, bool poppedRecently, bool unpoppedRecovered) {
if(tag.locality != tagLocalityLogRouter && allTags.size() && !allTags.count(tag) && popped <= recoveredAt) {
if(tag.locality != tagLocalityLogRouter && tag.locality != tagLocalityTxs && allTags.size() && !allTags.count(tag) && popped <= recoveredAt) {
popped = recoveredAt + 1;
}
Reference<TagData> newTagData = Reference<TagData>( new TagData(tag, popped, nothingPersistent, poppedRecently, unpoppedRecovered) );
@ -434,9 +435,10 @@ struct LogData : NonCopyable, public ReferenceCounted<LogData> {
Future<Void> terminated;
FlowLock execOpLock;
bool execOpCommitInProgress;
int txsTags;
explicit LogData(TLogData* tLogData, TLogInterface interf, Tag remoteTag, bool isPrimary, int logRouterTags, UID recruitmentID, std::vector<Tag> tags) : tLogData(tLogData), knownCommittedVersion(0), logId(interf.id()),
cc("TLog", interf.id().toString()), bytesInput("BytesInput", cc), bytesDurable("BytesDurable", cc), remoteTag(remoteTag), isPrimary(isPrimary), logRouterTags(logRouterTags), recruitmentID(recruitmentID),
explicit LogData(TLogData* tLogData, TLogInterface interf, Tag remoteTag, bool isPrimary, int logRouterTags, int txsTags, UID recruitmentID, std::vector<Tag> tags) : tLogData(tLogData), knownCommittedVersion(0), logId(interf.id()),
cc("TLog", interf.id().toString()), bytesInput("BytesInput", cc), bytesDurable("BytesDurable", cc), remoteTag(remoteTag), isPrimary(isPrimary), logRouterTags(logRouterTags), txsTags(txsTags), recruitmentID(recruitmentID),
logSystem(new AsyncVar<Reference<ILogSystem>>()), logRouterPoppedVersion(0), durableKnownCommittedVersion(0), minKnownCommittedVersion(0), allTags(tags.begin(), tags.end()), terminated(tLogData->terminated.getFuture()),
// These are initialized differently on init() or recovery
recoveryCount(), stopped(false), initialized(false), queueCommittingVersion(0), newPersistentDataVersion(invalidVersion), unrecoveredBefore(1), recoveredAt(1), unpoppedRecoveredTags(0),
@ -483,6 +485,7 @@ struct LogData : NonCopyable, public ReferenceCounted<LogData> {
tLogData->persistentData->clear( singleKeyRange(logIdKey.withPrefix(persistKnownCommittedVersionKeys.begin)) );
tLogData->persistentData->clear( singleKeyRange(logIdKey.withPrefix(persistLocalityKeys.begin)) );
tLogData->persistentData->clear( singleKeyRange(logIdKey.withPrefix(persistLogRouterTagsKeys.begin)) );
tLogData->persistentData->clear( singleKeyRange(logIdKey.withPrefix(persistTxsTagsKeys.begin)) );
tLogData->persistentData->clear( singleKeyRange(logIdKey.withPrefix(persistRecoveryCountKeys.begin)) );
Key msgKey = logIdKey.withPrefix(persistTagMessagesKeys.begin);
tLogData->persistentData->clear( KeyRangeRef( msgKey, strinc(msgKey) ) );
@ -616,14 +619,14 @@ ACTOR Future<Void> updatePersistentData( TLogData* self, Reference<LogData> logD
self->persistentData->set( KeyValueRef( persistTagMessagesKey( logData->logId, tagData->tag, currentVersion ), wr.toValue() ) );
Future<Void> f = yield(TaskUpdateStorage);
Future<Void> f = yield(TaskPriority::UpdateStorage);
if(!f.isReady()) {
wait(f);
msg = std::upper_bound(tagData->versionMessages.begin(), tagData->versionMessages.end(), std::make_pair(currentVersion, LengthPrefixedStringRef()), CompareFirst<std::pair<Version, LengthPrefixedStringRef>>());
}
}
wait(yield(TaskUpdateStorage));
wait(yield(TaskPriority::UpdateStorage));
}
}
}
@ -633,7 +636,7 @@ ACTOR Future<Void> updatePersistentData( TLogData* self, Reference<LogData> logD
logData->persistentDataVersion = newPersistentDataVersion;
wait( self->persistentData->commit() ); // SOMEDAY: This seems to be running pretty often, should we slow it down???
wait( delay(0, TaskUpdateStorage) );
wait( delay(0, TaskPriority::UpdateStorage) );
// Now that the changes we made to persistentData are durable, erase the data we moved from memory and the queue, increase bytesDurable accordingly, and update persistentDataDurableVersion.
@ -643,22 +646,22 @@ ACTOR Future<Void> updatePersistentData( TLogData* self, Reference<LogData> logD
for(tagLocality = 0; tagLocality < logData->tag_data.size(); tagLocality++) {
for(tagId = 0; tagId < logData->tag_data[tagLocality].size(); tagId++) {
if(logData->tag_data[tagLocality][tagId]) {
wait(logData->tag_data[tagLocality][tagId]->eraseMessagesBefore( newPersistentDataVersion+1, self, logData, TaskUpdateStorage ));
wait(yield(TaskUpdateStorage));
wait(logData->tag_data[tagLocality][tagId]->eraseMessagesBefore( newPersistentDataVersion+1, self, logData, TaskPriority::UpdateStorage ));
wait(yield(TaskPriority::UpdateStorage));
}
}
}
logData->version_sizes.erase(logData->version_sizes.begin(), logData->version_sizes.lower_bound(logData->persistentDataDurableVersion));
wait(yield(TaskUpdateStorage));
wait(yield(TaskPriority::UpdateStorage));
while(!logData->messageBlocks.empty() && logData->messageBlocks.front().first <= newPersistentDataVersion) {
int64_t bytesErased = int64_t(logData->messageBlocks.front().second.size()) * SERVER_KNOBS->TLOG_MESSAGE_BLOCK_OVERHEAD_FACTOR;
logData->bytesDurable += bytesErased;
self->bytesDurable += bytesErased;
logData->messageBlocks.pop_front();
wait(yield(TaskUpdateStorage));
wait(yield(TaskPriority::UpdateStorage));
}
if(logData->bytesDurable.getValue() > logData->bytesInput.getValue() || self->bytesDurable > self->bytesInput) {
@ -683,7 +686,7 @@ ACTOR Future<Void> updateStorage( TLogData* self ) {
}
if(!self->queueOrder.size()) {
wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskUpdateStorage) );
wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskPriority::UpdateStorage) );
return Void();
}
@ -707,7 +710,7 @@ ACTOR Future<Void> updateStorage( TLogData* self ) {
}
wait( logData->queueCommittedVersion.whenAtLeast( nextVersion ) );
wait( delay(0, TaskUpdateStorage) );
wait( delay(0, TaskPriority::UpdateStorage) );
//TraceEvent("TlogUpdatePersist", self->dbgid).detail("LogId", logData->logId).detail("NextVersion", nextVersion).detail("Version", logData->version.get()).detail("PersistentDataDurableVer", logData->persistentDataDurableVersion).detail("QueueCommitVer", logData->queueCommittedVersion.get()).detail("PersistDataVer", logData->persistentDataVersion);
if (nextVersion > logData->persistentDataVersion) {
@ -716,7 +719,7 @@ ACTOR Future<Void> updateStorage( TLogData* self ) {
wait( updatePersistentData(self, logData, nextVersion) );
commitLockReleaser.release();
} else {
wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskUpdateStorage) );
wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskPriority::UpdateStorage) );
}
if( logData->removed.isReady() ) {
@ -727,9 +730,9 @@ ACTOR Future<Void> updateStorage( TLogData* self ) {
if(logData->persistentDataDurableVersion == logData->version.get()) {
self->queueOrder.pop_front();
}
wait( delay(0.0, TaskUpdateStorage) );
wait( delay(0.0, TaskPriority::UpdateStorage) );
} else {
wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskUpdateStorage) );
wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskPriority::UpdateStorage) );
}
}
else if(logData->initialized) {
@ -750,7 +753,7 @@ ACTOR Future<Void> updateStorage( TLogData* self ) {
//TraceEvent("UpdateStorageVer", logData->logId).detail("NextVersion", nextVersion).detail("PersistentDataVersion", logData->persistentDataVersion).detail("TotalSize", totalSize);
wait( logData->queueCommittedVersion.whenAtLeast( nextVersion ) );
wait( delay(0, TaskUpdateStorage) );
wait( delay(0, TaskPriority::UpdateStorage) );
if (nextVersion > logData->persistentDataVersion) {
wait( self->persistentDataCommitLock.take() );
@ -760,21 +763,21 @@ ACTOR Future<Void> updateStorage( TLogData* self ) {
}
if( totalSize < SERVER_KNOBS->UPDATE_STORAGE_BYTE_LIMIT ) {
wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskUpdateStorage) );
wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskPriority::UpdateStorage) );
}
else {
//recovery wants to commit to persistant data when updatePersistentData is not active, this delay ensures that immediately after
//updatePersist returns another one has not been started yet.
wait( delay(0.0, TaskUpdateStorage) );
wait( delay(0.0, TaskPriority::UpdateStorage) );
}
} else {
wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskUpdateStorage) );
wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskPriority::UpdateStorage) );
}
return Void();
}
ACTOR Future<Void> updateStorageLoop( TLogData* self ) {
wait(delay(0, TaskUpdateStorage));
wait(delay(0, TaskPriority::UpdateStorage));
loop {
wait( updateStorage(self) );
@ -823,7 +826,7 @@ void commitMessages( TLogData* self, Reference<LogData> logData, Version version
block.append(block.arena(), msg.message.begin(), msg.message.size());
for(auto tag : msg.tags) {
if(logData->locality == tagLocalitySatellite) {
if(!(tag == txsTag || tag.locality == tagLocalityLogRouter)) {
if(!(tag.locality == tagLocalityTxs || tag.locality == tagLocalityLogRouter || tag == txsTag)) {
continue;
}
} else if(!(logData->locality == tagLocalitySpecial || logData->locality == tag.locality || tag.locality < 0)) {
@ -836,6 +839,9 @@ void commitMessages( TLogData* self, Reference<LogData> logData, Version version
}
tag.id = tag.id % logData->logRouterTags;
}
if(tag.locality == tagLocalityTxs) {
tag.id = tag.id % logData->txsTags;
}
Reference<LogData::TagData> tagData = logData->getTagData(tag);
if(!tagData) {
tagData = logData->createTagData(tag, 0, true, true, false);
@ -846,7 +852,7 @@ void commitMessages( TLogData* self, Reference<LogData> logData, Version version
if(tagData->versionMessages.back().second.expectedSize() > SERVER_KNOBS->MAX_MESSAGE_SIZE) {
TraceEvent(SevWarnAlways, "LargeMessage").detail("Size", tagData->versionMessages.back().second.expectedSize());
}
if (tag != txsTag) {
if (tag.locality != tagLocalityTxs && tag != txsTag) {
expectedBytes += tagData->versionMessages.back().second.expectedSize();
} else {
txsBytes += tagData->versionMessages.back().second.expectedSize();
@ -914,7 +920,7 @@ std::deque<std::pair<Version, LengthPrefixedStringRef>> & getVersionMessages( Re
};
ACTOR Future<Void> tLogPopCore( TLogData* self, Tag inputTag, Version to, Reference<LogData> logData ) {
if (self->ignorePopRequest && inputTag != txsTag) {
if (self->ignorePopRequest && inputTag.locality != tagLocalityTxs && inputTag != txsTag) {
TraceEvent("IgnoringPopRequest").detail("IgnorePopDeadline", self->ignorePopDeadline);
if (self->toBePopped.find(inputTag) == self->toBePopped.end()
@ -952,7 +958,7 @@ ACTOR Future<Void> tLogPopCore( TLogData* self, Tag inputTag, Version to, Refere
}
if (upTo > logData->persistentDataDurableVersion)
wait(tagData->eraseMessagesBefore(upTo, self, logData, TaskTLogPop));
wait(tagData->eraseMessagesBefore(upTo, self, logData, TaskPriority::TLogPop));
//TraceEvent("TLogPop", self->dbgid).detail("Tag", tag.toString()).detail("To", upTo);
}
return Void();
@ -1025,9 +1031,9 @@ ACTOR Future<Void> tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere
sequence = req.sequence.get().second;
auto& trackerData = logData->peekTracker[peekId];
if (sequence == 0) {
Promise<Version> firstRequest;
Promise<std::pair<Version, bool>> firstRequest;
trackerData.sequence_version[0] = firstRequest;
firstRequest.send(req.begin);
firstRequest.send(std::make_pair(req.begin, req.onlySpilled));
}
auto seqBegin = trackerData.sequence_version.begin();
while(trackerData.sequence_version.size() && seqBegin->first <= sequence - SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS) {
@ -1038,7 +1044,7 @@ ACTOR Future<Void> tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere
seqBegin = trackerData.sequence_version.begin();
}
if (seqBegin != trackerData.sequence_version.end() && !seqBegin->second.isSet()) {
seqBegin->second.send(req.begin);
seqBegin->second.send(std::make_pair(req.begin, req.onlySpilled));
}
if(trackerData.sequence_version.size() && sequence < seqBegin->first) {
@ -1046,8 +1052,9 @@ ACTOR Future<Void> tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere
}
trackerData.lastUpdate = now();
Version ver = wait(trackerData.sequence_version[sequence].getFuture());
req.begin = ver;
std::pair<Version, bool> prevPeekData = wait(trackerData.sequence_version[sequence].getFuture());
req.begin = prevPeekData.first;
req.onlySpilled = prevPeekData.second;
wait(yield());
} catch( Error &e ) {
if(e.code() == error_code_timed_out) {
@ -1065,7 +1072,7 @@ ACTOR Future<Void> tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere
auto& trackerData = logData->peekTracker[peekId];
auto& sequenceData = trackerData.sequence_version[sequence+1];
if (!sequenceData.isSet()) {
sequenceData.send(req.begin);
sequenceData.send(std::make_pair(req.begin, req.onlySpilled));
}
}
return Void();
@ -1081,16 +1088,16 @@ ACTOR Future<Void> tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere
if( req.tag.locality == tagLocalityLogRouter ) {
wait( self->concurrentLogRouterReads.take() );
state FlowLock::Releaser globalReleaser(self->concurrentLogRouterReads);
wait( delay(0.0, TaskLowPriority) );
wait( delay(0.0, TaskPriority::Low) );
}
if( req.begin <= logData->persistentDataDurableVersion && req.tag != txsTag) {
if( req.begin <= logData->persistentDataDurableVersion && req.tag.locality != tagLocalityTxs && req.tag != txsTag) {
// Reading spilled data will almost always imply that the storage server is >5s behind the rest
// of the cluster. We shouldn't prioritize spending CPU on helping this server catch up
// slightly faster over keeping the rest of the cluster operating normally.
// txsTag is only ever peeked on recovery, and we would still wish to prioritize requests
// that impact recovery duration.
wait(delay(0, TaskTLogSpilledPeekReply));
wait(delay(0, TaskPriority::TLogSpilledPeekReply));
}
Version poppedVer = poppedVersion(logData, req.tag);
@ -1113,13 +1120,13 @@ ACTOR Future<Void> tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere
return Void();
}
if(sequenceData.isSet()) {
if(sequenceData.getFuture().get() != rep.end) {
if(sequenceData.getFuture().get().first != rep.end) {
TEST(true); //tlog peek second attempt ended at a different version
req.reply.sendError(timed_out());
return Void();
}
} else {
sequenceData.send(rep.end);
sequenceData.send(std::make_pair(rep.end, rep.onlySpilled));
}
rep.begin = req.begin;
}
@ -1189,13 +1196,13 @@ ACTOR Future<Void> tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere
return Void();
}
if(sequenceData.isSet()) {
if(sequenceData.getFuture().get() != reply.end) {
if(sequenceData.getFuture().get().first != reply.end) {
TEST(true); //tlog peek second attempt ended at a different version
req.reply.sendError(timed_out());
return Void();
}
} else {
sequenceData.send(reply.end);
sequenceData.send(std::make_pair(reply.end, reply.onlySpilled));
}
reply.begin = req.begin;
}
@ -1208,7 +1215,7 @@ ACTOR Future<Void> watchDegraded(TLogData* self) {
//This delay is divided into multiple delays to avoid marking the tlog as degraded because of a single SlowTask
state int loopCount = 0;
while(loopCount < SERVER_KNOBS->TLOG_DEGRADED_DELAY_COUNT) {
wait(delay(SERVER_KNOBS->TLOG_DEGRADED_DURATION/SERVER_KNOBS->TLOG_DEGRADED_DELAY_COUNT, TaskLowPriority));
wait(delay(SERVER_KNOBS->TLOG_DEGRADED_DURATION/SERVER_KNOBS->TLOG_DEGRADED_DELAY_COUNT, TaskPriority::Low));
loopCount++;
}
TraceEvent(SevWarnAlways, "TLogDegraded", self->dbgid);
@ -1338,7 +1345,7 @@ void execProcessingHelper(TLogData* self,
rd >> messageLength >> sub >> tagCount;
for (int i = 0; i < tagCount; i++) {
rd >> tmpTag;
if (tmpTag == txsTag) {
if (tmpTag.locality == tagLocalityTxs || tmpTag == txsTag) {
hasTxsTag = true;
}
execTags->push_back(execTags->arena(), tmpTag);
@ -1544,7 +1551,7 @@ ACTOR Future<Void> tLogCommit(
.detail("PersistentDataDurableVersion", logData->persistentDataDurableVersion);
waitStartT = now();
}
wait( delayJittered(.005, TaskTLogCommit) );
wait( delayJittered(.005, TaskPriority::TLogCommit) );
}
// while exec op is being committed, no new transactions will be admitted.
@ -1667,6 +1674,7 @@ ACTOR Future<Void> initPersistentState( TLogData* self, Reference<LogData> logDa
storage->set( KeyValueRef( BinaryWriter::toValue(logData->logId,Unversioned()).withPrefix(persistKnownCommittedVersionKeys.begin), BinaryWriter::toValue(logData->knownCommittedVersion, Unversioned()) ) );
storage->set( KeyValueRef( BinaryWriter::toValue(logData->logId,Unversioned()).withPrefix(persistLocalityKeys.begin), BinaryWriter::toValue(logData->locality, Unversioned()) ) );
storage->set( KeyValueRef( BinaryWriter::toValue(logData->logId,Unversioned()).withPrefix(persistLogRouterTagsKeys.begin), BinaryWriter::toValue(logData->logRouterTags, Unversioned()) ) );
storage->set( KeyValueRef( BinaryWriter::toValue(logData->logId,Unversioned()).withPrefix(persistTxsTagsKeys.begin), BinaryWriter::toValue(logData->txsTags, Unversioned()) ) );
storage->set( KeyValueRef( BinaryWriter::toValue(logData->logId,Unversioned()).withPrefix(persistRecoveryCountKeys.begin), BinaryWriter::toValue(logData->recoveryCount, Unversioned()) ) );
for(auto tag : logData->allTags) {
@ -1884,7 +1892,7 @@ ACTOR Future<Void> pullAsyncData( TLogData* self, Reference<LogData> logData, st
while (!endVersion.present() || logData->version.get() < endVersion.get()) {
loop {
choose {
when(wait( r ? r->getMore(TaskTLogCommit) : Never() ) ) {
when(wait( r ? r->getMore(TaskPriority::TLogCommit) : Never() ) ) {
break;
}
when( wait( dbInfoChange ) ) {
@ -1907,7 +1915,7 @@ ACTOR Future<Void> pullAsyncData( TLogData* self, Reference<LogData> logData, st
.detail("PersistentDataDurableVersion", logData->persistentDataDurableVersion);
waitStartT = now();
}
wait( delayJittered(.005, TaskTLogCommit) );
wait( delayJittered(.005, TaskPriority::TLogCommit) );
}
state Version ver = 0;
@ -1947,7 +1955,7 @@ ACTOR Future<Void> pullAsyncData( TLogData* self, Reference<LogData> logData, st
// Notifies the commitQueue actor to commit persistentQueue, and also unblocks tLogPeekMessages actors
logData->version.set( ver );
wait( yield(TaskTLogCommit) );
wait( yield(TaskPriority::TLogCommit) );
}
lastVer = ver;
ver = r->version().version;
@ -1984,7 +1992,7 @@ ACTOR Future<Void> pullAsyncData( TLogData* self, Reference<LogData> logData, st
// Notifies the commitQueue actor to commit persistentQueue, and also unblocks tLogPeekMessages actors
logData->version.set( ver );
wait( yield(TaskTLogCommit) );
wait( yield(TaskPriority::TLogCommit) );
}
break;
}
@ -2075,12 +2083,13 @@ ACTOR Future<Void> restorePersistentState( TLogData* self, LocalityData locality
state Future<Standalone<VectorRef<KeyValueRef>>> fKnownCommitted = storage->readRange(persistKnownCommittedVersionKeys);
state Future<Standalone<VectorRef<KeyValueRef>>> fLocality = storage->readRange(persistLocalityKeys);
state Future<Standalone<VectorRef<KeyValueRef>>> fLogRouterTags = storage->readRange(persistLogRouterTagsKeys);
state Future<Standalone<VectorRef<KeyValueRef>>> fTxsTags = storage->readRange(persistTxsTagsKeys);
state Future<Standalone<VectorRef<KeyValueRef>>> fRecoverCounts = storage->readRange(persistRecoveryCountKeys);
// FIXME: metadata in queue?
wait( waitForAll( (vector<Future<Optional<Value>>>(), fFormat ) ) );
wait( waitForAll( (vector<Future<Standalone<VectorRef<KeyValueRef>>>>(), fVers, fKnownCommitted, fLocality, fLogRouterTags, fRecoverCounts) ) );
wait( waitForAll( (vector<Future<Standalone<VectorRef<KeyValueRef>>>>(), fVers, fKnownCommitted, fLocality, fLogRouterTags, fTxsTags, fRecoverCounts) ) );
if (fFormat.get().present() && !persistFormatReadableRange.contains( fFormat.get().get() )) {
//FIXME: remove when we no longer need to test upgrades from 4.X releases
@ -2132,6 +2141,11 @@ ACTOR Future<Void> restorePersistentState( TLogData* self, LocalityData locality
id_logRouterTags[ BinaryReader::fromStringRef<UID>(it.key.removePrefix(persistLogRouterTagsKeys.begin), Unversioned())] = BinaryReader::fromStringRef<int>( it.value, Unversioned() );
}
state std::map<UID, int> id_txsTags;
for(auto it : fTxsTags.get()) {
id_txsTags[ BinaryReader::fromStringRef<UID>(it.key.removePrefix(persistTxsTagsKeys.begin), Unversioned())] = BinaryReader::fromStringRef<int>( it.value, Unversioned() );
}
state std::map<UID, Version> id_knownCommitted;
for(auto it : fKnownCommitted.get()) {
id_knownCommitted[ BinaryReader::fromStringRef<UID>(it.key.removePrefix(persistKnownCommittedVersionKeys.begin), Unversioned())] = BinaryReader::fromStringRef<Version>( it.value, Unversioned() );
@ -2157,7 +2171,7 @@ ACTOR Future<Void> restorePersistentState( TLogData* self, LocalityData locality
DUMPTOKEN( recruited.confirmRunning );
//We do not need the remoteTag, because we will not be loading any additional data
logData = Reference<LogData>( new LogData(self, recruited, Tag(), true, id_logRouterTags[id1], UID(), std::vector<Tag>()) );
logData = Reference<LogData>( new LogData(self, recruited, Tag(), true, id_logRouterTags[id1], id_txsTags[id1], UID(), std::vector<Tag>()) );
logData->locality = id_locality[id1];
logData->stopped = true;
self->id_data[id1] = logData;
@ -2340,7 +2354,7 @@ ACTOR Future<Void> tLogStart( TLogData* self, InitializeTLogRequest req, Localit
it.second->stopCommit.trigger();
}
state Reference<LogData> logData = Reference<LogData>( new LogData(self, recruited, req.remoteTag, req.isPrimary, req.logRouterTags, req.recruitmentID, req.allTags) );
state Reference<LogData> logData = Reference<LogData>( new LogData(self, recruited, req.remoteTag, req.isPrimary, req.logRouterTags, req.txsTags, req.recruitmentID, req.allTags) );
self->id_data[recruited.id()] = logData;
logData->locality = req.locality;
logData->recoveryCount = req.epoch;

View File

@ -38,7 +38,7 @@ public:
ready = NotifiedVersion(s);
started = false;
}
Future<bool> order( Seq s, int taskID = TaskDefaultYield ) {
Future<bool> order( Seq s, TaskPriority taskID = TaskPriority::DefaultYield ) {
if ( ready.get() < s )
return waitAndOrder( this, s, taskID );
else
@ -54,7 +54,7 @@ public:
return ready.whenAtLeast(v);
}
private:
ACTOR static Future<bool> waitAndOrder( Orderer<Seq>* self, Seq s, int taskID ) {
ACTOR static Future<bool> waitAndOrder( Orderer<Seq>* self, Seq s, TaskPriority taskID ) {
wait( self->ready.whenAtLeast(s) );
wait( yield( taskID ) || self->shutdown.getFuture() );
return self->dedup(s);

View File

@ -291,6 +291,15 @@ ACTOR Future<bool> getTeamCollectionValid(Database cx, WorkerInterface dataDistr
int64_t desiredMachineTeamNumber = boost::lexical_cast<int64_t>(teamCollectionInfoMessage.getValue("DesiredMachineTeams"));
int64_t maxMachineTeamNumber = boost::lexical_cast<int64_t>(teamCollectionInfoMessage.getValue("MaxMachineTeams"));
int64_t minServerTeamOnServer =
boost::lexical_cast<int64_t>(teamCollectionInfoMessage.getValue("MinTeamNumberOnServer"));
int64_t maxServerTeamOnServer =
boost::lexical_cast<int64_t>(teamCollectionInfoMessage.getValue("MaxTeamNumberOnServer"));
int64_t minMachineTeamOnMachine =
boost::lexical_cast<int64_t>(teamCollectionInfoMessage.getValue("MinMachineTeamNumberOnMachine"));
int64_t maxMachineTeamOnMachine =
boost::lexical_cast<int64_t>(teamCollectionInfoMessage.getValue("MaxMachineTeamNumberOnMachine"));
// Team number is always valid when we disable teamRemover. This avoids false positive in simulation test
if (SERVER_KNOBS->TR_FLAG_DISABLE_TEAM_REMOVER) {
TraceEvent("GetTeamCollectionValid")
@ -300,7 +309,10 @@ ACTOR Future<bool> getTeamCollectionValid(Database cx, WorkerInterface dataDistr
// The if condition should be consistent with the condition in teamRemover() that decides
// if redundant teams exist.
if (healthyMachineTeamCount > desiredMachineTeamNumber) {
if (healthyMachineTeamCount > desiredMachineTeamNumber ||
(minMachineTeamOnMachine <= 0 && SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER == 3)) {
// When DESIRED_TEAMS_PER_SERVER == 1, we see minMachineTeamOnMachine can be 0 in one out of 30k test
// cases. Only check DESIRED_TEAMS_PER_SERVER == 3 for now since it is mostly used configuration.
TraceEvent("GetTeamCollectionValid")
.detail("CurrentTeamNumber", currentTeamNumber)
.detail("DesiredTeamNumber", desiredTeamNumber)
@ -308,7 +320,13 @@ ACTOR Future<bool> getTeamCollectionValid(Database cx, WorkerInterface dataDistr
.detail("CurrentHealthyMachineTeamNumber", healthyMachineTeamCount)
.detail("DesiredMachineTeams", desiredMachineTeamNumber)
.detail("CurrentMachineTeamNumber", currentMachineTeamNumber)
.detail("MaxMachineTeams", maxMachineTeamNumber);
.detail("MaxMachineTeams", maxMachineTeamNumber)
.detail("MinTeamNumberOnServer", minServerTeamOnServer)
.detail("MaxTeamNumberOnServer", maxServerTeamOnServer)
.detail("MinMachineTeamNumberOnMachine", minMachineTeamOnMachine)
.detail("MaxMachineTeamNumberOnMachine", maxMachineTeamOnMachine)
.detail("DesiredTeamsPerServer", SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER)
.detail("MaxTeamsPerServer", SERVER_KNOBS->MAX_TEAMS_PER_SERVER);
return false;
} else {
return true;

View File

@ -41,6 +41,7 @@ enum limitReason_t {
storage_server_min_free_space_ratio, // a storage server's normal limits are being reduced by a low free space ratio
log_server_min_free_space,
log_server_min_free_space_ratio,
storage_server_durability_lag,
limitReason_t_end
};
@ -56,7 +57,8 @@ const char* limitReasonName[] = {
"storage_server_min_free_space",
"storage_server_min_free_space_ratio",
"log_server_min_free_space",
"log_server_min_free_space_ratio"
"log_server_min_free_space_ratio",
"storage_server_durability_lag"
};
static_assert(sizeof(limitReasonName) / sizeof(limitReasonName[0]) == limitReason_t_end, "limitReasonDesc table size");
@ -72,7 +74,8 @@ const char* limitReasonDesc[] = {
"Storage server running out of space (approaching 100MB limit).",
"Storage server running out of space (approaching 5% limit).",
"Log server running out of space (approaching 100MB limit).",
"Log server running out of space (approaching 5% limit)."
"Log server running out of space (approaching 5% limit).",
"Storage server durable version falling behind."
};
static_assert(sizeof(limitReasonDesc) / sizeof(limitReasonDesc[0]) == limitReason_t_end, "limitReasonDesc table size");
@ -84,14 +87,14 @@ struct StorageQueueInfo {
StorageQueuingMetricsReply lastReply;
StorageQueuingMetricsReply prevReply;
Smoother smoothDurableBytes, smoothInputBytes, verySmoothDurableBytes;
Smoother smoothDurableVersion, smoothLatestVersion;
Smoother verySmoothDurableVersion, smoothLatestVersion;
Smoother smoothFreeSpace;
Smoother smoothTotalSpace;
double localRateLimit;
limitReason_t limitReason;
StorageQueueInfo(UID id, LocalityData locality) : valid(false), id(id), locality(locality), smoothDurableBytes(SERVER_KNOBS->SMOOTHING_AMOUNT),
smoothInputBytes(SERVER_KNOBS->SMOOTHING_AMOUNT), verySmoothDurableBytes(SERVER_KNOBS->SLOW_SMOOTHING_AMOUNT),
smoothDurableVersion(1.), smoothLatestVersion(1.), smoothFreeSpace(SERVER_KNOBS->SMOOTHING_AMOUNT),
verySmoothDurableVersion(SERVER_KNOBS->SLOW_SMOOTHING_AMOUNT), smoothLatestVersion(SERVER_KNOBS->SMOOTHING_AMOUNT), smoothFreeSpace(SERVER_KNOBS->SMOOTHING_AMOUNT),
smoothTotalSpace(SERVER_KNOBS->SMOOTHING_AMOUNT)
{
// FIXME: this is a tacky workaround for a potential uninitialized use in trackStorageServerQueueInfo
@ -126,9 +129,13 @@ struct RatekeeperLimits {
int64_t logSpringBytes;
double maxVersionDifference;
int64_t durabilityLagTargetVersions;
int64_t lastDurabilityLag;
double durabilityLagLimit;
std::string context;
RatekeeperLimits(std::string context, int64_t storageTargetBytes, int64_t storageSpringBytes, int64_t logTargetBytes, int64_t logSpringBytes, double maxVersionDifference) :
RatekeeperLimits(std::string context, int64_t storageTargetBytes, int64_t storageSpringBytes, int64_t logTargetBytes, int64_t logSpringBytes, double maxVersionDifference, int64_t durabilityLagTargetVersions) :
tpsLimit(std::numeric_limits<double>::infinity()),
tpsLimitMetric(StringRef("Ratekeeper.TPSLimit" + context)),
reasonMetric(StringRef("Ratekeeper.Reason" + context)),
@ -137,6 +144,9 @@ struct RatekeeperLimits {
logTargetBytes(logTargetBytes),
logSpringBytes(logSpringBytes),
maxVersionDifference(maxVersionDifference),
durabilityLagTargetVersions(durabilityLagTargetVersions),
durabilityLagLimit(std::numeric_limits<double>::infinity()),
lastDurabilityLag(0),
context(context)
{}
};
@ -167,11 +177,13 @@ struct RatekeeperData {
RatekeeperLimits normalLimits;
RatekeeperLimits batchLimits;
Deque<double> actualTpsHistory;
RatekeeperData() : smoothReleasedTransactions(SERVER_KNOBS->SMOOTHING_AMOUNT), smoothBatchReleasedTransactions(SERVER_KNOBS->SMOOTHING_AMOUNT), smoothTotalDurableBytes(SERVER_KNOBS->SLOW_SMOOTHING_AMOUNT),
actualTpsMetric(LiteralStringRef("Ratekeeper.ActualTPS")),
lastWarning(0),
normalLimits("", SERVER_KNOBS->TARGET_BYTES_PER_STORAGE_SERVER, SERVER_KNOBS->SPRING_BYTES_STORAGE_SERVER, SERVER_KNOBS->TARGET_BYTES_PER_TLOG, SERVER_KNOBS->SPRING_BYTES_TLOG, SERVER_KNOBS->MAX_TL_SS_VERSION_DIFFERENCE),
batchLimits("Batch", SERVER_KNOBS->TARGET_BYTES_PER_STORAGE_SERVER_BATCH, SERVER_KNOBS->SPRING_BYTES_STORAGE_SERVER_BATCH, SERVER_KNOBS->TARGET_BYTES_PER_TLOG_BATCH, SERVER_KNOBS->SPRING_BYTES_TLOG_BATCH, SERVER_KNOBS->MAX_TL_SS_VERSION_DIFFERENCE_BATCH)
normalLimits("", SERVER_KNOBS->TARGET_BYTES_PER_STORAGE_SERVER, SERVER_KNOBS->SPRING_BYTES_STORAGE_SERVER, SERVER_KNOBS->TARGET_BYTES_PER_TLOG, SERVER_KNOBS->SPRING_BYTES_TLOG, SERVER_KNOBS->MAX_TL_SS_VERSION_DIFFERENCE, SERVER_KNOBS->TARGET_DURABILITY_LAG_VERSIONS),
batchLimits("Batch", SERVER_KNOBS->TARGET_BYTES_PER_STORAGE_SERVER_BATCH, SERVER_KNOBS->SPRING_BYTES_STORAGE_SERVER_BATCH, SERVER_KNOBS->TARGET_BYTES_PER_TLOG_BATCH, SERVER_KNOBS->SPRING_BYTES_TLOG_BATCH, SERVER_KNOBS->MAX_TL_SS_VERSION_DIFFERENCE_BATCH, SERVER_KNOBS->TARGET_DURABILITY_LAG_VERSIONS_BATCH)
{}
};
@ -194,7 +206,7 @@ ACTOR Future<Void> trackStorageServerQueueInfo( RatekeeperData* self, StorageSer
myQueueInfo->value.smoothInputBytes.reset(reply.get().bytesInput);
myQueueInfo->value.smoothFreeSpace.reset(reply.get().storageBytes.available);
myQueueInfo->value.smoothTotalSpace.reset(reply.get().storageBytes.total);
myQueueInfo->value.smoothDurableVersion.reset(reply.get().durableVersion);
myQueueInfo->value.verySmoothDurableVersion.reset(reply.get().durableVersion);
myQueueInfo->value.smoothLatestVersion.reset(reply.get().version);
} else {
self->smoothTotalDurableBytes.addDelta( reply.get().bytesDurable - myQueueInfo->value.prevReply.bytesDurable );
@ -203,7 +215,7 @@ ACTOR Future<Void> trackStorageServerQueueInfo( RatekeeperData* self, StorageSer
myQueueInfo->value.smoothInputBytes.setTotal( reply.get().bytesInput );
myQueueInfo->value.smoothFreeSpace.setTotal( reply.get().storageBytes.available );
myQueueInfo->value.smoothTotalSpace.setTotal( reply.get().storageBytes.total );
myQueueInfo->value.smoothDurableVersion.setTotal(reply.get().durableVersion);
myQueueInfo->value.verySmoothDurableVersion.setTotal(reply.get().durableVersion);
myQueueInfo->value.smoothLatestVersion.setTotal(reply.get().version);
}
} else {
@ -297,7 +309,7 @@ ACTOR Future<Void> trackEachStorageServer(
ACTOR Future<Void> monitorServerListChange(
Reference<AsyncVar<ServerDBInfo>> dbInfo,
PromiseStream< std::pair<UID, Optional<StorageServerInterface>> > serverChanges) {
state Database db = openDBOnServer(dbInfo, TaskRatekeeper, true, true);
state Database db = openDBOnServer(dbInfo, TaskPriority::Ratekeeper, true, true);
state std::map<UID, StorageServerInterface> oldServers;
state Transaction tr(db);
@ -341,6 +353,11 @@ void updateRate(RatekeeperData* self, RatekeeperLimits* limits) {
self->actualTpsMetric = (int64_t)actualTps;
// SOMEDAY: Remove the max( 1.0, ... ) since the below calculations _should_ be able to recover back up from this value
actualTps = std::max( std::max( 1.0, actualTps ), self->smoothTotalDurableBytes.smoothRate() / CLIENT_KNOBS->TRANSACTION_SIZE_LIMIT );
if(self->actualTpsHistory.size() > SERVER_KNOBS->MAX_TPS_HISTORY_SAMPLES) {
self->actualTpsHistory.pop_front();
}
self->actualTpsHistory.push_back(actualTps);
limits->tpsLimit = std::numeric_limits<double>::infinity();
UID reasonID = UID();
@ -350,12 +367,14 @@ void updateRate(RatekeeperData* self, RatekeeperLimits* limits) {
int64_t worstFreeSpaceStorageServer = std::numeric_limits<int64_t>::max();
int64_t worstStorageQueueStorageServer = 0;
int64_t worstStorageDurabilityLagStorageServer = 0;
int64_t limitingStorageQueueStorageServer = 0;
int64_t worstDurabilityLag = 0;
double worstStorageLocalLimit = 0;
double limitingStorageLocalLimit = 0;
std::multimap<double, StorageQueueInfo*> storageTpsLimitReverseIndex;
std::multimap<int64_t, StorageQueueInfo*> storageDurabilityLagReverseIndex;
std::map<UID, limitReason_t> ssReasons;
// Look at each storage server's write queue and local rate, compute and store the desired rate ratio
@ -384,8 +403,10 @@ void updateRate(RatekeeperData* self, RatekeeperLimits* limits) {
worstStorageQueueStorageServer = std::max(worstStorageQueueStorageServer, storageQueue);
worstStorageLocalLimit = std::min(worstStorageLocalLimit, ss.localRateLimit);
int64_t storageDurabilityLag = ss.smoothLatestVersion.smoothTotal() - ss.smoothDurableVersion.smoothTotal();
worstStorageDurabilityLagStorageServer = std::max(worstStorageDurabilityLagStorageServer, storageDurabilityLag);
int64_t storageDurabilityLag = ss.smoothLatestVersion.smoothTotal() - ss.verySmoothDurableVersion.smoothTotal();
worstDurabilityLag = std::max(worstDurabilityLag, storageDurabilityLag);
storageDurabilityLagReverseIndex.insert(std::make_pair(-1*storageDurabilityLag, &ss));
auto& ssMetrics = self->healthMetrics.storageStats[ss.id];
ssMetrics.storageQueue = storageQueue;
@ -446,9 +467,6 @@ void updateRate(RatekeeperData* self, RatekeeperLimits* limits) {
ssReasons[ss.id] = ssLimitReason;
}
self->healthMetrics.worstStorageQueue = worstStorageQueueStorageServer;
self->healthMetrics.worstStorageDurabilityLag = worstStorageDurabilityLagStorageServer;
std::set<Optional<Standalone<StringRef>>> ignoredMachines;
for (auto ss = storageTpsLimitReverseIndex.begin(); ss != storageTpsLimitReverseIndex.end() && ss->first < limits->tpsLimit; ++ss) {
if (ignoredMachines.size() < std::min(self->configuration.storageTeamSize - 1, SERVER_KNOBS->MAX_MACHINES_FALLING_BEHIND)) {
@ -468,6 +486,46 @@ void updateRate(RatekeeperData* self, RatekeeperLimits* limits) {
break;
}
int64_t limitingDurabilityLag = 0;
std::set<Optional<Standalone<StringRef>>> ignoredDurabilityLagMachines;
for (auto ss = storageDurabilityLagReverseIndex.begin(); ss != storageDurabilityLagReverseIndex.end(); ++ss) {
if (ignoredDurabilityLagMachines.size() < std::min(self->configuration.storageTeamSize - 1, SERVER_KNOBS->MAX_MACHINES_FALLING_BEHIND)) {
ignoredDurabilityLagMachines.insert(ss->second->locality.zoneId());
continue;
}
if (ignoredDurabilityLagMachines.count(ss->second->locality.zoneId()) > 0) {
continue;
}
limitingDurabilityLag = -1*ss->first;
if(limitingDurabilityLag > limits->durabilityLagTargetVersions && self->actualTpsHistory.size() > SERVER_KNOBS->NEEDED_TPS_HISTORY_SAMPLES) {
if(limits->durabilityLagLimit == std::numeric_limits<double>::infinity()) {
double maxTps = 0;
for(int i = 0; i < self->actualTpsHistory.size(); i++) {
maxTps = std::max(maxTps, self->actualTpsHistory[i]);
}
limits->durabilityLagLimit = SERVER_KNOBS->INITIAL_DURABILITY_LAG_MULTIPLIER*maxTps;
}
if( limitingDurabilityLag > limits->lastDurabilityLag ) {
limits->durabilityLagLimit = SERVER_KNOBS->DURABILITY_LAG_REDUCTION_RATE*limits->durabilityLagLimit;
}
if(limits->durabilityLagLimit < limits->tpsLimit) {
limits->tpsLimit = limits->durabilityLagLimit;
limitReason = limitReason_t::storage_server_durability_lag;
}
} else if(limits->durabilityLagLimit != std::numeric_limits<double>::infinity() && limitingDurabilityLag > limits->durabilityLagTargetVersions - SERVER_KNOBS->DURABILITY_LAG_UNLIMITED_THRESHOLD) {
limits->durabilityLagLimit = SERVER_KNOBS->DURABILITY_LAG_INCREASE_RATE*limits->durabilityLagLimit;
} else {
limits->durabilityLagLimit = std::numeric_limits<double>::infinity();
}
limits->lastDurabilityLag = limitingDurabilityLag;
break;
}
self->healthMetrics.worstStorageQueue = worstStorageQueueStorageServer;
self->healthMetrics.worstStorageDurabilityLag = worstDurabilityLag;
double writeToReadLatencyLimit = 0;
Version worstVersionLag = 0;
Version limitingVersionLag = 0;
@ -613,12 +671,14 @@ void updateRate(RatekeeperData* self, RatekeeperLimits* limits) {
.detail("TotalDiskUsageBytes", totalDiskUsageBytes)
.detail("WorstStorageServerVersionLag", worstVersionLag)
.detail("LimitingStorageServerVersionLag", limitingVersionLag)
.detail("WorstDurabilityLag", worstDurabilityLag)
.detail("LimitingDurabilityLag", limitingDurabilityLag)
.trackLatest(name.c_str());
}
}
ACTOR Future<Void> configurationMonitor(Reference<AsyncVar<ServerDBInfo>> dbInfo, DatabaseConfiguration* conf) {
state Database cx = openDBOnServer(dbInfo, TaskDefaultEndpoint, true, true);
state Database cx = openDBOnServer(dbInfo, TaskPriority::DefaultEndpoint, true, true);
loop {
state ReadYourWritesTransaction tr(cx);
@ -650,7 +710,7 @@ ACTOR Future<Void> ratekeeper(RatekeeperInterface rkInterf, Reference<AsyncVar<S
state Promise<Void> err;
state Future<Void> collection = actorCollection( self.addActor.getFuture() );
TraceEvent("Ratekeeper_Starting", rkInterf.id());
TraceEvent("RatekeeperStarting", rkInterf.id());
self.addActor.send( waitFailureServer(rkInterf.waitFailure.getFuture()) );
self.addActor.send( configurationMonitor(dbInfo, &self.configuration) );
@ -732,7 +792,7 @@ ACTOR Future<Void> ratekeeper(RatekeeperInterface rkInterf, Reference<AsyncVar<S
}
}
catch (Error& err) {
TraceEvent("Ratekeeper_Died", rkInterf.id()).error(err, true);
TraceEvent("RatekeeperDied", rkInterf.id()).error(err, true);
}
return Void();
}

View File

@ -114,9 +114,9 @@ ACTOR Future<Void> resolveBatch(
}
}
if (check_yield(TaskDefaultEndpoint)) {
wait( delay( 0, TaskLowPriority ) || delay( SERVER_KNOBS->COMMIT_SLEEP_TIME ) ); // FIXME: Is this still right?
g_network->setCurrentTask(TaskDefaultEndpoint);
if (check_yield(TaskPriority::DefaultEndpoint)) {
wait( delay( 0, TaskPriority::Low ) || delay( SERVER_KNOBS->COMMIT_SLEEP_TIME ) ); // FIXME: Is this still right?
g_network->setCurrentTask(TaskPriority::DefaultEndpoint);
}
if (self->version.get() == req.prevVersion) { // Not a duplicate (check relies on no waiting between here and self->version.set() below!)

View File

@ -44,8 +44,8 @@ struct ResolverInterface {
bool operator != ( ResolverInterface const& r ) const { return id() != r.id(); }
NetworkAddress address() const { return resolve.getEndpoint().getPrimaryAddress(); }
void initEndpoints() {
metrics.getEndpoint( TaskResolutionMetrics );
split.getEndpoint( TaskResolutionMetrics );
metrics.getEndpoint( TaskPriority::ResolutionMetrics );
split.getEndpoint( TaskPriority::ResolutionMetrics );
}
template <class Ar>

View File

@ -24,7 +24,7 @@
#include "flow/actorcompiler.h" // This must be the last #include.
ACTOR Future<Void> restoreWorker(Reference<ClusterConnectionFile> ccf, LocalityData locality) {
state Database cx = Database::createDatabase(ccf->getFilename(), Database::API_VERSION_LATEST,locality);
state Database cx = Database::createDatabase(ccf->getFilename(), Database::API_VERSION_LATEST, true, locality);
state RestoreInterface interf;
interf.initEndpoints();
state Optional<RestoreInterface> leaderInterf;

Some files were not shown because too many files have changed in this diff Show More