Merge branch 'main' of github.com:apple/foundationdb into monitorusage

This commit is contained in:
Ankita Kejriwal 2022-11-15 18:32:21 -08:00
commit 959bf9f4e7
111 changed files with 1443 additions and 714 deletions

View File

@ -82,7 +82,8 @@ extern "C" DLLEXPORT fdb_bool_t fdb_error_predicate(int predicate_test, fdb_erro
code == error_code_grv_proxy_memory_limit_exceeded ||
code == error_code_commit_proxy_memory_limit_exceeded ||
code == error_code_batch_transaction_throttled || code == error_code_process_behind ||
code == error_code_tag_throttled || code == error_code_unknown_tenant;
code == error_code_tag_throttled || code == error_code_unknown_tenant ||
code == error_code_proxy_tag_throttled;
}
return false;
}

View File

@ -70,10 +70,13 @@ void ApiWorkload::start() {
schedule([this]() {
// 1. Clear data
clearData([this]() {
// 2. Populate initial data
populateData([this]() {
// 3. Generate random workload
runTests();
// 2. Workload setup
setup([this]() {
// 3. Populate initial data
populateData([this]() {
// 4. Generate random workload
runTests();
});
});
});
});
@ -249,6 +252,10 @@ void ApiWorkload::populateData(TTaskFct cont) {
}
}
void ApiWorkload::setup(TTaskFct cont) {
schedule(cont);
}
void ApiWorkload::randomInsertOp(TTaskFct cont, std::optional<int> tenantId) {
int numKeys = Random::get().randomInt(1, maxKeysPerTransaction);
auto kvPairs = std::make_shared<std::vector<fdb::KeyValue>>();
@ -322,4 +329,85 @@ std::optional<fdb::BytesRef> ApiWorkload::getTenant(std::optional<int> tenantId)
}
}
std::string ApiWorkload::debugTenantStr(std::optional<int> tenantId) {
return tenantId.has_value() ? fmt::format("(tenant {0})", tenantId.value()) : "()";
}
// BlobGranule setup.
// This blobbifies ['\x00', '\xff') per tenant or for the whole database if there are no tenants.
void ApiWorkload::setupBlobGranules(TTaskFct cont) {
// This count is used to synchronize the # of tenant blobbifyRange() calls to ensure
// we only start the workload once blobbification has fully finished.
auto blobbifiedCount = std::make_shared<std::atomic<int>>(1);
if (tenants.empty()) {
blobbifiedCount->store(1);
blobbifyTenant({}, blobbifiedCount, cont);
} else {
blobbifiedCount->store(tenants.size());
for (int i = 0; i < tenants.size(); i++) {
schedule([=]() { blobbifyTenant(i, blobbifiedCount, cont); });
}
}
}
void ApiWorkload::blobbifyTenant(std::optional<int> tenantId,
std::shared_ptr<std::atomic<int>> blobbifiedCount,
TTaskFct cont) {
auto retBlobbifyRange = std::make_shared<bool>(false);
execOperation(
[=](auto ctx) {
fdb::Key begin(1, '\x00');
fdb::Key end(1, '\xff');
info(fmt::format("setup: blobbifying {}: [\\x00 - \\xff)\n", debugTenantStr(tenantId)));
fdb::Future f = ctx->dbOps()->blobbifyRange(begin, end).eraseType();
ctx->continueAfter(f, [ctx, retBlobbifyRange, f]() {
*retBlobbifyRange = f.get<fdb::future_var::Bool>();
ctx->done();
});
},
[=]() {
if (!*retBlobbifyRange) {
schedule([=]() { blobbifyTenant(tenantId, blobbifiedCount, cont); });
} else {
schedule([=]() { verifyTenant(tenantId, blobbifiedCount, cont); });
}
},
/*tenant=*/getTenant(tenantId),
/* failOnError = */ false);
}
void ApiWorkload::verifyTenant(std::optional<int> tenantId,
std::shared_ptr<std::atomic<int>> blobbifiedCount,
TTaskFct cont) {
auto retVerifyVersion = std::make_shared<int64_t>(-1);
execOperation(
[=](auto ctx) {
fdb::Key begin(1, '\x00');
fdb::Key end(1, '\xff');
info(fmt::format("setup: verifying {}: [\\x00 - \\xff)\n", debugTenantStr(tenantId)));
fdb::Future f = ctx->dbOps()->verifyBlobRange(begin, end, /*latest_version*/ -2).eraseType();
ctx->continueAfter(f, [ctx, retVerifyVersion, f]() {
*retVerifyVersion = f.get<fdb::future_var::Int64>();
ctx->done();
});
},
[=]() {
if (*retVerifyVersion == -1) {
schedule([=]() { verifyTenant(tenantId, blobbifiedCount, cont); });
} else {
if (blobbifiedCount->fetch_sub(1) == 1) {
schedule(cont);
}
}
},
/*tenant=*/getTenant(tenantId),
/* failOnError = */ false);
}
} // namespace FdbApiTester

View File

@ -41,6 +41,9 @@ public:
virtual void checkProgress() override;
// Workload specific setup phase.
virtual void setup(TTaskFct cont);
// Running specific tests
// The default implementation generates a workload consisting of
// random operations generated by randomOperation
@ -126,6 +129,12 @@ protected:
void randomClearRangeOp(TTaskFct cont, std::optional<int> tenantId);
std::optional<fdb::BytesRef> getTenant(std::optional<int> tenantId);
std::string debugTenantStr(std::optional<int> tenantId);
// Generic BlobGranules setup.
void setupBlobGranules(TTaskFct cont);
void blobbifyTenant(std::optional<int> tenantId, std::shared_ptr<std::atomic<int>> blobbifiedCount, TTaskFct cont);
void verifyTenant(std::optional<int> tenantId, std::shared_ptr<std::atomic<int>> blobbifiedCount, TTaskFct cont);
private:
void populateDataTx(TTaskFct cont, std::optional<int> tenantId);

View File

@ -52,26 +52,23 @@ private:
};
std::vector<OpType> excludedOpTypes;
void setup(TTaskFct cont) override { setupBlobGranules(cont); }
// Allow reads at the start to get blob_granule_transaction_too_old if BG data isn't initialized yet
// FIXME: should still guarantee a read succeeds eventually somehow
// FIXME: this needs to be per tenant if tenant ids are set
std::unordered_set<std::optional<int>> tenantsWithReadSuccess;
inline void setReadSuccess(std::optional<int> tenantId) { tenantsWithReadSuccess.insert(tenantId); }
inline bool seenReadSuccess(std::optional<int> tenantId) { return tenantsWithReadSuccess.count(tenantId); }
std::string tenantDebugString(std::optional<int> tenantId) {
return tenantId.has_value() ? fmt::format(" (tenant {0})", tenantId.value()) : "";
}
void debugOp(std::string opName, fdb::Key begin, fdb::Key end, std::optional<int> tenantId, std::string message) {
if (BG_API_DEBUG_VERBOSE) {
info(fmt::format("{0}: [{1} - {2}){3}: {4}",
info(fmt::format("{0}: [{1} - {2}) {3}: {4}",
opName,
fdb::toCharsRef(begin),
fdb::toCharsRef(end),
tenantDebugString(tenantId),
debugTenantStr(tenantId),
message));
}
}
@ -117,7 +114,7 @@ private:
results.get()->assign(resVector.begin(), resVector.end());
bool previousSuccess = seenReadSuccess(tenantId);
if (!previousSuccess) {
info(fmt::format("Read{0}: first success\n", tenantDebugString(tenantId)));
info(fmt::format("Read {0}: first success\n", debugTenantStr(tenantId)));
setReadSuccess(tenantId);
} else {
debugOp("Read", begin, end, tenantId, "complete");
@ -289,20 +286,19 @@ private:
}
// TODO: tenant support
void randomGetBlobRangesOp(TTaskFct cont) {
void randomGetBlobRangesOp(TTaskFct cont, std::optional<int> tenantId) {
fdb::Key begin = randomKeyName();
fdb::Key end = randomKeyName();
auto results = std::make_shared<std::vector<fdb::KeyRange>>();
if (begin > end) {
std::swap(begin, end);
}
std::optional<int> tenantId = {};
debugOp("GetBlobRanges", begin, end, tenantId, "starting");
execOperation(
[begin, end, results](auto ctx) {
fdb::Future f = ctx->db().listBlobbifiedRanges(begin, end, 1000).eraseType();
fdb::Future f = ctx->dbOps()->listBlobbifiedRanges(begin, end, 1000).eraseType();
ctx->continueAfter(f, [ctx, f, results]() {
*results = copyKeyRangeArray(f.get<fdb::future_var::KeyRangeRefArray>());
ctx->done();
@ -314,25 +310,24 @@ private:
this->validateRanges(results, begin, end, seenReadSuccess(tenantId));
schedule(cont);
},
getTenant(tenantId),
/* failOnError = */ false);
}
// TODO: tenant support
void randomVerifyOp(TTaskFct cont) {
void randomVerifyOp(TTaskFct cont, std::optional<int> tenantId) {
fdb::Key begin = randomKeyName();
fdb::Key end = randomKeyName();
std::optional<int> tenantId;
if (begin > end) {
std::swap(begin, end);
}
auto verifyVersion = std::make_shared<int64_t>(false);
debugOp("Verify", begin, end, tenantId, "starting");
auto verifyVersion = std::make_shared<int64_t>(-1);
execOperation(
[begin, end, verifyVersion](auto ctx) {
fdb::Future f = ctx->db().verifyBlobRange(begin, end, -2 /* latest version*/).eraseType();
fdb::Future f = ctx->dbOps()->verifyBlobRange(begin, end, -2 /* latest version*/).eraseType();
ctx->continueAfter(f, [ctx, verifyVersion, f]() {
*verifyVersion = f.get<fdb::future_var::Int64>();
ctx->done();
@ -344,15 +339,16 @@ private:
if (*verifyVersion == -1) {
ASSERT(!previousSuccess);
} else if (!previousSuccess) {
info(fmt::format("Verify{0}: first success\n", tenantDebugString(tenantId)));
info(fmt::format("Verify {0}: first success\n", debugTenantStr(tenantId)));
setReadSuccess(tenantId);
}
schedule(cont);
},
getTenant(tenantId),
/* failOnError = */ false);
}
void randomOperation(TTaskFct cont) {
void randomOperation(TTaskFct cont) override {
std::optional<int> tenantId = randomTenant();
OpType txType = (stores[tenantId].size() == 0) ? OP_INSERT : (OpType)Random::get().randomInt(0, OP_LAST);
@ -380,10 +376,10 @@ private:
randomSummarizeOp(cont, tenantId);
break;
case OP_GET_BLOB_RANGES:
randomGetBlobRangesOp(cont);
randomGetBlobRangesOp(cont, tenantId);
break;
case OP_VERIFY:
randomVerifyOp(cont);
randomVerifyOp(cont, tenantId);
break;
}
}

View File

@ -47,6 +47,8 @@ private:
OP_LAST = OP_CANCEL_PURGE
};
void setup(TTaskFct cont) override { setupBlobGranules(cont); }
// could add summarize too old and verify too old as ops if desired but those are lower value
// Allow reads at the start to get blob_granule_transaction_too_old if BG data isn't initialized yet

View File

@ -91,13 +91,15 @@ public:
fdbDb = executor->selectDatabase();
}
if (tenantName) {
fdbTenant = fdbDb.openTenant(*tenantName);
fdbDbOps = std::make_shared<fdb::Tenant>(fdbTenant);
} else {
fdbDbOps = std::make_shared<fdb::Database>(fdbDb);
}
if (transactional) {
if (tenantName) {
fdb::Tenant tenant = fdbDb.openTenant(*tenantName);
fdbTx = tenant.createTransaction();
} else {
fdbTx = fdbDb.createTransaction();
}
fdbTx = fdbDbOps->createTransaction();
}
}
@ -109,6 +111,10 @@ public:
fdb::Database db() override { return fdbDb.atomic_load(); }
fdb::Tenant tenant() override { return fdbTenant.atomic_load(); }
std::shared_ptr<fdb::IDatabaseOps> dbOps() override { return std::atomic_load(&fdbDbOps); }
fdb::Transaction tx() override { return fdbTx.atomic_load(); }
// Set a continuation to be executed when a future gets ready
@ -272,13 +278,17 @@ protected:
scheduler->schedule([thisRef]() {
fdb::Database db = thisRef->executor->selectDatabase();
thisRef->fdbDb.atomic_store(db);
if (thisRef->tenantName) {
fdb::Tenant tenant = db.openTenant(*thisRef->tenantName);
thisRef->fdbTenant.atomic_store(tenant);
std::atomic_store(&thisRef->fdbDbOps,
std::dynamic_pointer_cast<fdb::IDatabaseOps>(std::make_shared<fdb::Tenant>(tenant)));
} else {
std::atomic_store(&thisRef->fdbDbOps,
std::dynamic_pointer_cast<fdb::IDatabaseOps>(std::make_shared<fdb::Database>(db)));
}
if (thisRef->transactional) {
if (thisRef->tenantName) {
fdb::Tenant tenant = db.openTenant(*thisRef->tenantName);
thisRef->fdbTx.atomic_store(tenant.createTransaction());
} else {
thisRef->fdbTx.atomic_store(db.createTransaction());
}
thisRef->fdbTx.atomic_store(thisRef->fdbDbOps->createTransaction());
}
thisRef->restartTransaction();
});
@ -317,6 +327,14 @@ protected:
// Provides a thread safe interface by itself (no need for mutex)
fdb::Database fdbDb;
// FDB tenant
// Provides a thread safe interface by itself (no need for mutex)
fdb::Tenant fdbTenant;
// FDB IDatabaseOps to hide database/tenant accordingly.
// Provides a shared pointer to database functions based on if db or tenant.
std::shared_ptr<fdb::IDatabaseOps> fdbDbOps;
// FDB transaction
// Provides a thread safe interface by itself (no need for mutex)
fdb::Transaction fdbTx;

View File

@ -41,6 +41,12 @@ public:
// Current FDB database
virtual fdb::Database db() = 0;
// Current FDB tenant
virtual fdb::Tenant tenant() = 0;
// Current FDB IDatabaseOps
virtual std::shared_ptr<fdb::IDatabaseOps> dbOps() = 0;
// Current FDB transaction
virtual fdb::Transaction tx() = 0;

View File

@ -117,8 +117,11 @@ void WorkloadBase::execTransaction(TOpStartFct startFct,
}
// Execute a non-transactional database operation within the workload
void WorkloadBase::execOperation(TOpStartFct startFct, TTaskFct cont, bool failOnError) {
doExecute(startFct, cont, {}, failOnError, false);
void WorkloadBase::execOperation(TOpStartFct startFct,
TTaskFct cont,
std::optional<fdb::BytesRef> tenant,
bool failOnError) {
doExecute(startFct, cont, tenant, failOnError, false);
}
void WorkloadBase::doExecute(TOpStartFct startFct,

View File

@ -125,7 +125,10 @@ protected:
bool failOnError = true);
// Execute a non-transactional database operation within the workload
void execOperation(TOpStartFct startFct, TTaskFct cont, bool failOnError = true);
void execOperation(TOpStartFct startFct,
TTaskFct cont,
std::optional<fdb::BytesRef> tenant = std::optional<fdb::BytesRef>(),
bool failOnError = true);
// Log an error message, increase error counter
void error(const std::string& msg);

View File

@ -677,7 +677,28 @@ public:
}
};
class Tenant final {
// Handle this as an abstract class instead of interface to preserve lifetime of fdb objects owned by Tenant and
// Database.
class IDatabaseOps {
public:
virtual ~IDatabaseOps() = default;
virtual Transaction createTransaction() = 0;
virtual TypedFuture<future_var::Bool> blobbifyRange(KeyRef begin, KeyRef end) = 0;
virtual TypedFuture<future_var::Bool> unblobbifyRange(KeyRef begin, KeyRef end) = 0;
virtual TypedFuture<future_var::KeyRangeRefArray> listBlobbifiedRanges(KeyRef begin,
KeyRef end,
int rangeLimit) = 0;
virtual TypedFuture<future_var::Int64> verifyBlobRange(KeyRef begin, KeyRef end, int64_t version) = 0;
virtual TypedFuture<future_var::KeyRef> purgeBlobGranules(KeyRef begin,
KeyRef end,
int64_t version,
bool force) = 0;
virtual TypedFuture<future_var::None> waitPurgeGranulesComplete(KeyRef purgeKey) = 0;
};
class Tenant final : public IDatabaseOps {
friend class Database;
std::shared_ptr<native::FDBTenant> tenant;
@ -694,6 +715,14 @@ public:
Tenant& operator=(const Tenant&) noexcept = default;
Tenant() noexcept : tenant(nullptr) {}
void atomic_store(Tenant other) { std::atomic_store(&tenant, other.tenant); }
Tenant atomic_load() {
Tenant retVal;
retVal.tenant = std::atomic_load(&tenant);
return retVal;
}
static void createTenant(Transaction tr, BytesRef name) {
tr.setOption(FDBTransactionOption::FDB_TR_OPTION_SPECIAL_KEY_SPACE_ENABLE_WRITES, BytesRef());
tr.setOption(FDBTransactionOption::FDB_TR_OPTION_LOCK_AWARE, BytesRef());
@ -715,7 +744,7 @@ public:
return tr.get(toBytesRef(fmt::format("{}{}", tenantManagementMapPrefix, toCharsRef(name))), false);
}
Transaction createTransaction() {
Transaction createTransaction() override {
auto tx_native = static_cast<native::FDBTransaction*>(nullptr);
auto err = Error(native::fdb_tenant_create_transaction(tenant.get(), &tx_native));
if (err)
@ -723,14 +752,49 @@ public:
return Transaction(tx_native);
}
TypedFuture<future_var::Bool> blobbifyRange(KeyRef begin, KeyRef end) {
TypedFuture<future_var::Bool> blobbifyRange(KeyRef begin, KeyRef end) override {
if (!tenant)
throw std::runtime_error("blobbifyRange from null tenant");
throw std::runtime_error("blobbifyRange() from null tenant");
return native::fdb_tenant_blobbify_range(tenant.get(), begin.data(), intSize(begin), end.data(), intSize(end));
}
TypedFuture<future_var::Bool> unblobbifyRange(KeyRef begin, KeyRef end) override {
if (!tenant)
throw std::runtime_error("unblobbifyRange() from null tenant");
return native::fdb_tenant_unblobbify_range(
tenant.get(), begin.data(), intSize(begin), end.data(), intSize(end));
}
TypedFuture<future_var::KeyRangeRefArray> listBlobbifiedRanges(KeyRef begin, KeyRef end, int rangeLimit) override {
if (!tenant)
throw std::runtime_error("listBlobbifiedRanges() from null tenant");
return native::fdb_tenant_list_blobbified_ranges(
tenant.get(), begin.data(), intSize(begin), end.data(), intSize(end), rangeLimit);
}
TypedFuture<future_var::Int64> verifyBlobRange(KeyRef begin, KeyRef end, int64_t version) override {
if (!tenant)
throw std::runtime_error("verifyBlobRange() from null tenant");
return native::fdb_tenant_verify_blob_range(
tenant.get(), begin.data(), intSize(begin), end.data(), intSize(end), version);
}
TypedFuture<future_var::KeyRef> purgeBlobGranules(KeyRef begin, KeyRef end, int64_t version, bool force) override {
if (!tenant)
throw std::runtime_error("purgeBlobGranules() from null tenant");
native::fdb_bool_t forceBool = force;
return native::fdb_tenant_purge_blob_granules(
tenant.get(), begin.data(), intSize(begin), end.data(), intSize(end), version, forceBool);
}
TypedFuture<future_var::None> waitPurgeGranulesComplete(KeyRef purgeKey) override {
if (!tenant)
throw std::runtime_error("waitPurgeGranulesComplete() from null tenant");
return native::fdb_tenant_wait_purge_granules_complete(tenant.get(), purgeKey.data(), intSize(purgeKey));
}
};
class Database {
class Database : public IDatabaseOps {
friend class Tenant;
std::shared_ptr<native::FDBDatabase> db;
@ -789,7 +853,7 @@ public:
return Tenant(tenant_native);
}
Transaction createTransaction() {
Transaction createTransaction() override {
if (!db)
throw std::runtime_error("create_transaction from null database");
auto tx_native = static_cast<native::FDBTransaction*>(nullptr);
@ -799,33 +863,33 @@ public:
return Transaction(tx_native);
}
TypedFuture<future_var::KeyRangeRefArray> listBlobbifiedRanges(KeyRef begin, KeyRef end, int rangeLimit) {
TypedFuture<future_var::KeyRangeRefArray> listBlobbifiedRanges(KeyRef begin, KeyRef end, int rangeLimit) override {
if (!db)
throw std::runtime_error("listBlobbifiedRanges from null database");
return native::fdb_database_list_blobbified_ranges(
db.get(), begin.data(), intSize(begin), end.data(), intSize(end), rangeLimit);
}
TypedFuture<future_var::Int64> verifyBlobRange(KeyRef begin, KeyRef end, int64_t version) {
TypedFuture<future_var::Int64> verifyBlobRange(KeyRef begin, KeyRef end, int64_t version) override {
if (!db)
throw std::runtime_error("verifyBlobRange from null database");
return native::fdb_database_verify_blob_range(
db.get(), begin.data(), intSize(begin), end.data(), intSize(end), version);
}
TypedFuture<future_var::Bool> blobbifyRange(KeyRef begin, KeyRef end) {
TypedFuture<future_var::Bool> blobbifyRange(KeyRef begin, KeyRef end) override {
if (!db)
throw std::runtime_error("blobbifyRange from null database");
return native::fdb_database_blobbify_range(db.get(), begin.data(), intSize(begin), end.data(), intSize(end));
}
TypedFuture<future_var::Bool> unblobbifyRange(KeyRef begin, KeyRef end) {
TypedFuture<future_var::Bool> unblobbifyRange(KeyRef begin, KeyRef end) override {
if (!db)
throw std::runtime_error("unblobbifyRange from null database");
return native::fdb_database_unblobbify_range(db.get(), begin.data(), intSize(begin), end.data(), intSize(end));
}
TypedFuture<future_var::KeyRef> purgeBlobGranules(KeyRef begin, KeyRef end, int64_t version, bool force) {
TypedFuture<future_var::KeyRef> purgeBlobGranules(KeyRef begin, KeyRef end, int64_t version, bool force) override {
if (!db)
throw std::runtime_error("purgeBlobGranules from null database");
native::fdb_bool_t forceBool = force;
@ -833,7 +897,7 @@ public:
db.get(), begin.data(), intSize(begin), end.data(), intSize(end), version, forceBool);
}
TypedFuture<future_var::None> waitPurgeGranulesComplete(KeyRef purgeKey) {
TypedFuture<future_var::None> waitPurgeGranulesComplete(KeyRef purgeKey) override {
if (!db)
throw std::runtime_error("purgeBlobGranules from null database");
return native::fdb_database_wait_purge_granules_complete(db.get(), purgeKey.data(), intSize(purgeKey));

View File

@ -497,6 +497,11 @@ func (o TransactionOptions) SetRawAccess() error {
return o.setOpt(303, nil)
}
// Allows this transaction to bypass storage quota enforcement. Should only be used for transactions that directly or indirectly decrease the size of the tenant group's data.
func (o TransactionOptions) SetBypassStorageQuota() error {
return o.setOpt(304, nil)
}
// Not yet implemented.
func (o TransactionOptions) SetDebugRetryLogging(param string) error {
return o.setOpt(401, []byte(param))

View File

@ -54,7 +54,7 @@ def write_coverage_chunk(tr, path: Tuple[str, ...], metadata: Tuple[str, ...],
initialized = v.present()
for cov, covered in coverage:
if not initialized or covered:
tr.add(cov_dir.pack((cov.file, cov.line, cov.comment)), struct.pack('<I', 1 if covered else 0))
tr.add(cov_dir.pack((cov.file, cov.line, cov.comment, cov.rare)), struct.pack('<I', 1 if covered else 0))
return initialized
@ -80,9 +80,9 @@ def _read_coverage(tr, cov_path: Tuple[str, ...]) -> OrderedDict[Coverage, int]:
res = collections.OrderedDict()
cov_dir = fdb.directory.create_or_open(tr, cov_path)
for k, v in tr[cov_dir.range()]:
file, line, comment = cov_dir.unpack(k)
file, line, comment, rare = cov_dir.unpack(k)
count = struct.unpack('<I', v)[0]
res[Coverage(file, line, comment)] = count
res[Coverage(file, line, comment, rare)] = count
return res

View File

@ -19,6 +19,7 @@ class GlobalStatistics:
self.total_cpu_time: int = 0
self.total_test_runs: int = 0
self.total_missed_probes: int = 0
self.total_missed_nonrare_probes: int = 0
class EnsembleResults:
@ -40,6 +41,8 @@ class EnsembleResults:
self.coverage.append((cov, count))
if count <= self.ratio:
self.global_statistics.total_missed_probes += 1
if not cov.rare:
self.global_statistics.total_missed_nonrare_probes += 1
if self.min_coverage_hit is None or self.min_coverage_hit > count:
self.min_coverage_hit = count
self.coverage.sort(key=lambda x: (x[1], x[0].file, x[0].line))
@ -63,9 +66,12 @@ class EnsembleResults:
out.attributes['MinProbeHit'] = str(self.min_coverage_hit)
out.attributes['TotalProbes'] = str(len(self.coverage))
out.attributes['MissedProbes'] = str(self.global_statistics.total_missed_probes)
out.attributes['MissedNonRareProbes'] = str(self.global_statistics.total_missed_nonrare_probes)
for cov, count in self.coverage:
severity = 10 if count > self.ratio else 40
severity = 10
if count <= self.ratio:
severity = 30 if cov.rare else 40
if severity == 40:
errors += 1
if (severity == 40 and errors <= config.max_errors) or config.details:
@ -75,6 +81,7 @@ class EnsembleResults:
child.attributes['Line'] = str(cov.line)
child.attributes['Comment'] = '' if cov.comment is None else cov.comment
child.attributes['HitCount'] = str(count)
child.attributes['Rare'] = str(cov.rare)
out.append(child)
if config.details:

View File

@ -193,16 +193,17 @@ class JsonParser(Parser):
class Coverage:
def __init__(self, file: str, line: str | int, comment: str | None = None):
def __init__(self, file: str, line: str | int, comment: str | None = None, rare: bool = False):
self.file = file
self.line = int(line)
self.comment = comment
self.rare = rare
def to_tuple(self) -> Tuple[str, int, str | None]:
return self.file, self.line, self.comment
return self.file, self.line, self.comment, self.rare
def __eq__(self, other) -> bool:
if isinstance(other, tuple) and len(other) == 3:
if isinstance(other, tuple) and len(other) == 4:
return self.to_tuple() == other
elif isinstance(other, Coverage):
return self.to_tuple() == other.to_tuple()
@ -210,7 +211,7 @@ class Coverage:
return False
def __lt__(self, other) -> bool:
if isinstance(other, tuple) and len(other) == 3:
if isinstance(other, tuple) and len(other) == 4:
return self.to_tuple() < other
elif isinstance(other, Coverage):
return self.to_tuple() < other.to_tuple()
@ -218,7 +219,7 @@ class Coverage:
return False
def __le__(self, other) -> bool:
if isinstance(other, tuple) and len(other) == 3:
if isinstance(other, tuple) and len(other) == 4:
return self.to_tuple() <= other
elif isinstance(other, Coverage):
return self.to_tuple() <= other.to_tuple()
@ -226,7 +227,7 @@ class Coverage:
return False
def __gt__(self, other: Coverage) -> bool:
if isinstance(other, tuple) and len(other) == 3:
if isinstance(other, tuple) and len(other) == 4:
return self.to_tuple() > other
elif isinstance(other, Coverage):
return self.to_tuple() > other.to_tuple()
@ -234,7 +235,7 @@ class Coverage:
return False
def __ge__(self, other):
if isinstance(other, tuple) and len(other) == 3:
if isinstance(other, tuple) and len(other) == 4:
return self.to_tuple() >= other
elif isinstance(other, Coverage):
return self.to_tuple() >= other.to_tuple()
@ -242,7 +243,7 @@ class Coverage:
return False
def __hash__(self):
return hash((self.file, self.line, self.comment))
return hash((self.file, self.line, self.comment, self.rare))
class TraceFiles:
@ -378,6 +379,7 @@ class Summary:
child = SummaryTree('CodeCoverage')
child.attributes['File'] = k.file
child.attributes['Line'] = str(k.line)
child.attributes['Rare'] = k.rare
if not v:
child.attributes['Covered'] = '0'
if k.comment is not None and len(k.comment):
@ -595,7 +597,10 @@ class Summary:
comment = ''
if 'Comment' in attrs:
comment = attrs['Comment']
c = Coverage(attrs['File'], attrs['Line'], comment)
rare = False
if 'Rare' in attrs:
rare = bool(int(attrs['Rare']))
c = Coverage(attrs['File'], attrs['Line'], comment, rare)
if covered or c not in self.coverage:
self.coverage[c] = covered

View File

@ -116,12 +116,12 @@ If an individual zone is unhealthy, it may cause the throttling ratio for storag
### Client Rate Calculation
The smoothed per-client rate for each tag is tracked within `GlobalTagThrottlerImpl::PerTagStatistics`. Once a target rate has been computed, this is passed to `GlobalTagThrotterImpl::PerTagStatistics::updateAndGetPerClientRate` which adjusts the per-client rate. The per-client rate is meant to limit the busiest clients, so that at equilibrium, the per-client rate will remain constant and the sum of throughput from all clients will match the target rate.
## Testing
The `GlobalTagThrottling.toml` test provides a simple end-to-end test using the global tag throttler. Quotas are set using the internal tag quota API in the `GlobalTagThrottling` workload. This is run in parallel with the `ReadWrite` workload, which tags transactions. The number of `transaction_tag_throttled` errors is reported, along with the throughput, which should be roughly predictable based on the quota parameters chosen.
## Simulation Testing
The `ThroughputQuota.toml` test provides a simple end-to-end test using the global tag throttler. Quotas are set using the internal tag quota API in the `ThroughputQuota` workload. This is run with the `Cycle` workload, which randomly tags transactions.
In addition to this end-to-end test, there is a suite of unit tests with the `/GlobalTagThrottler/` prefix. These tests run in a mock environment, with mock storage servers providing simulated storage queue statistics and tag busyness reports. Mock clients simulate workload on these mock storage servers, and get throttling feedback directly from a global tag throttler which is monitoring the mock storage servers.
In each test, the `GlobalTagThrottlerTesting::monitor` function is used to periodically check whether or not a desired equilibrium state has been reached. If the desired state is reached and maintained for a sufficient period of time, the test passes. If the unit test is unable to reach this desired equilibrium state before a timeout, the test will fail. Commonly, the desired state is for the global tag throttler to report a client rate sufficiently close to the desired rate specified as an input to the `GlobalTagThrottlerTesting::rateIsNear` function.
In each unit test, the `GlobalTagThrottlerTesting::monitor` function is used to periodically check whether or not a desired equilibrium state has been reached. If the desired state is reached and maintained for a sufficient period of time, the test passes. If the unit test is unable to reach this desired equilibrium state before a timeout, the test will fail. Commonly, the desired state is for the global tag throttler to report a client rate sufficiently close to the desired rate specified as an input to the `GlobalTagThrottlerTesting::rateIsNear` function.
## Visibility

View File

@ -107,9 +107,9 @@ struct ConvertParams {
bool log_enabled = false;
std::string log_dir, trace_format, trace_log_group;
bool isValid() { return begin != invalidVersion && end != invalidVersion && !container_url.empty(); }
bool isValid() const { return begin != invalidVersion && end != invalidVersion && !container_url.empty(); }
std::string toString() {
std::string toString() const {
std::string s;
s.append("ContainerURL:");
s.append(container_url);

View File

@ -19,11 +19,13 @@
*/
#include "fdbcli/fdbcli.actor.h"
#include "fdbclient/ManagementAPI.actor.h"
#include "fdbclient/SystemData.h"
#include "flow/actorcompiler.h" // This must be the last include
namespace {
enum class LimitType { RESERVED, TOTAL };
enum class QuotaType { RESERVED, TOTAL, STORAGE };
Optional<TransactionTag> parseTag(StringRef token) {
if (token.size() > CLIENT_KNOBS->MAX_TRANSACTION_TAG_LENGTH) {
@ -33,17 +35,19 @@ Optional<TransactionTag> parseTag(StringRef token) {
}
}
Optional<LimitType> parseLimitType(StringRef token) {
Optional<QuotaType> parseQuotaType(StringRef token) {
if (token == "reserved_throughput"_sr) {
return LimitType::RESERVED;
return QuotaType::RESERVED;
} else if (token == "total_throughput"_sr) {
return LimitType::TOTAL;
return QuotaType::TOTAL;
} else if (token == "storage"_sr) {
return QuotaType::STORAGE;
} else {
return {};
}
}
Optional<int64_t> parseLimitValue(StringRef token) {
Optional<int64_t> parseQuotaValue(StringRef token) {
try {
return std::stol(token.toString());
} catch (...) {
@ -51,20 +55,26 @@ Optional<int64_t> parseLimitValue(StringRef token) {
}
}
ACTOR Future<Void> getQuota(Reference<IDatabase> db, TransactionTag tag, LimitType limitType) {
ACTOR Future<Void> getQuota(Reference<IDatabase> db, TransactionTag tag, QuotaType quotaType) {
state Reference<ITransaction> tr = db->createTransaction();
loop {
tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
try {
state ThreadFuture<Optional<Value>> resultFuture = tr->get(ThrottleApi::getTagQuotaKey(tag));
state ThreadFuture<Optional<Value>> resultFuture =
tr->get(quotaType == QuotaType::STORAGE ? storageQuotaKey(tag) : ThrottleApi::getTagQuotaKey(tag));
Optional<Value> v = wait(safeThreadFutureToFuture(resultFuture));
if (!v.present()) {
fmt::print("<empty>\n");
} else {
if (quotaType == QuotaType::STORAGE) {
int64_t storageQuota = BinaryReader::fromStringRef<int64_t>(v.get(), Unversioned());
fmt::print("{}\n", storageQuota);
return Void();
}
auto const quota = ThrottleApi::TagQuotaValue::fromValue(v.get());
if (limitType == LimitType::TOTAL) {
if (quotaType == QuotaType::TOTAL) {
fmt::print("{}\n", quota.totalQuota);
} else if (limitType == LimitType::RESERVED) {
} else if (quotaType == QuotaType::RESERVED) {
fmt::print("{}\n", quota.reservedQuota);
}
}
@ -75,32 +85,36 @@ ACTOR Future<Void> getQuota(Reference<IDatabase> db, TransactionTag tag, LimitTy
}
}
ACTOR Future<Void> setQuota(Reference<IDatabase> db, TransactionTag tag, LimitType limitType, int64_t value) {
ACTOR Future<Void> setQuota(Reference<IDatabase> db, TransactionTag tag, QuotaType quotaType, int64_t value) {
state Reference<ITransaction> tr = db->createTransaction();
loop {
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
try {
state ThreadFuture<Optional<Value>> resultFuture = tr->get(ThrottleApi::getTagQuotaKey(tag));
Optional<Value> v = wait(safeThreadFutureToFuture(resultFuture));
ThrottleApi::TagQuotaValue quota;
if (v.present()) {
quota = ThrottleApi::TagQuotaValue::fromValue(v.get());
if (quotaType == QuotaType::STORAGE) {
tr->set(storageQuotaKey(tag), BinaryWriter::toValue<int64_t>(value, Unversioned()));
} else {
state ThreadFuture<Optional<Value>> resultFuture = tr->get(ThrottleApi::getTagQuotaKey(tag));
Optional<Value> v = wait(safeThreadFutureToFuture(resultFuture));
ThrottleApi::TagQuotaValue quota;
if (v.present()) {
quota = ThrottleApi::TagQuotaValue::fromValue(v.get());
}
// Internally, costs are stored in terms of pages, but in the API,
// costs are specified in terms of bytes
if (quotaType == QuotaType::TOTAL) {
// Round up to nearest page size
quota.totalQuota = ((value - 1) / CLIENT_KNOBS->TAG_THROTTLING_PAGE_SIZE + 1) *
CLIENT_KNOBS->TAG_THROTTLING_PAGE_SIZE;
} else if (quotaType == QuotaType::RESERVED) {
// Round up to nearest page size
quota.reservedQuota = ((value - 1) / CLIENT_KNOBS->TAG_THROTTLING_PAGE_SIZE + 1) *
CLIENT_KNOBS->TAG_THROTTLING_PAGE_SIZE;
}
if (!quota.isValid()) {
throw invalid_throttle_quota_value();
}
ThrottleApi::setTagQuota(tr, tag, quota.reservedQuota, quota.totalQuota);
}
// Internally, costs are stored in terms of pages, but in the API,
// costs are specified in terms of bytes
if (limitType == LimitType::TOTAL) {
// Round up to nearest page size
quota.totalQuota =
((value - 1) / CLIENT_KNOBS->TAG_THROTTLING_PAGE_SIZE + 1) * CLIENT_KNOBS->TAG_THROTTLING_PAGE_SIZE;
} else if (limitType == LimitType::RESERVED) {
// Round up to nearest page size
quota.reservedQuota =
((value - 1) / CLIENT_KNOBS->TAG_THROTTLING_PAGE_SIZE + 1) * CLIENT_KNOBS->TAG_THROTTLING_PAGE_SIZE;
}
if (!quota.isValid()) {
throw invalid_throttle_quota_value();
}
ThrottleApi::setTagQuota(tr, tag, quota.reservedQuota, quota.totalQuota);
wait(safeThreadFutureToFuture(tr->commit()));
fmt::print("Successfully updated quota.\n");
return Void();
@ -115,6 +129,7 @@ ACTOR Future<Void> clearQuota(Reference<IDatabase> db, TransactionTag tag) {
loop {
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
try {
tr->clear(storageQuotaKey(tag));
tr->clear(ThrottleApi::getTagQuotaKey(tag));
wait(safeThreadFutureToFuture(tr->commit()));
fmt::print("Successfully cleared quota.\n");
@ -125,8 +140,8 @@ ACTOR Future<Void> clearQuota(Reference<IDatabase> db, TransactionTag tag) {
}
}
constexpr auto usage = "quota [get <tag> [reserved_throughput|total_throughput] | set <tag> "
"[reserved_throughput|total_throughput] <value> | clear <tag>]";
constexpr auto usage = "quota [get <tag> [reserved_throughput|total_throughput|storage] | set <tag> "
"[reserved_throughput|total_throughput|storage] <value> | clear <tag>]";
bool exitFailure() {
fmt::print(usage);
@ -150,22 +165,22 @@ ACTOR Future<bool> quotaCommandActor(Reference<IDatabase> db, std::vector<String
if (tokens.size() != 4) {
return exitFailure();
}
auto const limitType = parseLimitType(tokens[3]);
if (!limitType.present()) {
auto const quotaType = parseQuotaType(tokens[3]);
if (!quotaType.present()) {
return exitFailure();
}
wait(getQuota(db, tag.get(), limitType.get()));
wait(getQuota(db, tag.get(), quotaType.get()));
return true;
} else if (tokens[1] == "set"_sr) {
if (tokens.size() != 5) {
return exitFailure();
}
auto const limitType = parseLimitType(tokens[3]);
auto const limitValue = parseLimitValue(tokens[4]);
if (!limitType.present() || !limitValue.present()) {
auto const quotaType = parseQuotaType(tokens[3]);
auto const quotaValue = parseQuotaValue(tokens[4]);
if (!quotaType.present() || !quotaValue.present()) {
return exitFailure();
}
wait(setQuota(db, tag.get(), limitType.get(), limitValue.get()));
wait(setQuota(db, tag.get(), quotaType.get(), quotaValue.get()));
return true;
} else if (tokens[1] == "clear"_sr) {
if (tokens.size() != 3) {

View File

@ -137,6 +137,11 @@ def quota(logger):
logger.debug(command + ' : ' + output)
assert output == 'Successfully updated quota.'
command = 'quota set green storage 98765'
output = run_fdbcli_command(command)
logger.debug(command + ' : ' + output)
assert output == 'Successfully updated quota.'
command = 'quota get green total_throughput'
output = run_fdbcli_command(command)
logger.debug(command + ' : ' + output)
@ -147,6 +152,11 @@ def quota(logger):
logger.debug(command + ' : ' + output)
assert output == '16384'
command = 'quota get green storage'
output = run_fdbcli_command(command)
logger.debug(command + ' : ' + output)
assert output == '98765'
command = 'quota clear green'
output = run_fdbcli_command(command)
logger.debug(command + ' : ' + output)
@ -157,6 +167,11 @@ def quota(logger):
logger.debug(command + ' : ' + output)
assert output == '<empty>'
command = 'quota get green storage'
output = run_fdbcli_command(command)
logger.debug(command + ' : ' + output)
assert output == '<empty>'
# Too few arguments, should log help message
command = 'quota get green'
output = run_fdbcli_command(command)

View File

@ -971,6 +971,11 @@ void sortDeltasByKey(const Standalone<GranuleDeltas>& deltasByVersion,
// clearVersion as previous guy)
}
void sortDeltasByKey(const Standalone<GranuleDeltas>& deltasByVersion, const KeyRangeRef& fileRange) {
SortedDeltasT deltasByKey;
sortDeltasByKey(deltasByVersion, fileRange, deltasByKey);
}
// FIXME: Could maybe reduce duplicated code between this and chunkedSnapshot for chunking
Value serializeChunkedDeltaFile(const Standalone<StringRef>& fileNameRef,
const Standalone<GranuleDeltas>& deltas,

View File

@ -5924,7 +5924,6 @@ public:
printf("Restoring backup to version: %lld\n", (long long)targetVersion);
}
state int retryCount = 0;
state Reference<ReadYourWritesTransaction> tr(new ReadYourWritesTransaction(cx));
loop {
try {
@ -5948,17 +5947,9 @@ public:
wait(tr->commit());
break;
} catch (Error& e) {
if (e.code() == error_code_transaction_too_old) {
retryCount++;
}
if (e.code() == error_code_restore_duplicate_tag) {
throw;
}
if (g_network->isSimulated() && retryCount > 50) {
CODE_PROBE(true, "submitRestore simulation speedup");
// try to make the read window back to normal size (5 * version_per_sec)
g_simulator->speedUpSimulation = true;
}
wait(tr->onError(e));
}
}

View File

@ -2145,6 +2145,9 @@ ACTOR Future<Void> lockDatabase(Reference<ReadYourWritesTransaction> tr, UID id)
ACTOR Future<Void> lockDatabase(Database cx, UID id) {
state Transaction tr(cx);
UID debugID = deterministicRandom()->randomUniqueID();
TraceEvent("LockDatabaseTransaction", debugID).log();
tr.debugTransaction(debugID);
loop {
try {
wait(lockDatabase(&tr, id));

View File

@ -965,7 +965,8 @@ ACTOR Future<MonitorLeaderInfo> monitorProxiesOneGeneration(
allConnectionsFailed = false;
} else {
CODE_PROBE(rep.getError().code() == error_code_failed_to_progress,
"Coordinator cant talk to cluster controller");
"Coordinator cant talk to cluster controller",
probe::decoration::rare);
TraceEvent("MonitorProxiesConnectFailed")
.detail("Error", rep.getError().name())
.detail("Coordinator", clientLeaderServer.getAddressString());

View File

@ -2175,7 +2175,7 @@ void DatabaseContext::removeWatch() {
ASSERT(outstandingWatches >= 0);
}
Future<Void> DatabaseContext::onConnected() {
Future<Void> DatabaseContext::onConnected() const {
return connected;
}
@ -2802,26 +2802,26 @@ void GetRangeLimits::decrement(MappedKeyValueRef const& data) {
}
// True if either the row or byte limit has been reached
bool GetRangeLimits::isReached() {
bool GetRangeLimits::isReached() const {
return rows == 0 || (bytes == 0 && minRows == 0);
}
// True if data would cause the row or byte limit to be reached
bool GetRangeLimits::reachedBy(VectorRef<KeyValueRef> const& data) {
bool GetRangeLimits::reachedBy(VectorRef<KeyValueRef> const& data) const {
return (rows != GetRangeLimits::ROW_LIMIT_UNLIMITED && data.size() >= rows) ||
(bytes != GetRangeLimits::BYTE_LIMIT_UNLIMITED &&
(int)data.expectedSize() + (8 - (int)sizeof(KeyValueRef)) * data.size() >= bytes && data.size() >= minRows);
}
bool GetRangeLimits::hasByteLimit() {
bool GetRangeLimits::hasByteLimit() const {
return bytes != GetRangeLimits::BYTE_LIMIT_UNLIMITED;
}
bool GetRangeLimits::hasRowLimit() {
bool GetRangeLimits::hasRowLimit() const {
return rows != GetRangeLimits::ROW_LIMIT_UNLIMITED;
}
bool GetRangeLimits::hasSatisfiedMinRows() {
bool GetRangeLimits::hasSatisfiedMinRows() const {
return hasByteLimit() && minRows == 0;
}
@ -4771,7 +4771,8 @@ static Future<Void> tssStreamComparison(Request request,
TSS_traceMismatch(mismatchEvent, request, ssReply.get(), tssReply.get());
CODE_PROBE(FLOW_KNOBS->LOAD_BALANCE_TSS_MISMATCH_TRACE_FULL,
"Tracing Full TSS Mismatch in stream comparison");
"Tracing Full TSS Mismatch in stream comparison",
probe::decoration::rare);
CODE_PROBE(!FLOW_KNOBS->LOAD_BALANCE_TSS_MISMATCH_TRACE_FULL,
"Tracing Partial TSS Mismatch in stream comparison and storing the rest in FDB");
@ -4813,7 +4814,7 @@ maybeDuplicateTSSStreamFragment(Request& req, QueueModel* model, RequestStream<R
Optional<TSSEndpointData> tssData = model->getTssData(ssStream->getEndpoint().token.first());
if (tssData.present()) {
CODE_PROBE(true, "duplicating stream to TSS");
CODE_PROBE(true, "duplicating stream to TSS", probe::decoration::rare);
resetReply(req);
// FIXME: optimize to avoid creating new netNotifiedQueueWithAcknowledgements for each stream duplication
RequestStream<Request> tssRequestStream(tssData.get().endpoint);
@ -5952,6 +5953,7 @@ void TransactionOptions::clear() {
useGrvCache = false;
skipGrvCache = false;
rawAccess = false;
bypassStorageQuota = false;
}
TransactionOptions::TransactionOptions() {
@ -6693,6 +6695,9 @@ Future<Void> Transaction::commitMutations() {
if (trState->options.firstInBatch) {
tr.flags = tr.flags | CommitTransactionRequest::FLAG_FIRST_IN_BATCH;
}
if (trState->options.bypassStorageQuota) {
tr.flags = tr.flags | CommitTransactionRequest::FLAG_BYPASS_STORAGE_QUOTA;
}
if (trState->options.reportConflictingKeys) {
tr.transaction.report_conflicting_keys = true;
}
@ -6971,6 +6976,10 @@ void Transaction::setOption(FDBTransactionOptions::Option option, Optional<Strin
trState->options.rawAccess = true;
break;
case FDBTransactionOptions::BYPASS_STORAGE_QUOTA:
trState->options.bypassStorageQuota = true;
break;
case FDBTransactionOptions::AUTHORIZATION_TOKEN:
if (value.present())
trState->authToken = Standalone<StringRef>(value.get());
@ -9406,7 +9415,8 @@ void handleTSSChangeFeedMismatch(const ChangeFeedStreamRequest& request,
mismatchEvent.detail("TSSVersion", tssVersion);
CODE_PROBE(FLOW_KNOBS->LOAD_BALANCE_TSS_MISMATCH_TRACE_FULL,
"Tracing Full TSS Feed Mismatch in stream comparison");
"Tracing Full TSS Feed Mismatch in stream comparison",
probe::decoration::rare);
CODE_PROBE(!FLOW_KNOBS->LOAD_BALANCE_TSS_MISMATCH_TRACE_FULL,
"Tracing Partial TSS Feed Mismatch in stream comparison and storing the rest in FDB");

View File

@ -1654,7 +1654,7 @@ Future<RangeResult> ReadYourWritesTransaction::getRange(KeySelector begin,
// This optimization prevents nullptr operations from being added to the conflict range
if (limits.isReached()) {
CODE_PROBE(true, "RYW range read limit 0", probe::decoration::rare);
CODE_PROBE(true, "RYW range read limit 0");
return RangeResult();
}
@ -1668,7 +1668,7 @@ Future<RangeResult> ReadYourWritesTransaction::getRange(KeySelector begin,
end.removeOrEqual(end.arena());
if (begin.offset >= end.offset && begin.getKey() >= end.getKey()) {
CODE_PROBE(true, "RYW range inverted", probe::decoration::rare);
CODE_PROBE(true, "RYW range inverted");
return RangeResult();
}
@ -1698,7 +1698,7 @@ Future<MappedRangeResult> ReadYourWritesTransaction::getMappedRange(KeySelector
if (getDatabase()->apiVersionAtLeast(630)) {
if (specialKeys.contains(begin.getKey()) && specialKeys.begin <= end.getKey() &&
end.getKey() <= specialKeys.end) {
CODE_PROBE(true, "Special key space get range (getMappedRange)");
CODE_PROBE(true, "Special key space get range (getMappedRange)", probe::decoration::rare);
throw client_invalid_operation(); // Not support special keys.
}
} else {
@ -1720,7 +1720,7 @@ Future<MappedRangeResult> ReadYourWritesTransaction::getMappedRange(KeySelector
// This optimization prevents nullptr operations from being added to the conflict range
if (limits.isReached()) {
CODE_PROBE(true, "RYW range read limit 0 (getMappedRange)");
CODE_PROBE(true, "RYW range read limit 0 (getMappedRange)", probe::decoration::rare);
return MappedRangeResult();
}
@ -1734,7 +1734,7 @@ Future<MappedRangeResult> ReadYourWritesTransaction::getMappedRange(KeySelector
end.removeOrEqual(end.arena());
if (begin.offset >= end.offset && begin.getKey() >= end.getKey()) {
CODE_PROBE(true, "RYW range inverted (getMappedRange)");
CODE_PROBE(true, "RYW range inverted (getMappedRange)", probe::decoration::rare);
return MappedRangeResult();
}

View File

@ -821,10 +821,14 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( QUICK_GET_KEY_VALUES_LIMIT, 2000 );
init( QUICK_GET_KEY_VALUES_LIMIT_BYTES, 1e7 );
init( STORAGE_FEED_QUERY_HARD_LIMIT, 100000 );
// Read priority definitions in the form of a list of their relative concurrency share weights
init( STORAGESERVER_READ_PRIORITIES, "120,10,20,40,60" );
// The total concurrency which will be shared by active priorities according to their relative weights
init( STORAGE_SERVER_READ_CONCURRENCY, 70 );
// Priorities which each ReadType maps to, in enumeration order
init( STORAGESERVER_READ_RANKS, "0,2,1,1,1" );
init( STORAGESERVER_READ_PRIORITIES, "48,32,8" );
// The priority number which each ReadType maps to in enumeration order
// This exists for flexibility but assigning each ReadType to its own unique priority number makes the most sense
// The enumeration is currently: eager, fetch, low, normal, high
init( STORAGESERVER_READTYPE_PRIORITY_MAP, "0,1,2,3,4" );
//Wait Failure
init( MAX_OUTSTANDING_WAIT_FAILURE_REQUESTS, 250 ); if( randomize && BUGGIFY ) MAX_OUTSTANDING_WAIT_FAILURE_REQUESTS = 2;
@ -948,7 +952,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( REDWOOD_HISTOGRAM_INTERVAL, 30.0 );
init( REDWOOD_EVICT_UPDATED_PAGES, true ); if( randomize && BUGGIFY ) { REDWOOD_EVICT_UPDATED_PAGES = false; }
init( REDWOOD_DECODECACHE_REUSE_MIN_HEIGHT, 2 ); if( randomize && BUGGIFY ) { REDWOOD_DECODECACHE_REUSE_MIN_HEIGHT = deterministicRandom()->randomInt(1, 7); }
init( REDWOOD_PRIORITY_LAUNCHS, "32,32,32,32" );
init( REDWOOD_IO_PRIORITIES, "32,32,32,32" );
init( REDWOOD_SPLIT_ENCRYPTED_PAGES_BY_TENANT, false );
// Server request latency measurement
@ -1022,6 +1026,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( BLOB_MANIFEST_BACKUP_INTERVAL, isSimulated ? 5.0 : 30.0 );
init( BLOB_FULL_RESTORE_MODE, false );
init( BLOB_MIGRATOR_CHECK_INTERVAL, isSimulated ? 1.0 : 5.0);
init( BLOB_MANIFEST_RW_ROWS, isSimulated ? 10 : 1000);
init( BGCC_TIMEOUT, isSimulated ? 10.0 : 120.0 );
init( BGCC_MIN_INTERVAL, isSimulated ? 1.0 : 10.0 );

View File

@ -56,4 +56,7 @@ RangeResult materializeBlobGranule(const BlobGranuleChunkRef& chunk,
std::string randomBGFilename(UID blobWorkerID, UID granuleID, Version version, std::string suffix);
#endif
// For benchmark testing only. It should never be called in prod.
void sortDeltasByKey(const Standalone<GranuleDeltas>& deltasByVersion, const KeyRangeRef& fileRange);
#endif

View File

@ -196,10 +196,11 @@ struct CommitID {
struct CommitTransactionRequest : TimedRequest {
constexpr static FileIdentifier file_identifier = 93948;
enum { FLAG_IS_LOCK_AWARE = 0x1, FLAG_FIRST_IN_BATCH = 0x2 };
enum { FLAG_IS_LOCK_AWARE = 0x1, FLAG_FIRST_IN_BATCH = 0x2, FLAG_BYPASS_STORAGE_QUOTA = 0x4 };
bool isLockAware() const { return (flags & FLAG_IS_LOCK_AWARE) != 0; }
bool firstInBatch() const { return (flags & FLAG_FIRST_IN_BATCH) != 0; }
bool bypassStorageQuota() const { return (flags & FLAG_BYPASS_STORAGE_QUOTA) != 0; }
Arena arena;
SpanContext spanContext;

View File

@ -353,8 +353,9 @@ public:
int apiVersionAtLeast(int minVersion) const { return apiVersion.version() >= minVersion; }
Future<Void> onConnected(); // Returns after a majority of coordination servers are available and have reported a
// leader. The cluster file therefore is valid, but the database might be unavailable.
Future<Void> onConnected()
const; // Returns after a majority of coordination servers are available and have reported a
// leader. The cluster file therefore is valid, but the database might be unavailable.
Reference<IClusterConnectionRecord> getConnectionRecord();
// Switch the database to use the new connection file, and recreate all pending watches for committed transactions.

View File

@ -706,15 +706,15 @@ struct GetRangeLimits {
void decrement(MappedKeyValueRef const& data);
// True if either the row or byte limit has been reached
bool isReached();
bool isReached() const;
// True if data would cause the row or byte limit to be reached
bool reachedBy(VectorRef<KeyValueRef> const& data);
bool reachedBy(VectorRef<KeyValueRef> const& data) const;
bool hasByteLimit();
bool hasRowLimit();
bool hasByteLimit() const;
bool hasRowLimit() const;
bool hasSatisfiedMinRows();
bool hasSatisfiedMinRows() const;
bool isValid() const {
return (rows >= 0 || rows == ROW_LIMIT_UNLIMITED) && (bytes >= 0 || bytes == BYTE_LIMIT_UNLIMITED) &&
minRows >= 0 && (minRows <= rows || rows == ROW_LIMIT_UNLIMITED);

View File

@ -161,6 +161,7 @@ struct TransactionOptions {
bool useGrvCache : 1;
bool skipGrvCache : 1;
bool rawAccess : 1;
bool bypassStorageQuota : 1;
TransactionPriority priority;

View File

@ -772,9 +772,9 @@ public:
int QUICK_GET_KEY_VALUES_LIMIT;
int QUICK_GET_KEY_VALUES_LIMIT_BYTES;
int STORAGE_FEED_QUERY_HARD_LIMIT;
int STORAGE_SERVER_READ_CONCURRENCY;
std::string STORAGESERVER_READ_RANKS;
std::string STORAGESERVER_READ_PRIORITIES;
int STORAGE_SERVER_READ_CONCURRENCY;
std::string STORAGESERVER_READTYPE_PRIORITY_MAP;
// Wait Failure
int MAX_OUTSTANDING_WAIT_FAILURE_REQUESTS;
@ -923,7 +923,7 @@ public:
int REDWOOD_DECODECACHE_REUSE_MIN_HEIGHT; // Minimum height for which to keep and reuse page decode caches
bool REDWOOD_SPLIT_ENCRYPTED_PAGES_BY_TENANT; // Whether to split pages by tenant if encryption is enabled
std::string REDWOOD_PRIORITY_LAUNCHS;
std::string REDWOOD_IO_PRIORITIES;
// Server request latency measurement
int LATENCY_SAMPLE_SIZE;
@ -998,6 +998,7 @@ public:
double BLOB_MANIFEST_BACKUP_INTERVAL;
bool BLOB_FULL_RESTORE_MODE;
double BLOB_MIGRATOR_CHECK_INTERVAL;
int BLOB_MANIFEST_RW_ROWS;
// Blob metadata
int64_t BLOB_METADATA_CACHE_TTL;

View File

@ -253,6 +253,8 @@ description is not currently required but encouraged.
description="Allows this transaction to read system keys (those that start with the byte 0xFF). Implies raw_access."/>
<Option name="raw_access" code="303"
description="Allows this transaction to access the raw key-space when tenant mode is on."/>
<Option name="bypass_storage_quota" code="304"
description="Allows this transaction to bypass storage quota enforcement. Should only be used for transactions that directly or indirectly decrease the size of the tenant group's data."/>
<Option name="debug_dump" code="400"
hidden="true" />
<Option name="debug_retry_logging" code="401" paramType="String" paramDescription="Optional transaction name" />

View File

@ -155,7 +155,15 @@ Future<Void> SimpleFailureMonitor::onDisconnectOrFailure(Endpoint const& endpoin
// If the endpoint or address is already failed, return right away
auto i = addressStatus.find(endpoint.getPrimaryAddress());
if (i == addressStatus.end() || i->second.isFailed() || failedEndpoints.count(endpoint)) {
TraceEvent("AlreadyDisconnected").detail("Addr", endpoint.getPrimaryAddress()).detail("Tok", endpoint.token);
TraceEvent event("AlreadyDisconnected");
if (endpoint.token.first() == 0xffffffffffffffff) {
// well known endpoint
event.suppressFor(5.0);
}
event.detail("Addr", endpoint.getPrimaryAddress())
.detail("Reason", i == addressStatus.end() || i->second.isFailed() ? "Disconnected" : "EndpointFailed")
.detail("Tok", endpoint.token)
.log();
return Void();
}

View File

@ -69,7 +69,7 @@ TEST_CASE("/flow/buggifiedDelay") {
});
wait(f1 && f2);
if (last == 1) {
CODE_PROBE(true, "Delays can become ready out of order");
CODE_PROBE(true, "Delays can become ready out of order", probe::decoration::rare);
return Void();
}
}

View File

@ -216,7 +216,7 @@ bool TokenCacheImpl::validateAndAdd(double currentTime, StringRef token, Network
Arena arena;
authz::jwt::TokenRef t;
if (!authz::jwt::parseToken(arena, t, token)) {
CODE_PROBE(true, "Token can't be parsed");
CODE_PROBE(true, "Token can't be parsed", probe::decoration::rare);
TraceEvent(SevWarn, "InvalidToken")
.detail("From", peer)
.detail("Reason", "ParseError")
@ -225,35 +225,35 @@ bool TokenCacheImpl::validateAndAdd(double currentTime, StringRef token, Network
}
auto key = FlowTransport::transport().getPublicKeyByName(t.keyId);
if (!key.present()) {
CODE_PROBE(true, "Token referencing non-existing key");
CODE_PROBE(true, "Token referencing non-existing key", probe::decoration::rare);
TRACE_INVALID_PARSED_TOKEN("UnknownKey", t);
return false;
} else if (!t.issuedAtUnixTime.present()) {
CODE_PROBE(true, "Token has no issued-at field");
CODE_PROBE(true, "Token has no issued-at field", probe::decoration::rare);
TRACE_INVALID_PARSED_TOKEN("NoIssuedAt", t);
return false;
} else if (!t.expiresAtUnixTime.present()) {
CODE_PROBE(true, "Token has no expiration time");
CODE_PROBE(true, "Token has no expiration time", probe::decoration::rare);
TRACE_INVALID_PARSED_TOKEN("NoExpirationTime", t);
return false;
} else if (double(t.expiresAtUnixTime.get()) <= currentTime) {
CODE_PROBE(true, "Expired token");
CODE_PROBE(true, "Expired token", probe::decoration::rare);
TRACE_INVALID_PARSED_TOKEN("Expired", t);
return false;
} else if (!t.notBeforeUnixTime.present()) {
CODE_PROBE(true, "Token has no not-before field");
CODE_PROBE(true, "Token has no not-before field", probe::decoration::rare);
TRACE_INVALID_PARSED_TOKEN("NoNotBefore", t);
return false;
} else if (double(t.notBeforeUnixTime.get()) > currentTime) {
CODE_PROBE(true, "Tokens not-before is in the future");
CODE_PROBE(true, "Tokens not-before is in the future", probe::decoration::rare);
TRACE_INVALID_PARSED_TOKEN("TokenNotYetValid", t);
return false;
} else if (!t.tenants.present()) {
CODE_PROBE(true, "Token with no tenants");
CODE_PROBE(true, "Token with no tenants", probe::decoration::rare);
TRACE_INVALID_PARSED_TOKEN("NoTenants", t);
return false;
} else if (!authz::jwt::verifyToken(token, key.get())) {
CODE_PROBE(true, "Token with invalid signature");
CODE_PROBE(true, "Token with invalid signature", probe::decoration::rare);
TRACE_INVALID_PARSED_TOKEN("InvalidSignature", t);
return false;
} else {
@ -300,7 +300,7 @@ bool TokenCacheImpl::validate(TenantNameRef name, StringRef token) {
}
}
if (!tenantFound) {
CODE_PROBE(true, "Valid token doesn't reference tenant");
CODE_PROBE(true, "Valid token doesn't reference tenant", probe::decoration::rare);
TraceEvent(SevWarn, "TenantTokenMismatch").detail("From", peer).detail("Tenant", name.toString());
return false;
}

View File

@ -25,6 +25,7 @@
#include "flow/IAsyncFile.h"
#include "flow/network.h"
#include "flow/ActorCollection.h"
#include "fdbrpc/simulator.h"
// template <class AsyncFileType>
class AsyncFileChaos final : public IAsyncFile, public ReferenceCounted<AsyncFileChaos> {
@ -35,7 +36,8 @@ private:
public:
explicit AsyncFileChaos(Reference<IAsyncFile> file) : file(file) {
// We only allow chaos events on storage files
enabled = (file->getFilename().find("storage-") != std::string::npos);
enabled = file->getFilename().find("storage-") != std::string::npos &&
file->getFilename().find("sqlite-wal") == std::string::npos;
}
void addref() override { ReferenceCounted<AsyncFileChaos>::addref(); }
@ -79,6 +81,7 @@ public:
Future<Void> write(void const* data, int length, int64_t offset) override {
Arena arena;
char* pdata = nullptr;
unsigned corruptedBlock = 0;
// Check if a bit flip event was injected, if so, copy the buffer contents
// with a random bit flipped in a new buffer and use that for the write
@ -91,32 +94,38 @@ public:
pdata = (char*)arena.allocate4kAlignedBuffer(length);
memcpy(pdata, data, length);
// flip a random bit in the copied buffer
pdata[deterministicRandom()->randomInt(0, length)] ^= (1 << deterministicRandom()->randomInt(0, 8));
auto corruptedPos = deterministicRandom()->randomInt(0, length);
pdata[corruptedPos] ^= (1 << deterministicRandom()->randomInt(0, 8));
// mark the block as corrupted
corruptedBlock = (offset + corruptedPos) / 4096;
TraceEvent("CorruptedBlock")
.detail("Filename", file->getFilename())
.detail("Block", corruptedBlock)
.log();
// increment the metric for bit flips
auto res = g_network->global(INetwork::enChaosMetrics);
if (res) {
ChaosMetrics* chaosMetrics = static_cast<ChaosMetrics*>(res);
auto chaosMetricsPointer = g_network->global(INetwork::enChaosMetrics);
if (chaosMetricsPointer) {
ChaosMetrics* chaosMetrics = static_cast<ChaosMetrics*>(chaosMetricsPointer);
chaosMetrics->bitFlips++;
}
}
}
}
double diskDelay = getDelay();
if (diskDelay == 0.0) {
if (pdata)
return holdWhile(arena, file->write(pdata, length, offset));
return file->write(data, length, offset);
}
// Wait for diskDelay before submitting the I/O
// Capture file by value in case this is destroyed during the delay
return mapAsync<Void, std::function<Future<Void>(Void)>, Void>(
delay(diskDelay), [=, file = file](Void _) -> Future<Void> {
if (pdata)
return holdWhile(arena, file->write(pdata, length, offset));
delay(getDelay()), [=, file = file](Void _) -> Future<Void> {
if (pdata) {
return map(holdWhile(arena, file->write(pdata, length, offset)),
[corruptedBlock, file = file](auto res) {
if (g_network->isSimulated()) {
g_simulator->corruptedBlocks.emplace(file->getFilename(), corruptedBlock);
}
return res;
});
}
return file->write(data, length, offset);
});
@ -130,7 +139,16 @@ public:
// Wait for diskDelay before submitting the I/O
// Capture file by value in case this is destroyed during the delay
return mapAsync<Void, std::function<Future<Void>(Void)>, Void>(
delay(diskDelay), [=, file = file](Void _) -> Future<Void> { return file->truncate(size); });
delay(diskDelay), [size, file = file](Void _) -> Future<Void> {
constexpr auto maxBlockValue =
std::numeric_limits<decltype(g_simulator->corruptedBlocks)::key_type::second_type>::max();
auto firstDeletedBlock =
g_simulator->corruptedBlocks.lower_bound(std::make_pair(file->getFilename(), size / 4096));
auto lastFileBlock =
g_simulator->corruptedBlocks.upper_bound(std::make_pair(file->getFilename(), maxBlockValue));
g_simulator->corruptedBlocks.erase(firstDeletedBlock, lastFileBlock);
return file->truncate(size);
});
}
Future<Void> sync() override {

View File

@ -140,7 +140,7 @@ public:
sav->addFutureRef();
return Future<T>(sav);
}
bool isSet() { return sav->isSet(); }
bool isSet() const { return sav->isSet(); }
bool isValid() const { return sav != nullptr; }
ReplyPromise() : sav(new NetSAV<T>(0, 1)) {}
explicit ReplyPromise(const PeerCompatibilityPolicy& policy) : ReplyPromise() {
@ -515,7 +515,7 @@ public:
void setRequestStreamEndpoint(const Endpoint& endpoint) { queue->requestStreamEndpoint = endpoint; }
bool connected() { return queue->acknowledgements.getRawEndpoint().isValid() || queue->error.isValid(); }
bool connected() const { return queue->acknowledgements.getRawEndpoint().isValid() || queue->error.isValid(); }
Future<Void> onConnected() {
if (connected()) {

View File

@ -133,7 +133,7 @@ Future<REPLY_TYPE(Req)> retryGetReplyFromHostname(Req request, Hostname hostname
// Like tryGetReplyFromHostname, except that request_maybe_delivered results in re-resolving the hostname.
// Suitable for use with hostname, where RequestStream is NOT initialized yet.
// Not normally useful for endpoints initialized with NetworkAddress.
state double reconnetInterval = FLOW_KNOBS->HOSTNAME_RECONNECT_INIT_INTERVAL;
state double reconnectInterval = FLOW_KNOBS->HOSTNAME_RECONNECT_INIT_INTERVAL;
state std::unique_ptr<RequestStream<Req>> to;
loop {
NetworkAddress address = wait(hostname.resolveWithRetry());
@ -145,8 +145,8 @@ Future<REPLY_TYPE(Req)> retryGetReplyFromHostname(Req request, Hostname hostname
resetReply(request);
if (reply.getError().code() == error_code_request_maybe_delivered) {
// Connection failure.
wait(delay(reconnetInterval));
reconnetInterval = std::min(2 * reconnetInterval, FLOW_KNOBS->HOSTNAME_RECONNECT_MAX_INTERVAL);
wait(delay(reconnectInterval));
reconnectInterval = std::min(2 * reconnectInterval, FLOW_KNOBS->HOSTNAME_RECONNECT_MAX_INTERVAL);
INetworkConnections::net()->removeCachedDNS(hostname.host, hostname.service);
} else {
throw reply.getError();
@ -165,7 +165,7 @@ Future<REPLY_TYPE(Req)> retryGetReplyFromHostname(Req request,
// Like tryGetReplyFromHostname, except that request_maybe_delivered results in re-resolving the hostname.
// Suitable for use with hostname, where RequestStream is NOT initialized yet.
// Not normally useful for endpoints initialized with NetworkAddress.
state double reconnetInterval = FLOW_KNOBS->HOSTNAME_RECONNECT_INIT_INTERVAL;
state double reconnectInitInterval = FLOW_KNOBS->HOSTNAME_RECONNECT_INIT_INTERVAL;
state std::unique_ptr<RequestStream<Req>> to;
loop {
NetworkAddress address = wait(hostname.resolveWithRetry());
@ -177,8 +177,9 @@ Future<REPLY_TYPE(Req)> retryGetReplyFromHostname(Req request,
resetReply(request);
if (reply.getError().code() == error_code_request_maybe_delivered) {
// Connection failure.
wait(delay(reconnetInterval));
reconnetInterval = std::min(2 * reconnetInterval, FLOW_KNOBS->HOSTNAME_RECONNECT_MAX_INTERVAL);
wait(delay(reconnectInitInterval));
reconnectInitInterval =
std::min(2 * reconnectInitInterval, FLOW_KNOBS->HOSTNAME_RECONNECT_MAX_INTERVAL);
INetworkConnections::net()->removeCachedDNS(hostname.host, hostname.service);
} else {
throw reply.getError();

View File

@ -26,6 +26,8 @@
#include <random>
#include <limits>
#include <boost/unordered_set.hpp>
#include "flow/flow.h"
#include "flow/Histogram.h"
#include "flow/ProtocolVersion.h"
@ -520,6 +522,8 @@ public:
std::unordered_map<Standalone<StringRef>, PrivateKey> authKeys;
std::set<std::pair<std::string, unsigned>> corruptedBlocks;
flowGlobalType global(int id) const final { return getCurrentProcess()->global(id); };
void setGlobal(size_t id, flowGlobalType v) final { getCurrentProcess()->setGlobal(id, v); };

View File

@ -784,11 +784,25 @@ private:
std::string sourceFilename = self->filename + ".part";
if (machineCache.count(sourceFilename)) {
// it seems gcc has some trouble with these types. Aliasing with typename is ugly, but seems to work.
using block_value_type = typename decltype(g_simulator->corruptedBlocks)::key_type::second_type;
TraceEvent("SimpleFileRename")
.detail("From", sourceFilename)
.detail("To", self->filename)
.detail("SourceCount", machineCache.count(sourceFilename))
.detail("FileCount", machineCache.count(self->filename));
auto maxBlockValue = std::numeric_limits<block_value_type>::max();
g_simulator->corruptedBlocks.erase(
g_simulator->corruptedBlocks.lower_bound(std::make_pair(sourceFilename, 0u)),
g_simulator->corruptedBlocks.upper_bound(std::make_pair(self->filename, maxBlockValue)));
// next we need to rename all files. In practice, the number of corruptions for a given file should be
// very small
auto begin = g_simulator->corruptedBlocks.lower_bound(std::make_pair(sourceFilename, 0u)),
end = g_simulator->corruptedBlocks.upper_bound(std::make_pair(sourceFilename, maxBlockValue));
for (auto iter = begin; iter != end; ++iter) {
g_simulator->corruptedBlocks.emplace(self->filename, iter->second);
}
g_simulator->corruptedBlocks.erase(begin, end);
renameFile(sourceFilename.c_str(), self->filename.c_str());
machineCache[self->filename] = machineCache[sourceFilename];
@ -1219,13 +1233,15 @@ public:
static void runLoop(Sim2* self) {
ISimulator::ProcessInfo* callingMachine = self->currentProcess;
int lastPrintTime = 0;
while (!self->isStopped) {
if (self->taskQueue.canSleep()) {
double sleepTime = self->taskQueue.getSleepTime(self->time);
self->time +=
sleepTime + FLOW_KNOBS->MAX_RUNLOOP_SLEEP_DELAY * pow(deterministicRandom()->random01(), 1000.0);
if (self->printSimTime) {
if (self->printSimTime && (int)self->time > lastPrintTime) {
printf("Time: %d\n", (int)self->time);
lastPrintTime = (int)self->time;
}
self->timerTime = std::max(self->timerTime, self->time);
}
@ -2361,7 +2377,7 @@ class UDPSimSocket : public IUDPSocket, ReferenceCounted<UDPSimSocket> {
NetworkAddress _localAddress;
bool randomDropPacket() {
auto res = deterministicRandom()->random01() < .000001;
CODE_PROBE(res, "UDP packet drop", probe::context::sim2, probe::assert::simOnly);
CODE_PROBE(res, "UDP packet drop", probe::context::sim2, probe::assert::simOnly, probe::decoration::rare);
return res;
}
@ -2744,6 +2760,22 @@ Future<Void> Sim2FileSystem::deleteFile(const std::string& filename, bool mustBe
ACTOR Future<Void> renameFileImpl(std::string from, std::string to) {
wait(delay(0.5 * deterministicRandom()->random01()));
// rename all keys in the corrupted list
// first we have to delete all corruption of the destination, since this file will be unlinked if it exists
TraceEvent("RenamingFile").detail("From", from).detail("To", to).log();
// it seems gcc has some trouble with these types. Aliasing with typename is ugly, but seems to work.
using block_value_type = typename decltype(g_simulator->corruptedBlocks)::key_type::second_type;
auto maxBlockValue = std::numeric_limits<block_value_type>::max();
g_simulator->corruptedBlocks.erase(g_simulator->corruptedBlocks.lower_bound(std::make_pair(to, 0u)),
g_simulator->corruptedBlocks.upper_bound(std::make_pair(to, maxBlockValue)));
// next we need to rename all files. In practice, the number of corruptions for a given file should be very small
auto begin = g_simulator->corruptedBlocks.lower_bound(std::make_pair(from, 0u)),
end = g_simulator->corruptedBlocks.upper_bound(std::make_pair(from, maxBlockValue));
for (auto iter = begin; iter != end; ++iter) {
g_simulator->corruptedBlocks.emplace(to, iter->second);
}
g_simulator->corruptedBlocks.erase(begin, end);
// do the rename
::renameFile(from, to);
wait(delay(0.5 * deterministicRandom()->random01()));
return Void();

View File

@ -654,7 +654,7 @@ private:
TraceEvent("WriteRecoveryKeySet", dbgid).log();
if (!initialCommit)
txnStateStore->set(KeyValueRef(m.param1, m.param2));
CODE_PROBE(true, "Snapshot created, setting writeRecoveryKey in txnStateStore");
CODE_PROBE(true, "Snapshot created, setting writeRecoveryKey in txnStateStore", probe::decoration::rare);
}
void checkSetTenantMapPrefix(MutationRef m) {

View File

@ -32,6 +32,7 @@ struct ConnectionProviderTestSettings {
uint32_t maxFileMemory;
uint32_t maxFileSize;
uint32_t threads;
bool uniformProviderChoice;
double readWriteSplit;
@ -39,6 +40,7 @@ struct ConnectionProviderTestSettings {
int writeOps;
int readOps;
uint32_t targetBytesPerSec;
ConnectionProviderTestSettings() {
numProviders = deterministicRandom()->randomSkewedUInt32(1, 1000);
@ -56,6 +58,8 @@ struct ConnectionProviderTestSettings {
writeOps = 0;
readOps = 0;
targetBytesPerSec = 100 * 1024 * 1024;
}
};
@ -68,7 +72,7 @@ struct ProviderTestData {
explicit ProviderTestData(Reference<BlobConnectionProvider> provider) : provider(provider) {}
};
ACTOR Future<Void> createObject(ConnectionProviderTestSettings* settings, ProviderTestData* provider) {
ACTOR Future<int64_t> createObject(ConnectionProviderTestSettings* settings, ProviderTestData* provider) {
// pick object name before wait so no collisions between concurrent writes
std::string objName;
loop {
@ -98,10 +102,10 @@ ACTOR Future<Void> createObject(ConnectionProviderTestSettings* settings, Provid
// after write, put in the readable list
provider->data.push_back({ fullPath, data });
return Void();
return data.size();
}
ACTOR Future<Void> readAndVerifyObject(ProviderTestData* provider, std::string objFullPath, Value expectedData) {
ACTOR Future<int64_t> readAndVerifyObject(ProviderTestData* provider, std::string objFullPath, Value expectedData) {
Reference<BackupContainerFileSystem> bstore = provider->provider->getForRead(objFullPath);
state Reference<IAsyncFile> reader = wait(bstore->readFile(objFullPath));
@ -110,7 +114,7 @@ ACTOR Future<Void> readAndVerifyObject(ProviderTestData* provider, std::string o
ASSERT_EQ(expectedData.size(), readSize);
ASSERT(expectedData == actualData);
return Void();
return expectedData.size();
}
Future<Void> deleteObject(ProviderTestData* provider, std::string objFullPath) {
@ -119,6 +123,10 @@ Future<Void> deleteObject(ProviderTestData* provider, std::string objFullPath) {
}
ACTOR Future<Void> workerThread(ConnectionProviderTestSettings* settings, std::vector<ProviderTestData>* providers) {
// This worker should average settings->targetBytesPerSec / settings->threads.
// Then because we randomly 50% of the time don't consult the rateLimiter, bring the rate limiter's rate down by 2
state int targetBytesPerSec = std::max((uint32_t)1, settings->targetBytesPerSec / settings->threads / 2);
state Reference<IRateControl> rateLimiter = Reference<IRateControl>(new SpeedLimit(targetBytesPerSec, 1));
state double endTime = now() + settings->runtime;
try {
while (now() < endTime) {
@ -133,19 +141,24 @@ ACTOR Future<Void> workerThread(ConnectionProviderTestSettings* settings, std::v
// randomly pick create or read
bool doWrite = deterministicRandom()->random01() < settings->readWriteSplit;
state int64_t opSize = 0;
if (provider->usedNames.size() < settings->filesPerProvider && (provider->data.empty() || doWrite)) {
// create an object
wait(createObject(settings, provider));
wait(store(opSize, createObject(settings, provider)));
settings->writeOps++;
} else if (!provider->data.empty()) {
// read a random object
auto& readInfo = provider->data[deterministicRandom()->randomInt(0, provider->data.size())];
wait(readAndVerifyObject(provider, readInfo.first, readInfo.second));
wait(store(opSize, readAndVerifyObject(provider, readInfo.first, readInfo.second)));
settings->readOps++;
} else {
// other threads are creating files up to filesPerProvider limit, but none finished yet. Just wait
wait(delay(0.1));
}
if (opSize > 0 && deterministicRandom()->coinflip()) {
wait(rateLimiter->getAllowance(opSize) && delayJittered(0.01));
}
}
return Void();
@ -161,7 +174,7 @@ ACTOR Future<Void> checkAndCleanUp(ProviderTestData* provider) {
for (i = 0; i < provider->data.size(); i++) {
auto& readInfo = provider->data[i];
wait(readAndVerifyObject(provider, readInfo.first, readInfo.second));
wait(success(readAndVerifyObject(provider, readInfo.first, readInfo.second)));
wait(deleteObject(provider, provider->data[i].first));
}

View File

@ -441,7 +441,7 @@ struct BlobManagerData : NonCopyable, ReferenceCounted<BlobManagerData> {
// if this granule is not an active granule, it can't be merged
auto gIt = workerAssignments.rangeContaining(range.begin);
if (gIt->begin() != range.begin || gIt->end() != range.end) {
CODE_PROBE(true, "non-active granule reported merge eligible, ignoring");
CODE_PROBE(true, "non-active granule reported merge eligible, ignoring", probe::decoration::rare);
if (BM_DEBUG) {
fmt::print(
"BM {0} Ignoring Merge Candidate [{1} - {2}): range mismatch with active granule [{3} - {4})\n",
@ -1035,7 +1035,7 @@ static bool handleRangeIsAssign(Reference<BlobManagerData> bmData, RangeAssignme
if (assignment.assign.get().type == AssignRequestType::Continue) {
ASSERT(assignment.worker.present());
if (i.range() != assignment.keyRange || i.cvalue() != assignment.worker.get()) {
CODE_PROBE(true, "BM assignment out of date");
CODE_PROBE(true, "BM assignment out of date", probe::decoration::rare);
if (BM_DEBUG) {
fmt::print("Out of date re-assign for ({0}, {1}). Assignment must have changed while "
"checking split.\n Reassign: [{2} - {3}): {4}\n Existing: [{5} - {6}): {7}\n",
@ -1602,10 +1602,10 @@ ACTOR Future<Void> reevaluateInitialSplit(Reference<BlobManagerData> bmData,
if (retried && prevOwnerEpoch == bmData->epoch && prevGranuleID == granuleID &&
prevOwnerSeqno == std::numeric_limits<int64_t>::max()) {
// owner didn't change, last iteration of this transaction just succeeded but threw an error.
CODE_PROBE(true, "split too big adjustment succeeded after retry");
CODE_PROBE(true, "split too big adjustment succeeded after retry", probe::decoration::rare);
break;
}
CODE_PROBE(true, "split too big was since moved to another worker");
CODE_PROBE(true, "split too big was since moved to another worker", probe::decoration::rare);
if (BM_DEBUG) {
fmt::print("BM {0} re-evaluating initial split [{1} - {2}) too big: moved to another worker\n",
bmData->epoch,
@ -1839,7 +1839,7 @@ ACTOR Future<Void> maybeSplitRange(Reference<BlobManagerData> bmData,
wait(checkManagerLock(tr, bmData));
ForcedPurgeState purgeState = wait(getForcePurgedState(&tr->getTransaction(), granuleRange));
if (purgeState != ForcedPurgeState::NonePurged) {
CODE_PROBE(true, "Split stopped because of force purge");
CODE_PROBE(true, "Split stopped because of force purge", probe::decoration::rare);
TraceEvent("GranuleSplitCancelledForcePurge", bmData->id)
.detail("Epoch", bmData->epoch)
.detail("GranuleRange", granuleRange);
@ -2635,7 +2635,9 @@ ACTOR Future<Void> attemptMerges(Reference<BlobManagerData> bmData,
currentBytes + metrics.bytes > SERVER_KNOBS->BG_SNAPSHOT_FILE_TARGET_BYTES ||
currentKeySumBytes >= CLIENT_KNOBS->VALUE_SIZE_LIMIT / 2) {
ASSERT(currentBytes <= SERVER_KNOBS->BG_SNAPSHOT_FILE_TARGET_BYTES);
CODE_PROBE(currentKeySumBytes >= CLIENT_KNOBS->VALUE_SIZE_LIMIT / 2, "merge early because of key size");
CODE_PROBE(currentKeySumBytes >= CLIENT_KNOBS->VALUE_SIZE_LIMIT / 2,
"merge early because of key size",
probe::decoration::rare);
attemptStartMerge(bmData, currentCandidates);
currentCandidates.clear();
currentBytes = 0;
@ -3254,7 +3256,7 @@ static void addAssignment(KeyRangeMap<std::tuple<UID, int64_t, int64_t>>& map,
if (oldEpoch > newEpoch || (oldEpoch == newEpoch && oldSeqno > newSeqno)) {
newer.push_back(std::pair(old.range(), std::tuple(oldWorker, oldEpoch, oldSeqno)));
if (old.range() != newRange) {
CODE_PROBE(true, "BM Recovery: BWs disagree on range boundaries");
CODE_PROBE(true, "BM Recovery: BWs disagree on range boundaries", probe::decoration::rare);
anyConflicts = true;
}
} else {
@ -3288,7 +3290,8 @@ static void addAssignment(KeyRangeMap<std::tuple<UID, int64_t, int64_t>>& map,
std::get<0>(old.value()) = UID();
}
if (outOfDate.empty() || outOfDate.back() != std::pair(oldWorker, KeyRange(old.range()))) {
CODE_PROBE(true, "BM Recovery: Two workers claim ownership of same granule");
CODE_PROBE(
true, "BM Recovery: Two workers claim ownership of same granule", probe::decoration::rare);
outOfDate.push_back(std::pair(oldWorker, old.range()));
}
}

View File

@ -24,6 +24,7 @@
#include "fdbclient/BackupContainer.h"
#include "fdbclient/BlobGranuleCommon.h"
#include "fdbclient/ClientBooleanParams.h"
#include "fdbserver/Knobs.h"
#include "flow/FastRef.h"
#include "flow/Trace.h"
@ -137,10 +138,23 @@ private:
blobRangeKeys // Key ranges managed by blob
};
for (auto range : ranges) {
// todo use getRangeStream for better performance
RangeResult result = wait(tr.getRange(range, GetRangeLimits::BYTE_LIMIT_UNLIMITED));
for (auto& row : result) {
rows.push_back_deep(rows.arena(), KeyValueRef(row.key, row.value));
state GetRangeLimits limits(SERVER_KNOBS->BLOB_MANIFEST_RW_ROWS);
limits.minRows = 0;
state KeySelectorRef begin = firstGreaterOrEqual(range.begin);
state KeySelectorRef end = firstGreaterOrEqual(range.end);
loop {
RangeResult result = wait(tr.getRange(begin, end, limits, Snapshot::True));
for (auto& row : result) {
rows.push_back_deep(rows.arena(), KeyValueRef(row.key, row.value));
}
if (!result.more) {
break;
}
if (result.readThrough.present()) {
begin = firstGreaterOrEqual(result.readThrough.get());
} else {
begin = firstGreaterThan(result.end()[-1].key);
}
}
}
return rows;
@ -152,6 +166,13 @@ private:
// Write data to blob manifest file
ACTOR static Future<Void> writeToFile(Reference<BlobManifestDumper> self, Value data) {
static int32_t lastWrittenBytes = 0;
if (data.size() == lastWrittenBytes) {
dprint("Skip writting blob manifest with same size {}\n", lastWrittenBytes);
return Void();
}
lastWrittenBytes = data.size();
state Reference<BackupContainerFileSystem> writer;
state std::string fullPath;
@ -212,7 +233,7 @@ public:
ACTOR static Future<Void> execute(Reference<BlobManifestLoader> self) {
try {
Value data = wait(readFromFile(self));
Standalone<BlobManifest> manifest = decode(data);
state Standalone<BlobManifest> manifest = decode(data);
wait(writeSystemKeys(self, manifest.rows));
BlobGranuleRestoreVersionVector _ = wait(listGranules(self));
} catch (Error& e) {
@ -231,13 +252,32 @@ public:
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
try {
std::vector<KeyRangeRef> granules;
state Standalone<VectorRef<KeyRef>> blobRanges;
// Read all granules
state GetRangeLimits limits(SERVER_KNOBS->BLOB_MANIFEST_RW_ROWS);
limits.minRows = 0;
state KeySelectorRef begin = firstGreaterOrEqual(blobGranuleMappingKeys.begin);
state KeySelectorRef end = firstGreaterOrEqual(blobGranuleMappingKeys.end);
loop {
RangeResult rows = wait(tr.getRange(begin, end, limits, Snapshot::True));
for (auto& row : rows) {
blobRanges.push_back_deep(blobRanges.arena(), row.key);
}
if (!rows.more) {
break;
}
if (rows.readThrough.present()) {
begin = firstGreaterOrEqual(rows.readThrough.get());
} else {
begin = firstGreaterThan(rows.end()[-1].key);
}
}
// check each granule range
state int i = 0;
auto limit = GetRangeLimits::BYTE_LIMIT_UNLIMITED;
state RangeResult blobRanges = wait(tr.getRange(blobGranuleMappingKeys, limit));
for (i = 0; i < blobRanges.size() - 1; i++) {
Key startKey = blobRanges[i].key.removePrefix(blobGranuleMappingKeys.begin);
Key endKey = blobRanges[i + 1].key.removePrefix(blobGranuleMappingKeys.begin);
Key startKey = blobRanges[i].removePrefix(blobGranuleMappingKeys.begin);
Key endKey = blobRanges[i + 1].removePrefix(blobGranuleMappingKeys.begin);
state KeyRange granuleRange = KeyRangeRef(startKey, endKey);
try {
Standalone<BlobGranuleRestoreVersion> granule = wait(getGranule(&tr, granuleRange));
@ -300,17 +340,32 @@ private:
// Write system keys to database
ACTOR static Future<Void> writeSystemKeys(Reference<BlobManifestLoader> self, VectorRef<KeyValueRef> rows) {
state int start = 0;
state int end = 0;
for (start = 0; start < rows.size(); start = end) {
end = std::min(start + SERVER_KNOBS->BLOB_MANIFEST_RW_ROWS, rows.size());
wait(writeSystemKeys(self, rows, start, end));
}
return Void();
}
// Write system keys from start index to end(exclusive), so that we don't exceed the limit of transaction limit
ACTOR static Future<Void> writeSystemKeys(Reference<BlobManifestLoader> self,
VectorRef<KeyValueRef> rows,
int start,
int end) {
state Transaction tr(self->db_);
loop {
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
try {
for (auto& row : rows) {
tr.set(row.key, row.value);
for (int i = start; i < end; ++i) {
tr.set(rows[i].key, rows[i].value);
}
wait(tr.commit());
dprint("Blob manifest loaded {} rows\n", rows.size());
dprint("Blob manifest loaded rows from {} to {}\n", start, end);
TraceEvent("BlobManifestLoader").detail("RowStart", start).detail("RowEnd", end);
return Void();
} catch (Error& e) {
wait(tr.onError(e));
@ -324,8 +379,7 @@ private:
KeyRange historyKeyRange = blobGranuleHistoryKeyRangeFor(range);
// reverse lookup so that the first row is the newest version
state RangeResult results =
wait(tr->getRange(historyKeyRange, GetRangeLimits::BYTE_LIMIT_UNLIMITED, Snapshot::False, Reverse::True));
wait(tr->getRange(historyKeyRange, GetRangeLimits::BYTE_LIMIT_UNLIMITED, Snapshot::True, Reverse::True));
for (KeyValueRef row : results) {
state KeyRange keyRange;
state Version version;
@ -367,24 +421,39 @@ private:
// List all files for given granule
ACTOR static Future<std::vector<GranuleFileVersion>> listGranuleFiles(Transaction* tr, UID granuleID) {
state std::vector<GranuleFileVersion> files;
state KeyRange fileKeyRange = blobGranuleFileKeyRangeFor(granuleID);
RangeResult results = wait(tr->getRange(fileKeyRange, GetRangeLimits::BYTE_LIMIT_UNLIMITED));
state GetRangeLimits limits(SERVER_KNOBS->BLOB_MANIFEST_RW_ROWS);
limits.minRows = 0;
state KeySelectorRef begin = firstGreaterOrEqual(fileKeyRange.begin);
state KeySelectorRef end = firstGreaterOrEqual(fileKeyRange.end);
loop {
RangeResult results = wait(tr->getRange(begin, end, limits, Snapshot::True));
for (auto& row : results) {
UID gid;
Version version;
uint8_t fileType;
Standalone<StringRef> filename;
int64_t offset;
int64_t length;
int64_t fullFileLength;
Optional<BlobGranuleCipherKeysMeta> cipherKeysMeta;
std::vector<GranuleFileVersion> files;
for (auto& row : results) {
UID gid;
Version version;
uint8_t fileType;
Standalone<StringRef> filename;
int64_t offset;
int64_t length;
int64_t fullFileLength;
Optional<BlobGranuleCipherKeysMeta> cipherKeysMeta;
std::tie(gid, version, fileType) = decodeBlobGranuleFileKey(row.key);
std::tie(filename, offset, length, fullFileLength, cipherKeysMeta) = decodeBlobGranuleFileValue(row.value);
GranuleFileVersion vs = { version, fileType, filename.toString(), length };
files.push_back(vs);
std::tie(gid, version, fileType) = decodeBlobGranuleFileKey(row.key);
std::tie(filename, offset, length, fullFileLength, cipherKeysMeta) =
decodeBlobGranuleFileValue(row.value);
GranuleFileVersion vs = { version, fileType, filename.toString(), length };
files.push_back(vs);
}
if (!results.more) {
break;
}
if (results.readThrough.present()) {
begin = firstGreaterOrEqual(results.readThrough.get());
} else {
begin = firstGreaterThan(results.end()[-1].key);
}
}
return files;
}
@ -466,12 +535,26 @@ ACTOR Future<bool> isFullRestoreMode(Database db, KeyRangeRef keys) {
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
try {
RangeResult ranges = wait(tr.getRange(blobRestoreCommandKeys, CLIENT_KNOBS->TOO_MANY));
for (auto& r : ranges) {
KeyRange keyRange = decodeBlobRestoreCommandKeyFor(r.key);
if (keyRange.contains(keys)) {
Standalone<BlobRestoreStatus> status = decodeBlobRestoreStatus(r.value);
return status.progress < 100; // progress is less than 100
state GetRangeLimits limits(SERVER_KNOBS->BLOB_MANIFEST_RW_ROWS);
limits.minRows = 0;
state KeySelectorRef begin = firstGreaterOrEqual(blobRestoreCommandKeys.begin);
state KeySelectorRef end = firstGreaterOrEqual(blobRestoreCommandKeys.end);
loop {
RangeResult ranges = wait(tr.getRange(begin, end, limits, Snapshot::True));
for (auto& r : ranges) {
KeyRange keyRange = decodeBlobRestoreCommandKeyFor(r.key);
if (keyRange.contains(keys)) {
Standalone<BlobRestoreStatus> status = decodeBlobRestoreStatus(r.value);
return status.progress < 100; // progress is less than 100
}
}
if (!ranges.more) {
break;
}
if (ranges.readThrough.present()) {
begin = firstGreaterOrEqual(ranges.readThrough.get());
} else {
begin = firstGreaterThan(ranges.end()[-1].key);
}
}
return false;

View File

@ -157,7 +157,7 @@ struct GranuleMetadata : NonCopyable, ReferenceCounted<GranuleMetadata> {
return (1.0 * readStats.deltaBytesRead) / (writeAmp * SERVER_KNOBS->BG_RDC_READ_FACTOR);
}
bool isEligibleRDC() {
bool isEligibleRDC() const {
// granule should be reasonably read-hot to be eligible
int64_t bytesWritten = bufferedDeltaBytes + bytesInNewDeltaFiles;
return bytesWritten * SERVER_KNOBS->BG_RDC_READ_FACTOR < readStats.deltaBytesRead;
@ -2173,13 +2173,16 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
// will get an exception if we try to read any popped data, killing this actor
readOldChangeFeed = true;
// because several feeds will be reading the same version range of this change feed at the same time, set
// cache result to true
oldChangeFeedFuture = bwData->db->getChangeFeedStream(cfData,
oldCFKey.get(),
startVersion + 1,
startState.changeFeedStartVersion,
metadata->keyRange,
bwData->changeFeedStreamReplyBufferSize,
false);
false,
{ ReadType::NORMAL, CacheResult::True });
} else {
readOldChangeFeed = false;
@ -2283,7 +2286,7 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
// popped up to V+1 is ok. Or in other words, if the last delta @ V, we only missed data
// at V+1 onward if popVersion >= V+2
if (metadata->bufferedDeltaVersion < metadata->activeCFData.get()->popVersion - 1) {
CODE_PROBE(true, "Blob Worker detected popped");
CODE_PROBE(true, "Blob Worker detected popped", probe::decoration::rare);
TraceEvent("BlobWorkerChangeFeedPopped", bwData->id)
.detail("Granule", metadata->keyRange)
.detail("GranuleID", startState.granuleID)
@ -2462,6 +2465,8 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
if (readOldChangeFeed) {
ASSERT(cfRollbackVersion + 1 < startState.changeFeedStartVersion);
ASSERT(oldCFKey.present());
// because several feeds will be reading the same version range of this change
// feed at the same time, set cache result to true
oldChangeFeedFuture =
bwData->db->getChangeFeedStream(cfData,
oldCFKey.get(),
@ -2469,7 +2474,8 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
startState.changeFeedStartVersion,
metadata->keyRange,
bwData->changeFeedStreamReplyBufferSize,
false);
false,
{ ReadType::NORMAL, CacheResult::True });
} else {
if (cfRollbackVersion + 1 < startState.changeFeedStartVersion) {
@ -3987,7 +3993,7 @@ ACTOR Future<GranuleStartState> openGranule(Reference<BlobWorkerData> bwData, As
ForcedPurgeState purgeState = wait(fForcedPurgeState);
if (purgeState != ForcedPurgeState::NonePurged) {
CODE_PROBE(true, "Worker trying to open force purged granule");
CODE_PROBE(true, "Worker trying to open force purged granule", probe::decoration::rare);
if (BW_DEBUG) {
fmt::print("Granule [{0} - {1}) is force purged on BW {2}, abandoning\n",
req.keyRange.begin.printable(),

View File

@ -414,7 +414,7 @@ ACTOR Future<Void> commitBatcher(ProxyCommitData* commitData,
}
Optional<TenantNameRef> const& tenantName = req.tenantInfo.name;
if (SERVER_KNOBS->STORAGE_QUOTA_ENABLED && tenantName.present() &&
if (SERVER_KNOBS->STORAGE_QUOTA_ENABLED && !req.bypassStorageQuota() && tenantName.present() &&
commitData->tenantsOverStorageQuota.count(tenantName.get()) > 0) {
req.reply.sendError(storage_quota_exceeded());
continue;
@ -1310,7 +1310,7 @@ ACTOR Future<WriteMutationRefVar> writeMutationFetchEncryptKey(CommitBatchContex
wait(getLatestEncryptCipherKey(self->pProxyCommitData->db, domainId, p.first, BlobCipherMetrics::TLOG));
self->cipherKeys[domainId] = cipherKey;
CODE_PROBE(true, "Raw access mutation encryption");
CODE_PROBE(true, "Raw access mutation encryption", probe::decoration::rare);
ASSERT_NE(domainId, INVALID_ENCRYPT_DOMAIN_ID);
encryptedMutation = mutation->encrypt(self->cipherKeys, domainId, *arena, BlobCipherMetrics::TLOG);
self->toCommit.writeTypedMessage(encryptedMutation);
@ -1436,11 +1436,13 @@ ACTOR Future<Void> assignMutationsToStorageServers(CommitBatchContext* self) {
double prob = mul * cost / totalCosts;
if (deterministicRandom()->random01() < prob) {
for (const auto& ssInfo : pProxyCommitData->keyInfo[m.param1].src_info) {
const auto& storageServers = pProxyCommitData->keyInfo[m.param1].src_info;
for (const auto& ssInfo : storageServers) {
auto id = ssInfo->interf.id();
// scale cost
cost = cost < CLIENT_KNOBS->COMMIT_SAMPLE_COST ? CLIENT_KNOBS->COMMIT_SAMPLE_COST : cost;
pProxyCommitData->updateSSTagCost(id, trs[self->transactionNum].tagSet.get(), m, cost);
pProxyCommitData->updateSSTagCost(
id, trs[self->transactionNum].tagSet.get(), m, cost / storageServers.size());
}
}
}

View File

@ -316,7 +316,7 @@ class ConfigNodeImpl {
ACTOR static Future<Void> getConfigClasses(ConfigNodeImpl* self, ConfigTransactionGetConfigClassesRequest req) {
state Optional<CoordinatorsHash> locked = wait(getLocked(self));
if (locked.present()) {
CODE_PROBE(true, "attempting to read config classes from locked ConfigNode");
CODE_PROBE(true, "attempting to read config classes from locked ConfigNode", probe::decoration::rare);
req.reply.sendError(coordinators_changed());
return Void();
}
@ -360,7 +360,7 @@ class ConfigNodeImpl {
ACTOR static Future<Void> getKnobs(ConfigNodeImpl* self, ConfigTransactionGetKnobsRequest req) {
state Optional<CoordinatorsHash> locked = wait(getLocked(self));
if (locked.present()) {
CODE_PROBE(true, "attempting to read knobs from locked ConfigNode");
CODE_PROBE(true, "attempting to read knobs from locked ConfigNode", probe::decoration::rare);
req.reply.sendError(coordinators_changed());
return Void();
}

View File

@ -531,7 +531,7 @@ struct LeaderRegisterCollection {
return Void();
}
Future<Void> onError() { return actors.getResult(); }
Future<Void> onError() const { return actors.getResult(); }
// Check if the this coordinator is no longer the leader, and the new one was stored in the "forward" keyspace.
// If the "forward" keyspace was set some time ago (as configured by knob), log an error to indicate the client is

View File

@ -697,6 +697,9 @@ struct DDQueue : public IDDRelocationQueue {
RemoteTeamIsFull,
RemoteTeamIsNotHealthy,
NoAvailablePhysicalShard,
UnknownForceNew,
NoAnyHealthy,
DstOverloaded,
NumberOfTypes,
};
std::vector<int> retryFindDstReasonCount;
@ -1423,6 +1426,7 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueue* self,
state double startTime = now();
state std::vector<UID> destIds;
state uint64_t debugID = deterministicRandom()->randomUInt64();
state bool enableShardMove = SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD;
try {
if (now() - self->lastInterval < 1.0) {
@ -1539,8 +1543,7 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueue* self,
req.src = rd.src;
req.completeSources = rd.completeSources;
if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD &&
tciIndex == 1) {
if (enableShardMove && tciIndex == 1) {
ASSERT(physicalShardIDCandidate != UID().first() &&
physicalShardIDCandidate != anonymousShardId.first());
Optional<ShardsAffectedByTeamFailure::Team> remoteTeamWithPhysicalShard =
@ -1587,64 +1590,65 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueue* self,
anyWithSource = true;
}
if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD) {
// critical to the correctness of team selection by PhysicalShardCollection
// tryGetAvailableRemoteTeamWith() enforce to select a remote team paired with a primary
// team Thus, tryGetAvailableRemoteTeamWith() may select an almost full remote team In this
// case, we must re-select a remote team We set foundTeams = false to avoid finishing team
// selection Then, forceToUseNewPhysicalShard is set, which enforce to use getTeam to select
// a remote team
if (enableShardMove) {
if (tciIndex == 1 && !forceToUseNewPhysicalShard) {
// critical to the correctness of team selection by PhysicalShardCollection
// tryGetAvailableRemoteTeamWith() enforce to select a remote team paired with a primary
// team Thus, tryGetAvailableRemoteTeamWith() may select an almost full remote team In
// this case, we must re-select a remote team We set foundTeams = false to avoid
// finishing team selection Then, forceToUseNewPhysicalShard is set, which enforce to
// use getTeam to select a remote team
bool minAvailableSpaceRatio = bestTeam.first.get()->getMinAvailableSpaceRatio(true);
if (minAvailableSpaceRatio < SERVER_KNOBS->TARGET_AVAILABLE_SPACE_RATIO) {
retryFindDstReason = DDQueue::RetryFindDstReason::RemoteTeamIsFull;
foundTeams = false;
break;
}
}
}
if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD) {
// critical to the correctness of team selection by PhysicalShardCollection
// tryGetAvailableRemoteTeamWith() enforce to select a remote team paired with a primary
// team Thus, tryGetAvailableRemoteTeamWith() may select an unhealthy remote team In
// this case, we must re-select a remote team We set foundTeams = false to avoid
// finishing team selection Then, forceToUseNewPhysicalShard is set, which enforce to
// use getTeam to select a remote team
if (!bestTeam.first.get()->isHealthy()) {
retryFindDstReason = DDQueue::RetryFindDstReason::RemoteTeamIsNotHealthy;
foundTeams = false;
break;
}
}
bestTeams.emplace_back(bestTeam.first.get(), true);
// Always set bestTeams[i].second = true to disable optimization in data move between DCs
// for the correctness of PhysicalShardCollection
// Currently, enabling the optimization will break the invariant of PhysicalShardCollection
// Invariant: once a physical shard is created with a specific set of SSes, this SS set will
// never get changed.
if (tciIndex == 0) {
ASSERT(foundTeams);
ShardsAffectedByTeamFailure::Team primaryTeam =
ShardsAffectedByTeamFailure::Team(bestTeams[0].first->getServerIDs(), true);
if (forceToUseNewPhysicalShard &&
retryFindDstReason == DDQueue::RetryFindDstReason::None) {
// This is an abnormally state where we try to create new physical shard, but we
// don't know why. This state is to track unknown reason for force creating new
// physical shard.
retryFindDstReason = DDQueue::RetryFindDstReason::UnknownForceNew;
}
physicalShardIDCandidate =
self->physicalShardCollection->determinePhysicalShardIDGivenPrimaryTeam(
primaryTeam, metrics, forceToUseNewPhysicalShard, debugID);
ASSERT(physicalShardIDCandidate != UID().first() &&
physicalShardIDCandidate != anonymousShardId.first());
}
} else {
bestTeams.emplace_back(bestTeam.first.get(), bestTeam.second);
}
// get physicalShardIDCandidate
if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD &&
tciIndex == 0) {
ASSERT(foundTeams);
ShardsAffectedByTeamFailure::Team primaryTeam =
ShardsAffectedByTeamFailure::Team(bestTeams[0].first->getServerIDs(), true);
physicalShardIDCandidate =
self->physicalShardCollection->determinePhysicalShardIDGivenPrimaryTeam(
primaryTeam, metrics, forceToUseNewPhysicalShard, debugID);
ASSERT(physicalShardIDCandidate != UID().first() &&
physicalShardIDCandidate != anonymousShardId.first());
}
}
tciIndex++;
}
// critical to the correctness of team selection by PhysicalShardCollection
// tryGetAvailableRemoteTeamWith() enforce to select a remote team paired with a primary team
// Thus, tryGetAvailableRemoteTeamWith() may select an unhealthy remote team
// In this case, we must re-select a remote team
// We set foundTeams = false to avoid finishing team selection
// Then, forceToUseNewPhysicalShard is set, which enforce to use getTeam to select a remote team
if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD &&
bestTeams.size() > 1 && !forceToUseNewPhysicalShard) {
if (!bestTeams[1].first->isHealthy()) {
retryFindDstReason = DDQueue::RetryFindDstReason::RemoteTeamIsNotHealthy;
foundTeams = false;
}
}
// once we've found healthy candidate teams, make sure they're not overloaded with outstanding moves
// already
anyDestOverloaded = !canLaunchDest(bestTeams, rd.priority, self->destBusymap);
@ -1654,6 +1658,14 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueue* self,
break;
}
if (retryFindDstReason == DDQueue::RetryFindDstReason::None && foundTeams) {
if (!anyHealthy) {
retryFindDstReason = DDQueue::RetryFindDstReason::NoAnyHealthy;
} else if (anyDestOverloaded) {
retryFindDstReason = DDQueue::RetryFindDstReason::DstOverloaded;
}
}
if (anyDestOverloaded) {
CODE_PROBE(true, "Destination overloaded throttled move");
destOverloadedCount++;
@ -1665,7 +1677,7 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueue* self,
.detail("AnyDestOverloaded", anyDestOverloaded)
.detail("NumOfTeamCollections", self->teamCollections.size())
.detail("Servers", destServersString(bestTeams));
if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD) {
if (enableShardMove) {
if (rd.isRestore() && destOverloadedCount > 50) {
throw data_move_dest_team_not_found();
}
@ -1689,14 +1701,14 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueue* self,
// When forceToUseNewPhysicalShard = false, we get paired primary team and remote team
// However, this may be failed
// Any retry triggers to use new physicalShard which enters the normal routine
if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD) {
if (enableShardMove) {
forceToUseNewPhysicalShard = true;
}
// TODO different trace event + knob for overloaded? Could wait on an async var for done moves
}
if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD) {
if (enableShardMove) {
if (!rd.isRestore()) {
// when !rd.isRestore(), dataMoveId is just decided as physicalShardIDCandidate
// thus, update the physicalShardIDCandidate to related data structures
@ -1954,7 +1966,7 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueue* self,
self->shardsAffectedByTeamFailure->finishMove(rd.keys);
relocationComplete.send(rd);
if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD) {
if (enableShardMove) {
// update physical shard collection
std::vector<ShardsAffectedByTeamFailure::Team> selectedTeams;
for (int i = 0; i < bestTeams.size(); i++) {
@ -2525,6 +2537,12 @@ ACTOR Future<Void> dataDistributionQueue(Reference<IDDTxnProcessor> db,
self.retryFindDstReasonCount[DDQueue::RetryFindDstReason::RemoteTeamIsFull])
.detail("RemoteTeamIsNotHealthy",
self.retryFindDstReasonCount[DDQueue::RetryFindDstReason::RemoteTeamIsNotHealthy])
.detail("UnknownForceNew",
self.retryFindDstReasonCount[DDQueue::RetryFindDstReason::UnknownForceNew])
.detail("NoAnyHealthy",
self.retryFindDstReasonCount[DDQueue::RetryFindDstReason::NoAnyHealthy])
.detail("DstOverloaded",
self.retryFindDstReasonCount[DDQueue::RetryFindDstReason::DstOverloaded])
.detail(
"NoAvailablePhysicalShard",
self.retryFindDstReasonCount[DDQueue::RetryFindDstReason::NoAvailablePhysicalShard]);

View File

@ -623,7 +623,9 @@ std::vector<RangeToSplit> findTenantShardBoundaries(KeyRangeMap<ShardTrackedData
result.emplace_back(shardContainingTenantEnd, faultLines);
}
} else {
CODE_PROBE(true, "Shards that contain tenant key range not split since shard stats are unavailable");
CODE_PROBE(true,
"Shards that contain tenant key range not split since shard stats are unavailable",
probe::decoration::rare);
}
}
@ -1358,7 +1360,7 @@ ACTOR Future<Void> fetchTopKShardMetrics(DataDistributionTracker* self, GetTopKM
when(wait(g_network->isSimulated() && BUGGIFY_WITH_PROB(0.01) ? Never()
: fetchTopKShardMetrics_impl(self, req))) {}
when(wait(delay(SERVER_KNOBS->DD_SHARD_METRICS_TIMEOUT))) {
CODE_PROBE(true, "TopK DD_SHARD_METRICS_TIMEOUT", probe::decoration::rare);
CODE_PROBE(true, "TopK DD_SHARD_METRICS_TIMEOUT");
req.reply.send(GetTopKMetricsReply());
}
}
@ -2087,4 +2089,4 @@ TEST_CASE("/DataDistributor/Tracker/FetchTopK") {
ASSERT(reply.minReadLoad == -1);
return Void();
}
}

View File

@ -1538,14 +1538,18 @@ ACTOR Future<Void> dataDistributor(DataDistributorInterface di, Reference<AsyncV
when(DistributorSnapRequest snapReq = waitNext(di.distributorSnapReq.getFuture())) {
auto& snapUID = snapReq.snapUID;
if (ddSnapReqResultMap.count(snapUID)) {
CODE_PROBE(true, "Data distributor received a duplicate finished snapshot request");
CODE_PROBE(true,
"Data distributor received a duplicate finished snapshot request",
probe::decoration::rare);
auto result = ddSnapReqResultMap[snapUID];
result.isError() ? snapReq.reply.sendError(result.getError()) : snapReq.reply.send(result.get());
TraceEvent("RetryFinishedDistributorSnapRequest")
.detail("SnapUID", snapUID)
.detail("Result", result.isError() ? result.getError().code() : 0);
} else if (ddSnapReqMap.count(snapReq.snapUID)) {
CODE_PROBE(true, "Data distributor received a duplicate ongoing snapshot request");
CODE_PROBE(true,
"Data distributor received a duplicate ongoing snapshot request",
probe::decoration::rare);
TraceEvent("RetryOngoingDistributorSnapRequest").detail("SnapUID", snapUID);
ASSERT(snapReq.snapPayload == ddSnapReqMap[snapUID].snapPayload);
ddSnapReqMap[snapUID] = snapReq;

View File

@ -184,7 +184,7 @@ struct BlobMetadataCacheEntry {
explicit BlobMetadataCacheEntry(Standalone<BlobMetadataDetailsRef> metadataDetails)
: metadataDetails(metadataDetails), creationTimeSec(now()) {}
bool isValid() { return (now() - creationTimeSec) < SERVER_KNOBS->BLOB_METADATA_CACHE_TTL; }
bool isValid() const { return (now() - creationTimeSec) < SERVER_KNOBS->BLOB_METADATA_CACHE_TTL; }
};
// TODO: Bound the size of the cache (implement LRU/LFU...)

View File

@ -107,7 +107,7 @@ class GlobalTagThrottlerImpl {
if (opType == OpType::READ) {
readCost.setTotal(newCost);
} else {
writeCost.setTotal(CLIENT_KNOBS->GLOBAL_TAG_THROTTLING_RW_FUNGIBILITY_RATIO * newCost);
writeCost.setTotal(newCost);
}
}
@ -226,7 +226,9 @@ class GlobalTagThrottlerImpl {
return {};
}
auto const transactionRate = stats.get().getTransactionRate();
if (transactionRate == 0.0) {
// If there is less than one transaction per second, we do not have enough data
// to accurately compute an average transaction cost.
if (transactionRate < 1.0) {
return {};
} else {
return std::max(static_cast<double>(CLIENT_KNOBS->TAG_THROTTLING_PAGE_SIZE), cost.get() / transactionRate);
@ -475,7 +477,7 @@ public:
if (targetTps.present()) {
auto const smoothedTargetTps = stats.updateAndGetTargetLimit(targetTps.get());
te.detail("SmoothedTargetTps", smoothedTargetTps).detail("NumProxies", numProxies);
result[tag] = smoothedTargetTps / numProxies;
result[tag] = std::max(1.0, smoothedTargetTps / numProxies);
} else {
te.disable();
}

View File

@ -49,7 +49,7 @@ bool GrvProxyTagThrottler::TagQueue::isMaxThrottled(double maxThrottleDuration)
}
void GrvProxyTagThrottler::TagQueue::rejectRequests(LatencyBandsMap& latencyBandsMap) {
CODE_PROBE(true, "GrvProxyTagThrottler rejecting requests");
CODE_PROBE(true, "GrvProxyTagThrottler rejecting requests", probe::decoration::rare);
while (!requests.empty()) {
auto& delayedReq = requests.front();
delayedReq.updateProxyTagThrottledDuration(latencyBandsMap);
@ -58,6 +58,14 @@ void GrvProxyTagThrottler::TagQueue::rejectRequests(LatencyBandsMap& latencyBand
}
}
void GrvProxyTagThrottler::TagQueue::endReleaseWindow(int64_t numStarted, double elapsed) {
if (rateInfo.present()) {
CODE_PROBE(requests.empty(), "Tag queue ending release window with empty request queue");
CODE_PROBE(!requests.empty(), "Tag queue ending release window with requests still queued");
rateInfo.get().endReleaseWindow(numStarted, requests.empty(), elapsed);
}
}
GrvProxyTagThrottler::GrvProxyTagThrottler(double maxThrottleDuration)
: maxThrottleDuration(maxThrottleDuration),
latencyBandsMap("GrvProxyTagThrottler",
@ -202,16 +210,14 @@ void GrvProxyTagThrottler::releaseTransactions(double elapsed,
}
}
// End release windows for queues with valid rateInfo
// End release windows for all tag queues
{
TransactionTagMap<uint32_t> transactionsReleasedMap;
for (const auto& [tag, count] : transactionsReleased) {
transactionsReleasedMap[tag] = count;
}
for (auto& [tag, queue] : queues) {
if (queue.rateInfo.present()) {
queue.rateInfo.get().endReleaseWindow(transactionsReleasedMap[tag], false, elapsed);
}
queue.endReleaseWindow(transactionsReleasedMap[tag], elapsed);
}
}
// If the capacity is increased, that means the vector has been illegally resized, potentially
@ -438,3 +444,33 @@ TEST_CASE("/GrvProxyTagThrottler/Fifo") {
wait(mockFifoClient(&throttler));
return Void();
}
// Tests that while throughput is low, the tag throttler
// does not accumulate too much budget.
//
// A server is setup to server 10 transactions per second,
// then runs idly for 60 seconds. Then a client starts
// and attempts 20 transactions per second for 60 seconds.
// The server throttles the client to only achieve
// 10 transactions per second during this 60 second window.
// If the throttler is allowed to accumulate budget indefinitely
// during the idle 60 seconds, this test will fail.
TEST_CASE("/GrvProxyTagThrottler/LimitedIdleBudget") {
state GrvProxyTagThrottler throttler(5.0);
state TagSet tagSet;
state TransactionTagMap<uint32_t> counters;
{
TransactionTagMap<double> rates;
rates["sampleTag"_sr] = 10.0;
throttler.updateRates(rates);
}
tagSet.addTag("sampleTag"_sr);
state Future<Void> server = mockServer(&throttler);
wait(delay(60.0));
state Future<Void> client = mockClient(&throttler, TransactionPriority::DEFAULT, tagSet, 1, 20.0, &counters);
wait(timeout(client && server, 60.0, Void()));
TraceEvent("TagQuotaTest_LimitedIdleBudget").detail("Counter", counters["sampleTag"_sr]);
ASSERT(isNear(counters["sampleTag"_sr], 60.0 * 10.0));
return Void();
}

View File

@ -35,7 +35,7 @@ bool GrvTransactionRateInfo::canStart(int64_t numAlreadyStarted, int64_t count)
std::min(limit + budget, SERVER_KNOBS->START_TRANSACTION_MAX_TRANSACTIONS_TO_START);
}
void GrvTransactionRateInfo::endReleaseWindow(int64_t numStartedAtPriority, bool queueEmptyAtPriority, double elapsed) {
void GrvTransactionRateInfo::endReleaseWindow(int64_t numStarted, bool queueEmpty, double elapsed) {
// Update the budget to accumulate any extra capacity available or remove any excess that was used.
// The actual delta is the portion of the limit we didn't use multiplied by the fraction of the rate window that
// elapsed.
@ -52,16 +52,15 @@ void GrvTransactionRateInfo::endReleaseWindow(int64_t numStartedAtPriority, bool
//
// Note that "rate window" here indicates a period of SERVER_KNOBS->START_TRANSACTION_RATE_WINDOW seconds,
// whereas "release window" is the period between wait statements, with duration indicated by "elapsed."
budget =
std::max(0.0, budget + elapsed * (limit - numStartedAtPriority) / SERVER_KNOBS->START_TRANSACTION_RATE_WINDOW);
budget = std::max(0.0, budget + elapsed * (limit - numStarted) / SERVER_KNOBS->START_TRANSACTION_RATE_WINDOW);
// If we are emptying out the queue of requests, then we don't need to carry much budget forward
// If we did keep accumulating budget, then our responsiveness to changes in workflow could be compromised
if (queueEmptyAtPriority) {
if (queueEmpty) {
budget = std::min(budget, SERVER_KNOBS->START_TRANSACTION_MAX_EMPTY_QUEUE_BUDGET);
}
smoothReleased.addDelta(numStartedAtPriority);
smoothReleased.addDelta(numStarted);
}
void GrvTransactionRateInfo::disable() {

View File

@ -740,7 +740,8 @@ private:
}
CODE_PROBE(self->enableEncryption && self->uncommittedBytes() > 0,
"KeyValueStoreMemory recovered partial transaction while encryption-at-rest is enabled");
"KeyValueStoreMemory recovered partial transaction while encryption-at-rest is enabled",
probe::decoration::rare);
self->semiCommit();
return Void();

View File

@ -149,7 +149,22 @@ struct PageChecksumCodec {
}
if (!silent) {
TraceEvent trEvent(SevError, "SQLitePageChecksumFailure");
auto severity = SevError;
if (g_network->isSimulated()) {
auto firstBlock = pageNumber == 1 ? 0 : ((pageNumber - 1) * pageLen) / 4096,
lastBlock = (pageNumber * pageLen) / 4096;
auto iter = g_simulator->corruptedBlocks.lower_bound(std::make_pair(filename, firstBlock));
if (iter != g_simulator->corruptedBlocks.end() && iter->first == filename && iter->second < lastBlock) {
severity = SevWarnAlways;
}
TraceEvent("CheckCorruption")
.detail("Filename", filename)
.detail("NextFile", iter->first)
.detail("FirstBlock", firstBlock)
.detail("LastBlock", lastBlock)
.detail("NextBlock", iter->second);
}
TraceEvent trEvent(severity, "SQLitePageChecksumFailure");
trEvent.error(checksum_failed())
.detail("CodecPageSize", pageSize)
.detail("CodecReserveSize", reserveSize)

View File

@ -321,7 +321,7 @@ void LogPushData::writeMessage(StringRef rawMessageWithoutLength, bool usePrevio
}
}
std::vector<Standalone<StringRef>> LogPushData::getAllMessages() {
std::vector<Standalone<StringRef>> LogPushData::getAllMessages() const {
std::vector<Standalone<StringRef>> results;
results.reserve(messagesWriter.size());
for (int loc = 0; loc < messagesWriter.size(); loc++) {

View File

@ -451,7 +451,7 @@ Future<Void> ILogSystem::ServerPeekCursor::getMore(TaskPriority taskID) {
return more;
}
ACTOR Future<Void> serverPeekOnFailed(ILogSystem::ServerPeekCursor* self) {
ACTOR Future<Void> serverPeekOnFailed(ILogSystem::ServerPeekCursor const* self) {
loop {
choose {
when(wait(self->interf->get().present()
@ -471,7 +471,7 @@ ACTOR Future<Void> serverPeekOnFailed(ILogSystem::ServerPeekCursor* self) {
}
}
Future<Void> ILogSystem::ServerPeekCursor::onFailed() {
Future<Void> ILogSystem::ServerPeekCursor::onFailed() const {
return serverPeekOnFailed(this);
}
@ -757,7 +757,7 @@ Future<Void> ILogSystem::MergedPeekCursor::getMore(TaskPriority taskID) {
return more;
}
Future<Void> ILogSystem::MergedPeekCursor::onFailed() {
Future<Void> ILogSystem::MergedPeekCursor::onFailed() const {
ASSERT(false);
return Never();
}
@ -1114,7 +1114,7 @@ Future<Void> ILogSystem::SetPeekCursor::getMore(TaskPriority taskID) {
return more;
}
Future<Void> ILogSystem::SetPeekCursor::onFailed() {
Future<Void> ILogSystem::SetPeekCursor::onFailed() const {
ASSERT(false);
return Never();
}
@ -1226,7 +1226,7 @@ Future<Void> ILogSystem::MultiCursor::getMore(TaskPriority taskID) {
return cursors.back()->getMore(taskID);
}
Future<Void> ILogSystem::MultiCursor::onFailed() {
Future<Void> ILogSystem::MultiCursor::onFailed() const {
return cursors.back()->onFailed();
}
@ -1503,7 +1503,7 @@ Future<Void> ILogSystem::BufferedCursor::getMore(TaskPriority taskID) {
return more;
}
Future<Void> ILogSystem::BufferedCursor::onFailed() {
Future<Void> ILogSystem::BufferedCursor::onFailed() const {
ASSERT(false);
return Never();
}

View File

@ -131,16 +131,16 @@ void MockStorageServer::setShardStatus(KeyRangeRef range, MockShardStatus status
auto ranges = serverKeys.intersectingRanges(range);
ASSERT(!ranges.empty());
if (ranges.begin().range().contains(range)) {
CODE_PROBE(true, "Implicitly split single shard to 3 pieces");
CODE_PROBE(true, "Implicitly split single shard to 3 pieces", probe::decoration::rare);
threeWayShardSplitting(ranges.begin().range(), range, ranges.begin().cvalue().shardSize, restrictSize);
return;
}
if (ranges.begin().begin() < range.begin) {
CODE_PROBE(true, "Implicitly split begin range to 2 pieces");
CODE_PROBE(true, "Implicitly split begin range to 2 pieces", probe::decoration::rare);
twoWayShardSplitting(ranges.begin().range(), range.begin, ranges.begin().cvalue().shardSize, restrictSize);
}
if (ranges.end().end() > range.end) {
CODE_PROBE(true, "Implicitly split end range to 2 pieces");
CODE_PROBE(true, "Implicitly split end range to 2 pieces", probe::decoration::rare);
twoWayShardSplitting(ranges.end().range(), range.end, ranges.end().cvalue().shardSize, restrictSize);
}
ranges = serverKeys.containedRanges(range);
@ -156,7 +156,7 @@ void MockStorageServer::setShardStatus(KeyRangeRef range, MockShardStatus status
if (isStatusTransitionValid(oldStatus, status)) {
it.value() = ShardInfo{ status, newSize };
} else if (oldStatus == MockShardStatus::COMPLETED && status == MockShardStatus::INFLIGHT) {
CODE_PROBE(true, "Shard already on server");
CODE_PROBE(true, "Shard already on server", probe::decoration::rare);
} else {
TraceEvent(SevError, "MockShardStatusTransitionError")
.detail("From", oldStatus)
@ -382,7 +382,7 @@ Future<std::vector<KeyRangeLocationInfo>> MockGlobalState::getKeyRangeLocations(
ASSERT_EQ(srcTeam.size(), 1);
rep.results.emplace_back(it->range(), extractStorageServerInterfaces(srcTeam.front().servers));
}
CODE_PROBE(it != ranges.end(), "getKeyRangeLocations is limited", probe::decoration::rare);
CODE_PROBE(it != ranges.end(), "getKeyRangeLocations is limited");
std::vector<KeyRangeLocationInfo> results;
for (int shard = 0; shard < rep.results.size(); shard++) {

View File

@ -802,11 +802,13 @@ ACTOR Future<Void> waitForShardReady(StorageServerInterface server,
try {
GetShardStateReply rep =
wait(server.getShardState.getReply(GetShardStateRequest(keys, mode), TaskPriority::MoveKeys));
TraceEvent("GetShardStateReadyDD").detail("RepVersion", rep.first).detail("MinVersion", rep.second).log();
if (rep.first >= minVersion) {
return Void();
}
wait(delayJittered(SERVER_KNOBS->SHARD_READY_DELAY, TaskPriority::MoveKeys));
} catch (Error& e) {
TraceEvent("GetShardStateReadyError").error(e).log();
if (e.code() != error_code_timed_out) {
if (e.code() != error_code_broken_promise)
throw e;
@ -1699,7 +1701,9 @@ ACTOR static Future<Void> finishMoveShards(Database occ,
state std::vector<UID> newDestinations;
std::set<UID> completeSrcSet(completeSrc.begin(), completeSrc.end());
for (const UID& id : destServers) {
newDestinations.push_back(id);
if (!hasRemote || !completeSrcSet.count(id)) {
newDestinations.push_back(id);
}
}
state std::vector<StorageServerInterface> storageServerInterfaces;
@ -1743,7 +1747,8 @@ ACTOR static Future<Void> finishMoveShards(Database occ,
TraceEvent(SevVerbose, "FinishMoveShardsWaitedServers", relocationIntervalId)
.detail("DataMoveID", dataMoveId)
.detail("ReadyServers", describe(readyServers));
.detail("ReadyServers", describe(readyServers))
.detail("NewDestinations", describe(newDestinations));
if (readyServers.size() == newDestinations.size()) {

View File

@ -196,7 +196,7 @@ private:
Standalone<StringRef> e = wait(self->queue->readNext(payloadSize + 1));
if (e.size() != payloadSize + 1) {
CODE_PROBE(true, "Zero fill within payload");
CODE_PROBE(true, "Zero fill within payload", probe::decoration::rare);
zeroFillSize = payloadSize + 1 - e.size();
break;
}
@ -210,7 +210,7 @@ private:
}
}
if (zeroFillSize) {
CODE_PROBE(true, "Fixing a partial commit at the end of the tlog queue");
CODE_PROBE(true, "Fixing a partial commit at the end of the tlog queue", probe::decoration::rare);
for (int i = 0; i < zeroFillSize; i++)
self->queue->push(StringRef((const uint8_t*)"", 1));
}

View File

@ -170,7 +170,7 @@ private:
Standalone<StringRef> e = wait(self->queue->readNext(payloadSize + 1));
if (e.size() != payloadSize + 1) {
CODE_PROBE(true, "Zero fill within payload");
CODE_PROBE(true, "Zero fill within payload", probe::decoration::rare);
zeroFillSize = payloadSize + 1 - e.size();
break;
}
@ -186,7 +186,7 @@ private:
}
}
if (zeroFillSize) {
CODE_PROBE(true, "Fixing a partial commit at the end of the tlog queue");
CODE_PROBE(true, "Fixing a partial commit at the end of the tlog queue", probe::decoration::rare);
for (int i = 0; i < zeroFillSize; i++)
self->queue->push(StringRef((const uint8_t*)"", 1));
}

View File

@ -289,11 +289,7 @@ ACTOR Future<Void> resolveBatch(Reference<Resolver> self,
// Detect conflicts
double expire = now() + SERVER_KNOBS->SAMPLE_EXPIRATION_TIME;
ConflictBatch conflictBatch(self->conflictSet, &reply.conflictingKeyRangeMap, &reply.arena);
Version newOldestVersion = req.version - SERVER_KNOBS->MAX_WRITE_TRANSACTION_LIFE_VERSIONS;
if (g_network->isSimulated() && g_simulator->speedUpSimulation) {
newOldestVersion = req.version - std::max(5 * SERVER_KNOBS->VERSIONS_PER_SECOND,
SERVER_KNOBS->MAX_WRITE_TRANSACTION_LIFE_VERSIONS);
}
const Version newOldestVersion = req.version - SERVER_KNOBS->MAX_WRITE_TRANSACTION_LIFE_VERSIONS;
for (int t = 0; t < req.transactions.size(); t++) {
conflictBatch.addTransaction(req.transactions[t], newOldestVersion);
self->resolvedReadConflictRanges += req.transactions[t].read_conflict_ranges.size();
@ -372,7 +368,7 @@ ACTOR Future<Void> resolveBatch(Reference<Resolver> self,
isEncryptionOpSupported(EncryptOperationType::TLOG_ENCRYPTION) ? &cipherKeys
: nullptr);
}
CODE_PROBE(self->forceRecovery, "Resolver detects forced recovery");
CODE_PROBE(self->forceRecovery, "Resolver detects forced recovery", probe::decoration::rare);
}
self->resolvedStateTransactions += req.txnStateTransactions.size();

View File

@ -2283,6 +2283,19 @@ void setupSimulatedSystem(std::vector<Future<Void>>* systemActors,
}
deterministicRandom()->randomShuffle(coordinatorAddresses);
for (const auto& coordinators : extraCoordinatorAddresses) {
for (int i = 0; i < (coordinators.size() / 2) + 1; i++) {
TraceEvent("ProtectCoordinator")
.detail("Address", coordinators[i])
.detail("Coordinators", describe(coordinators));
g_simulator->protectedAddresses.insert(
NetworkAddress(coordinators[i].ip, coordinators[i].port, true, coordinators[i].isTLS()));
if (coordinators[i].port == 2) {
g_simulator->protectedAddresses.insert(NetworkAddress(coordinators[i].ip, 1, true, true));
}
}
}
ASSERT_EQ(coordinatorAddresses.size(), coordinatorCount);
ClusterConnectionString conn(coordinatorAddresses, "TestCluster:0"_sr);
if (useHostname) {

View File

@ -172,7 +172,7 @@ private:
Standalone<StringRef> e = wait(self->queue->readNext(payloadSize + 1));
if (e.size() != payloadSize + 1) {
CODE_PROBE(true, "Zero fill within payload");
CODE_PROBE(true, "Zero fill within payload", probe::decoration::rare);
zeroFillSize = payloadSize + 1 - e.size();
break;
}
@ -188,7 +188,7 @@ private:
}
}
if (zeroFillSize) {
CODE_PROBE(true, "Fixing a partial commit at the end of the tlog queue");
CODE_PROBE(true, "Fixing a partial commit at the end of the tlog queue", probe::decoration::rare);
for (int i = 0; i < zeroFillSize; i++)
self->queue->push(StringRef((const uint8_t*)"", 1));
}
@ -1262,7 +1262,7 @@ ACTOR Future<Void> processPopRequests(TLogData* self, Reference<LogData> logData
TraceEvent("PlayIgnoredPop", logData->logId).detail("Tag", tag.toString()).detail("Version", version);
ignoredPops.push_back(tLogPopCore(self, tag, version, logData));
if (++ignoredPopsPlayed % SERVER_KNOBS->TLOG_POP_BATCH_SIZE == 0) {
CODE_PROBE(true, "Yielding while processing pop requests");
CODE_PROBE(true, "Yielding while processing pop requests", probe::decoration::rare);
wait(yield());
}
}
@ -1857,7 +1857,8 @@ Future<Void> tLogPeekMessages(PromiseType replyPromise,
}
if (sequenceData.isSet()) {
if (sequenceData.getFuture().get().first != rep.end) {
CODE_PROBE(true, "tlog peek second attempt ended at a different version");
CODE_PROBE(
true, "tlog peek second attempt ended at a different version", probe::decoration::rare);
replyPromise.sendError(operation_obsolete());
return Void();
}

View File

@ -305,7 +305,7 @@ Reference<ILogSystem> TagPartitionedLogSystem::fromOldLogSystemConfig(UID const&
return logSystem;
}
void TagPartitionedLogSystem::toCoreState(DBCoreState& newState) {
void TagPartitionedLogSystem::toCoreState(DBCoreState& newState) const {
if (recoveryComplete.isValid() && recoveryComplete.isError())
throw recoveryComplete.getError();
@ -343,11 +343,11 @@ void TagPartitionedLogSystem::toCoreState(DBCoreState& newState) {
newState.logSystemType = logSystemType;
}
bool TagPartitionedLogSystem::remoteStorageRecovered() {
bool TagPartitionedLogSystem::remoteStorageRecovered() const {
return remoteRecoveryComplete.isValid() && remoteRecoveryComplete.isReady();
}
Future<Void> TagPartitionedLogSystem::onCoreStateChanged() {
Future<Void> TagPartitionedLogSystem::onCoreStateChanged() const {
std::vector<Future<Void>> changes;
changes.push_back(Never());
if (recoveryComplete.isValid() && !recoveryComplete.isReady()) {
@ -376,11 +376,11 @@ void TagPartitionedLogSystem::coreStateWritten(DBCoreState const& newState) {
}
}
Future<Void> TagPartitionedLogSystem::onError() {
Future<Void> TagPartitionedLogSystem::onError() const {
return onError_internal(this);
}
ACTOR Future<Void> TagPartitionedLogSystem::onError_internal(TagPartitionedLogSystem* self) {
ACTOR Future<Void> TagPartitionedLogSystem::onError_internal(TagPartitionedLogSystem const* self) {
// Never returns normally, but throws an error if the subsystem stops working
loop {
std::vector<Future<Void>> failed;

View File

@ -92,6 +92,8 @@ static FILE* g_debugStream = stdout;
#define TRACE \
debug_printf_always("%s: %s line %d %s\n", __FUNCTION__, __FILE__, __LINE__, platform::get_backtrace().c_str());
using namespace std::string_view_literals;
// Returns a string where every line in lines is prefixed with prefix
std::string addPrefix(std::string prefix, std::string lines) {
StringRef m = lines;
@ -489,7 +491,7 @@ public:
}
// Returns true if the mutex cannot be immediately taken.
bool isBusy() { return !mutex.available(); }
bool isBusy() const { return !mutex.available(); }
// Wait for all operations started before now to be ready, which is done by
// obtaining and releasing the mutex.
@ -1026,17 +1028,31 @@ public:
// These pages are not encrypted
page->postReadPayload(c.pageID);
} catch (Error& e) {
TraceEvent(SevError, "RedwoodChecksumFailed")
bool isInjected = false;
if (g_network->isSimulated()) {
auto num4kBlocks = std::max(self->pager->getPhysicalPageSize() / 4096, 1);
auto startBlock = (c.pageID * self->pager->getPhysicalPageSize()) / 4096;
auto iter = g_simulator->corruptedBlocks.lower_bound(
std::make_pair(self->pager->getName(), startBlock));
if (iter->first == self->pager->getName() && iter->second < startBlock + num4kBlocks) {
isInjected = true;
}
}
TraceEvent(isInjected ? SevWarnAlways : SevError, "RedwoodChecksumFailed")
.error(e)
.detail("PageID", c.pageID)
.detail("PageSize", self->pager->getPhysicalPageSize())
.detail("Offset", c.pageID * self->pager->getPhysicalPageSize());
.detail("Offset", c.pageID * self->pager->getPhysicalPageSize())
.detail("Filename", self->pager->getName());
debug_printf("FIFOQueue::Cursor(%s) peekALLExt getSubPage error=%s for %s. Offset %d ",
c.toString().c_str(),
e.what(),
toString(c.pageID).c_str(),
c.pageID * self->pager->getPhysicalPageSize());
if (isInjected) {
throw e.asInjectedFault();
}
throw;
}
@ -1168,7 +1184,7 @@ public:
headWriter.write(item);
}
bool isBusy() {
bool isBusy() const {
return headWriter.isBusy() || headReader.isBusy() || tailWriter.isBusy() || !newTailPage.isReady();
}
@ -2025,7 +2041,8 @@ public:
bool memoryOnly,
Reference<IPageEncryptionKeyProvider> keyProvider,
Promise<Void> errorPromise = {})
: keyProvider(keyProvider), ioLock(FLOW_KNOBS->MAX_OUTSTANDING, SERVER_KNOBS->REDWOOD_PRIORITY_LAUNCHS),
: keyProvider(keyProvider),
ioLock(makeReference<PriorityMultiLock>(FLOW_KNOBS->MAX_OUTSTANDING, SERVER_KNOBS->REDWOOD_IO_PRIORITIES)),
pageCacheBytes(pageCacheSizeBytes), desiredPageSize(desiredPageSize), desiredExtentSize(desiredExtentSize),
filename(filename), memoryOnly(memoryOnly), errorPromise(errorPromise),
remapCleanupWindowBytes(remapCleanupWindowBytes), concurrentExtentReads(new FlowLock(concurrentExtentReads)) {
@ -2037,7 +2054,7 @@ public:
// This sets the page cache size for all PageCacheT instances using the same evictor
pageCache.evictor().sizeLimit = pageCacheBytes;
g_redwoodMetrics.ioLock = &ioLock;
g_redwoodMetrics.ioLock = ioLock.getPtr();
if (!g_redwoodMetricsActor.isValid()) {
g_redwoodMetricsActor = redwoodMetricsLogger();
}
@ -2499,7 +2516,7 @@ public:
unsigned int level,
bool header) {
state PriorityMultiLock::Lock lock = wait(self->ioLock.lock(header ? ioMaxPriority : ioMinPriority));
state PriorityMultiLock::Lock lock = wait(self->ioLock->lock(header ? ioMaxPriority : ioMinPriority));
++g_redwoodMetrics.metric.pagerDiskWrite;
g_redwoodMetrics.level(level).metrics.events.addEventReason(PagerEvents::PageWrite, reason);
if (self->memoryOnly) {
@ -2779,7 +2796,7 @@ public:
int blockSize,
int64_t offset,
int priority) {
state PriorityMultiLock::Lock lock = wait(self->ioLock.lock(std::min(priority, ioMaxPriority)));
state PriorityMultiLock::Lock lock = wait(self->ioLock->lock(std::min(priority, ioMaxPriority)));
++g_redwoodMetrics.metric.pagerDiskRead;
int bytes = wait(self->pageFile->read(pageBuffer->rawData() + pageOffset, blockSize, offset));
return bytes;
@ -3593,7 +3610,7 @@ public:
// The next section explicitly cancels all pending operations held in the pager
debug_printf("DWALPager(%s) shutdown kill ioLock\n", self->filename.c_str());
self->ioLock.kill();
self->ioLock->kill();
debug_printf("DWALPager(%s) shutdown cancel recovery\n", self->filename.c_str());
self->recoverFuture.cancel();
@ -3802,7 +3819,7 @@ private:
Reference<IPageEncryptionKeyProvider> keyProvider;
PriorityMultiLock ioLock;
Reference<PriorityMultiLock> ioLock;
int64_t pageCacheBytes;
@ -8894,32 +8911,25 @@ void RedwoodMetrics::getIOLockFields(TraceEvent* e, std::string* s) {
int maxPriority = ioLock->maxPriority();
if (e != nullptr) {
e->detail("ActiveReads", ioLock->totalRunners());
e->detail("AwaitReads", ioLock->totalWaiters());
e->detail("IOActiveTotal", ioLock->getRunnersCount());
e->detail("IOWaitingTotal", ioLock->getWaitersCount());
for (int priority = 0; priority <= maxPriority; ++priority) {
e->detail(format("ActiveP%d", priority), ioLock->numRunners(priority));
e->detail(format("AwaitP%d", priority), ioLock->numWaiters(priority));
e->detail(format("IOActiveP%d", priority), ioLock->getRunnersCount(priority));
e->detail(format("IOWaitingP%d", priority), ioLock->getWaitersCount(priority));
}
}
if (s != nullptr) {
std::string active = "Active";
std::string await = "Await";
*s += "\n";
*s += format("%-15s %-8u ", "ActiveReads", ioLock->totalRunners());
*s += format("%-15s %-8u ", "AwaitReads", ioLock->totalWaiters());
*s += "\n";
*s += format("%-15s %-8u ", "IOActiveTotal", ioLock->getRunnersCount());
for (int priority = 0; priority <= maxPriority; ++priority) {
*s +=
format("%-15s %-8u ", (active + 'P' + std::to_string(priority)).c_str(), ioLock->numRunners(priority));
*s += format("IOActiveP%-6d %-8u ", priority, ioLock->getRunnersCount(priority));
}
*s += "\n";
*s += format("%-15s %-8u ", "IOWaitingTotal", ioLock->getWaitersCount());
for (int priority = 0; priority <= maxPriority; ++priority) {
*s +=
format("%-15s %-8u ", (await + 'P' + std::to_string(priority)).c_str(), ioLock->numWaiters(priority));
*s += format("IOWaitingP%-5d %-8u ", priority, ioLock->getWaitersCount(priority));
}
}
}
@ -11407,57 +11417,3 @@ TEST_CASE(":/redwood/performance/histograms") {
return Void();
}
ACTOR Future<Void> waitLockIncrement(PriorityMultiLock* pml, int priority, int* pout) {
state PriorityMultiLock::Lock lock = wait(pml->lock(priority));
wait(delay(deterministicRandom()->random01() * .1));
++*pout;
return Void();
}
TEST_CASE("/redwood/PriorityMultiLock") {
state std::vector<int> priorities = { 10, 20, 40 };
state int concurrency = 25;
state PriorityMultiLock* pml = new PriorityMultiLock(concurrency, priorities);
state std::vector<int> counts;
counts.resize(priorities.size(), 0);
// Clog the lock buy taking concurrency locks at each level
state std::vector<Future<PriorityMultiLock::Lock>> lockFutures;
for (int i = 0; i < priorities.size(); ++i) {
for (int j = 0; j < concurrency; ++j) {
lockFutures.push_back(pml->lock(i));
}
}
// Wait for n = concurrency locks to be acquired
wait(quorum(lockFutures, concurrency));
state std::vector<Future<Void>> futures;
for (int i = 0; i < 10e3; ++i) {
int p = i % priorities.size();
futures.push_back(waitLockIncrement(pml, p, &counts[p]));
}
state Future<Void> f = waitForAll(futures);
// Release the locks
lockFutures.clear();
// Print stats and wait for all futures to be ready
loop {
choose {
when(wait(delay(1))) {
printf("counts: ");
for (auto c : counts) {
printf("%d ", c);
}
printf(" pml: %s\n", pml->toString().c_str());
}
when(wait(f)) { break; }
}
}
delete pml;
return Void();
}

View File

@ -60,6 +60,7 @@ class GrvProxyTagThrottler {
void setRate(double rate);
bool isMaxThrottled(double maxThrottleDuration) const;
void rejectRequests(LatencyBandsMap&);
void endReleaseWindow(int64_t numStarted, double elapsed);
};
// Track the budgets for each tag

View File

@ -55,7 +55,7 @@ public:
// Updates the budget to accumulate any extra capacity available or remove any excess that was used.
// Call at the end of a release window.
void endReleaseWindow(int64_t numStartedAtPriority, bool queueEmptyAtPriority, double elapsed);
void endReleaseWindow(int64_t numStarted, bool queueEmpty, double elapsed);
// Smoothly sets rate. If currently disabled, reenable
void setRate(double rate);

View File

@ -163,7 +163,7 @@ struct ILogSystem {
virtual Future<Void> getMore(TaskPriority taskID = TaskPriority::TLogPeekReply) = 0;
// returns when the failure monitor detects that the servers associated with the cursor are failed
virtual Future<Void> onFailed() = 0;
virtual Future<Void> onFailed() const = 0;
// returns false if:
// (1) the failure monitor detects that the servers associated with the cursor is failed
@ -251,7 +251,7 @@ struct ILogSystem {
VectorRef<Tag> getTags() const override;
void advanceTo(LogMessageVersion n) override;
Future<Void> getMore(TaskPriority taskID = TaskPriority::TLogPeekReply) override;
Future<Void> onFailed() override;
Future<Void> onFailed() const override;
bool isActive() const override;
bool isExhausted() const override;
const LogMessageVersion& version() const override;
@ -313,7 +313,7 @@ struct ILogSystem {
VectorRef<Tag> getTags() const override;
void advanceTo(LogMessageVersion n) override;
Future<Void> getMore(TaskPriority taskID = TaskPriority::TLogPeekReply) override;
Future<Void> onFailed() override;
Future<Void> onFailed() const override;
bool isActive() const override;
bool isExhausted() const override;
const LogMessageVersion& version() const override;
@ -369,7 +369,7 @@ struct ILogSystem {
VectorRef<Tag> getTags() const override;
void advanceTo(LogMessageVersion n) override;
Future<Void> getMore(TaskPriority taskID = TaskPriority::TLogPeekReply) override;
Future<Void> onFailed() override;
Future<Void> onFailed() const override;
bool isActive() const override;
bool isExhausted() const override;
const LogMessageVersion& version() const override;
@ -401,7 +401,7 @@ struct ILogSystem {
VectorRef<Tag> getTags() const override;
void advanceTo(LogMessageVersion n) override;
Future<Void> getMore(TaskPriority taskID = TaskPriority::TLogPeekReply) override;
Future<Void> onFailed() override;
Future<Void> onFailed() const override;
bool isActive() const override;
bool isExhausted() const override;
const LogMessageVersion& version() const override;
@ -480,7 +480,7 @@ struct ILogSystem {
VectorRef<Tag> getTags() const override;
void advanceTo(LogMessageVersion n) override;
Future<Void> getMore(TaskPriority taskID = TaskPriority::TLogPeekReply) override;
Future<Void> onFailed() override;
Future<Void> onFailed() const override;
bool isActive() const override;
bool isExhausted() const override;
const LogMessageVersion& version() const override;
@ -500,18 +500,18 @@ struct ILogSystem {
virtual std::string describe() const = 0;
virtual UID getDebugID() const = 0;
virtual void toCoreState(DBCoreState&) = 0;
virtual void toCoreState(DBCoreState&) const = 0;
virtual bool remoteStorageRecovered() = 0;
virtual bool remoteStorageRecovered() const = 0;
virtual Future<Void> onCoreStateChanged() = 0;
virtual Future<Void> onCoreStateChanged() const = 0;
// Returns if and when the output of toCoreState() would change (for example, when older logs can be discarded from
// the state)
virtual void coreStateWritten(DBCoreState const& newState) = 0;
// Called when a core state has been written to the coordinators
virtual Future<Void> onError() = 0;
virtual Future<Void> onError() const = 0;
// Never returns normally, but throws an error if the subsystem stops working
// Future<Void> push( UID bundle, int64_t seq, VectorRef<TaggedMessageRef> messages );
@ -791,10 +791,10 @@ struct LogPushData : NonCopyable {
template <class T>
void writeTypedMessage(T const& item, bool metadataMessage = false, bool allLocations = false);
Standalone<StringRef> getMessages(int loc) { return messagesWriter[loc].toValue(); }
Standalone<StringRef> getMessages(int loc) const { return messagesWriter[loc].toValue(); }
// Returns all locations' messages, including empty ones.
std::vector<Standalone<StringRef>> getAllMessages();
std::vector<Standalone<StringRef>> getAllMessages() const;
// Records if a tlog (specified by "loc") will receive an empty version batch message.
// "value" is the message returned by getMessages() call.

View File

@ -208,7 +208,7 @@ class Ratekeeper {
Deque<std::pair<double, Version>> blobWorkerVersionHistory;
Optional<Key> remoteDC;
double getRecoveryDuration(Version ver) {
double getRecoveryDuration(Version ver) const {
auto it = version_recovery.lower_bound(ver);
double recoveryDuration = 0;
while (it != version_recovery.end()) {

View File

@ -185,7 +185,7 @@ struct StagingKey {
}
// Does the key has at least 1 set or clear mutation to get the base value
bool hasBaseValue() {
bool hasBaseValue() const {
if (version.version > 0) {
ASSERT(type == MutationRef::SetValue || type == MutationRef::ClearRange);
}
@ -193,12 +193,12 @@ struct StagingKey {
}
// Has all pendingMutations been pre-applied to the val?
bool hasPrecomputed() {
bool hasPrecomputed() const {
ASSERT(pendingMutations.empty() || pendingMutations.rbegin()->first >= pendingMutations.begin()->first);
return pendingMutations.empty() || version >= pendingMutations.rbegin()->first;
}
int totalSize() { return MutationRef::OVERHEAD_BYTES + key.size() + val.size(); }
int totalSize() const { return MutationRef::OVERHEAD_BYTES + key.size() + val.size(); }
};
// The range mutation received on applier.
@ -231,7 +231,7 @@ public:
void operator=(int newState) override { vbState = newState; }
int get() override { return vbState; }
int get() const override { return vbState; }
};
struct ApplierBatchData : public ReferenceCounted<ApplierBatchData> {
@ -324,7 +324,7 @@ struct ApplierBatchData : public ReferenceCounted<ApplierBatchData> {
dbApplier = Optional<Future<Void>>();
}
void sanityCheckMutationOps() {
void sanityCheckMutationOps() const {
if (kvOps.empty())
return;
@ -332,7 +332,7 @@ struct ApplierBatchData : public ReferenceCounted<ApplierBatchData> {
ASSERT_WE_THINK(allOpsAreKnown());
}
bool isKVOpsSorted() {
bool isKVOpsSorted() const {
auto prev = kvOps.begin();
for (auto it = kvOps.begin(); it != kvOps.end(); ++it) {
if (prev->first > it->first) {
@ -343,7 +343,7 @@ struct ApplierBatchData : public ReferenceCounted<ApplierBatchData> {
return true;
}
bool allOpsAreKnown() {
bool allOpsAreKnown() const {
for (auto it = kvOps.begin(); it != kvOps.end(); ++it) {
for (auto m = it->second.begin(); m != it->second.end(); ++m) {
if (m->type == MutationRef::SetValue || m->type == MutationRef::ClearRange ||
@ -380,8 +380,8 @@ struct RestoreApplierData : RestoreRoleData, public ReferenceCounted<RestoreAppl
// getVersionBatchState may be called periodically to dump version batch state,
// even when no version batch has been started.
int getVersionBatchState(int batchIndex) final {
std::map<int, Reference<ApplierBatchData>>::iterator item = batch.find(batchIndex);
int getVersionBatchState(int batchIndex) const final {
auto item = batch.find(batchIndex);
if (item == batch.end()) { // Batch has not been initialized when we blindly profile the state
return ApplierVersionBatchState::INVALID;
} else {
@ -404,7 +404,7 @@ struct RestoreApplierData : RestoreRoleData, public ReferenceCounted<RestoreAppl
finishedBatch = NotifiedVersion(0);
}
std::string describeNode() override {
std::string describeNode() const override {
std::stringstream ss;
ss << "NodeID:" << nodeID.toString() << " nodeIndex:" << nodeIndex;
return ss.str();

View File

@ -56,7 +56,7 @@ struct VersionBatch {
std::tie(rhs.batchIndex, rhs.beginVersion, rhs.endVersion, rhs.logFiles, rhs.rangeFiles, rhs.size);
}
bool isEmpty() { return logFiles.empty() && rangeFiles.empty(); }
bool isEmpty() const { return logFiles.empty() && rangeFiles.empty(); }
void reset() {
beginVersion = 0;
endVersion = 0;
@ -164,7 +164,7 @@ struct RestoreControllerData : RestoreRoleData, public ReferenceCounted<RestoreC
~RestoreControllerData() override = default;
int getVersionBatchState(int batchIndex) final { return RoleVersionBatchState::INVALID; }
int getVersionBatchState(int batchIndex) const final { return RoleVersionBatchState::INVALID; }
void setVersionBatchState(int batchIndex, int vbState) final {}
void initVersionBatch(int batchIndex) override {
@ -182,13 +182,13 @@ struct RestoreControllerData : RestoreRoleData, public ReferenceCounted<RestoreC
ASSERT(runningVersionBatches.get() == 0);
}
std::string describeNode() override {
std::string describeNode() const override {
std::stringstream ss;
ss << "Controller";
return ss.str();
}
void dumpVersionBatches(const std::map<Version, VersionBatch>& versionBatches) {
void dumpVersionBatches(const std::map<Version, VersionBatch>& versionBatches) const {
int i = 1;
double rangeFiles = 0;
double rangeSize = 0;

View File

@ -56,7 +56,7 @@ public:
void operator=(int newState) override { vbState = newState; }
int get() override { return vbState; }
int get() const override { return vbState; }
};
struct LoaderBatchData : public ReferenceCounted<LoaderBatchData> {
@ -193,15 +193,15 @@ struct RestoreLoaderData : RestoreRoleData, public ReferenceCounted<RestoreLoade
~RestoreLoaderData() override = default;
std::string describeNode() override {
std::string describeNode() const override {
std::stringstream ss;
ss << "[Role: Loader] [NodeID:" << nodeID.toString().c_str() << "] [NodeIndex:" << std::to_string(nodeIndex)
<< "]";
return ss.str();
}
int getVersionBatchState(int batchIndex) final {
std::map<int, Reference<LoaderBatchData>>::iterator item = batch.find(batchIndex);
int getVersionBatchState(int batchIndex) const final {
auto item = batch.find(batchIndex);
if (item == batch.end()) { // Batch has not been initialized when we blindly profile the state
return LoaderVersionBatchState::INVALID;
} else {

View File

@ -69,7 +69,7 @@ class RoleVersionBatchState {
public:
static const int INVALID = -1;
virtual int get() { return vbState; }
virtual int get() const { return vbState; }
virtual void operator=(int newState) { vbState = newState; }
@ -109,7 +109,7 @@ public:
virtual void initVersionBatch(int batchIndex) = 0;
virtual void resetPerRestoreRequest() = 0;
virtual int getVersionBatchState(int batchIndex) = 0;
virtual int getVersionBatchState(int batchIndex) const = 0;
virtual void setVersionBatchState(int batchIndex, int vbState) = 0;
void clearInterfaces() {
@ -117,7 +117,7 @@ public:
appliersInterf.clear();
}
virtual std::string describeNode() = 0;
virtual std::string describeNode() const = 0;
};
void updateProcessStats(Reference<RestoreRoleData> self);

View File

@ -189,7 +189,7 @@ Future<Void> serveStorageMetricsRequests(ServiceType* self, StorageServerInterfa
choose {
when(state WaitMetricsRequest req = waitNext(ssi.waitMetrics.getFuture())) {
if (!req.tenantInfo.present() && !self->isReadable(req.keys)) {
CODE_PROBE(true, "waitMetrics immediate wrong_shard_server()");
CODE_PROBE(true, "waitMetrics immediate wrong_shard_server()", probe::decoration::rare);
self->sendErrorWithPenalty(req.reply, wrong_shard_server(), self->getPenalty());
} else {
self->addActor(self->waitMetricsTenantAware(req));
@ -231,4 +231,4 @@ Future<Void> serveStorageMetricsRequests(ServiceType* self, StorageServerInterfa
}
#include "flow/unactorcompiler.h"
#endif // FDBSERVER_STORAGEMETRICS_H
#endif // FDBSERVER_STORAGEMETRICS_H

View File

@ -170,17 +170,17 @@ struct TagPartitionedLogSystem final : ILogSystem, ReferenceCounted<TagPartition
LogSystemConfig const& lsConf);
// Convert TagPartitionedLogSystem to DBCoreState and override input newState as return value
void toCoreState(DBCoreState& newState) final;
void toCoreState(DBCoreState& newState) const final;
bool remoteStorageRecovered() final;
bool remoteStorageRecovered() const final;
Future<Void> onCoreStateChanged() final;
Future<Void> onCoreStateChanged() const final;
void coreStateWritten(DBCoreState const& newState) final;
Future<Void> onError() final;
Future<Void> onError() const final;
ACTOR static Future<Void> onError_internal(TagPartitionedLogSystem* self);
ACTOR static Future<Void> onError_internal(TagPartitionedLogSystem const* self);
ACTOR static Future<Void> pushResetChecker(Reference<ConnectionResetInfo> self, NetworkAddress addr);

View File

@ -344,7 +344,7 @@ struct ApiWorkload : TestWorkload {
virtual Future<Void> performTest(Database const& cx, Standalone<VectorRef<KeyValueRef>> const& data) = 0;
// Returns whether or not success is false
bool hasFailed();
bool hasFailed() const;
// Clears the keyspace used by this test
Future<Void> clearKeyspace();

View File

@ -567,6 +567,8 @@ struct ChangeFeedInfo : ReferenceCounted<ChangeFeedInfo> {
// back, we can avoid notifying other SS of change feeds that don't durably exist
Version metadataCreateVersion = invalidVersion;
FlowLock fetchLock = FlowLock(1);
bool removing = false;
bool destroyed = false;
@ -1004,7 +1006,7 @@ public:
// investigate, but preventing a new storage process from replacing the TSS on the worker. It will still get removed
// from the cluster if it falls behind on the mutation stream, or if its tss pair gets removed and its tag is no
// longer valid.
bool isTSSInQuarantine() { return tssPairID.present() && tssInQuarantine; }
bool isTSSInQuarantine() const { return tssPairID.present() && tssInQuarantine; }
void startTssQuarantine() {
if (!tssInQuarantine) {
@ -1054,6 +1056,11 @@ public:
// when the disk permits
NotifiedVersion oldestVersion; // See also storageVersion()
NotifiedVersion durableVersion; // At least this version will be readable from storage after a power failure
// In the event of the disk corruption, sqlite and redwood will either not recover, recover to durableVersion
// but be unable to read some data, or they could lose the last commit. If we lose the last commit, the storage
// might not be able to peek from the tlog (depending on when it sent the last pop). So this version just keeps
// track of the version we committed to the storage engine before we did commit durableVersion.
Version storageMinRecoverVersion = 0;
Version rebootAfterDurableVersion;
int8_t primaryLocality;
NotifiedVersion knownCommittedVersion;
@ -1110,15 +1117,13 @@ public:
FlowLock serveFetchCheckpointParallelismLock;
PriorityMultiLock ssLock;
Reference<PriorityMultiLock> ssLock;
std::vector<int> readPriorityRanks;
Future<PriorityMultiLock::Lock> getReadLock(const Optional<ReadOptions>& options) {
// TODO: Fix perf regression in 100% cache read case where taking this lock adds too much overhead
return PriorityMultiLock::Lock();
// int readType = (int)(options.present() ? options.get().type : ReadType::NORMAL);
// readType = std::clamp<int>(readType, 0, readPriorityRanks.size() - 1);
// return ssLock.lock(readPriorityRanks[readType]);
int readType = (int)(options.present() ? options.get().type : ReadType::NORMAL);
readType = std::clamp<int>(readType, 0, readPriorityRanks.size() - 1);
return ssLock->lock(readPriorityRanks[readType]);
}
FlowLock serveAuditStorageParallelismLock;
@ -1407,7 +1412,8 @@ public:
fetchKeysParallelismFullLock(SERVER_KNOBS->FETCH_KEYS_PARALLELISM_FULL),
fetchKeysBytesBudget(SERVER_KNOBS->STORAGE_FETCH_BYTES), fetchKeysBudgetUsed(false),
serveFetchCheckpointParallelismLock(SERVER_KNOBS->SERVE_FETCH_CHECKPOINT_PARALLELISM),
ssLock(SERVER_KNOBS->STORAGE_SERVER_READ_CONCURRENCY, SERVER_KNOBS->STORAGESERVER_READ_PRIORITIES),
ssLock(makeReference<PriorityMultiLock>(SERVER_KNOBS->STORAGE_SERVER_READ_CONCURRENCY,
SERVER_KNOBS->STORAGESERVER_READ_PRIORITIES)),
serveAuditStorageParallelismLock(SERVER_KNOBS->SERVE_AUDIT_STORAGE_PARALLELISM),
instanceID(deterministicRandom()->randomUniqueID().first()), shuttingDown(false), behind(false),
versionBehind(false), debug_inApplyUpdate(false), debug_lastValidateTime(0), lastBytesInputEBrake(0),
@ -1415,7 +1421,7 @@ public:
busiestWriteTagContext(ssi.id()), counters(this),
storageServerSourceTLogIDEventHolder(
makeReference<EventCacheHolder>(ssi.id().toString() + "/StorageServerSourceTLogID")) {
readPriorityRanks = parseStringToVector<int>(SERVER_KNOBS->STORAGESERVER_READ_RANKS, ',');
readPriorityRanks = parseStringToVector<int>(SERVER_KNOBS->STORAGESERVER_READTYPE_PRIORITY_MAP, ',');
ASSERT(readPriorityRanks.size() > (int)ReadType::MAX);
version.initMetric("StorageServer.Version"_sr, counters.cc.getId());
oldestVersion.initMetric("StorageServer.OldestVersion"_sr, counters.cc.getId());
@ -1509,6 +1515,7 @@ public:
desiredOldestVersion = ver;
oldestVersion = ver;
durableVersion = ver;
storageMinRecoverVersion = ver;
lastVersionWithData = ver;
restoredVersion = ver;
@ -5687,6 +5694,7 @@ bool changeDurableVersion(StorageServer* data, Version desiredDurableVersion) {
data->freeable.erase(data->freeable.begin(), data->freeable.lower_bound(nextDurableVersion));
Future<Void> checkFatalError = data->otherError.getFuture();
data->storageMinRecoverVersion = data->durableVersion.get();
data->durableVersion.set(nextDurableVersion);
setDataDurableVersion(data->thisServerID, data->durableVersion.get());
if (checkFatalError.isReady())
@ -6309,6 +6317,15 @@ ACTOR Future<Version> fetchChangeFeedApplier(StorageServer* data,
Version beginVersion,
Version endVersion,
ReadOptions readOptions) {
state FlowLock::Releaser feedFetchReleaser;
// avoid fetching the same version range of the same change feed multiple times.
choose {
when(wait(changeFeedInfo->fetchLock.take())) {
feedFetchReleaser = FlowLock::Releaser(changeFeedInfo->fetchLock);
}
when(wait(changeFeedInfo->durableFetchVersion.whenAtLeast(endVersion))) { return invalidVersion; }
}
state Version startVersion = beginVersion;
startVersion = std::max(startVersion, emptyVersion + 1);
@ -6328,6 +6345,7 @@ ACTOR Future<Version> fetchChangeFeedApplier(StorageServer* data,
return invalidVersion;
}
// FIXME: if this feed range is not wholly contained within the shard, set cache to true on reading
state Reference<ChangeFeedData> feedResults = makeReference<ChangeFeedData>();
state Future<Void> feed = data->cx->getChangeFeedStream(feedResults,
rangeId,
@ -6843,6 +6861,16 @@ ACTOR Future<std::vector<Key>> fetchChangeFeedMetadata(StorageServer* data,
return feedIds;
}
ReadOptions readOptionsForFeedFetch(const ReadOptions& options, const KeyRangeRef& keys, const KeyRangeRef& feedRange) {
if (!feedRange.contains(keys)) {
return options;
}
// If feed range wholly contains shard range, cache on fetch because other shards will likely also fetch it
ReadOptions newOptions = options;
newOptions.cacheResult = true;
return newOptions;
}
// returns max version fetched for each feed
// newFeedIds is used for the second fetch to get data for new feeds that weren't there for the first fetch
ACTOR Future<std::unordered_map<Key, Version>> dispatchChangeFeeds(StorageServer* data,
@ -6867,8 +6895,9 @@ ACTOR Future<std::unordered_map<Key, Version>> dispatchChangeFeeds(StorageServer
auto feedIt = data->uidChangeFeed.find(feedId);
// feed may have been moved away or deleted after move was scheduled, do nothing in that case
if (feedIt != data->uidChangeFeed.end() && !feedIt->second->removing) {
ReadOptions fetchReadOptions = readOptionsForFeedFetch(readOptions, keys, feedIt->second->range);
feedFetches[feedIt->second->id] =
fetchChangeFeed(data, feedIt->second, beginVersion, endVersion, readOptions);
fetchChangeFeed(data, feedIt->second, beginVersion, endVersion, fetchReadOptions);
}
}
for (auto& feedId : newFeedIds) {
@ -6876,7 +6905,8 @@ ACTOR Future<std::unordered_map<Key, Version>> dispatchChangeFeeds(StorageServer
// we just read the change feed data map earlier in fetchKeys without yielding, so these feeds must exist
ASSERT(feedIt != data->uidChangeFeed.end());
ASSERT(!feedIt->second->removing);
feedFetches[feedIt->second->id] = fetchChangeFeed(data, feedIt->second, 0, endVersion, readOptions);
ReadOptions fetchReadOptions = readOptionsForFeedFetch(readOptions, keys, feedIt->second->range);
feedFetches[feedIt->second->id] = fetchChangeFeed(data, feedIt->second, 0, endVersion, fetchReadOptions);
}
loop {
@ -9426,7 +9456,7 @@ ACTOR Future<Void> updateStorage(StorageServer* data) {
wait(ioTimeoutError(durable, SERVER_KNOBS->MAX_STORAGE_COMMIT_TIME));
data->storageCommitLatencyHistogram->sampleSeconds(now() - beforeStorageCommit);
debug_advanceMinCommittedVersion(data->thisServerID, newOldestVersion);
debug_advanceMinCommittedVersion(data->thisServerID, data->storageMinRecoverVersion);
if (removeKVSRanges) {
TraceEvent(SevDebug, "RemoveKVSRangesComitted", data->thisServerID)
@ -9568,7 +9598,7 @@ ACTOR Future<Void> updateStorage(StorageServer* data) {
// loaded.
state double beforeSSDurableVersionUpdate = now();
wait(data->durableVersionLock.take());
data->popVersion(data->durableVersion.get() + 1);
data->popVersion(data->storageMinRecoverVersion + 1);
while (!changeDurableVersion(data, newOldestVersion)) {
if (g_network->check_yield(TaskPriority::UpdateStorage)) {
@ -10431,20 +10461,20 @@ ACTOR Future<Void> metricsCore(StorageServer* self, StorageServerInterface ssi)
te.detail("StorageEngine", self->storage.getKeyValueStoreType().toString());
te.detail("Tag", self->tag.toString());
std::vector<int> rpr = self->readPriorityRanks;
te.detail("ReadsActive", self->ssLock.totalRunners());
te.detail("ReadsWaiting", self->ssLock.totalWaiters());
te.detail("ReadsTotalActive", self->ssLock->getRunnersCount());
te.detail("ReadsTotalWaiting", self->ssLock->getWaitersCount());
int type = (int)ReadType::FETCH;
te.detail("ReadFetchActive", self->ssLock.numRunners(rpr[type]));
te.detail("ReadFetchWaiting", self->ssLock.numWaiters(rpr[type]));
te.detail("ReadFetchActive", self->ssLock->getRunnersCount(rpr[type]));
te.detail("ReadFetchWaiting", self->ssLock->getWaitersCount(rpr[type]));
type = (int)ReadType::LOW;
te.detail("ReadLowActive", self->ssLock.numRunners(rpr[type]));
te.detail("ReadLowWaiting", self->ssLock.numWaiters(rpr[type]));
te.detail("ReadLowActive", self->ssLock->getRunnersCount(rpr[type]));
te.detail("ReadLowWaiting", self->ssLock->getWaitersCount(rpr[type]));
type = (int)ReadType::NORMAL;
te.detail("ReadNormalActive", self->ssLock.numRunners(rpr[type]));
te.detail("ReadNormalWaiting", self->ssLock.numWaiters(rpr[type]));
te.detail("ReadNormalActive", self->ssLock->getRunnersCount(rpr[type]));
te.detail("ReadNormalWaiting", self->ssLock->getWaitersCount(rpr[type]));
type = (int)ReadType::HIGH;
te.detail("ReadHighActive", self->ssLock.numRunners(rpr[type]));
te.detail("ReadHighWaiting", self->ssLock.numWaiters(rpr[type]));
te.detail("ReadHighActive", self->ssLock->getRunnersCount(rpr[type]));
te.detail("ReadHighWaiting", self->ssLock->getWaitersCount(rpr[type]));
StorageBytes sb = self->storage.getStorageBytes();
te.detail("KvstoreBytesUsed", sb.used);
te.detail("KvstoreBytesFree", sb.free);
@ -10821,7 +10851,7 @@ ACTOR Future<Void> storageServerCore(StorageServer* self, StorageServerInterface
}
self->logCursor = self->logSystem->peekSingle(
self->thisServerID, self->version.get() + 1, self->tag, self->history);
self->popVersion(self->durableVersion.get() + 1, true);
self->popVersion(self->storageMinRecoverVersion + 1, true);
}
// If update() is waiting for results from the tlog, it might never get them, so needs to be
// cancelled. But if it is waiting later, cancelling it could cause problems (e.g. fetchKeys
@ -11260,7 +11290,7 @@ ACTOR Future<Void> storageServer(IKeyValueStore* persistentData,
// If the storage server dies while something that uses self is still on the stack,
// we want that actor to complete before we terminate and that memory goes out of scope
self.ssLock.kill();
self.ssLock->kill();
state Error err = e;
if (storageServerTerminated(self, persistentData, err)) {
@ -11358,7 +11388,7 @@ ACTOR Future<Void> storageServer(IKeyValueStore* persistentData,
throw internal_error();
} catch (Error& e) {
self.ssLock.kill();
self.ssLock->kill();
if (self.byteSampleRecovery.isValid()) {
self.byteSampleRecovery.cancel();

View File

@ -2331,7 +2331,7 @@ ACTOR Future<Void> workerServer(Reference<IClusterConnectionRecord> connRecord,
recruited.initEndpoints();
if (blobMigratorInterf->get().present()) {
recruited = blobMigratorInterf->get().get();
CODE_PROBE(true, "Recruited while already a blob migrator.");
CODE_PROBE(true, "Recruited while already a blob migrator.", probe::decoration::rare);
} else {
startRole(Role::BLOB_MIGRATOR, recruited.id(), interf.id());
DUMPTOKEN(recruited.haltBlobMigrator);
@ -2796,7 +2796,7 @@ ACTOR Future<Void> workerServer(Reference<IClusterConnectionRecord> connRecord,
when(state WorkerSnapRequest snapReq = waitNext(interf.workerSnapReq.getFuture())) {
std::string snapReqKey = snapReq.snapUID.toString() + snapReq.role.toString();
if (snapReqResultMap.count(snapReqKey)) {
CODE_PROBE(true, "Worker received a duplicate finished snapshot request");
CODE_PROBE(true, "Worker received a duplicate finished snapshot request", probe::decoration::rare);
auto result = snapReqResultMap[snapReqKey];
result.isError() ? snapReq.reply.sendError(result.getError()) : snapReq.reply.send(result.get());
TraceEvent("RetryFinishedWorkerSnapRequest")
@ -2804,7 +2804,7 @@ ACTOR Future<Void> workerServer(Reference<IClusterConnectionRecord> connRecord,
.detail("Role", snapReq.role)
.detail("Result", result.isError() ? result.getError().code() : success().code());
} else if (snapReqMap.count(snapReqKey)) {
CODE_PROBE(true, "Worker received a duplicate ongoing snapshot request");
CODE_PROBE(true, "Worker received a duplicate ongoing snapshot request", probe::decoration::rare);
TraceEvent("RetryOngoingWorkerSnapRequest")
.detail("SnapUID", snapReq.snapUID.toString())
.detail("Role", snapReq.role);

View File

@ -328,6 +328,6 @@ Reference<TransactionWrapper> ApiWorkload::createTransaction() {
return transactionFactory->createTransaction();
}
bool ApiWorkload::hasFailed() {
bool ApiWorkload::hasFailed() const {
return !success;
}

View File

@ -172,6 +172,7 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
}
}
}
void disableFailureInjectionWorkloads(std::set<std::string>& out) const override { out.emplace("Attrition"); }
Future<Void> setup(Database const& cx) override { return _setup(cx, this); }

View File

@ -62,7 +62,9 @@ struct DataLossRecoveryWorkload : TestWorkload {
Future<Void> setup(Database const& cx) override { return Void(); }
void disableFailureInjectionWorkloads(std::set<std::string>& out) const override { out.insert("RandomMoveKeys"); }
void disableFailureInjectionWorkloads(std::set<std::string>& out) const override {
out.insert({ "RandomMoveKeys", "Attrition" });
}
Future<Void> start(Database const& cx) override {
if (!enabled) {

View File

@ -23,7 +23,6 @@
#include "fdbserver/workloads/workloads.actor.h"
#include "fdbrpc/simulator.h"
#include "fdbserver/WorkerInterface.actor.h"
#include "fdbserver/ServerDBInfo.h"
#include "fdbserver/QuietDatabase.h"
#include "fdbserver/Status.actor.h"
#include "flow/actorcompiler.h" // This must be the last #include.
@ -43,7 +42,6 @@ struct DiskFailureInjectionWorkload : FailureInjectionWorkload {
double percentBitFlips = 10;
double periodicBroadcastInterval = 5.0;
std::vector<NetworkAddress> chosenWorkers;
std::vector<Future<Void>> clients;
// Verification Mode: We run the workload indefinitely in this mode.
// The idea is to keep going until we get a non-zero chaosMetric to ensure
// that we haven't lost the chaos event. testDuration is ignored in this mode
@ -76,23 +74,20 @@ struct DiskFailureInjectionWorkload : FailureInjectionWorkload {
// 2. Starting the actor that injects failures on chosen storage servers
Future<Void> start(Database const& cx) override {
if (enabled) {
clients.push_back(timeout(diskFailureInjectionClient<WorkerInterface>(cx, this), testDuration, Void()));
// In verification mode, we want to wait until periodicEventBroadcast actor returns which indicates that
// a non-zero chaosMetric was found.
auto result = diskFailureInjectionClient<WorkerInterface>(cx, this);
// In verification mode, we want to wait until periodicEventBroadcast actor returns which indicates that
// a non-zero chaosMetric was found.
if (verificationMode) {
clients.push_back(periodicEventBroadcast(this));
} else
return (periodicEventBroadcast(this) && delay(testDuration)) || result;
} else {
// Else we honor the testDuration
clients.push_back(timeout(periodicEventBroadcast(this), testDuration, Void()));
return waitForAll(clients);
return timeout(periodicEventBroadcast(this) && result, testDuration, Void());
}
} else
return Void();
}
Future<bool> check(Database const& cx) override {
clients.clear();
return true;
}
Future<bool> check(Database const& cx) override { return true; }
void getMetrics(std::vector<PerfMetric>& m) override {}
@ -160,6 +155,7 @@ struct DiskFailureInjectionWorkload : FailureInjectionWorkload {
} catch (Error& e) {
// If we failed to get a complete list of storage servers, we can't inject failure events
// But don't throw the error in that case
TraceEvent("ChaosCouldNotGetStorages").error(e);
continue;
}
auto machine = deterministicRandom()->randomChoice(machines);
@ -192,11 +188,20 @@ struct DiskFailureInjectionWorkload : FailureInjectionWorkload {
for (auto worker : workers) {
workersMap[worker.interf.address()] = worker.interf;
}
TraceEvent("ResendChaos")
.detail("ChosenWorkersSize", self->chosenWorkers.size())
.detail("FoundWorkers", workersMap.size())
.detail(
"ResendToNumber",
std::count_if(self->chosenWorkers.begin(),
self->chosenWorkers.end(),
[&map = std::as_const(workersMap)](auto const& addr) { return map.count(addr) > 0; }));
for (auto& workerAddress : self->chosenWorkers) {
auto itr = workersMap.find(workerAddress);
if (itr != workersMap.end()) {
if (self->throttleDisk && (throttledWorkers++ < self->workersToThrottle))
if (self->throttleDisk && (throttledWorkers++ < self->workersToThrottle)) {
self->injectDiskDelays(itr->second, self->stallInterval, self->stallPeriod, self->throttlePeriod);
}
if (self->corruptFile && (corruptedWorkers++ < self->workersToCorrupt)) {
if (g_simulator == g_network)
g_simulator->corruptWorkerMap[workerAddress] = true;

View File

@ -154,7 +154,7 @@ struct EncryptionOpsWorkload : TestWorkload {
~EncryptionOpsWorkload() { TraceEvent("EncryptionOpsWorkloadDone").log(); }
bool isFixedSizePayload() { return mode == 1; }
bool isFixedSizePayload() const { return mode == 1; }
std::string getModeStr() const {
if (mode == 1) {
@ -166,7 +166,7 @@ struct EncryptionOpsWorkload : TestWorkload {
throw internal_error();
}
void generateRandomBaseCipher(const int maxLen, uint8_t* buff, int* retLen) {
static void generateRandomBaseCipher(const int maxLen, uint8_t* buff, int* retLen) {
memset(buff, 0, maxLen);
*retLen = deterministicRandom()->randomInt(maxLen / 2, maxLen);
deterministicRandom()->randomBytes(buff, *retLen);

View File

@ -43,6 +43,12 @@ struct FastTriggeredWatchesWorkload : TestWorkload {
keyBytes = std::max(getOption(options, "keyBytes"_sr, 16), 16);
}
void disableFailureInjectionWorkloads(std::set<std::string>& out) const override {
// This test asserts that watches fire within a certain version range. Attrition will make this assertion fail
// since it can cause recoveries which will bump the cluster version significantly
out.emplace("Attrition");
}
Future<Void> setup(Database const& cx) override {
if (clientId == 0)
return _setup(cx, this);

View File

@ -18,10 +18,7 @@
* limitations under the License.
*/
#include <cstdint>
#include <limits>
#include <algorithm>
#include "fdbrpc/simulator.h"
#include "fdbclient/MutationLogReader.actor.h"
#include "fdbclient/Tuple.h"
#include "fdbserver/workloads/ApiWorkload.h"
@ -55,6 +52,9 @@ struct GetMappedRangeWorkload : ApiWorkload {
enabled = !clientId; // only do this on the "first" client
}
// TODO: Currently this workload doesn't play well with MachineAttrition, but it probably should
void disableFailureInjectionWorkloads(std::set<std::string>& out) const override { out.insert("Attrition"); }
Future<Void> start(Database const& cx) override {
// This workload is generated different from typical ApiWorkload. So don't use ApiWorkload::_start.
if (enabled) {

View File

@ -228,7 +228,7 @@ struct IDDTxnProcessorApiWorkload : TestWorkload {
// test finish or started but cancelled movement
if (deterministicRandom()->coinflip()) {
CODE_PROBE(true, "RawMovementApi partial started");
CODE_PROBE(true, "RawMovementApi partial started", probe::decoration::rare);
return Void();
}
@ -320,4 +320,4 @@ struct IDDTxnProcessorApiWorkload : TestWorkload {
void getMetrics(std::vector<PerfMetric>& m) override {}
};
WorkloadFactory<IDDTxnProcessorApiWorkload> IDDTxnProcessorApiWorkload;
WorkloadFactory<IDDTxnProcessorApiWorkload> IDDTxnProcessorApiWorkload;

View File

@ -49,6 +49,8 @@ struct LowLatencyWorkload : TestWorkload {
testKey = getOption(options, "testKey"_sr, "testKey"_sr);
}
void disableFailureInjectionWorkloads(std::set<std::string>& out) const override { out.insert("Attrition"); }
Future<Void> setup(Database const& cx) override {
if (g_network->isSimulated()) {
IKnobCollection::getMutableGlobalKnobCollection().setKnob("min_delay_cc_worst_fit_candidacy_seconds",

View File

@ -121,6 +121,10 @@ struct MachineAttritionWorkload : FailureInjectionWorkload {
bool shouldInject(DeterministicRandom& random,
const WorkloadRequest& work,
const unsigned alreadyAdded) const override {
if (g_network->isSimulated() && !g_simulator->extraDatabases.empty()) {
// Remove this as soon as we track extra databases properly
return false;
}
return work.useDatabase && random.random01() < 1.0 / (2.0 + alreadyAdded);
}
@ -482,5 +486,4 @@ struct MachineAttritionWorkload : FailureInjectionWorkload {
};
WorkloadFactory<MachineAttritionWorkload> MachineAttritionWorkloadFactory;
// TODO: Enable MachineAttritionWorkload injection once this is bug-free
// FailureInjectorFactory<MachineAttritionWorkload> MachineAttritionFailureWorkloadFactory;
FailureInjectorFactory<MachineAttritionWorkload> MachineAttritionFailureWorkloadFactory;

View File

@ -70,7 +70,10 @@ struct PhysicalShardMoveWorkLoad : TestWorkload {
return _start(this, cx);
}
void disableFailureInjectionWorkloads(std::set<std::string>& out) const override { out.insert("RandomMoveKeys"); }
void disableFailureInjectionWorkloads(std::set<std::string>& out) const override {
out.insert("RandomMoveKeys");
out.insert("Attrition");
}
ACTOR Future<Void> _start(PhysicalShardMoveWorkLoad* self, Database cx) {
int ignore = wait(setDDMode(cx, 0));

View File

@ -1143,6 +1143,8 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload {
state KeyRange coordinators_key_range =
KeyRangeRef("process/"_sr, "process0"_sr)
.withPrefix(SpecialKeySpace::getManagementApiCommandPrefix("coordinators"));
state unsigned retries = 0;
state bool changeCoordinatorsSucceeded = true;
loop {
try {
tx->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
@ -1222,11 +1224,18 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload {
.detail("ErrorMessage", valueObj["message"].get_str());
ASSERT(valueObj["command"].get_str() == "coordinators");
if (valueObj["retriable"].get_bool()) { // coordinators not reachable, retry
if (++retries >= 10) {
CODE_PROBE(true, "ChangeCoordinators Exceeded retry limit");
changeCoordinatorsSucceeded = false;
tx->reset();
break;
}
tx->reset();
} else {
ASSERT(valueObj["message"].get_str() ==
"No change (existing configuration satisfies request)");
tx->reset();
CODE_PROBE(true, "Successfully changed coordinators");
break;
}
} else {
@ -1242,8 +1251,10 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload {
ASSERT(res.present()); // Otherwise, database is in a bad state
ClusterConnectionString csNew(res.get().toString());
// verify the cluster decription
ASSERT(new_cluster_description == csNew.clusterKeyName().toString());
ASSERT(csNew.hostnames.size() + csNew.coords.size() == old_coordinators_processes.size() + 1);
ASSERT(!changeCoordinatorsSucceeded ||
new_cluster_description == csNew.clusterKeyName().toString());
ASSERT(!changeCoordinatorsSucceeded ||
csNew.hostnames.size() + csNew.coords.size() == old_coordinators_processes.size() + 1);
std::vector<NetworkAddress> newCoordinators = wait(csNew.tryResolveHostnames());
// verify the coordinators' addresses
for (const auto& network_address : newCoordinators) {
@ -1259,7 +1270,7 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload {
wait(delay(FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY));
}
// change back to original settings
loop {
while (changeCoordinatorsSucceeded) {
try {
std::string new_processes_key;
tx->setOption(FDBTransactionOptions::RAW_ACCESS);

View File

@ -92,11 +92,15 @@ struct StorageQuotaWorkload : TestWorkload {
}
// Check that writes to both the tenants are rejected when the group is over quota.
state bool rejected1 = wait(tryWrite(self, cx, self->tenant, /*expectOk=*/false));
state bool rejected1 = wait(tryWrite(self, cx, self->tenant, /*bypassQuota=*/false, /*expectOk=*/false));
ASSERT(rejected1);
state bool rejected2 = wait(tryWrite(self, cx, self->emptyTenant, /*expectOk=*/false));
state bool rejected2 = wait(tryWrite(self, cx, self->emptyTenant, /*bypassQuota=*/false, /*expectOk=*/false));
ASSERT(rejected2);
// Check that transaction is able to commit if we use the FDBTransactionOptions to bypass quota.
state bool bypassed = wait(tryWrite(self, cx, self->tenant, /*bypassQuota=*/true, /*expectOk=*/true));
ASSERT(bypassed);
// Increase the quota or clear the quota. Check that writes to both the tenants are now able to commit.
if (deterministicRandom()->coinflip()) {
quota = size * 2;
@ -104,9 +108,9 @@ struct StorageQuotaWorkload : TestWorkload {
} else {
wait(clearStorageQuotaHelper(cx, self->group));
}
state bool committed1 = wait(tryWrite(self, cx, self->tenant, /*expectOk=*/true));
state bool committed1 = wait(tryWrite(self, cx, self->tenant, /*bypassQuota=*/false, /*expectOk=*/true));
ASSERT(committed1);
state bool committed2 = wait(tryWrite(self, cx, self->emptyTenant, /*expectOk=*/true));
state bool committed2 = wait(tryWrite(self, cx, self->emptyTenant, /*bypassQuota=*/false, /*expectOk=*/true));
ASSERT(committed2);
return Void();
@ -173,13 +177,20 @@ struct StorageQuotaWorkload : TestWorkload {
}
}
ACTOR static Future<bool> tryWrite(StorageQuotaWorkload* self, Database cx, TenantName tenant, bool expectOk) {
ACTOR static Future<bool> tryWrite(StorageQuotaWorkload* self,
Database cx,
TenantName tenant,
bool bypassQuota,
bool expectOk) {
state int i;
// Retry the transaction a few times if needed; this allows us wait for a while for all
// the storage usage and quota related monitors to fetch and propagate the latest information
// about the tenants that are over storage quota.
for (i = 0; i < 10; i++) {
state Transaction tr(cx, tenant);
if (bypassQuota) {
tr.setOption(FDBTransactionOptions::BYPASS_STORAGE_QUOTA);
}
loop {
try {
Standalone<KeyValueRef> kv =

View File

@ -66,7 +66,10 @@ struct SSCheckpointRestoreWorkload : TestWorkload {
return _start(this, cx);
}
void disableFailureInjectionWorkloads(std::set<std::string>& out) const override { out.insert("RandomMoveKeys"); }
void disableFailureInjectionWorkloads(std::set<std::string>& out) const override {
out.insert("RandomMoveKeys");
out.insert("Attrition");
}
ACTOR Future<Void> _start(SSCheckpointRestoreWorkload* self, Database cx) {
state Key key = "TestKey"_sr;

View File

@ -26,8 +26,8 @@
// This workload sets the throughput quota of a tag during the setup phase
class ThroughputQuotaWorkload : public TestWorkload {
TransactionTag transactionTag;
double reservedQuota{ 0.0 };
double totalQuota{ 0.0 };
int64_t reservedQuota{ 0 };
int64_t totalQuota{ 0 };
ACTOR static Future<Void> setup(ThroughputQuotaWorkload* self, Database cx) {
state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(cx);

View File

@ -209,6 +209,7 @@ const std::set<int> transactionRetryableErrors = { error_code_not_committed,
error_code_batch_transaction_throttled,
error_code_tag_throttled,
error_code_unknown_tenant,
error_code_proxy_tag_throttled,
// maybe committed error
error_code_cluster_version_changed,
error_code_commit_unknown_result };

View File

@ -341,7 +341,7 @@ public:
BindPromise(BindPromise const& r) : p(r.p), errContext(r.errContext), errID(r.errID) {}
BindPromise(BindPromise&& r) noexcept : p(std::move(r.p)), errContext(r.errContext), errID(r.errID) {}
Future<Void> getFuture() { return p.getFuture(); }
Future<Void> getFuture() const { return p.getFuture(); }
void operator()(const boost::system::error_code& error, size_t bytesWritten = 0) {
try {

View File

@ -88,7 +88,7 @@ public:
sendError(broken_promise());
}
Future<T> getFuture() { // Call only on the originating thread!
Future<T> getFuture() const { // Call only on the originating thread!
return promise.getFuture();
}
@ -107,7 +107,7 @@ public:
g_network->isOnMainThread() ? incrementPriorityIfEven(g_network->getCurrentTask())
: TaskPriority::DefaultOnMainThread);
}
bool isValid() { return promise.isValid(); }
bool isValid() const { return promise.isValid(); }
private:
Promise<T> promise;

View File

@ -29,21 +29,25 @@
#define PRIORITYMULTILOCK_ACTOR_H
#include "flow/flow.h"
#include <boost/intrusive/list.hpp>
#include "flow/actorcompiler.h" // This must be the last #include.
#define PRIORITYMULTILOCK_DEBUG 0
#if PRIORITYMULTILOCK_DEBUG || !defined(NO_INTELLISENSE)
#define pml_debug_printf(...) \
if (now() > 0) \
printf(__VA_ARGS__)
if (now() > 0) { \
printf("pml line=%04d ", __LINE__); \
printf(__VA_ARGS__); \
}
#else
#define pml_debug_printf(...)
#endif
// A multi user lock with a concurrent holder limit where waiters request a lock with a priority
// id and are granted locks based on a total concurrency and relative weights of the current active
// priorities. Priority id's must start at 0 and are sequential integers.
// priorities. Priority id's must start at 0 and are sequential integers. Priority id numbers
// are not related to the importance of the priority in execution.
//
// Scheduling logic
// Let
@ -64,17 +68,17 @@
// The interface is similar to FlowMutex except that lock holders can just drop the lock to release it.
//
// Usage:
// Lock lock = wait(prioritylock.lock(priorityLevel));
// Lock lock = wait(prioritylock.lock(priority_id));
// lock.release(); // Explicit release, or
// // let lock and all copies of lock go out of scope to release
class PriorityMultiLock {
class PriorityMultiLock : public ReferenceCounted<PriorityMultiLock> {
public:
// Waiting on the lock returns a Lock, which is really just a Promise<Void>
// Calling release() is not necessary, it exists in case the Lock holder wants to explicitly release
// the Lock before it goes out of scope.
struct Lock {
void release() { promise.send(Void()); }
bool isLocked() const { return promise.canBeSet(); }
// This is exposed in case the caller wants to use/copy it directly
Promise<Void> promise;
@ -84,10 +88,11 @@ public:
: PriorityMultiLock(concurrency, parseStringToVector<int>(weights, ',')) {}
PriorityMultiLock(int concurrency, std::vector<int> weightsByPriority)
: concurrency(concurrency), available(concurrency), waiting(0), totalPendingWeights(0), releaseDebugID(0) {
: concurrency(concurrency), available(concurrency), waiting(0), totalPendingWeights(0) {
priorities.resize(weightsByPriority.size());
for (int i = 0; i < priorities.size(); ++i) {
priorities[i].priority = i;
priorities[i].weight = weightsByPriority[i];
}
@ -102,7 +107,8 @@ public:
// If this priority currently has no waiters
if (q.empty()) {
// Add this priority's weight to the total for priorities with pending work
// Add this priority's weight to the total for priorities with pending work. This must be done
// so that currenctCapacity() below will assign capacaity to this priority.
totalPendingWeights += p.weight;
// If there are slots available and the priority has capacity then don't make the caller wait
@ -114,80 +120,71 @@ public:
Lock lock;
addRunner(lock, &p);
pml_debug_printf("lock nowait line %d priority %d %s\n", __LINE__, priority, toString().c_str());
pml_debug_printf("lock nowait priority %d %s\n", priority, toString().c_str());
return lock;
}
// If we didn't return above then add the priority to the waitingPriorities list
waitingPriorities.push_back(p);
}
Waiter w;
q.push_back(w);
Waiter& w = q.emplace_back();
++waiting;
pml_debug_printf("lock wait line %d priority %d %s\n", __LINE__, priority, toString().c_str());
pml_debug_printf("lock wait priority %d %s\n", priority, toString().c_str());
return w.lockPromise.getFuture();
}
void kill() {
pml_debug_printf("kill %s\n", toString().c_str());
brokenOnDestruct.reset();
// handleRelease will not free up any execution slots when it ends via cancel
fRunner.cancel();
available = 0;
runners.clear();
priorities.clear();
waitingPriorities.clear();
for (auto& p : priorities) {
p.queue.clear();
}
}
std::string toString() const {
int runnersDone = 0;
for (int i = 0; i < runners.size(); ++i) {
if (runners[i].isReady()) {
++runnersDone;
}
}
std::string s = format("{ ptr=%p concurrency=%d available=%d running=%d waiting=%d runnersQueue=%d "
"runnersDone=%d pendingWeights=%d ",
std::string s = format("{ ptr=%p concurrency=%d available=%d running=%d waiting=%d "
"pendingWeights=%d ",
this,
concurrency,
available,
concurrency - available,
waiting,
runners.size(),
runnersDone,
totalPendingWeights);
for (int i = 0; i < priorities.size(); ++i) {
s += format("p%d:{%s} ", i, priorities[i].toString(this).c_str());
for (auto& p : priorities) {
s += format("{%s} ", p.toString(this).c_str());
}
s += "}";
if (concurrency - available != runners.size() - runnersDone) {
pml_debug_printf("%s\n", s.c_str());
ASSERT_EQ(concurrency - available, runners.size() - runnersDone);
}
return s;
}
int maxPriority() const { return priorities.size() - 1; }
int totalWaiters() const { return waiting; }
int getRunnersCount() const { return concurrency - available; }
int getWaitersCount() const { return waiting; }
int numWaiters(const unsigned int priority) const {
int getWaitersCount(const unsigned int priority) const {
ASSERT(priority < priorities.size());
return priorities[priority].queue.size();
}
int totalRunners() const { return concurrency - available; }
int numRunners(const unsigned int priority) const {
int getRunnersCount(const unsigned int priority) const {
ASSERT(priority < priorities.size());
return priorities[priority].runners;
}
private:
struct Waiter {
Waiter() {}
Promise<Lock> lockPromise;
};
@ -202,8 +199,8 @@ private:
typedef Deque<Waiter> Queue;
struct Priority {
Priority() : runners(0), weight(0) {}
struct Priority : boost::intrusive::list_base_hook<> {
Priority() : runners(0), weight(0), priority(-1) {}
// Queue of waiters at this priority
Queue queue;
@ -211,9 +208,12 @@ private:
int runners;
// Configured weight for this priority
int weight;
// Priority number for convenience, matches *this's index in PML priorities vector
int priority;
std::string toString(const PriorityMultiLock* pml) const {
return format("weight=%d run=%d wait=%d cap=%d",
return format("priority=%d weight=%d run=%d wait=%d cap=%d",
priority,
weight,
runners,
queue.size(),
@ -222,51 +222,41 @@ private:
};
std::vector<Priority> priorities;
typedef boost::intrusive::list<Priority, boost::intrusive::constant_time_size<false>> WaitingPrioritiesList;
// Current or recent (ended) runners
Deque<Future<Void>> runners;
// List of all priorities with 1 or more waiters. This list exists so that the scheduling loop
// does not have to iterage over the priorities vector checking priorities without waiters.
WaitingPrioritiesList waitingPriorities;
Future<Void> fRunner;
AsyncTrigger wakeRunner;
Promise<Void> brokenOnDestruct;
// Used for debugging, can roll over without issue
unsigned int releaseDebugID;
ACTOR static Future<Void> handleRelease(PriorityMultiLock* self, Future<Void> f, Priority* priority) {
state [[maybe_unused]] unsigned int id = self->releaseDebugID++;
pml_debug_printf("%f handleRelease self=%p id=%u start \n", now(), self, id);
ACTOR static void handleRelease(Reference<PriorityMultiLock> self, Priority* priority, Future<Void> holder) {
pml_debug_printf("%f handleRelease self=%p start\n", now(), self.getPtr());
try {
wait(f);
pml_debug_printf("%f handleRelease self=%p id=%u success\n", now(), self, id);
wait(holder);
pml_debug_printf("%f handleRelease self=%p success\n", now(), self.getPtr());
} catch (Error& e) {
pml_debug_printf("%f handleRelease self=%p id=%u error %s\n", now(), self, id, e.what());
if (e.code() == error_code_actor_cancelled) {
throw;
}
pml_debug_printf("%f handleRelease self=%p error %s\n", now(), self.getPtr(), e.what());
}
pml_debug_printf("lock release line %d priority %d %s\n",
__LINE__,
(int)(priority - &self->priorities.front()),
self->toString().c_str());
pml_debug_printf("lock release priority %d %s\n", (int)(priority->priority), self->toString().c_str());
pml_debug_printf("%f handleRelease self=%p id=%u releasing\n", now(), self, id);
pml_debug_printf("%f handleRelease self=%p releasing\n", now(), self.getPtr());
++self->available;
priority->runners -= 1;
// If there are any waiters or if the runners array is getting large, trigger the runner loop
if (self->waiting > 0 || self->runners.size() > 1000) {
if (self->waiting > 0) {
self->wakeRunner.trigger();
}
return Void();
}
void addRunner(Lock& lock, Priority* p) {
p->runners += 1;
void addRunner(Lock& lock, Priority* priority) {
priority->runners += 1;
--available;
runners.push_back(handleRelease(this, lock.promise.getFuture(), p));
handleRelease(Reference<PriorityMultiLock>::addRef(this), priority, lock.promise.getFuture());
}
// Current maximum running tasks for the specified priority, which must have waiters
@ -278,76 +268,50 @@ private:
}
ACTOR static Future<Void> runner(PriorityMultiLock* self) {
state int sinceYield = 0;
state Future<Void> error = self->brokenOnDestruct.getFuture();
// Priority to try to run tasks from next
state int priority = 0;
state WaitingPrioritiesList::iterator p = self->waitingPriorities.end();
loop {
pml_debug_printf(
"runner loop start line %d priority=%d %s\n", __LINE__, priority, self->toString().c_str());
// Cleanup finished runner futures at the front of the runner queue.
while (!self->runners.empty() && self->runners.front().isReady()) {
self->runners.pop_front();
}
pml_debug_printf("runner loop start priority=%d %s\n", p->priority, self->toString().c_str());
// Wait for a runner to release its lock
pml_debug_printf(
"runner loop waitTrigger line %d priority=%d %s\n", __LINE__, priority, self->toString().c_str());
pml_debug_printf("runner loop waitTrigger priority=%d %s\n", p->priority, self->toString().c_str());
wait(self->wakeRunner.onTrigger());
pml_debug_printf(
"%f runner loop wake line %d priority=%d %s\n", now(), __LINE__, priority, self->toString().c_str());
if (++sinceYield == 100) {
sinceYield = 0;
pml_debug_printf(
" runner waitDelay line %d priority=%d %s\n", __LINE__, priority, self->toString().c_str());
wait(delay(0));
pml_debug_printf(
" runner afterDelay line %d priority=%d %s\n", __LINE__, priority, self->toString().c_str());
}
pml_debug_printf("%f runner loop wake priority=%d %s\n", now(), p->priority, self->toString().c_str());
// While there are available slots and there are waiters, launch tasks
while (self->available > 0 && self->waiting > 0) {
pml_debug_printf(
" launch loop start line %d priority=%d %s\n", __LINE__, priority, self->toString().c_str());
Priority* pPriority;
pml_debug_printf(" launch loop start priority=%d %s\n", p->priority, self->toString().c_str());
// Find the next priority with waiters and capacity. There must be at least one.
loop {
// Rotate to next priority
if (++priority == self->priorities.size()) {
priority = 0;
if (p == self->waitingPriorities.end()) {
p = self->waitingPriorities.begin();
}
pPriority = &self->priorities[priority];
pml_debug_printf(" launch loop scan priority=%d %s\n", p->priority, self->toString().c_str());
pml_debug_printf(" launch loop scan line %d priority=%d %s\n",
__LINE__,
priority,
self->toString().c_str());
if (!pPriority->queue.empty() && pPriority->runners < self->currentCapacity(pPriority->weight)) {
if (!p->queue.empty() && p->runners < self->currentCapacity(p->weight)) {
break;
}
++p;
}
Queue& queue = pPriority->queue;
Queue& queue = p->queue;
Waiter w = queue.front();
queue.pop_front();
// If this priority is now empty, subtract its weight from the total pending weights
// If this priority is now empty, subtract its weight from the total pending weights an remove it
// from the waitingPriorities list
Priority* pPriority = &*p;
if (queue.empty()) {
p = self->waitingPriorities.erase(p);
self->totalPendingWeights -= pPriority->weight;
pml_debug_printf(" emptied priority line %d priority=%d %s\n",
__LINE__,
priority,
self->toString().c_str());
pml_debug_printf(
" emptied priority priority=%d %s\n", pPriority->priority, self->toString().c_str());
}
--self->waiting;
@ -365,10 +329,9 @@ private:
self->addRunner(lock, pPriority);
}
pml_debug_printf(" launched line %d alreadyDone=%d priority=%d %s\n",
__LINE__,
pml_debug_printf(" launched alreadyDone=%d priority=%d %s\n",
!lock.promise.canBeSet(),
priority,
pPriority->priority,
self->toString().c_str());
}
}

Some files were not shown because too many files have changed in this diff Show More