Merge pull request #3785 from apple/release-6.3

Merge Release 6.3 to master
This commit is contained in:
Meng Xu 2020-09-17 14:43:56 -07:00 committed by GitHub
commit cf69f455a9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 290 additions and 126 deletions

View File

@ -1,6 +1,5 @@
#!/bin/bash
SCRIPTDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
pkill fdbserver
ulimit -S -c unlimited
unset FDB_NETWORK_OPTION_EXTERNAL_CLIENT_DIRECTORY
@ -8,4 +7,4 @@ WORKDIR="$(pwd)/tmp/$$"
if [ ! -d "${WORKDIR}" ] ; then
mkdir -p "${WORKDIR}"
fi
DEBUGLEVEL=0 DISPLAYERROR=1 RANDOMTEST=1 WORKDIR="${WORKDIR}" FDBSERVERPORT="${PORT_FDBSERVER:-4500}" ${SCRIPTDIR}/bindingTestScript.sh 1
DEBUGLEVEL=0 DISPLAYERROR=1 RANDOMTEST=1 WORKDIR="${WORKDIR}" ${SCRIPTDIR}/bindingTestScript.sh 1

View File

@ -7,7 +7,7 @@ SCRIPTID="${$}"
SAVEONERROR="${SAVEONERROR:-1}"
PYTHONDIR="${BINDIR}/tests/python"
testScript="${BINDIR}/tests/bindingtester/run_binding_tester.sh"
VERSION="1.6"
VERSION="1.8"
source ${SCRIPTDIR}/localClusterStart.sh
@ -23,19 +23,22 @@ cycles="${1}"
if [ "${DEBUGLEVEL}" -gt 0 ]
then
echo "Work dir: ${WORKDIR}"
echo "Bin dir: ${BINDIR}"
echo "Log dir: ${LOGDIR}"
echo "Python path: ${PYTHONDIR}"
echo "Lib dir: ${LIBDIR}"
echo "Server port: ${FDBSERVERPORT}"
echo "Script Id: ${SCRIPTID}"
echo "Version: ${VERSION}"
echo "Work dir: ${WORKDIR}"
echo "Bin dir: ${BINDIR}"
echo "Log dir: ${LOGDIR}"
echo "Python path: ${PYTHONDIR}"
echo "Lib dir: ${LIBDIR}"
echo "Cluster String: ${CLUSTERSTRING}"
echo "Script Id: ${SCRIPTID}"
echo "Version: ${VERSION}"
fi
# Begin the cluster using the logic in localClusterStart.sh.
startCluster
# Stop the cluster on exit
trap "stopCluster" EXIT
# Display user message
if [ "${status}" -ne 0 ]; then
:
@ -58,8 +61,8 @@ fi
# Display directory and log information, if an error occurred
if [ "${status}" -ne 0 ]
then
ls "${WORKDIR}" > "${LOGDIR}/dir.log"
ps -eafw > "${LOGDIR}/process-preclean.log"
ls "${WORKDIR}" &> "${LOGDIR}/dir.log"
ps -eafwH &> "${LOGDIR}/process-preclean.log"
if [ -f "${FDBCONF}" ]; then
cp -f "${FDBCONF}" "${LOGDIR}/"
fi
@ -71,10 +74,15 @@ fi
# Save debug information files, environment, and log information, if an error occurred
if [ "${status}" -ne 0 ] && [ "${SAVEONERROR}" -gt 0 ]; then
ps -eafw > "${LOGDIR}/process-exit.log"
netstat -na > "${LOGDIR}/netstat.log"
df -h > "${LOGDIR}/disk.log"
env > "${LOGDIR}/env.log"
ps -eafwH &> "${LOGDIR}/process-exit.log"
netstat -na &> "${LOGDIR}/netstat.log"
df -h &> "${LOGDIR}/disk.log"
env &> "${LOGDIR}/env.log"
fi
# Stop the cluster
if stopCluster; then
unset FDBSERVERID
fi
exit "${status}"

View File

@ -5,14 +5,26 @@ WORKDIR="${WORKDIR:-${SCRIPTDIR}/tmp/fdb.work}"
LOGDIR="${WORKDIR}/log"
ETCDIR="${WORKDIR}/etc"
BINDIR="${BINDIR:-${SCRIPTDIR}}"
FDBSERVERPORT="${FDBSERVERPORT:-4500}"
FDBPORTSTART="${FDBPORTSTART:-4000}"
SERVERCHECKS="${SERVERCHECKS:-10}"
CONFIGUREWAIT="${CONFIGUREWAIT:-240}"
FDBCONF="${ETCDIR}/fdb.cluster"
LOGFILE="${LOGFILE:-${LOGDIR}/startcluster.log}"
AUDITCLUSTER="${AUDITCLUSTER:-0}"
AUDITLOG="${AUDITLOG:-/tmp/audit-cluster.log}"
# Initialize the variables
status=0
messagetime=0
messagecount=0
let index2="${RANDOM} % 256"
let index3="${RANDOM} % 256"
let index4="(${RANDOM} % 255) + 1"
let FDBPORT="(${RANDOM} % 1000) + ${FDBPORTSTART}"
# Define a random ip address and port on localhost
IPADDRESS="127.${index2}.${index3}.${index4}"
CLUSTERSTRING="${IPADDRESS}:${FDBPORT}"
function log
{
@ -92,29 +104,32 @@ function displayMessage
}
# Create the directories used by the server.
function createDirectories {
function createDirectories
{
local status=0
# Display user message
if ! displayMessage "Creating directories"
then
echo 'Failed to display user message'
let status="${status} + 1"
elif ! mkdir -p "${LOGDIR}" "${ETCDIR}"
then
log "Failed to create directories"
let status="${status} + 1"
# Display user message
elif ! displayMessage "Setting file permissions"
then
log 'Failed to display user message'
let status="${status} + 1"
elif ! chmod 755 "${BINDIR}/fdbserver" "${BINDIR}/fdbcli"
then
log "Failed to set file permissions"
let status="${status} + 1"
else
while read filepath
do
@ -137,7 +152,10 @@ function createDirectories {
}
# Create a cluster file for the local cluster.
function createClusterFile {
function createClusterFile
{
local status=0
if [ "${status}" -ne 0 ]; then
:
# Display user message
@ -148,7 +166,7 @@ function createClusterFile {
else
description=$(LC_CTYPE=C tr -dc A-Za-z0-9 < /dev/urandom 2> /dev/null | head -c 8)
random_str=$(LC_CTYPE=C tr -dc A-Za-z0-9 < /dev/urandom 2> /dev/null | head -c 8)
echo "$description:$random_str@127.0.0.1:${FDBSERVERPORT}" > "${FDBCONF}"
echo "${description}:${random_str}@${CLUSTERSTRING}" > "${FDBCONF}"
fi
if [ "${status}" -ne 0 ]; then
@ -161,8 +179,51 @@ function createClusterFile {
return ${status}
}
# Stop the Cluster from running.
function stopCluster
{
local status=0
# Add an audit entry, if enabled
if [ "${AUDITCLUSTER}" -gt 0 ]; then
printf '%-15s (%6s) Stopping cluster %-20s (%6s): %s\n' "$(date +'%Y-%m-%d %H:%M:%S')" "${$}" "${CLUSTERSTRING}" "${FDBSERVERID}" >> "${AUDITLOG}"
fi
if [ -z "${FDBSERVERID}" ]; then
log 'FDB Server process is not defined'
let status="${status} + 1"
elif ! kill -0 "${FDBSERVERID}"; then
log "Failed to locate FDB Server process (${FDBSERVERID})"
let status="${status} + 1"
elif "${BINDIR}/fdbcli" -C "${FDBCONF}" --exec "kill; kill ${CLUSTERSTRING}; sleep 3" --timeout 120 &>> "${LOGDIR}/fdbcli-kill.log"
then
# Ensure that process is dead
if ! kill -0 "${FDBSERVERID}" 2> /dev/null; then
log "Killed cluster (${FDBSERVERID}) via cli"
elif ! kill -9 "${FDBSERVERID}"; then
log "Failed to kill FDB Server process (${FDBSERVERID}) via cli or kill command"
let status="${status} + 1"
else
log "Forcibly killed FDB Server process (${FDBSERVERID}) since cli failed"
fi
elif ! kill -9 "${FDBSERVERID}"; then
log "Failed to forcibly kill FDB Server process (${FDBSERVERID})"
let status="${status} + 1"
else
log "Forcibly killed FDB Server process (${FDBSERVERID})"
fi
return "${status}"
}
# Start the server running.
function startFdbServer {
function startFdbServer
{
local status=0
# Add an audit entry, if enabled
if [ "${AUDITCLUSTER}" -gt 0 ]; then
printf '%-15s (%6s) Starting cluster %-20s\n' "$(date +'%Y-%m-%d %H:%M:%S')" "${$}" "${CLUSTERSTRING}" >> "${AUDITLOG}"
fi
if [ "${status}" -ne 0 ]; then
:
elif ! displayMessage "Starting Fdb Server"
@ -170,25 +231,29 @@ function startFdbServer {
log 'Failed to display user message'
let status="${status} + 1"
elif ! "${BINDIR}/fdbserver" -C "${FDBCONF}" -p "auto:${FDBSERVERPORT}" -L "${LOGDIR}" -d "${WORKDIR}/fdb/$$" &> "${LOGDIR}/fdbserver.log" &
elif ! "${BINDIR}/fdbserver" --knob_disable_posix_kernel_aio=1 -C "${FDBCONF}" -p "${CLUSTERSTRING}" -L "${LOGDIR}" -d "${WORKDIR}/fdb/${$}" &> "${LOGDIR}/fdbserver.log" &
then
log "Failed to start FDB Server"
# Maybe the server is already running
FDBSERVERID="$(pidof fdbserver)"
let status="${status} + 1"
else
FDBSERVERID="${!}"
fi
if ! kill -0 ${FDBSERVERID} ; then
log "FDB Server start failed."
if [ -z "${FDBSERVERID}" ]; then
log "FDB Server start failed because no process"
let status="${status} + 1"
elif ! kill -0 "${FDBSERVERID}" ; then
log "FDB Server start failed because process terminated unexpectedly"
let status="${status} + 1"
fi
return ${status}
}
function getStatus {
function getStatus
{
local status=0
if [ "${status}" -ne 0 ]; then
:
elif ! date &>> "${LOGDIR}/fdbclient.log"
@ -209,35 +274,41 @@ function getStatus {
}
# Verify that the cluster is available.
function verifyAvailable {
function verifyAvailable
{
local status=0
if [ -z "${FDBSERVERID}" ]; then
log "FDB Server process is not defined."
let status="${status} + 1"
# Verify that the server is running.
if ! kill -0 "${FDBSERVERID}"
elif ! kill -0 "${FDBSERVERID}"
then
log "FDB server process (${FDBSERVERID}) is not running"
let status="${status} + 1"
return 1
# Display user message.
elif ! displayMessage "Checking cluster availability"
then
log 'Failed to display user message'
let status="${status} + 1"
return 1
# Determine if status json says the database is available.
else
avail=`"${BINDIR}/fdbcli" -C "${FDBCONF}" --exec 'status json' --timeout 10 2> /dev/null | grep -E '"database_available"|"available"' | grep 'true'`
avail=`"${BINDIR}/fdbcli" -C "${FDBCONF}" --exec 'status json' --timeout "${SERVERCHECKS}" 2> /dev/null | grep -E '"database_available"|"available"' | grep 'true'`
log "Avail value: ${avail}" "${DEBUGLEVEL}"
if [[ -n "${avail}" ]] ; then
return 0
:
else
return 1
let status="${status} + 1"
fi
fi
return "${status}"
}
# Configure the database on the server.
function createDatabase {
function createDatabase
{
local status=0
if [ "${status}" -ne 0 ]; then
:
# Ensure that the server is running
@ -262,7 +333,7 @@ function createDatabase {
# Configure the database.
else
"${BINDIR}/fdbcli" -C "${FDBCONF}" --exec 'configure new single memory; status' --timeout 240 --log --log-dir "${LOGDIR}" &>> "${LOGDIR}/fdbclient.log"
"${BINDIR}/fdbcli" -C "${FDBCONF}" --exec 'configure new single memory; status' --timeout "${CONFIGUREWAIT}" --log --log-dir "${LOGDIR}" &>> "${LOGDIR}/fdbclient.log"
if ! displayMessage "Checking if config succeeded"
then
@ -270,7 +341,7 @@ function createDatabase {
fi
iteration=0
while [[ "${iteration}" -lt 10 ]] && ! verifyAvailable
while [[ "${iteration}" -lt "${SERVERCHECKS}" ]] && ! verifyAvailable
do
log "Database not created (iteration ${iteration})."
let iteration="${iteration} + 1"
@ -290,7 +361,10 @@ function createDatabase {
}
# Begin the local cluster from scratch.
function startCluster {
function startCluster
{
local status=0
if [ "${status}" -ne 0 ]; then
:
elif ! createDirectories

View File

@ -5,6 +5,8 @@ Release Notes
6.3.5
=====
* Report missing old tlogs information when in recovery before storage servers are fully recovered. `(PR #3706) <https://github.com/apple/foundationdb/pull/3706>`_
Features
--------

View File

@ -20,6 +20,7 @@
#include "boost/lexical_cast.hpp"
#include "fdbclient/NativeAPI.actor.h"
#include "fdbclient/FDBTypes.h"
#include "fdbclient/Status.h"
#include "fdbclient/StatusClient.h"
#include "fdbclient/DatabaseContext.h"
@ -1235,14 +1236,54 @@ void printStatus(StatusObjectReader statusObj, StatusClient::StatusLevel level,
int minLoss = std::min(availLoss, dataLoss);
const char *faultDomain = machinesAreZones ? "machine" : "zone";
if (minLoss == 1)
outputString += format("1 %s", faultDomain);
else
outputString += format("%d %ss", minLoss, faultDomain);
outputString += format("%d %ss", minLoss, faultDomain);
if (dataLoss > availLoss){
outputString += format(" (%d without data loss)", dataLoss);
}
if (dataLoss == -1) {
ASSERT_WE_THINK(availLoss == -1);
outputString += format(
"\n\n Warning: the database may have data loss and availability loss. Please restart "
"following tlog interfaces, otherwise storage servers may never be able to catch "
"up.\n");
StatusObjectReader logs;
if (statusObjCluster.has("logs")) {
for (StatusObjectReader logEpoch : statusObjCluster.last().get_array()) {
bool possiblyLosingData;
if (logEpoch.get("possibly_losing_data", possiblyLosingData) &&
!possiblyLosingData) {
continue;
}
// Current epoch doesn't have an end version.
int64_t epoch, beginVersion, endVersion = invalidVersion;
bool current;
logEpoch.get("epoch", epoch);
logEpoch.get("begin_version", beginVersion);
logEpoch.get("end_version", endVersion);
logEpoch.get("current", current);
std::string missing_log_interfaces;
if (logEpoch.has("log_interfaces")) {
for (StatusObjectReader logInterface : logEpoch.last().get_array()) {
bool healthy;
std::string address, id;
if (logInterface.get("healthy", healthy) && !healthy) {
logInterface.get("id", id);
logInterface.get("address", address);
missing_log_interfaces += format("%s,%s ", id.c_str(), address.c_str());
}
}
}
outputString += format(
" %s log epoch: %ld begin: %ld end: %s, missing "
"log interfaces(id,address): %s\n",
current ? "Current" : "Old", epoch, beginVersion,
endVersion == invalidVersion ? "(unknown)" : format("%ld", endVersion).c_str(),
missing_log_interfaces.c_str());
}
}
}
}
}

View File

@ -33,6 +33,7 @@
#include "fdbclient/DatabaseContext.h"
#include "fdbrpc/simulator.h"
#include "fdbclient/StatusClient.h"
#include "flow/Trace.h"
#include "flow/UnitTest.h"
#include "fdbrpc/ReplicationPolicy.h"
#include "fdbrpc/Replication.h"

View File

@ -278,15 +278,20 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema(
"run_loop_busy":0.2
}
},
"old_logs":[
"logs":[
{
"logs":[
"log_interfaces":[
{
"id":"7f8d623d0cb9966e",
"healthy":true,
"address":"1.2.3.4:1234"
}
],
"epoch":1,
"current":false,
"begin_version":23,
"end_version":112315141,
"possibly_losing_data":true,
"log_replication_factor":3,
"log_write_anti_quorum":0,
"log_fault_tolerance":2,

View File

@ -2012,78 +2012,98 @@ ACTOR static Future<JsonBuilderObject> clusterSummaryStatisticsFetcher(WorkerEve
return statusObj;
}
static JsonBuilderArray oldTlogFetcher(int* oldLogFaultTolerance, Reference<AsyncVar<ServerDBInfo>> db, std::unordered_map<NetworkAddress, WorkerInterface> const& address_workers) {
JsonBuilderArray oldTlogsArray;
static JsonBuilderObject tlogFetcher(int* logFaultTolerance, const std::vector<TLogSet>& tLogs,
std::unordered_map<NetworkAddress, WorkerInterface> const& address_workers) {
JsonBuilderObject statusObj;
JsonBuilderArray logsObj;
Optional<int32_t> sat_log_replication_factor, sat_log_write_anti_quorum, sat_log_fault_tolerance,
log_replication_factor, log_write_anti_quorum, log_fault_tolerance, remote_log_replication_factor,
remote_log_fault_tolerance;
if(db->get().recoveryState >= RecoveryState::ACCEPTING_COMMITS) {
for(auto it : db->get().logSystemConfig.oldTLogs) {
JsonBuilderObject statusObj;
JsonBuilderArray logsObj;
Optional<int32_t> sat_log_replication_factor, sat_log_write_anti_quorum, sat_log_fault_tolerance, log_replication_factor, log_write_anti_quorum, log_fault_tolerance, remote_log_replication_factor, remote_log_fault_tolerance;
int maxFaultTolerance = 0;
int maxFaultTolerance = 0;
for(int i = 0; i < it.tLogs.size(); i++) {
int failedLogs = 0;
for(auto& log : it.tLogs[i].tLogs) {
JsonBuilderObject logObj;
bool failed = !log.present() || !address_workers.count(log.interf().address());
logObj["id"] = log.id().shortString();
logObj["healthy"] = !failed;
if(log.present()) {
logObj["address"] = log.interf().address().toString();
}
logsObj.push_back(logObj);
if(failed) {
failedLogs++;
}
}
maxFaultTolerance = std::max(maxFaultTolerance, it.tLogs[i].tLogReplicationFactor - 1 - it.tLogs[i].tLogWriteAntiQuorum - failedLogs);
if(it.tLogs[i].isLocal && it.tLogs[i].locality == tagLocalitySatellite) {
sat_log_replication_factor = it.tLogs[i].tLogReplicationFactor;
sat_log_write_anti_quorum = it.tLogs[i].tLogWriteAntiQuorum;
sat_log_fault_tolerance = it.tLogs[i].tLogReplicationFactor - 1 - it.tLogs[i].tLogWriteAntiQuorum - failedLogs;
}
else if(it.tLogs[i].isLocal) {
log_replication_factor = it.tLogs[i].tLogReplicationFactor;
log_write_anti_quorum = it.tLogs[i].tLogWriteAntiQuorum;
log_fault_tolerance = it.tLogs[i].tLogReplicationFactor - 1 - it.tLogs[i].tLogWriteAntiQuorum - failedLogs;
}
else {
remote_log_replication_factor = it.tLogs[i].tLogReplicationFactor;
remote_log_fault_tolerance = it.tLogs[i].tLogReplicationFactor - 1 - failedLogs;
}
for (int i = 0; i < tLogs.size(); i++) {
int failedLogs = 0;
for (auto& log : tLogs[i].tLogs) {
JsonBuilderObject logObj;
bool failed = !log.present() || !address_workers.count(log.interf().address());
logObj["id"] = log.id().shortString();
logObj["healthy"] = !failed;
if (log.present()) {
logObj["address"] = log.interf().address().toString();
}
*oldLogFaultTolerance = std::min(*oldLogFaultTolerance, maxFaultTolerance);
statusObj["logs"] = logsObj;
if (sat_log_replication_factor.present())
statusObj["satellite_log_replication_factor"] = sat_log_replication_factor.get();
if (sat_log_write_anti_quorum.present())
statusObj["satellite_log_write_anti_quorum"] = sat_log_write_anti_quorum.get();
if (sat_log_fault_tolerance.present())
statusObj["satellite_log_fault_tolerance"] = sat_log_fault_tolerance.get();
if (log_replication_factor.present())
statusObj["log_replication_factor"] = log_replication_factor.get();
if (log_write_anti_quorum.present())
statusObj["log_write_anti_quorum"] = log_write_anti_quorum.get();
if (log_fault_tolerance.present())
statusObj["log_fault_tolerance"] = log_fault_tolerance.get();
if (remote_log_replication_factor.present())
statusObj["remote_log_replication_factor"] = remote_log_replication_factor.get();
if (remote_log_fault_tolerance.present())
statusObj["remote_log_fault_tolerance"] = remote_log_fault_tolerance.get();
oldTlogsArray.push_back(statusObj);
logsObj.push_back(logObj);
if (failed) {
failedLogs++;
}
}
// The log generation's fault tolerance is the maximum tlog fault tolerance of each region.
maxFaultTolerance =
std::max(maxFaultTolerance, tLogs[i].tLogReplicationFactor - 1 - tLogs[i].tLogWriteAntiQuorum - failedLogs);
if (tLogs[i].isLocal && tLogs[i].locality == tagLocalitySatellite) {
sat_log_replication_factor = tLogs[i].tLogReplicationFactor;
sat_log_write_anti_quorum = tLogs[i].tLogWriteAntiQuorum;
sat_log_fault_tolerance = tLogs[i].tLogReplicationFactor - 1 - tLogs[i].tLogWriteAntiQuorum - failedLogs;
} else if (tLogs[i].isLocal) {
log_replication_factor = tLogs[i].tLogReplicationFactor;
log_write_anti_quorum = tLogs[i].tLogWriteAntiQuorum;
log_fault_tolerance = tLogs[i].tLogReplicationFactor - 1 - tLogs[i].tLogWriteAntiQuorum - failedLogs;
} else {
remote_log_replication_factor = tLogs[i].tLogReplicationFactor;
remote_log_fault_tolerance = tLogs[i].tLogReplicationFactor - 1 - failedLogs;
}
}
*logFaultTolerance = std::min(*logFaultTolerance, maxFaultTolerance);
statusObj["log_interfaces"] = logsObj;
// We may lose logs in this log generation, storage servers may never be able to catch up this log
// generation.
statusObj["possibly_losing_data"] = maxFaultTolerance < 0;
return oldTlogsArray;
if (sat_log_replication_factor.present())
statusObj["satellite_log_replication_factor"] = sat_log_replication_factor.get();
if (sat_log_write_anti_quorum.present())
statusObj["satellite_log_write_anti_quorum"] = sat_log_write_anti_quorum.get();
if (sat_log_fault_tolerance.present()) statusObj["satellite_log_fault_tolerance"] = sat_log_fault_tolerance.get();
if (log_replication_factor.present()) statusObj["log_replication_factor"] = log_replication_factor.get();
if (log_write_anti_quorum.present()) statusObj["log_write_anti_quorum"] = log_write_anti_quorum.get();
if (log_fault_tolerance.present()) statusObj["log_fault_tolerance"] = log_fault_tolerance.get();
if (remote_log_replication_factor.present())
statusObj["remote_log_replication_factor"] = remote_log_replication_factor.get();
if (remote_log_fault_tolerance.present())
statusObj["remote_log_fault_tolerance"] = remote_log_fault_tolerance.get();
return statusObj;
}
static JsonBuilderObject faultToleranceStatusFetcher(DatabaseConfiguration configuration, ServerCoordinators coordinators, std::vector<WorkerDetails>& workers, int extraTlogEligibleZones, int minReplicasRemaining, bool underMaintenance) {
static JsonBuilderArray tlogFetcher(int* logFaultTolerance, Reference<AsyncVar<ServerDBInfo>> db,
std::unordered_map<NetworkAddress, WorkerInterface> const& address_workers) {
JsonBuilderArray tlogsArray;
JsonBuilderObject tlogsStatus;
tlogsStatus = tlogFetcher(logFaultTolerance, db->get().logSystemConfig.tLogs, address_workers);
tlogsStatus["epoch"] = db->get().logSystemConfig.epoch;
tlogsStatus["current"] = true;
if (db->get().logSystemConfig.recoveredAt.present()) {
tlogsStatus["begin_version"] = db->get().logSystemConfig.recoveredAt.get();
}
tlogsArray.push_back(tlogsStatus);
for (auto it : db->get().logSystemConfig.oldTLogs) {
JsonBuilderObject oldTlogsStatus = tlogFetcher(logFaultTolerance, it.tLogs, address_workers);
oldTlogsStatus["epoch"] = it.epoch;
oldTlogsStatus["current"] = false;
oldTlogsStatus["begin_version"] = it.epochBegin;
oldTlogsStatus["end_version"] = it.epochEnd;
tlogsArray.push_back(oldTlogsStatus);
}
return tlogsArray;
}
static JsonBuilderObject faultToleranceStatusFetcher(DatabaseConfiguration configuration,
ServerCoordinators coordinators,
std::vector<WorkerDetails>& workers, int extraTlogEligibleZones,
int minReplicasRemaining, int oldLogFaultTolerance,
bool underMaintenance) {
JsonBuilderObject statusObj;
// without losing data
@ -2115,17 +2135,18 @@ static JsonBuilderObject faultToleranceStatusFetcher(DatabaseConfiguration confi
}
maxCoordinatorZoneFailures += 1;
}
// max zone failures that we can tolerate to not lose data
int zoneFailuresWithoutLosingData = std::min(maxZoneFailures, maxCoordinatorZoneFailures);
if (minReplicasRemaining >= 0){
zoneFailuresWithoutLosingData = std::min(zoneFailuresWithoutLosingData, minReplicasRemaining - 1);
}
statusObj["max_zone_failures_without_losing_data"] = std::max(zoneFailuresWithoutLosingData, 0);
// without losing availablity
statusObj["max_zone_failures_without_losing_availability"] = std::max(std::min(extraTlogEligibleZones, zoneFailuresWithoutLosingData), 0);
// oldLogFaultTolerance means max failures we can tolerate to lose logs data. -1 means we lose data or availability.
zoneFailuresWithoutLosingData = std::max(std::min(zoneFailuresWithoutLosingData, oldLogFaultTolerance), -1);
statusObj["max_zone_failures_without_losing_data"] = zoneFailuresWithoutLosingData;
statusObj["max_zone_failures_without_losing_availability"] =
std::max(std::min(extraTlogEligibleZones, zoneFailuresWithoutLosingData), -1);
return statusObj;
}
@ -2521,14 +2542,16 @@ ACTOR Future<StatusReply> clusterGetStatus(
futures2.push_back(clusterSummaryStatisticsFetcher(pMetrics, storageServerFuture, tLogFuture, &status_incomplete_reasons));
state std::vector<JsonBuilderObject> workerStatuses = wait(getAll(futures2));
int oldLogFaultTolerance = 100;
if(db->get().recoveryState >= RecoveryState::ACCEPTING_COMMITS && db->get().logSystemConfig.oldTLogs.size() > 0) {
statusObj["old_logs"] = oldTlogFetcher(&oldLogFaultTolerance, db, address_workers);
int logFaultTolerance = 100;
if (db->get().recoveryState >= RecoveryState::ACCEPTING_COMMITS) {
statusObj["logs"] = tlogFetcher(&logFaultTolerance, db, address_workers);
}
if(configuration.present()) {
int extraTlogEligibleZones = getExtraTLogEligibleZones(workers, configuration.get());
statusObj["fault_tolerance"] = faultToleranceStatusFetcher(configuration.get(), coordinators, workers, extraTlogEligibleZones, minReplicasRemaining, loadResult.present() && loadResult.get().healthyZone.present());
statusObj["fault_tolerance"] = faultToleranceStatusFetcher(
configuration.get(), coordinators, workers, extraTlogEligibleZones, minReplicasRemaining,
logFaultTolerance, loadResult.present() && loadResult.get().healthyZone.present());
}
state JsonBuilderObject configObj =

View File

@ -55,8 +55,7 @@ struct AtomicOpsWorkload : TestWorkload {
ubsum = 0;
int64_t randNum = sharedRandomNumber / 10;
if(opType == -1)
opType = randNum % 8;
if (opType == -1) opType = randNum % 10;
switch(opType) {
case 0:
@ -91,6 +90,18 @@ struct AtomicOpsWorkload : TestWorkload {
TEST(true); //Testing atomic ByteMax
opType = MutationRef::ByteMax;
break;
case 8:
TEST(true); // Testing atomic MinV2
opType = MutationRef::MinV2;
break;
case 9:
TEST(true); // Testing atomic AndV2
opType = MutationRef::AndV2;
break;
// case 10:
// TEST(true); // Testing atomic CompareAndClear Not supported yet
// opType = MutationRef::CompareAndClear
// break;
default:
ASSERT(false);
}