Merge pull request #3785 from apple/release-6.3
Merge Release 6.3 to master
This commit is contained in:
commit
cf69f455a9
|
@ -1,6 +1,5 @@
|
|||
#!/bin/bash
|
||||
SCRIPTDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
|
||||
pkill fdbserver
|
||||
ulimit -S -c unlimited
|
||||
|
||||
unset FDB_NETWORK_OPTION_EXTERNAL_CLIENT_DIRECTORY
|
||||
|
@ -8,4 +7,4 @@ WORKDIR="$(pwd)/tmp/$$"
|
|||
if [ ! -d "${WORKDIR}" ] ; then
|
||||
mkdir -p "${WORKDIR}"
|
||||
fi
|
||||
DEBUGLEVEL=0 DISPLAYERROR=1 RANDOMTEST=1 WORKDIR="${WORKDIR}" FDBSERVERPORT="${PORT_FDBSERVER:-4500}" ${SCRIPTDIR}/bindingTestScript.sh 1
|
||||
DEBUGLEVEL=0 DISPLAYERROR=1 RANDOMTEST=1 WORKDIR="${WORKDIR}" ${SCRIPTDIR}/bindingTestScript.sh 1
|
||||
|
|
|
@ -7,7 +7,7 @@ SCRIPTID="${$}"
|
|||
SAVEONERROR="${SAVEONERROR:-1}"
|
||||
PYTHONDIR="${BINDIR}/tests/python"
|
||||
testScript="${BINDIR}/tests/bindingtester/run_binding_tester.sh"
|
||||
VERSION="1.6"
|
||||
VERSION="1.8"
|
||||
|
||||
source ${SCRIPTDIR}/localClusterStart.sh
|
||||
|
||||
|
@ -23,19 +23,22 @@ cycles="${1}"
|
|||
|
||||
if [ "${DEBUGLEVEL}" -gt 0 ]
|
||||
then
|
||||
echo "Work dir: ${WORKDIR}"
|
||||
echo "Bin dir: ${BINDIR}"
|
||||
echo "Log dir: ${LOGDIR}"
|
||||
echo "Python path: ${PYTHONDIR}"
|
||||
echo "Lib dir: ${LIBDIR}"
|
||||
echo "Server port: ${FDBSERVERPORT}"
|
||||
echo "Script Id: ${SCRIPTID}"
|
||||
echo "Version: ${VERSION}"
|
||||
echo "Work dir: ${WORKDIR}"
|
||||
echo "Bin dir: ${BINDIR}"
|
||||
echo "Log dir: ${LOGDIR}"
|
||||
echo "Python path: ${PYTHONDIR}"
|
||||
echo "Lib dir: ${LIBDIR}"
|
||||
echo "Cluster String: ${CLUSTERSTRING}"
|
||||
echo "Script Id: ${SCRIPTID}"
|
||||
echo "Version: ${VERSION}"
|
||||
fi
|
||||
|
||||
# Begin the cluster using the logic in localClusterStart.sh.
|
||||
startCluster
|
||||
|
||||
# Stop the cluster on exit
|
||||
trap "stopCluster" EXIT
|
||||
|
||||
# Display user message
|
||||
if [ "${status}" -ne 0 ]; then
|
||||
:
|
||||
|
@ -58,8 +61,8 @@ fi
|
|||
# Display directory and log information, if an error occurred
|
||||
if [ "${status}" -ne 0 ]
|
||||
then
|
||||
ls "${WORKDIR}" > "${LOGDIR}/dir.log"
|
||||
ps -eafw > "${LOGDIR}/process-preclean.log"
|
||||
ls "${WORKDIR}" &> "${LOGDIR}/dir.log"
|
||||
ps -eafwH &> "${LOGDIR}/process-preclean.log"
|
||||
if [ -f "${FDBCONF}" ]; then
|
||||
cp -f "${FDBCONF}" "${LOGDIR}/"
|
||||
fi
|
||||
|
@ -71,10 +74,15 @@ fi
|
|||
|
||||
# Save debug information files, environment, and log information, if an error occurred
|
||||
if [ "${status}" -ne 0 ] && [ "${SAVEONERROR}" -gt 0 ]; then
|
||||
ps -eafw > "${LOGDIR}/process-exit.log"
|
||||
netstat -na > "${LOGDIR}/netstat.log"
|
||||
df -h > "${LOGDIR}/disk.log"
|
||||
env > "${LOGDIR}/env.log"
|
||||
ps -eafwH &> "${LOGDIR}/process-exit.log"
|
||||
netstat -na &> "${LOGDIR}/netstat.log"
|
||||
df -h &> "${LOGDIR}/disk.log"
|
||||
env &> "${LOGDIR}/env.log"
|
||||
fi
|
||||
|
||||
# Stop the cluster
|
||||
if stopCluster; then
|
||||
unset FDBSERVERID
|
||||
fi
|
||||
|
||||
exit "${status}"
|
||||
|
|
|
@ -5,14 +5,26 @@ WORKDIR="${WORKDIR:-${SCRIPTDIR}/tmp/fdb.work}"
|
|||
LOGDIR="${WORKDIR}/log"
|
||||
ETCDIR="${WORKDIR}/etc"
|
||||
BINDIR="${BINDIR:-${SCRIPTDIR}}"
|
||||
FDBSERVERPORT="${FDBSERVERPORT:-4500}"
|
||||
FDBPORTSTART="${FDBPORTSTART:-4000}"
|
||||
SERVERCHECKS="${SERVERCHECKS:-10}"
|
||||
CONFIGUREWAIT="${CONFIGUREWAIT:-240}"
|
||||
FDBCONF="${ETCDIR}/fdb.cluster"
|
||||
LOGFILE="${LOGFILE:-${LOGDIR}/startcluster.log}"
|
||||
AUDITCLUSTER="${AUDITCLUSTER:-0}"
|
||||
AUDITLOG="${AUDITLOG:-/tmp/audit-cluster.log}"
|
||||
|
||||
# Initialize the variables
|
||||
status=0
|
||||
messagetime=0
|
||||
messagecount=0
|
||||
let index2="${RANDOM} % 256"
|
||||
let index3="${RANDOM} % 256"
|
||||
let index4="(${RANDOM} % 255) + 1"
|
||||
let FDBPORT="(${RANDOM} % 1000) + ${FDBPORTSTART}"
|
||||
# Define a random ip address and port on localhost
|
||||
IPADDRESS="127.${index2}.${index3}.${index4}"
|
||||
CLUSTERSTRING="${IPADDRESS}:${FDBPORT}"
|
||||
|
||||
|
||||
function log
|
||||
{
|
||||
|
@ -92,29 +104,32 @@ function displayMessage
|
|||
}
|
||||
|
||||
# Create the directories used by the server.
|
||||
function createDirectories {
|
||||
function createDirectories
|
||||
{
|
||||
local status=0
|
||||
|
||||
# Display user message
|
||||
if ! displayMessage "Creating directories"
|
||||
then
|
||||
echo 'Failed to display user message'
|
||||
let status="${status} + 1"
|
||||
|
||||
|
||||
elif ! mkdir -p "${LOGDIR}" "${ETCDIR}"
|
||||
then
|
||||
log "Failed to create directories"
|
||||
let status="${status} + 1"
|
||||
|
||||
|
||||
# Display user message
|
||||
elif ! displayMessage "Setting file permissions"
|
||||
then
|
||||
log 'Failed to display user message'
|
||||
let status="${status} + 1"
|
||||
|
||||
|
||||
elif ! chmod 755 "${BINDIR}/fdbserver" "${BINDIR}/fdbcli"
|
||||
then
|
||||
log "Failed to set file permissions"
|
||||
let status="${status} + 1"
|
||||
|
||||
|
||||
else
|
||||
while read filepath
|
||||
do
|
||||
|
@ -137,7 +152,10 @@ function createDirectories {
|
|||
}
|
||||
|
||||
# Create a cluster file for the local cluster.
|
||||
function createClusterFile {
|
||||
function createClusterFile
|
||||
{
|
||||
local status=0
|
||||
|
||||
if [ "${status}" -ne 0 ]; then
|
||||
:
|
||||
# Display user message
|
||||
|
@ -148,7 +166,7 @@ function createClusterFile {
|
|||
else
|
||||
description=$(LC_CTYPE=C tr -dc A-Za-z0-9 < /dev/urandom 2> /dev/null | head -c 8)
|
||||
random_str=$(LC_CTYPE=C tr -dc A-Za-z0-9 < /dev/urandom 2> /dev/null | head -c 8)
|
||||
echo "$description:$random_str@127.0.0.1:${FDBSERVERPORT}" > "${FDBCONF}"
|
||||
echo "${description}:${random_str}@${CLUSTERSTRING}" > "${FDBCONF}"
|
||||
fi
|
||||
|
||||
if [ "${status}" -ne 0 ]; then
|
||||
|
@ -161,8 +179,51 @@ function createClusterFile {
|
|||
return ${status}
|
||||
}
|
||||
|
||||
# Stop the Cluster from running.
|
||||
function stopCluster
|
||||
{
|
||||
local status=0
|
||||
|
||||
# Add an audit entry, if enabled
|
||||
if [ "${AUDITCLUSTER}" -gt 0 ]; then
|
||||
printf '%-15s (%6s) Stopping cluster %-20s (%6s): %s\n' "$(date +'%Y-%m-%d %H:%M:%S')" "${$}" "${CLUSTERSTRING}" "${FDBSERVERID}" >> "${AUDITLOG}"
|
||||
fi
|
||||
if [ -z "${FDBSERVERID}" ]; then
|
||||
log 'FDB Server process is not defined'
|
||||
let status="${status} + 1"
|
||||
elif ! kill -0 "${FDBSERVERID}"; then
|
||||
log "Failed to locate FDB Server process (${FDBSERVERID})"
|
||||
let status="${status} + 1"
|
||||
elif "${BINDIR}/fdbcli" -C "${FDBCONF}" --exec "kill; kill ${CLUSTERSTRING}; sleep 3" --timeout 120 &>> "${LOGDIR}/fdbcli-kill.log"
|
||||
then
|
||||
# Ensure that process is dead
|
||||
if ! kill -0 "${FDBSERVERID}" 2> /dev/null; then
|
||||
log "Killed cluster (${FDBSERVERID}) via cli"
|
||||
elif ! kill -9 "${FDBSERVERID}"; then
|
||||
log "Failed to kill FDB Server process (${FDBSERVERID}) via cli or kill command"
|
||||
let status="${status} + 1"
|
||||
else
|
||||
log "Forcibly killed FDB Server process (${FDBSERVERID}) since cli failed"
|
||||
fi
|
||||
elif ! kill -9 "${FDBSERVERID}"; then
|
||||
log "Failed to forcibly kill FDB Server process (${FDBSERVERID})"
|
||||
let status="${status} + 1"
|
||||
else
|
||||
log "Forcibly killed FDB Server process (${FDBSERVERID})"
|
||||
fi
|
||||
return "${status}"
|
||||
}
|
||||
|
||||
# Start the server running.
|
||||
function startFdbServer {
|
||||
function startFdbServer
|
||||
{
|
||||
local status=0
|
||||
|
||||
# Add an audit entry, if enabled
|
||||
if [ "${AUDITCLUSTER}" -gt 0 ]; then
|
||||
printf '%-15s (%6s) Starting cluster %-20s\n' "$(date +'%Y-%m-%d %H:%M:%S')" "${$}" "${CLUSTERSTRING}" >> "${AUDITLOG}"
|
||||
fi
|
||||
|
||||
if [ "${status}" -ne 0 ]; then
|
||||
:
|
||||
elif ! displayMessage "Starting Fdb Server"
|
||||
|
@ -170,25 +231,29 @@ function startFdbServer {
|
|||
log 'Failed to display user message'
|
||||
let status="${status} + 1"
|
||||
|
||||
elif ! "${BINDIR}/fdbserver" -C "${FDBCONF}" -p "auto:${FDBSERVERPORT}" -L "${LOGDIR}" -d "${WORKDIR}/fdb/$$" &> "${LOGDIR}/fdbserver.log" &
|
||||
elif ! "${BINDIR}/fdbserver" --knob_disable_posix_kernel_aio=1 -C "${FDBCONF}" -p "${CLUSTERSTRING}" -L "${LOGDIR}" -d "${WORKDIR}/fdb/${$}" &> "${LOGDIR}/fdbserver.log" &
|
||||
then
|
||||
log "Failed to start FDB Server"
|
||||
# Maybe the server is already running
|
||||
FDBSERVERID="$(pidof fdbserver)"
|
||||
let status="${status} + 1"
|
||||
else
|
||||
FDBSERVERID="${!}"
|
||||
fi
|
||||
|
||||
if ! kill -0 ${FDBSERVERID} ; then
|
||||
log "FDB Server start failed."
|
||||
if [ -z "${FDBSERVERID}" ]; then
|
||||
log "FDB Server start failed because no process"
|
||||
let status="${status} + 1"
|
||||
elif ! kill -0 "${FDBSERVERID}" ; then
|
||||
log "FDB Server start failed because process terminated unexpectedly"
|
||||
let status="${status} + 1"
|
||||
fi
|
||||
|
||||
return ${status}
|
||||
}
|
||||
|
||||
function getStatus {
|
||||
function getStatus
|
||||
{
|
||||
local status=0
|
||||
|
||||
if [ "${status}" -ne 0 ]; then
|
||||
:
|
||||
elif ! date &>> "${LOGDIR}/fdbclient.log"
|
||||
|
@ -209,35 +274,41 @@ function getStatus {
|
|||
}
|
||||
|
||||
# Verify that the cluster is available.
|
||||
function verifyAvailable {
|
||||
function verifyAvailable
|
||||
{
|
||||
local status=0
|
||||
|
||||
if [ -z "${FDBSERVERID}" ]; then
|
||||
log "FDB Server process is not defined."
|
||||
let status="${status} + 1"
|
||||
# Verify that the server is running.
|
||||
if ! kill -0 "${FDBSERVERID}"
|
||||
elif ! kill -0 "${FDBSERVERID}"
|
||||
then
|
||||
log "FDB server process (${FDBSERVERID}) is not running"
|
||||
let status="${status} + 1"
|
||||
return 1
|
||||
|
||||
# Display user message.
|
||||
elif ! displayMessage "Checking cluster availability"
|
||||
then
|
||||
log 'Failed to display user message'
|
||||
let status="${status} + 1"
|
||||
return 1
|
||||
|
||||
# Determine if status json says the database is available.
|
||||
else
|
||||
avail=`"${BINDIR}/fdbcli" -C "${FDBCONF}" --exec 'status json' --timeout 10 2> /dev/null | grep -E '"database_available"|"available"' | grep 'true'`
|
||||
avail=`"${BINDIR}/fdbcli" -C "${FDBCONF}" --exec 'status json' --timeout "${SERVERCHECKS}" 2> /dev/null | grep -E '"database_available"|"available"' | grep 'true'`
|
||||
log "Avail value: ${avail}" "${DEBUGLEVEL}"
|
||||
if [[ -n "${avail}" ]] ; then
|
||||
return 0
|
||||
:
|
||||
else
|
||||
return 1
|
||||
let status="${status} + 1"
|
||||
fi
|
||||
fi
|
||||
return "${status}"
|
||||
}
|
||||
|
||||
# Configure the database on the server.
|
||||
function createDatabase {
|
||||
function createDatabase
|
||||
{
|
||||
local status=0
|
||||
|
||||
if [ "${status}" -ne 0 ]; then
|
||||
:
|
||||
# Ensure that the server is running
|
||||
|
@ -262,7 +333,7 @@ function createDatabase {
|
|||
|
||||
# Configure the database.
|
||||
else
|
||||
"${BINDIR}/fdbcli" -C "${FDBCONF}" --exec 'configure new single memory; status' --timeout 240 --log --log-dir "${LOGDIR}" &>> "${LOGDIR}/fdbclient.log"
|
||||
"${BINDIR}/fdbcli" -C "${FDBCONF}" --exec 'configure new single memory; status' --timeout "${CONFIGUREWAIT}" --log --log-dir "${LOGDIR}" &>> "${LOGDIR}/fdbclient.log"
|
||||
|
||||
if ! displayMessage "Checking if config succeeded"
|
||||
then
|
||||
|
@ -270,7 +341,7 @@ function createDatabase {
|
|||
fi
|
||||
|
||||
iteration=0
|
||||
while [[ "${iteration}" -lt 10 ]] && ! verifyAvailable
|
||||
while [[ "${iteration}" -lt "${SERVERCHECKS}" ]] && ! verifyAvailable
|
||||
do
|
||||
log "Database not created (iteration ${iteration})."
|
||||
let iteration="${iteration} + 1"
|
||||
|
@ -290,7 +361,10 @@ function createDatabase {
|
|||
}
|
||||
|
||||
# Begin the local cluster from scratch.
|
||||
function startCluster {
|
||||
function startCluster
|
||||
{
|
||||
local status=0
|
||||
|
||||
if [ "${status}" -ne 0 ]; then
|
||||
:
|
||||
elif ! createDirectories
|
||||
|
|
|
@ -5,6 +5,8 @@ Release Notes
|
|||
6.3.5
|
||||
=====
|
||||
|
||||
* Report missing old tlogs information when in recovery before storage servers are fully recovered. `(PR #3706) <https://github.com/apple/foundationdb/pull/3706>`_
|
||||
|
||||
Features
|
||||
--------
|
||||
|
||||
|
|
|
@ -20,6 +20,7 @@
|
|||
|
||||
#include "boost/lexical_cast.hpp"
|
||||
#include "fdbclient/NativeAPI.actor.h"
|
||||
#include "fdbclient/FDBTypes.h"
|
||||
#include "fdbclient/Status.h"
|
||||
#include "fdbclient/StatusClient.h"
|
||||
#include "fdbclient/DatabaseContext.h"
|
||||
|
@ -1235,14 +1236,54 @@ void printStatus(StatusObjectReader statusObj, StatusClient::StatusLevel level,
|
|||
|
||||
int minLoss = std::min(availLoss, dataLoss);
|
||||
const char *faultDomain = machinesAreZones ? "machine" : "zone";
|
||||
if (minLoss == 1)
|
||||
outputString += format("1 %s", faultDomain);
|
||||
else
|
||||
outputString += format("%d %ss", minLoss, faultDomain);
|
||||
outputString += format("%d %ss", minLoss, faultDomain);
|
||||
|
||||
if (dataLoss > availLoss){
|
||||
outputString += format(" (%d without data loss)", dataLoss);
|
||||
}
|
||||
|
||||
if (dataLoss == -1) {
|
||||
ASSERT_WE_THINK(availLoss == -1);
|
||||
outputString += format(
|
||||
"\n\n Warning: the database may have data loss and availability loss. Please restart "
|
||||
"following tlog interfaces, otherwise storage servers may never be able to catch "
|
||||
"up.\n");
|
||||
StatusObjectReader logs;
|
||||
if (statusObjCluster.has("logs")) {
|
||||
for (StatusObjectReader logEpoch : statusObjCluster.last().get_array()) {
|
||||
bool possiblyLosingData;
|
||||
if (logEpoch.get("possibly_losing_data", possiblyLosingData) &&
|
||||
!possiblyLosingData) {
|
||||
continue;
|
||||
}
|
||||
// Current epoch doesn't have an end version.
|
||||
int64_t epoch, beginVersion, endVersion = invalidVersion;
|
||||
bool current;
|
||||
logEpoch.get("epoch", epoch);
|
||||
logEpoch.get("begin_version", beginVersion);
|
||||
logEpoch.get("end_version", endVersion);
|
||||
logEpoch.get("current", current);
|
||||
std::string missing_log_interfaces;
|
||||
if (logEpoch.has("log_interfaces")) {
|
||||
for (StatusObjectReader logInterface : logEpoch.last().get_array()) {
|
||||
bool healthy;
|
||||
std::string address, id;
|
||||
if (logInterface.get("healthy", healthy) && !healthy) {
|
||||
logInterface.get("id", id);
|
||||
logInterface.get("address", address);
|
||||
missing_log_interfaces += format("%s,%s ", id.c_str(), address.c_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
outputString += format(
|
||||
" %s log epoch: %ld begin: %ld end: %s, missing "
|
||||
"log interfaces(id,address): %s\n",
|
||||
current ? "Current" : "Old", epoch, beginVersion,
|
||||
endVersion == invalidVersion ? "(unknown)" : format("%ld", endVersion).c_str(),
|
||||
missing_log_interfaces.c_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -33,6 +33,7 @@
|
|||
#include "fdbclient/DatabaseContext.h"
|
||||
#include "fdbrpc/simulator.h"
|
||||
#include "fdbclient/StatusClient.h"
|
||||
#include "flow/Trace.h"
|
||||
#include "flow/UnitTest.h"
|
||||
#include "fdbrpc/ReplicationPolicy.h"
|
||||
#include "fdbrpc/Replication.h"
|
||||
|
|
|
@ -278,15 +278,20 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema(
|
|||
"run_loop_busy":0.2
|
||||
}
|
||||
},
|
||||
"old_logs":[
|
||||
"logs":[
|
||||
{
|
||||
"logs":[
|
||||
"log_interfaces":[
|
||||
{
|
||||
"id":"7f8d623d0cb9966e",
|
||||
"healthy":true,
|
||||
"address":"1.2.3.4:1234"
|
||||
}
|
||||
],
|
||||
"epoch":1,
|
||||
"current":false,
|
||||
"begin_version":23,
|
||||
"end_version":112315141,
|
||||
"possibly_losing_data":true,
|
||||
"log_replication_factor":3,
|
||||
"log_write_anti_quorum":0,
|
||||
"log_fault_tolerance":2,
|
||||
|
|
|
@ -2012,78 +2012,98 @@ ACTOR static Future<JsonBuilderObject> clusterSummaryStatisticsFetcher(WorkerEve
|
|||
return statusObj;
|
||||
}
|
||||
|
||||
static JsonBuilderArray oldTlogFetcher(int* oldLogFaultTolerance, Reference<AsyncVar<ServerDBInfo>> db, std::unordered_map<NetworkAddress, WorkerInterface> const& address_workers) {
|
||||
JsonBuilderArray oldTlogsArray;
|
||||
static JsonBuilderObject tlogFetcher(int* logFaultTolerance, const std::vector<TLogSet>& tLogs,
|
||||
std::unordered_map<NetworkAddress, WorkerInterface> const& address_workers) {
|
||||
JsonBuilderObject statusObj;
|
||||
JsonBuilderArray logsObj;
|
||||
Optional<int32_t> sat_log_replication_factor, sat_log_write_anti_quorum, sat_log_fault_tolerance,
|
||||
log_replication_factor, log_write_anti_quorum, log_fault_tolerance, remote_log_replication_factor,
|
||||
remote_log_fault_tolerance;
|
||||
|
||||
if(db->get().recoveryState >= RecoveryState::ACCEPTING_COMMITS) {
|
||||
for(auto it : db->get().logSystemConfig.oldTLogs) {
|
||||
JsonBuilderObject statusObj;
|
||||
JsonBuilderArray logsObj;
|
||||
Optional<int32_t> sat_log_replication_factor, sat_log_write_anti_quorum, sat_log_fault_tolerance, log_replication_factor, log_write_anti_quorum, log_fault_tolerance, remote_log_replication_factor, remote_log_fault_tolerance;
|
||||
int maxFaultTolerance = 0;
|
||||
|
||||
int maxFaultTolerance = 0;
|
||||
|
||||
for(int i = 0; i < it.tLogs.size(); i++) {
|
||||
int failedLogs = 0;
|
||||
for(auto& log : it.tLogs[i].tLogs) {
|
||||
JsonBuilderObject logObj;
|
||||
bool failed = !log.present() || !address_workers.count(log.interf().address());
|
||||
logObj["id"] = log.id().shortString();
|
||||
logObj["healthy"] = !failed;
|
||||
if(log.present()) {
|
||||
logObj["address"] = log.interf().address().toString();
|
||||
}
|
||||
logsObj.push_back(logObj);
|
||||
if(failed) {
|
||||
failedLogs++;
|
||||
}
|
||||
}
|
||||
maxFaultTolerance = std::max(maxFaultTolerance, it.tLogs[i].tLogReplicationFactor - 1 - it.tLogs[i].tLogWriteAntiQuorum - failedLogs);
|
||||
if(it.tLogs[i].isLocal && it.tLogs[i].locality == tagLocalitySatellite) {
|
||||
sat_log_replication_factor = it.tLogs[i].tLogReplicationFactor;
|
||||
sat_log_write_anti_quorum = it.tLogs[i].tLogWriteAntiQuorum;
|
||||
sat_log_fault_tolerance = it.tLogs[i].tLogReplicationFactor - 1 - it.tLogs[i].tLogWriteAntiQuorum - failedLogs;
|
||||
}
|
||||
else if(it.tLogs[i].isLocal) {
|
||||
log_replication_factor = it.tLogs[i].tLogReplicationFactor;
|
||||
log_write_anti_quorum = it.tLogs[i].tLogWriteAntiQuorum;
|
||||
log_fault_tolerance = it.tLogs[i].tLogReplicationFactor - 1 - it.tLogs[i].tLogWriteAntiQuorum - failedLogs;
|
||||
}
|
||||
else {
|
||||
remote_log_replication_factor = it.tLogs[i].tLogReplicationFactor;
|
||||
remote_log_fault_tolerance = it.tLogs[i].tLogReplicationFactor - 1 - failedLogs;
|
||||
}
|
||||
for (int i = 0; i < tLogs.size(); i++) {
|
||||
int failedLogs = 0;
|
||||
for (auto& log : tLogs[i].tLogs) {
|
||||
JsonBuilderObject logObj;
|
||||
bool failed = !log.present() || !address_workers.count(log.interf().address());
|
||||
logObj["id"] = log.id().shortString();
|
||||
logObj["healthy"] = !failed;
|
||||
if (log.present()) {
|
||||
logObj["address"] = log.interf().address().toString();
|
||||
}
|
||||
*oldLogFaultTolerance = std::min(*oldLogFaultTolerance, maxFaultTolerance);
|
||||
statusObj["logs"] = logsObj;
|
||||
|
||||
if (sat_log_replication_factor.present())
|
||||
statusObj["satellite_log_replication_factor"] = sat_log_replication_factor.get();
|
||||
if (sat_log_write_anti_quorum.present())
|
||||
statusObj["satellite_log_write_anti_quorum"] = sat_log_write_anti_quorum.get();
|
||||
if (sat_log_fault_tolerance.present())
|
||||
statusObj["satellite_log_fault_tolerance"] = sat_log_fault_tolerance.get();
|
||||
|
||||
if (log_replication_factor.present())
|
||||
statusObj["log_replication_factor"] = log_replication_factor.get();
|
||||
if (log_write_anti_quorum.present())
|
||||
statusObj["log_write_anti_quorum"] = log_write_anti_quorum.get();
|
||||
if (log_fault_tolerance.present())
|
||||
statusObj["log_fault_tolerance"] = log_fault_tolerance.get();
|
||||
|
||||
if (remote_log_replication_factor.present())
|
||||
statusObj["remote_log_replication_factor"] = remote_log_replication_factor.get();
|
||||
if (remote_log_fault_tolerance.present())
|
||||
statusObj["remote_log_fault_tolerance"] = remote_log_fault_tolerance.get();
|
||||
|
||||
oldTlogsArray.push_back(statusObj);
|
||||
logsObj.push_back(logObj);
|
||||
if (failed) {
|
||||
failedLogs++;
|
||||
}
|
||||
}
|
||||
// The log generation's fault tolerance is the maximum tlog fault tolerance of each region.
|
||||
maxFaultTolerance =
|
||||
std::max(maxFaultTolerance, tLogs[i].tLogReplicationFactor - 1 - tLogs[i].tLogWriteAntiQuorum - failedLogs);
|
||||
if (tLogs[i].isLocal && tLogs[i].locality == tagLocalitySatellite) {
|
||||
sat_log_replication_factor = tLogs[i].tLogReplicationFactor;
|
||||
sat_log_write_anti_quorum = tLogs[i].tLogWriteAntiQuorum;
|
||||
sat_log_fault_tolerance = tLogs[i].tLogReplicationFactor - 1 - tLogs[i].tLogWriteAntiQuorum - failedLogs;
|
||||
} else if (tLogs[i].isLocal) {
|
||||
log_replication_factor = tLogs[i].tLogReplicationFactor;
|
||||
log_write_anti_quorum = tLogs[i].tLogWriteAntiQuorum;
|
||||
log_fault_tolerance = tLogs[i].tLogReplicationFactor - 1 - tLogs[i].tLogWriteAntiQuorum - failedLogs;
|
||||
} else {
|
||||
remote_log_replication_factor = tLogs[i].tLogReplicationFactor;
|
||||
remote_log_fault_tolerance = tLogs[i].tLogReplicationFactor - 1 - failedLogs;
|
||||
}
|
||||
}
|
||||
*logFaultTolerance = std::min(*logFaultTolerance, maxFaultTolerance);
|
||||
statusObj["log_interfaces"] = logsObj;
|
||||
// We may lose logs in this log generation, storage servers may never be able to catch up this log
|
||||
// generation.
|
||||
statusObj["possibly_losing_data"] = maxFaultTolerance < 0;
|
||||
|
||||
return oldTlogsArray;
|
||||
if (sat_log_replication_factor.present())
|
||||
statusObj["satellite_log_replication_factor"] = sat_log_replication_factor.get();
|
||||
if (sat_log_write_anti_quorum.present())
|
||||
statusObj["satellite_log_write_anti_quorum"] = sat_log_write_anti_quorum.get();
|
||||
if (sat_log_fault_tolerance.present()) statusObj["satellite_log_fault_tolerance"] = sat_log_fault_tolerance.get();
|
||||
|
||||
if (log_replication_factor.present()) statusObj["log_replication_factor"] = log_replication_factor.get();
|
||||
if (log_write_anti_quorum.present()) statusObj["log_write_anti_quorum"] = log_write_anti_quorum.get();
|
||||
if (log_fault_tolerance.present()) statusObj["log_fault_tolerance"] = log_fault_tolerance.get();
|
||||
|
||||
if (remote_log_replication_factor.present())
|
||||
statusObj["remote_log_replication_factor"] = remote_log_replication_factor.get();
|
||||
if (remote_log_fault_tolerance.present())
|
||||
statusObj["remote_log_fault_tolerance"] = remote_log_fault_tolerance.get();
|
||||
|
||||
return statusObj;
|
||||
}
|
||||
|
||||
static JsonBuilderObject faultToleranceStatusFetcher(DatabaseConfiguration configuration, ServerCoordinators coordinators, std::vector<WorkerDetails>& workers, int extraTlogEligibleZones, int minReplicasRemaining, bool underMaintenance) {
|
||||
static JsonBuilderArray tlogFetcher(int* logFaultTolerance, Reference<AsyncVar<ServerDBInfo>> db,
|
||||
std::unordered_map<NetworkAddress, WorkerInterface> const& address_workers) {
|
||||
JsonBuilderArray tlogsArray;
|
||||
JsonBuilderObject tlogsStatus;
|
||||
tlogsStatus = tlogFetcher(logFaultTolerance, db->get().logSystemConfig.tLogs, address_workers);
|
||||
tlogsStatus["epoch"] = db->get().logSystemConfig.epoch;
|
||||
tlogsStatus["current"] = true;
|
||||
if (db->get().logSystemConfig.recoveredAt.present()) {
|
||||
tlogsStatus["begin_version"] = db->get().logSystemConfig.recoveredAt.get();
|
||||
}
|
||||
tlogsArray.push_back(tlogsStatus);
|
||||
for (auto it : db->get().logSystemConfig.oldTLogs) {
|
||||
JsonBuilderObject oldTlogsStatus = tlogFetcher(logFaultTolerance, it.tLogs, address_workers);
|
||||
oldTlogsStatus["epoch"] = it.epoch;
|
||||
oldTlogsStatus["current"] = false;
|
||||
oldTlogsStatus["begin_version"] = it.epochBegin;
|
||||
oldTlogsStatus["end_version"] = it.epochEnd;
|
||||
tlogsArray.push_back(oldTlogsStatus);
|
||||
}
|
||||
return tlogsArray;
|
||||
}
|
||||
|
||||
static JsonBuilderObject faultToleranceStatusFetcher(DatabaseConfiguration configuration,
|
||||
ServerCoordinators coordinators,
|
||||
std::vector<WorkerDetails>& workers, int extraTlogEligibleZones,
|
||||
int minReplicasRemaining, int oldLogFaultTolerance,
|
||||
bool underMaintenance) {
|
||||
JsonBuilderObject statusObj;
|
||||
|
||||
// without losing data
|
||||
|
@ -2115,17 +2135,18 @@ static JsonBuilderObject faultToleranceStatusFetcher(DatabaseConfiguration confi
|
|||
}
|
||||
maxCoordinatorZoneFailures += 1;
|
||||
}
|
||||
|
||||
// max zone failures that we can tolerate to not lose data
|
||||
int zoneFailuresWithoutLosingData = std::min(maxZoneFailures, maxCoordinatorZoneFailures);
|
||||
|
||||
if (minReplicasRemaining >= 0){
|
||||
zoneFailuresWithoutLosingData = std::min(zoneFailuresWithoutLosingData, minReplicasRemaining - 1);
|
||||
}
|
||||
|
||||
statusObj["max_zone_failures_without_losing_data"] = std::max(zoneFailuresWithoutLosingData, 0);
|
||||
|
||||
// without losing availablity
|
||||
statusObj["max_zone_failures_without_losing_availability"] = std::max(std::min(extraTlogEligibleZones, zoneFailuresWithoutLosingData), 0);
|
||||
// oldLogFaultTolerance means max failures we can tolerate to lose logs data. -1 means we lose data or availability.
|
||||
zoneFailuresWithoutLosingData = std::max(std::min(zoneFailuresWithoutLosingData, oldLogFaultTolerance), -1);
|
||||
statusObj["max_zone_failures_without_losing_data"] = zoneFailuresWithoutLosingData;
|
||||
statusObj["max_zone_failures_without_losing_availability"] =
|
||||
std::max(std::min(extraTlogEligibleZones, zoneFailuresWithoutLosingData), -1);
|
||||
return statusObj;
|
||||
}
|
||||
|
||||
|
@ -2521,14 +2542,16 @@ ACTOR Future<StatusReply> clusterGetStatus(
|
|||
futures2.push_back(clusterSummaryStatisticsFetcher(pMetrics, storageServerFuture, tLogFuture, &status_incomplete_reasons));
|
||||
state std::vector<JsonBuilderObject> workerStatuses = wait(getAll(futures2));
|
||||
|
||||
int oldLogFaultTolerance = 100;
|
||||
if(db->get().recoveryState >= RecoveryState::ACCEPTING_COMMITS && db->get().logSystemConfig.oldTLogs.size() > 0) {
|
||||
statusObj["old_logs"] = oldTlogFetcher(&oldLogFaultTolerance, db, address_workers);
|
||||
int logFaultTolerance = 100;
|
||||
if (db->get().recoveryState >= RecoveryState::ACCEPTING_COMMITS) {
|
||||
statusObj["logs"] = tlogFetcher(&logFaultTolerance, db, address_workers);
|
||||
}
|
||||
|
||||
if(configuration.present()) {
|
||||
int extraTlogEligibleZones = getExtraTLogEligibleZones(workers, configuration.get());
|
||||
statusObj["fault_tolerance"] = faultToleranceStatusFetcher(configuration.get(), coordinators, workers, extraTlogEligibleZones, minReplicasRemaining, loadResult.present() && loadResult.get().healthyZone.present());
|
||||
statusObj["fault_tolerance"] = faultToleranceStatusFetcher(
|
||||
configuration.get(), coordinators, workers, extraTlogEligibleZones, minReplicasRemaining,
|
||||
logFaultTolerance, loadResult.present() && loadResult.get().healthyZone.present());
|
||||
}
|
||||
|
||||
state JsonBuilderObject configObj =
|
||||
|
|
|
@ -55,8 +55,7 @@ struct AtomicOpsWorkload : TestWorkload {
|
|||
ubsum = 0;
|
||||
|
||||
int64_t randNum = sharedRandomNumber / 10;
|
||||
if(opType == -1)
|
||||
opType = randNum % 8;
|
||||
if (opType == -1) opType = randNum % 10;
|
||||
|
||||
switch(opType) {
|
||||
case 0:
|
||||
|
@ -91,6 +90,18 @@ struct AtomicOpsWorkload : TestWorkload {
|
|||
TEST(true); //Testing atomic ByteMax
|
||||
opType = MutationRef::ByteMax;
|
||||
break;
|
||||
case 8:
|
||||
TEST(true); // Testing atomic MinV2
|
||||
opType = MutationRef::MinV2;
|
||||
break;
|
||||
case 9:
|
||||
TEST(true); // Testing atomic AndV2
|
||||
opType = MutationRef::AndV2;
|
||||
break;
|
||||
// case 10:
|
||||
// TEST(true); // Testing atomic CompareAndClear Not supported yet
|
||||
// opType = MutationRef::CompareAndClear
|
||||
// break;
|
||||
default:
|
||||
ASSERT(false);
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue